version 0.1.8

cran · Jul 15, 2024 · 248f7a5 · 248f7a5
1 parent e7b3fb9
commit 248f7a5
Show file tree

Hide file tree

Showing 42 changed files with 1,428 additions and 426 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6  1,6 @@
 Package: ppendemic
 Title: A Glimpse at the Diversity of Peru's Endemic Plants
-Version: 0.1.7
 Version: 0.1.8
 Authors@R: 
       c(person(given = "Paul E.",
                  family = "Santos Andrade",
@@ -19,15  19,17 @@ BugReports: https://github.com/PaulESantos/ppendemic/issues/
 Suggests: knitr, rmarkdown, testthat (>= 3.0.0)
 Encoding: UTF-8
 LazyData: true
-RoxygenNote: 7.2.3
 RoxygenNote: 7.3.2
 Depends: R (>= 3.5.0),
 Config/testthat/edition: 3
 Maintainer: Paul E. Santos Andrade <[email protected]>
 Imports: assertthat, dplyr, fuzzyjoin, memoise, progress, purrr, readr,
         stringr, tibble, tidyr
 NeedsCompilation: no
-Packaged: 2023-08-15 02:26:15 UTC; user
 Packaged: 2024-07-14 04:31:52 UTC; PC
 Author: Paul E. Santos Andrade [aut, cre]
     (<https://orcid.org/0000-0002-6635-0375>),
   Lucely L. Vilca Bustamante [aut]
     (<https://orcid.org/0000-0002-5559-1296>)
 Repository: CRAN
-Date/Publication: 2023-08-15 07:20:02 UTC
 Date/Publication: 2024-07-14 04:50:02 UTC
diff --git a/LICENSE b/LICENSE
@@ -1,2  1,2 @@
-YEAR: 2023
-COPYRIGHT HOLDER: ppendemic authors
 YEAR: 2023
 COPYRIGHT HOLDER: ppendemic authors
diff --git a/MD5 b/MD5
@@ -1,18  1,24 @@
-b3dc77ff78d95bb0dbde3d6ebfa1c498 *DESCRIPTION
-fd383bdc9aa08b46370a203e3c651a4e *LICENSE
-af6d284bd741bf7d8ec1cd6eaf2dc648 *NAMESPACE
-0f95e1cddd39ee3d6fecef4badc812e9 *R/globals.R
-ad15518b7bedaa16ca61c40e2d7baa1e *R/internals.R
-22e972d4cd9ada21f1dc99302a094d69 *R/is_ppendemic.R
-0a4e91eb88d16853287bc3bf7ff5e215 *R/ppendemic_dat.R
-2349583c5d4c717339dcc85a636700a0 *R/ppendemic_data.R
-1f13ce6edf44b4985923d2ea7d8499a0 *R/sysdata.rda
 810f55de75a3bd3ded27347dbf51ffff *DESCRIPTION
 d24c50004d7b441ca2027e7b85288b1f *LICENSE
 c040ecce2caa9d86a35324b9b7e75b51 *NAMESPACE
 a69d0f26d546733a1cdda99c528cbc12 *R/direct_match.R
 9066d53ce8e58d3a02f048956491dc83 *R/direct_match_spp_genus.R
 e0761f5805f5b3c00706e0fd59d7a3c2 *R/fuzzy_match_genus.R
 77a05811b9aadfeee0962457713ef647 *R/fuzzy_match_infraspp_within_species.R
 2088f58a2edf511816eb6258cd4dd4a8 *R/fuzzy_match_spp_within_genus.R
 87a21cb1e6142ebde08d2cf22e1ce074 *R/genus_match.R
 c9fa02029e5c5f198bdb24c8dd4b8ca8 *R/internal.R
 0e4a4f23e36d2c11bd97a0a457d5753b *R/is_ppendemic.R
 a7b77459a7488e44dc9841a5968717a7 *R/matching_ppendemic.R
 0eaa28488c54a648d4b6719ed94c2a0a *R/ppendemic_data.R
 c31d7ebeba7818bfa98858381646caf0 *R/suffix_match_spp_within_genus.R
 e79c6a72b29b483f1ae2745db57ec8b4 *R/zzz.R
-017aa0dc52e282a6e1b09d84482df3b1 *README.md
-d6007103d1a3580681f024e683326241 *data/ppendemic_tab.rda
-833d6bde0eb3f8f8acabc53d90d21d9e *inst/CITATION
-7770d22872ba7a0f23b86caa13217f5c *man/figures/README-map-1.png
-d2e601e90792c40c216098bc5b7bf348 *man/figures/README-unnamed-chunk-2-1.png
 12dc1f4708809a78e7484b69866ddbe5 *README.md
 05402f63fdc9b1d2af54850d02b0fac7 *data/ppendemic_tab13.rda
 a05324e037698659f6ae911a7afc4679 *inst/CITATION
 811fd4e9702658c6dbe7982858ac7365 *man/direct_match.Rd
 72dcae1905cef5c7367ace03d06a13c4 *man/direct_match_species_within_genus_helper.Rd
 2ac2a1661d378eec9dab07606eaf16f9 *man/figures/README-unnamed-chunk-2-1.png
 cb1e46f469cfbbbde29c8b5113e1d789 *man/figures/lifecycle-archived.svg
 c0d2e5a54f1fa4ff02bf9533079dd1f7 *man/figures/lifecycle-defunct.svg
 a1b8c987c676c16af790f563f96cbb1f *man/figures/lifecycle-deprecated.svg
@@ -21,10  27,14 @@ c3978703d8f40f2679795335715e98f4 *man/figures/lifecycle-experimental.svg
 27b879bf3677ea76e3991d56ab324081 *man/figures/lifecycle-questioning.svg
 53b3f893324260b737b3c46ed2a0e643 *man/figures/lifecycle-stable.svg
 1c1fe7a759b86dc6dbcbe7797ab8246c *man/figures/lifecycle-superseded.svg
-d8e771a29f9a09acb1fa183e93b42ce2 *man/get_ppendemic_data.Rd
-236de5836bb7517b8181a95e606fd723 *man/is_ppendemic.Rd
-9c872cf7d009475faadeebe62d1e0bcb *man/ppendemic_tab.Rd
-001567bc6737897162e8708030a66f71 *tests/testthat.R
-e25127e6f251f675e97219c2c096c676 *tests/testthat/test_get_ppendemic_data.R
-7898bffae0ce54849f92b4b040e36161 *tests/testthat/test_is_ppendemic.R
-579579a50c3de31707581e8455ffef5e *tests/testthat/test_ppendemic_tab.R
 9653e144338ec8844e7168b1d9f77c89 *man/figures/ppendemic_logo.png
 e429f13bc5b56a5f2c40c7fe2bdb6230 *man/fuzzy_match_genus.Rd
 2b95125ede132ab7fdb8308fe7113d7b *man/fuzzy_match_infraspecies_within_species.Rd
 9d12c9cd4373f02cf99434f9efa8458d *man/fuzzy_match_species_within_genus_helper.Rd
 fd1da418078e49fc904c09ab3cbcd8d4 *man/genus_match.Rd
 447f8dac1ecaf159215c2d0afc71a392 *man/is_ppendemic.Rd
 6759a44fef2a0d0aa586c6a1511fe098 *man/matching_ppendemic.Rd
 b2f505fa9af41f16161aa0e5d6edce8a *man/ppendemic_tab13.Rd
 f2fcedce9ba941f36bb766f1b3a8aecf *man/suffix_match_species_within_genus_helper.Rd
 c3a47a9e5de2436dfea235e9cdbdb6ff *tests/testthat.R
 7ce1f27eace30050149983b65326bb84 *tests/testthat/test_matching_ppendemic.R
diff --git a/NAMESPACE b/NAMESPACE
@@ -1,4  1,4 @@
 # Generated by roxygen2: do not edit by hand
 
-export(get_ppendemic_data)
 export(is_ppendemic)
 export(matching_ppendemic)
diff --git a/R/direct_match.R b/R/direct_match.R
@@ -0,0  1,52 @@
 #' Direct Match
 #'
 #' @description
 #' This function performs a direct match of species names. It matches the genus and species if the name is binomial, and matches the genus, species, and infra species if the name includes a subspecies.
 #'
 #' @param df A tibble containing the species data to be matched.
 #' @param target_df A tibble representing the ppendemic database containing the reference list of endemic species.
 #'
 #' @return
 #' A tibble with an additional logical column direct_match indicating whether the binomial or trinomial name was successfully matched (`TRUE`) or not (`FALSE`).
 #'
 #' @keywords internal
 direct_match <- function(df, target_df = NULL){
   assertthat::assert_that(all(c('Orig.Genus',
                                 'Orig.Species',
                                 'Orig.Infraspecies') %in%
                                 colnames(df)))
 
   ## solve issue of empty input tibble, and needed to ensure
   ## compatilbility with sequential_matching: because there the
   ## columns already exists for the second backbone
   if(!all(c('direct_match') %in% colnames(df))){
     if(nrow(df) == 0){
       return(tibble::add_column(df, direct_match = NA))
     }
   }
 
   matched <- df |>
     dplyr::semi_join(target_df,
                      by = c('Orig.Genus' = 'Genus',
                             'Orig.Species' = 'Species',
                             'Orig.Infraspecies' = 'infraspecies')) |>
     dplyr::mutate(Matched.Genus = Orig.Genus,
                   Matched.Species = Orig.Species,
                   Matched.Infraspecies = Orig.Infraspecies)
   unmatched <- df |>
     dplyr::anti_join( target_df,
                       c('Orig.Genus' = 'Genus',
                         'Orig.Species' = 'Species',
                         'Orig.Infraspecies' = 'infraspecies'))
 
   assertthat::assert_that(nrow(df) == (nrow(matched)   nrow(unmatched)))
 
   # combine matched and unmatched and add Boolean indicator: TRUE = matched, FALSE = unmatched
   combined <-  dplyr::bind_rows(matched, unmatched,
                                 .id = 'direct_match') |>
     dplyr::mutate(direct_match = (direct_match == 1)) |>  ## convert to Boolean
     dplyr::relocate(c('Orig.Genus', 'Orig.Species', 'Orig.Infraspecies'))
   ## Genus & Species column at the beginning of tibble
 
   return(combined)
 }
diff --git a/R/direct_match_spp_genus.R b/R/direct_match_spp_genus.R
@@ -0,0  1,71 @@
 #' Direct Match Species within Genus
 #'
 #' @description
 #' This function performs a direct match of specific epithets within an already matched genus from the list of endemic species in the ppendemic database.
 #'
 #' @param df A tibble containing the species data to be matched.
 #' @param target_df A tibble representing the ppendemic database containing the reference list of endemic species.
 #'
 #' @return
 #' A tibble with an additional logical column indicating whether the specific epithet was successfully matched within the matched genus (`TRUE`) or not (`FALSE`).
 #'
 #' @keywords internal
 direct_match_species_within_genus_helper <- function(df, target_df){
   # subset database
   genus <- df |>
     dplyr::distinct(Matched.Genus) |>
     unlist()
 
   database_subset <- memoised_get_trees_of_genus(genus, target_df)
 
 
   # match specific epithet within genus
   matched <- df |>
     dplyr::semi_join(database_subset,
                      by = c('Orig.Species' = 'Species')) |>
     dplyr::mutate(Matched.Species = Orig.Species)
 
   unmatched <- df |>
     dplyr::anti_join(database_subset,
                      by = c('Orig.Species' = 'Species'))
 
   assertthat::assert_that(nrow(df) == (nrow(matched)   nrow(unmatched)))
 
   # combine matched and unmatched and add Boolean indicator: TRUE = matched, FALSE = unmatched
   combined <-  dplyr::bind_rows(matched, unmatched,
                                 .id = 'direct_match_species_within_genus') |>
     dplyr::mutate(direct_match_species_within_genus = (direct_match_species_within_genus == 1)) |>  ## convert to Boolean
     dplyr::relocate(c('Orig.Genus',
                       'Orig.Species',
                       'Orig.Infraspecies')) ## Genus & Species column at the beginning of tibble
 
   return(combined)
 }
 
 
 direct_match_species_within_genus <- function(df, target_df = NULL){
 
   assertthat::assert_that(all(c('Orig.Genus', 'Orig.Species',
                                 'Orig.Infraspecies',
                                 'Matched.Genus') %in% colnames(df)))
 
   ## solve issue of empty input tibble, and needed to ensure compatilbility with sequential_matching: because there the columns already exists for the second backbone
   if(nrow(df) == 0){
     if(!all(c('direct_match_species_within_genus') %in% colnames(df))){
       return(tibble::add_column(df,
                                 direct_match_species_within_genus = NA))
     }
     else{
       return(df)
     }
   }
 
     res <- df |>
     dplyr::group_by(Matched.Genus) |>
     dplyr::group_split() |>
     map_dfr_progress(direct_match_species_within_genus_helper,
                      target_df)
 
   return(res)
 }
 
diff --git a/R/fuzzy_match_genus.R b/R/fuzzy_match_genus.R
@@ -0,0  1,97 @@
 #' Fuzzy Match Genus Name
 #'
 #' @description
 #' This function performs a fuzzy match of genus names against the ppendemic database using fuzzyjoin::stringdist() to account for slight variations in spelling.
 #'
 #' @param df A tibble containing the genus names to be matched.
 #' @param target_df A tibble representing the ppendemic database containing the reference list of endemic species.
 #'
 #' @return
 #' A tibble with two additional columns:
 #' - fuzzy_match_genus: A logical column indicating whether the genus was successfully matched (`TRUE`) or not (`FALSE`).
 #' - fuzzy_genus_dist: A numeric column representing the distance for each match.
 #'
 #' @keywords internal
 fuzzy_match_genus <- function(df, target_df = NULL){
   assertthat::assert_that(all(c('Orig.Genus',
                                 'Orig.Species',
                                 'Orig.Infraspecies') %in% colnames(df)))
 
   ## solve issue of empty input tibble, and needed to ensure compatilbility with sequential_matching: because there the columns already exists for the second backbone
   if(nrow(df) == 0){
     if(!all(c('fuzzy_match_genus', 'fuzzy_genus_dist') %in% colnames(df))){
       return(tibble::add_column(df, fuzzy_match_genus = NA, fuzzy_genus_dist = NA))
     }
     else{
       return(df)
     }
   }
   ## solve issue in second iteration of sequential_matching: necessary to remove fuzzy_species_dist column: otherwise 2 columns are generated 'fuzzy_species_dist...1, fuzzy_species_dist...2'
   if('fuzzy_genus_dist' %in% colnames(df)){
     df <- df |>
       dplyr::mutate(fuzzy_genus_dist = NULL)
   } ## TODO: can potentially be removed again????
 
   Tree.Genera <- target_df |>
     dplyr::distinct(Genus)
   # fuzzy match
   matched_temp <- df |>
     fuzzyjoin::stringdist_left_join(Tree.Genera,
                                     by = c('Orig.Genus' = 'Genus'),
                                     max_dist = 1,
                                     distance_col = 'fuzzy_genus_dist') |>
     # save matched Genus name to Matched.Genus
     dplyr::mutate(Matched.Genus = Genus) |>
     dplyr::select(-c('Genus')) |>
     dplyr::group_by(Orig.Genus, Orig.Species) |>
     dplyr::filter(fuzzy_genus_dist == min(fuzzy_genus_dist))
 
 
   ## If there are multiple matches for the same genus: raise warning and advise for manual checking
   if(matched_temp |>
      dplyr::filter(dplyr::n() > 1) |>
      nrow() > 0){
     message("Multiple fuzzy matches for genera with similar string distance:
             Please consider curating the ambiguous entries by hand and re-run the pipeline.
             The ambiguous matched genera were saved to 'treemendous_ambiguous_genera.csv' in the current working directory.
              The algorithm will choose the first genus to continue.")
     #Do you want save a list of the ambiguous matched genera current working directory in 'treemendous_ambiguous_genera.csv'?")
     ## Save ambiguous genera for manual curation:
     matched_temp |>
       dplyr::filter(dplyr::n() > 1) |>
       dplyr::select(Genus.x, Species, Matched.Genus) |>
       readr::write_csv(file = 'ambiguous_genera.csv') ##
     ## Alternative Idea: prompt the user to insert the correct name. Caution here however because this might cause trouble with unit testing
   }
 
   ## continue selecting first genus if more than one match
   matched <- matched_temp |>
     dplyr::group_modify(
       ~ifelse(nrow(.x) == 0,
               return(.x),
               return(dplyr::slice_head(.x,n = 1)))
       ## In cases of multiple matches: we choose first match.
       ## Alternatively could use something more sophisticated here:
       ## like for instance choosing the one with more support (present
       ## in more databases)
     ) |>
     dplyr::ungroup()
 
 
   unmatched <- df |>
     fuzzyjoin::stringdist_anti_join(Tree.Genera,
                                     by = c('Orig.Genus' = 'Genus'),
                                     max_dist = 1)
   #unmatched
   assertthat::assert_that(nrow(df) == (nrow(matched)   nrow(unmatched)))
 
   res <-  dplyr::bind_rows(matched, unmatched,
                            .id = 'fuzzy_match_genus') |>
     dplyr::mutate(fuzzy_match_genus = (fuzzy_match_genus == 1)) |>  ## convert to Boolean
     dplyr::arrange(Orig.Genus, Orig.Species) |>
     dplyr::relocate(c('Orig.Genus',
                       'Orig.Species',
                       'Orig.Infraspecies')) ## Genus & Species column at the beginning of tibble
   #res
   return(res)
 }