Skip to content

Commit

Permalink
version 0.1.8
Browse files Browse the repository at this point in the history
  • Loading branch information
PaulESantos authored and cran-robot committed Jul 15, 2024
1 parent e7b3fb9 commit 248f7a5
Show file tree
Hide file tree
Showing 42 changed files with 1,428 additions and 426 deletions.
10 changes: 6 additions & 4 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 1,6 @@
Package: ppendemic
Title: A Glimpse at the Diversity of Peru's Endemic Plants
Version: 0.1.7
Version: 0.1.8
Authors@R:
c(person(given = "Paul E.",
family = "Santos Andrade",
Expand All @@ -19,15 19,17 @@ BugReports: https://github.com/PaulESantos/ppendemic/issues/
Suggests: knitr, rmarkdown, testthat (>= 3.0.0)
Encoding: UTF-8
LazyData: true
RoxygenNote: 7.2.3
RoxygenNote: 7.3.2
Depends: R (>= 3.5.0),
Config/testthat/edition: 3
Maintainer: Paul E. Santos Andrade <[email protected]>
Imports: assertthat, dplyr, fuzzyjoin, memoise, progress, purrr, readr,
stringr, tibble, tidyr
NeedsCompilation: no
Packaged: 2023-08-15 02:26:15 UTC; user
Packaged: 2024-07-14 04:31:52 UTC; PC
Author: Paul E. Santos Andrade [aut, cre]
(<https://orcid.org/0000-0002-6635-0375>),
Lucely L. Vilca Bustamante [aut]
(<https://orcid.org/0000-0002-5559-1296>)
Repository: CRAN
Date/Publication: 2023-08-15 07:20:02 UTC
Date/Publication: 2024-07-14 04:50:02 UTC
4 changes: 2 additions & 2 deletions LICENSE
Original file line number Diff line number Diff line change
@@ -1,2 1,2 @@
YEAR: 2023
COPYRIGHT HOLDER: ppendemic authors
YEAR: 2023
COPYRIGHT HOLDER: ppendemic authors
52 changes: 31 additions & 21 deletions MD5
Original file line number Diff line number Diff line change
@@ -1,18 1,24 @@
b3dc77ff78d95bb0dbde3d6ebfa1c498 *DESCRIPTION
fd383bdc9aa08b46370a203e3c651a4e *LICENSE
af6d284bd741bf7d8ec1cd6eaf2dc648 *NAMESPACE
0f95e1cddd39ee3d6fecef4badc812e9 *R/globals.R
ad15518b7bedaa16ca61c40e2d7baa1e *R/internals.R
22e972d4cd9ada21f1dc99302a094d69 *R/is_ppendemic.R
0a4e91eb88d16853287bc3bf7ff5e215 *R/ppendemic_dat.R
2349583c5d4c717339dcc85a636700a0 *R/ppendemic_data.R
1f13ce6edf44b4985923d2ea7d8499a0 *R/sysdata.rda
810f55de75a3bd3ded27347dbf51ffff *DESCRIPTION
d24c50004d7b441ca2027e7b85288b1f *LICENSE
c040ecce2caa9d86a35324b9b7e75b51 *NAMESPACE
a69d0f26d546733a1cdda99c528cbc12 *R/direct_match.R
9066d53ce8e58d3a02f048956491dc83 *R/direct_match_spp_genus.R
e0761f5805f5b3c00706e0fd59d7a3c2 *R/fuzzy_match_genus.R
77a05811b9aadfeee0962457713ef647 *R/fuzzy_match_infraspp_within_species.R
2088f58a2edf511816eb6258cd4dd4a8 *R/fuzzy_match_spp_within_genus.R
87a21cb1e6142ebde08d2cf22e1ce074 *R/genus_match.R
c9fa02029e5c5f198bdb24c8dd4b8ca8 *R/internal.R
0e4a4f23e36d2c11bd97a0a457d5753b *R/is_ppendemic.R
a7b77459a7488e44dc9841a5968717a7 *R/matching_ppendemic.R
0eaa28488c54a648d4b6719ed94c2a0a *R/ppendemic_data.R
c31d7ebeba7818bfa98858381646caf0 *R/suffix_match_spp_within_genus.R
e79c6a72b29b483f1ae2745db57ec8b4 *R/zzz.R
017aa0dc52e282a6e1b09d84482df3b1 *README.md
d6007103d1a3580681f024e683326241 *data/ppendemic_tab.rda
833d6bde0eb3f8f8acabc53d90d21d9e *inst/CITATION
7770d22872ba7a0f23b86caa13217f5c *man/figures/README-map-1.png
d2e601e90792c40c216098bc5b7bf348 *man/figures/README-unnamed-chunk-2-1.png
12dc1f4708809a78e7484b69866ddbe5 *README.md
05402f63fdc9b1d2af54850d02b0fac7 *data/ppendemic_tab13.rda
a05324e037698659f6ae911a7afc4679 *inst/CITATION
811fd4e9702658c6dbe7982858ac7365 *man/direct_match.Rd
72dcae1905cef5c7367ace03d06a13c4 *man/direct_match_species_within_genus_helper.Rd
2ac2a1661d378eec9dab07606eaf16f9 *man/figures/README-unnamed-chunk-2-1.png
cb1e46f469cfbbbde29c8b5113e1d789 *man/figures/lifecycle-archived.svg
c0d2e5a54f1fa4ff02bf9533079dd1f7 *man/figures/lifecycle-defunct.svg
a1b8c987c676c16af790f563f96cbb1f *man/figures/lifecycle-deprecated.svg
Expand All @@ -21,10 27,14 @@ c3978703d8f40f2679795335715e98f4 *man/figures/lifecycle-experimental.svg
27b879bf3677ea76e3991d56ab324081 *man/figures/lifecycle-questioning.svg
53b3f893324260b737b3c46ed2a0e643 *man/figures/lifecycle-stable.svg
1c1fe7a759b86dc6dbcbe7797ab8246c *man/figures/lifecycle-superseded.svg
d8e771a29f9a09acb1fa183e93b42ce2 *man/get_ppendemic_data.Rd
236de5836bb7517b8181a95e606fd723 *man/is_ppendemic.Rd
9c872cf7d009475faadeebe62d1e0bcb *man/ppendemic_tab.Rd
001567bc6737897162e8708030a66f71 *tests/testthat.R
e25127e6f251f675e97219c2c096c676 *tests/testthat/test_get_ppendemic_data.R
7898bffae0ce54849f92b4b040e36161 *tests/testthat/test_is_ppendemic.R
579579a50c3de31707581e8455ffef5e *tests/testthat/test_ppendemic_tab.R
9653e144338ec8844e7168b1d9f77c89 *man/figures/ppendemic_logo.png
e429f13bc5b56a5f2c40c7fe2bdb6230 *man/fuzzy_match_genus.Rd
2b95125ede132ab7fdb8308fe7113d7b *man/fuzzy_match_infraspecies_within_species.Rd
9d12c9cd4373f02cf99434f9efa8458d *man/fuzzy_match_species_within_genus_helper.Rd
fd1da418078e49fc904c09ab3cbcd8d4 *man/genus_match.Rd
447f8dac1ecaf159215c2d0afc71a392 *man/is_ppendemic.Rd
6759a44fef2a0d0aa586c6a1511fe098 *man/matching_ppendemic.Rd
b2f505fa9af41f16161aa0e5d6edce8a *man/ppendemic_tab13.Rd
f2fcedce9ba941f36bb766f1b3a8aecf *man/suffix_match_species_within_genus_helper.Rd
c3a47a9e5de2436dfea235e9cdbdb6ff *tests/testthat.R
7ce1f27eace30050149983b65326bb84 *tests/testthat/test_matching_ppendemic.R
2 changes: 1 addition & 1 deletion NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,4 1,4 @@
# Generated by roxygen2: do not edit by hand

export(get_ppendemic_data)
export(is_ppendemic)
export(matching_ppendemic)
52 changes: 52 additions & 0 deletions R/direct_match.R
Original file line number Diff line number Diff line change
@@ -0,0 1,52 @@
#' Direct Match
#'
#' @description
#' This function performs a direct match of species names. It matches the genus and species if the name is binomial, and matches the genus, species, and infra species if the name includes a subspecies.
#'
#' @param df A tibble containing the species data to be matched.
#' @param target_df A tibble representing the ppendemic database containing the reference list of endemic species.
#'
#' @return
#' A tibble with an additional logical column direct_match indicating whether the binomial or trinomial name was successfully matched (`TRUE`) or not (`FALSE`).
#'
#' @keywords internal
direct_match <- function(df, target_df = NULL){
assertthat::assert_that(all(c('Orig.Genus',
'Orig.Species',
'Orig.Infraspecies') %in%
colnames(df)))

## solve issue of empty input tibble, and needed to ensure
## compatilbility with sequential_matching: because there the
## columns already exists for the second backbone
if(!all(c('direct_match') %in% colnames(df))){
if(nrow(df) == 0){
return(tibble::add_column(df, direct_match = NA))
}
}

matched <- df |>
dplyr::semi_join(target_df,
by = c('Orig.Genus' = 'Genus',
'Orig.Species' = 'Species',
'Orig.Infraspecies' = 'infraspecies')) |>
dplyr::mutate(Matched.Genus = Orig.Genus,
Matched.Species = Orig.Species,
Matched.Infraspecies = Orig.Infraspecies)
unmatched <- df |>
dplyr::anti_join( target_df,
c('Orig.Genus' = 'Genus',
'Orig.Species' = 'Species',
'Orig.Infraspecies' = 'infraspecies'))

assertthat::assert_that(nrow(df) == (nrow(matched) nrow(unmatched)))

# combine matched and unmatched and add Boolean indicator: TRUE = matched, FALSE = unmatched
combined <- dplyr::bind_rows(matched, unmatched,
.id = 'direct_match') |>
dplyr::mutate(direct_match = (direct_match == 1)) |> ## convert to Boolean
dplyr::relocate(c('Orig.Genus', 'Orig.Species', 'Orig.Infraspecies'))
## Genus & Species column at the beginning of tibble

return(combined)
}
71 changes: 71 additions & 0 deletions R/direct_match_spp_genus.R
Original file line number Diff line number Diff line change
@@ -0,0 1,71 @@
#' Direct Match Species within Genus
#'
#' @description
#' This function performs a direct match of specific epithets within an already matched genus from the list of endemic species in the ppendemic database.
#'
#' @param df A tibble containing the species data to be matched.
#' @param target_df A tibble representing the ppendemic database containing the reference list of endemic species.
#'
#' @return
#' A tibble with an additional logical column indicating whether the specific epithet was successfully matched within the matched genus (`TRUE`) or not (`FALSE`).
#'
#' @keywords internal
direct_match_species_within_genus_helper <- function(df, target_df){
# subset database
genus <- df |>
dplyr::distinct(Matched.Genus) |>
unlist()

database_subset <- memoised_get_trees_of_genus(genus, target_df)


# match specific epithet within genus
matched <- df |>
dplyr::semi_join(database_subset,
by = c('Orig.Species' = 'Species')) |>
dplyr::mutate(Matched.Species = Orig.Species)

unmatched <- df |>
dplyr::anti_join(database_subset,
by = c('Orig.Species' = 'Species'))

assertthat::assert_that(nrow(df) == (nrow(matched) nrow(unmatched)))

# combine matched and unmatched and add Boolean indicator: TRUE = matched, FALSE = unmatched
combined <- dplyr::bind_rows(matched, unmatched,
.id = 'direct_match_species_within_genus') |>
dplyr::mutate(direct_match_species_within_genus = (direct_match_species_within_genus == 1)) |> ## convert to Boolean
dplyr::relocate(c('Orig.Genus',
'Orig.Species',
'Orig.Infraspecies')) ## Genus & Species column at the beginning of tibble

return(combined)
}


direct_match_species_within_genus <- function(df, target_df = NULL){

assertthat::assert_that(all(c('Orig.Genus', 'Orig.Species',
'Orig.Infraspecies',
'Matched.Genus') %in% colnames(df)))

## solve issue of empty input tibble, and needed to ensure compatilbility with sequential_matching: because there the columns already exists for the second backbone
if(nrow(df) == 0){
if(!all(c('direct_match_species_within_genus') %in% colnames(df))){
return(tibble::add_column(df,
direct_match_species_within_genus = NA))
}
else{
return(df)
}
}

res <- df |>
dplyr::group_by(Matched.Genus) |>
dplyr::group_split() |>
map_dfr_progress(direct_match_species_within_genus_helper,
target_df)

return(res)
}

97 changes: 97 additions & 0 deletions R/fuzzy_match_genus.R
Original file line number Diff line number Diff line change
@@ -0,0 1,97 @@
#' Fuzzy Match Genus Name
#'
#' @description
#' This function performs a fuzzy match of genus names against the ppendemic database using fuzzyjoin::stringdist() to account for slight variations in spelling.
#'
#' @param df A tibble containing the genus names to be matched.
#' @param target_df A tibble representing the ppendemic database containing the reference list of endemic species.
#'
#' @return
#' A tibble with two additional columns:
#' - fuzzy_match_genus: A logical column indicating whether the genus was successfully matched (`TRUE`) or not (`FALSE`).
#' - fuzzy_genus_dist: A numeric column representing the distance for each match.
#'
#' @keywords internal
fuzzy_match_genus <- function(df, target_df = NULL){
assertthat::assert_that(all(c('Orig.Genus',
'Orig.Species',
'Orig.Infraspecies') %in% colnames(df)))

## solve issue of empty input tibble, and needed to ensure compatilbility with sequential_matching: because there the columns already exists for the second backbone
if(nrow(df) == 0){
if(!all(c('fuzzy_match_genus', 'fuzzy_genus_dist') %in% colnames(df))){
return(tibble::add_column(df, fuzzy_match_genus = NA, fuzzy_genus_dist = NA))
}
else{
return(df)
}
}
## solve issue in second iteration of sequential_matching: necessary to remove fuzzy_species_dist column: otherwise 2 columns are generated 'fuzzy_species_dist...1, fuzzy_species_dist...2'
if('fuzzy_genus_dist' %in% colnames(df)){
df <- df |>
dplyr::mutate(fuzzy_genus_dist = NULL)
} ## TODO: can potentially be removed again????

Tree.Genera <- target_df |>
dplyr::distinct(Genus)
# fuzzy match
matched_temp <- df |>
fuzzyjoin::stringdist_left_join(Tree.Genera,
by = c('Orig.Genus' = 'Genus'),
max_dist = 1,
distance_col = 'fuzzy_genus_dist') |>
# save matched Genus name to Matched.Genus
dplyr::mutate(Matched.Genus = Genus) |>
dplyr::select(-c('Genus')) |>
dplyr::group_by(Orig.Genus, Orig.Species) |>
dplyr::filter(fuzzy_genus_dist == min(fuzzy_genus_dist))


## If there are multiple matches for the same genus: raise warning and advise for manual checking
if(matched_temp |>
dplyr::filter(dplyr::n() > 1) |>
nrow() > 0){
message("Multiple fuzzy matches for genera with similar string distance:
Please consider curating the ambiguous entries by hand and re-run the pipeline.
The ambiguous matched genera were saved to 'treemendous_ambiguous_genera.csv' in the current working directory.
The algorithm will choose the first genus to continue.")
#Do you want save a list of the ambiguous matched genera current working directory in 'treemendous_ambiguous_genera.csv'?")
## Save ambiguous genera for manual curation:
matched_temp |>
dplyr::filter(dplyr::n() > 1) |>
dplyr::select(Genus.x, Species, Matched.Genus) |>
readr::write_csv(file = 'ambiguous_genera.csv') ##
## Alternative Idea: prompt the user to insert the correct name. Caution here however because this might cause trouble with unit testing
}

## continue selecting first genus if more than one match
matched <- matched_temp |>
dplyr::group_modify(
~ifelse(nrow(.x) == 0,
return(.x),
return(dplyr::slice_head(.x,n = 1)))
## In cases of multiple matches: we choose first match.
## Alternatively could use something more sophisticated here:
## like for instance choosing the one with more support (present
## in more databases)
) |>
dplyr::ungroup()


unmatched <- df |>
fuzzyjoin::stringdist_anti_join(Tree.Genera,
by = c('Orig.Genus' = 'Genus'),
max_dist = 1)
#unmatched
assertthat::assert_that(nrow(df) == (nrow(matched) nrow(unmatched)))

res <- dplyr::bind_rows(matched, unmatched,
.id = 'fuzzy_match_genus') |>
dplyr::mutate(fuzzy_match_genus = (fuzzy_match_genus == 1)) |> ## convert to Boolean
dplyr::arrange(Orig.Genus, Orig.Species) |>
dplyr::relocate(c('Orig.Genus',
'Orig.Species',
'Orig.Infraspecies')) ## Genus & Species column at the beginning of tibble
#res
return(res)
}
Loading

0 comments on commit 248f7a5

Please sign in to comment.