We are hoping to move the SEOSAW database to use World Flora Online (WFO) as its taxonomic backbone, rather than the African Plant Database (APD) which we have been using since 2019.
The WorldFlora R package , developed by Roeland Kindt, provides code for querying a downloaded copy of the WFO database and providing taxonomic information. The key features we want for SEOSAW are: check the validity of taxonomic names in tree inventory data, fuzzy find accepted names to catch spelling errors, and provide accepted names for synonyms.
I’ve written a wrapper function around the code in the WorldFlora R package. Some features:
- Optionally submit a lookup table to replace unmatched names.
- Consolidate consecutive whitespaces in the WFO database to a single space, which can lead to poor matching.
- Use
WorldFlora::WFO.prepare()
with a default set of replacements to fix common orthographic errors. - Use
WorldFlora::WFO.one()
to find the best match where multiple names are fuzzy matched byWorldFlora::WFO.match()
. - Extract subspecies and variety epithets from matched names.
- Optionally return unmatched or multiply-matched names.
#' Replace taxonomic names using lookup tables
#'
#' @param x vector of species names
#' @param lookup a single dataframe or a list of dataframes containing lookup
#' tables. The first column should contain names in `x` to be changed. The
#' second column should contain the new names.
#'
#' @return Vector of corrected species names
#'
#' @details Lookup tables are run in order through the list of lookup tables, meaning
#' names may change incrementally multiple times.
#'
#' @export
#'
synonymyFix <- function(x, lookup) {
# Make list if not already
if (!inherits(lookup, "list")) {
lookup <- list(lookup)
}
# Combine lookup tables into a single dataframe
lookup_combi <- as.data.frame(fastRbind(lookup))
# Check no NAs
if (any(is.na(lookup_combi))) {
stop("Lookup table cannot contain NA entries")
}
# Do substitution
out <- lookup_combi[,2][match(x, lookup_combi[,1])]
out[is.na(out)] <- x[is.na(out)]
return(out)
}
#' Return default pattern substitution for `taxonCheck()`
#'
#' @return vector of regex patterns for use with `taxonCheck()` in argument
#' `sub.pattern`
#'
#' @export
#'
WFO.prepare_default <- function() {
c(
" indet$",
" sp[.]",
" spp[.]",
" ssp[.]",
" pl[.]",
" indet[.]",
" ind[.]",
" gen[.]",
" g[.]",
" fam[.]",
" nov[.]",
" prox[.]",
" cf[.]",
" aff[.]",
" s[.]s[.]",
" s[.]l[.]",
" p[.]p[.]",
" p[.] p[.]",
"[?]",
" inc[.]",
" stet[.]",
"nom[.] cons[.]",
"nom[.] dub[.]",
" nom[.] err[.]",
" nom[.] illeg[.]",
" nom[.] inval[.]",
" nom[.] nov[.]",
" nom[.] nud[.]",
" nom[.] obl[.]",
" nom[.] prot[.]",
" nom[.] rej[.]",
" nom[.] supp[.]",
" sensu auct[.]"
)
}
#' Correct and match taxonomic names to the World Flora Taxonomic Backbone
#'
#' @param x vector of taxonomic names
#' @param WFO.file optional file name of static copy of World Flora Online
#' Taxonomic Backbone. If not NULL, data will be reloaded from this file
#' @param WFO.data optional dataset with static copy of World Flora Online
#' Taxonomic backbone. Ignored if `WFO.file` is not NULL
#' @param lookup optional a single dataframe or a list of dataframes containing
#' lookup tables. The first column should contain names in `x` to be
#' changed. The second column should contain the new names.
#' @param ret_wfo logical, if TRUE the function stops after
#' `WorldFlora::WFO.match()` and returns the raw output from this function.
#' @param ret_unk logical, if TRUE taxa not matched in the World
#' Flora Online are returned to the user as a vector containing
#' the unmatched values. If FALSE these taxa are returned as NA.
#' @param ret_multi logical, if TRUE taxa matching multiple records in the
#' World Flora Online are returned to the user as a list with one element
#' for each original name containing the unmatched values. If FALSE the
#' "best" name is selected by `WorldFlora::WFO.one()`
#' @param sub.pattern vector with regular expressions defining sections of `x`
#' to be removed during correction of common orthographic errors by
#' `WorldFlora::WFO.prepare()`
#' @param fuzzy If larger than 0, then attempt fuzzy matching. See `WorldFlora::WFO.match()`
#' @param ... Additional arguments passed to `WorldFlora::WFO.match()`
#'
#' @return Dataframe with cleaned taxonomic names and metadata
#'
#' @details
#' Taxonomic names are matched against the World Flora Online database using
#' `WorldFlora::WFO.match()`.
#'
#' The search algorithm is as follows:
#' \enumerate{
#' \item{Optionally replace names with `lookup`}
#' \item{Correct common orthographic errors with `WorldFlora::WFO.prepare()`}
#' \item{Query `WorldFlora::WFO.match()` for accepted
#' name and taxonomic rank information}
#' \item{Optionally return multiple matches or unsuccessful matches}
#' \item{Consolidate multiple matches with `WorldFlora::WFO.one()`}
#' \item{Return formatted dataframe}
#' }
#'
#' Names that cannot be matched should be replaced with "Indet indet" in
#' `lookup`. These are replaced with NA_character_ before `WorldFlora::WFO.match()`
#'
#' @importFrom data.table fread data.table
#' @importFrom WorldFlora WFO.prepare WFO.match WFO.one
#'
#' @export
#'
taxonCheck <- function(x, WFO.file = NULL, WFO.data = NULL,
lookup = NULL, ret_wfo = FALSE, ret_unk = FALSE, ret_multi = FALSE,
sub.pattern = WFO.prepare_default(), fuzzy = 0.1, ...) {
# Check WFO data is available
if (is.null(WFO.data) & is.null(WFO.file)) {
stop("Either WFO.data or WFO.file must be provided")
}
if (is.null(WFO.data)) {
message(paste("Reading WFO data"))
if (!file.exists(WFO.file)) {
stop("If WFO.data is NULL, a valid WFO.file must be provided. See WorldFlora::WFO.download()")
}
WFO.data <- data.table::fread(WFO.file, encoding = "UTF-8")
} else {
WFO.data <- data.table::data.table(WFO.data)
}
WFO.data$scientificName <- gsub("\\s+", " ", WFO.data$scientificName)
# Get unique taxonomic names
xu <- unique(x)
# Substitute names with lookup table
if (!is.null(lookup)) {
message("Substituting names with `lookup`")
xf <- synonymyFix(xu, lookup = lookup)
} else {
xf <- xu
}
# Prepare taxonomic names for WFO query
xs <- WorldFlora::WFO.prepare(xf, sub.pattern = sub.pattern)$spec.name
# Replace Indet genera with ""
xi <- xs
xi[xi == "Indet"] <- ""
# Run WFO matching
message("Querying World Flora Online")
wfo <- WorldFlora::WFO.match(unique(xi),
WFO.data = WFO.data, Fuzzy = fuzzy, ...)
# Optionally return raw WFO output
if (ret_wfo) {
# Add original names
wfo_all <- dplyr::bind_rows(lapply(seq_along(xu), function(i) {
orig <- xu[i]
cbind("taxon_name_orig" = orig, wfo[wfo$spec.name.ORIG == xi[i],])
}))
# Check all original names are matched back
stopifnot(all(!is.na(wfo_all$taxon_name_orig)))
return(wfo_all)
}
# Consolidate to single best name per taxon
wfo_one <- WorldFlora::WFO.one(wfo, verbose = FALSE)
# Add original names
wfo_one_all <- dplyr::bind_rows(lapply(seq_along(xu), function(i) {
orig <- xu[i]
cbind("taxon_name_orig" = orig, wfo_one[wfo_one$spec.name.ORIG == xi[i],])
}))
# Check all original names are matched back
stopifnot(all(!is.na(wfo_one_all$taxon_name_orig)))
wfo_sel <- wfo_one_all[,c(
"taxon_name_orig",
"spec.name.ORIG", # taxon_name_sanit
"Old.name", # taxon_name_syn
"Old.ID", # taxon_wfo_syn
"scientificName", # taxon_name_acc
"taxonID", # taxon_wfo_acc
"scientificNameAuthorship", # taxon_auth_acc
"taxonRank", # taxon_rank_acc
"parentNameUsageID", # taxon_wfo_parent
"specificEpithet", # taxon_epithet_acc
"genus", # taxon_genus_acc
"family" # taxon_family_acc
)]
wfo_sel <- unique(wfo_sel)
# All submitted names should be included in WFO output
stopifnot(all(sort(unique(wfo_sel$spec.name.ORIG)) == sort(unique(xi))))
# Consolidate genus and species
wfo_sel$species <- trimws(paste(wfo_sel$genus, wfo_sel$specificEpithet))
wfo_sel$species <- ifelse(!wfo_sel$taxonRank %in%
c("species", "subspecies", "variety", "subvariety",
"form", "subform", "prole", "unranked"),
NA_character_, wfo_sel$species)
# Extract subsp. and var. epithets from accepted names
wfo_sel$taxon_subspecies_acc <- gsub(".*subsp\\.\\s", "", wfo_sel$scientificName)
wfo_sel$taxon_subspecies_acc[!grepl("\\ssubsp\\.\\s", wfo_sel$scientificName)] <- NA_character_
wfo_sel$taxon_variety_acc <- gsub(".*var\\.\\s", "", wfo_sel$scientificName)
wfo_sel$taxon_variety_acc[!grepl("\\svar\\.\\s", wfo_sel$scientificName)] <- NA_character_
# Fill wfo ID of synonyms
wfo_sel$Old.ID <- ifelse(wfo_sel$Old.ID == "",
wfo_sel$taxonID, wfo_sel$Old.ID)
wfo_sel$Old.name <- ifelse(wfo_sel$Old.name == "",
wfo_sel$scientificName, wfo_sel$Old.name)
# Add date of processing
wfo_sel$taxon_wfo_date <- Sys.Date()
# Create output dataframe
out <- wfo_sel[,c(
"taxon_name_orig",
"spec.name.ORIG", # taxon_name_sanit
"Old.name", # taxon_name_syn
"Old.ID", # taxon_wfo_syn
"scientificName", # taxon_name_acc
"taxonID", # taxon_wfo_acc
"scientificNameAuthorship", # taxon_auth_acc
"taxonRank", # taxon_rank_acc
"parentNameUsageID", # taxon_wfo_parent
"taxon_variety_acc",
"taxon_subspecies_acc",
"specificEpithet", # taxon_epithet_acc
"species", # taxon_species_acc
"genus", # taxon_genus_acc
"family", # taxon_family_acc
"taxon_wfo_date")]
names(out) <- c(
"taxon_name_orig",
"taxon_name_sanit",
"taxon_name_syn",
"taxon_wfo_syn",
"taxon_name_acc",
"taxon_wfo_acc",
"taxon_auth_acc",
"taxon_rank_acc",
"taxon_wfo_parent",
"taxon_variety_acc",
"taxon_subspecies_acc",
"taxon_epithet_acc",
"taxon_species_acc",
"taxon_genus_acc",
"taxon_family_acc",
"taxon_wfo_date")
# Optionally return unmatched names
if (ret_unk & any(out$taxon_name_sanit != out$taxon_name_syn, na.rm = TRUE)) {
unmatched <- out$taxon_name_orig[
(out$taxon_name_sanit != out$taxon_name_syn) |
is.na(out$taxon_name_syn) | is.na(out$taxon_name_sanit)]
warning("Some taxonomic names not matched by WFO.match, returning original names")
return(unmatched)
}
# Optionally return names with multiple matches
if (ret_multi & any(duplicated(wfo$taxon_name_orig))) {
multis <- wfo$taxon_name_orig[duplicated(wfo$taxon_name_orig)]
multis_df <- wfo[wfo$taxon_name_orig %in% multis,]
multis_list <- split(multis_df, multis_df$taxon_name_orig)
warning("Some taxonomic names matched to multiple names by WFO.match, returning options")
return(multis_list)
}
# Change "" to NA in all columns
out[] <- lapply(out, function(x) {
if (is.character(x)) {
x[x == ""] <- NA_character_
}
x
})
# All original names should be filled
stopifnot(all(!is.na(out$species[out$species_sanit != "Indet indet"])))
# Return
return(out)
}