Merge pull request #81 from stitam/issue77

Prepare all SQLite databases locally
ropensci · Jun 2, 2024 · bc00b7b · bc00b7b
2 parents 3a78dd4 + 3ef1b52
commit bc00b7b
Show file tree

Hide file tree

Showing 18 changed files with 216 additions and 190 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -19,6 +19,7 @@ Encoding: UTF-8
 Language: en-US
 Imports:
     curl (>= 2.4),
+    data.table,
     DBI (>= 0.6-1),
     RSQLite (>= 1.1.2),
     dplyr (>= 0.7.0),
@@ -31,4 +32,4 @@ Imports:
 Suggests:
     testthat,
     taxize
-RoxygenNote: 7.2.3
+RoxygenNote: 7.3.1
diff --git a/R/ap_children.R b/R/ap_children.R
@@ -10,10 +10,10 @@
 #' equivalent to the output of `taxize::children()`
 #' @examples \dontrun{
 #' children(c(3700, 2))
-#' children(c(154395, 154357), db="itis")
-#' children("wfo-4000032377", db="wfo")
-#' children(2877951, db="gbif")
-#' children(3960765, db="col") # Abies
+#' children(c(154395, 154357), db = "itis")
+#' children("wfo-4000032377", db = "wfo")
+#' children(2877951, db = "gbif")
+#' children("C66T4", db = "col") # Abies Mill. Mill.
 #' }
 children <- function(x, db='ncbi', verbose=TRUE, ...){
   ap_dispatch(x=x, db=db, cmd='children', class='children', verbose=verbose, ...)

diff --git a/R/ap_classification.R b/R/ap_classification.R
@@ -15,11 +15,11 @@
 #' exactly equivalent to the output of `taxize::classification()`
 #' @examples \dontrun{
 #' classification(c(3702, 9606))
-#' classification(c(154395, 154357), db="itis")
-#' classification(c("wfo-0000291463", "wfo-7000000057"), db="wfo")
-#' classification(2878586, db="gbif")
-#' classification(c(2878586, 2704179), db="gbif")
-#' classification(3960765, db="col") # Abies
+#' classification(c(154395, 154357), db = "itis")
+#' classification(c("wfo-0000291463", "wfo-7000000057"), db = "wfo")
+#' classification(2878586, db = "gbif")
+#' classification(c(2878586, 2704179), db = "gbif")
+#' classification("C66T4", db = "col") # Abies Mill.
 #' }
 classification <- function(x, db='ncbi', verbose=TRUE, ...){
   ap_dispatch(

diff --git a/R/ap_name2taxid.R b/R/ap_name2taxid.R
@@ -36,7 +36,7 @@
 #' name2taxid("Austrobaileyaceae", db = "wfo")
 #' name2taxid("Quercus kelloggii", db = "gbif")
 #' name2taxid(c("Quercus", "Fabaceae", "Animalia"), db = "gbif")
-#' name2taxid(c("Abies", "Pinales", "Tracheophyta"), db = "col")
+#' name2taxid(c("Abies Mill.", "Pinales Gorozh.", "Tracheophyta"), db = "col")
 #' name2taxid(c("Abies mangifica", "Acanthopale aethiogermanica",
 #'   "Acanthopale albosetulosa"), db = "tpl")
 #' }

diff --git a/R/ap_taxid2name.R b/R/ap_taxid2name.R
@@ -14,11 +14,11 @@
 #' taxid2name(c(3702, 9606))
 #' taxid2name(c(154395, 154357, 23041, 154396), db = "itis")
 #' taxid2name(c('wfo-0000541830', 'wfo-0000291463'), db = "wfo")
-#' taxid2name("wfo-7000000057", db="wfo")
-#' taxid2name(2877951, db="gbif")
-#' taxid2name(c(2877951, 5386), db="gbif")
-#' taxid2name(c(3960765, 3953606, 3953010), db="col")
-#' taxid2name(c("kew-2614538", "kew-2895433", "kew-2615007"), db="tpl")
+#' taxid2name("wfo-7000000057", db = "wfo")
+#' taxid2name(2877951, db = "gbif")
+#' taxid2name(c(2877951, 5386), db = "gbif")
+#' taxid2name(c("C66T4", "C7ZVH", "TP"), db = "col")
+#' taxid2name(c("kew-2614538", "kew-2895433", "kew-2615007"), db = "tpl")
 #' }
 taxid2name <- function(x, db='ncbi', verbose=TRUE, warn=TRUE, ...){
   result <- ap_vector_dispatch(

diff --git a/R/ap_taxid2rank.R b/R/ap_taxid2rank.R
@@ -13,12 +13,12 @@
 #' @return character vector of ranks in the same order as the inputs
 #' @examples \dontrun{
 #' taxid2rank(c(3701, 9606))
-#' taxid2rank(c(154395, 154357, 23041, 154396), db="itis")
-#' taxid2rank(c('wfo-4000032377', 'wfo-0000541830'), db="wfo")
-#' taxid2rank("wfo-7000000057", db="wfo")
-#' taxid2rank(2877951, db="gbif")
-#' taxid2rank(c(2877951, 5386), db="gbif")
-#' taxid2rank(c(3960765, 3953606, 3953010), db="col")
+#' taxid2rank(c(154395, 154357, 23041, 154396), db = "itis")
+#' taxid2rank(c('wfo-4000032377', 'wfo-0000541830'), db = "wfo")
+#' taxid2rank("wfo-7000000057", db = "wfo")
+#' taxid2rank(2877951, db = "gbif")
+#' taxid2rank(c(2877951, 5386), db = "gbif")
+#' taxid2rank(c("C66T4", "C7ZVH", "TP"), db = "col")
 #' }
 taxid2rank <- function(x, db='ncbi', verbose=TRUE, warn=TRUE, ...){
   result <- ap_vector_dispatch(

diff --git a/R/db_download.R b/R/db_download.R
@@ -254,10 +254,8 @@ db_download_itis <- function(verbose = TRUE, overwrite = FALSE) {
 
 #' @export
 #' @rdname db_download
-db_download_tpl <- function(verbose = TRUE, overwrite = FALSE) {
-  db_url <- "https://taxize-dbs.s3-us-west-2.amazonaws.com/plantlist.zip" #nolint
-  db_path <- file.path(tdb_cache$cache_path_get(), 'plantlist.zip')
-  db_path_file <- file.path(tdb_cache$cache_path_get(), 'plantlist')
+db_download_tpl <- function(verbose = TRUE, overwrite = FALSE) { 
+  db_path_dir <- file.path(tdb_cache$cache_path_get(), 'tpl')
   final_file <- file.path(tdb_cache$cache_path_get(), 'plantlist.sqlite')
 
   assert(verbose, "logical")
@@ -271,25 +269,29 @@ db_download_tpl <- function(verbose = TRUE, overwrite = FALSE) {
   # make home dir if not already present
   tdb_cache$mkdir()
   # download data
-  mssg(verbose, 'downloading...')
-  curl::curl_download(db_url, db_path, quiet = TRUE)
-  # unzip
-  mssg(verbose, 'unzipping...')
-  utils::unzip(db_path, exdir = db_path_file)
-  # move database
-  file.rename(file.path(db_path_file, "plantlist.sqlite"), final_file)
-  # cleanup
-  mssg(verbose, 'cleaning up...')
-  unlink(db_path)
-  unlink(db_path_file, recursive = TRUE)
+  taxize::tpl_get(db_path_dir)
+  # convert to sqlite 
+  files <- list.files(db_path_dir, full.names = TRUE)
+  df <- data.table::rbindlist(lapply(files, data.table::fread), fill = TRUE)
+  names(df) <- gsub("\\s", "_", tolower(names(df)))
+  df$scientificname <- paste(df$genus, df$species)
+  x <- DBI::dbConnect(RSQLite::SQLite(), final_file)
+  DBI::dbWriteTable(x, "tpl", df)
+  # create indices
+  RSQLite::dbExecute(x, 'CREATE UNIQUE INDEX id ON tpl (id)')
+  RSQLite::dbExecute(x, 'CREATE INDEX sciname ON tpl (scientificname)')
+  # disconnect
+  DBI::dbDisconnect(x)
+  # clean up
+  unlink(db_path_dir, recursive = TRUE)
   # return path
   return(final_file)
 }
 
 #' @export
 #' @rdname db_download
 db_download_wfo <- function(verbose = TRUE, overwrite = FALSE) {
-  db_url <- "http://104.198.143.165/files/WFO_Backbone/_WFOCompleteBackbone/WFO_Backbone.zip" #nolint
+  db_url <- "https://zenodo.org/records/10425161/files/_DwC_backbone_R.zip?download=1" #nolint
   db_path <- file.path(tdb_cache$cache_path_get(), 'WFO_Backbone.zip')
   db_path_file <- file.path(tdb_cache$cache_path_get(), 'WFO_Backbone')
   class_file <- file.path(tdb_cache$cache_path_get(), 'WFO_Backbone/classification.csv')
@@ -385,9 +387,10 @@ db_download_wfo <- function(verbose = TRUE, overwrite = FALSE) {
 #' @export
 #' @rdname db_download
 db_download_col <- function(verbose = TRUE, overwrite = FALSE) {
-  db_url <- 'https://taxize-dbs.s3-us-west-2.amazonaws.com/col.zip'
+  db_url <- 'https://api.checklistbank.org/dataset/294826/export.zip?extended=true&format=DwCA'
   db_path <- file.path(tdb_cache$cache_path_get(), 'col.sqlite')
   db_path_file <- file.path(tdb_cache$cache_path_get(), 'col.zip')
+  db_path_dir <- file.path(tdb_cache$cache_path_get(), 'col')
 
   assert(verbose, "logical")
   assert(overwrite, "logical")
@@ -402,10 +405,64 @@ db_download_col <- function(verbose = TRUE, overwrite = FALSE) {
   curl::curl_download(db_url, db_path_file, quiet = TRUE)
 
   mssg(verbose, 'unzipping...')
-  utils::unzip(db_path_file, exdir = tdb_cache$cache_path_get())
+  utils::unzip(db_path_file, exdir = db_path_dir)
+
+  # Convert to SQLite database
+
+  mssg(verbose, 'building SQLite database...')
+  db <- RSQLite::dbConnect(RSQLite::SQLite(), dbname = db_path)
+
+  out <- readr::read_tsv_chunked(
+    paste0(db_path_dir, "/Distribution.tsv"),
+    callback = function(chunk, dummy) {
+      names(chunk) <- gsub("^.*:", "", names(chunk))
+      DBI::dbWriteTable(db, "distribution", chunk, append = T)
+    },
+    chunk_size = 10000,
+    col_types = "cccccc"
+  )
+
+  out <- readr::read_tsv_chunked(
+    paste0(db_path_dir, "/SpeciesProfile.tsv"),
+    callback = function(chunk, dummy) {
+      names(chunk) <- gsub("^.*:", "", names(chunk))
+      DBI::dbWriteTable(db, "speciesprofile", chunk, append = T)
+    },
+    chunk_size = 10000,
+    col_types = "ccccc"
+  )
+
+  out <- readr::read_tsv_chunked(
+    paste0(db_path_dir, "/VernacularName.tsv"),
+    callback = function(chunk, dummy) {
+      names(chunk) <- gsub("^.*:", "", names(chunk))
+      DBI::dbWriteTable(db, "vernacular", chunk, append = T)
+    },
+    chunk_size = 10000,
+    col_types = "ccc"
+  )
+
+  out <- readr::read_tsv_chunked(
+    paste0(db_path_dir, "/Taxon.tsv"),
+    callback = function(chunk, dummy) {
+      names(chunk) <- gsub("^.*:", "", names(chunk))
+      DBI::dbWriteTable(db, "taxa", chunk, append = T)
+    },
+    chunk_size = 10000,
+    col_types = "cccccicccclccccllccccc"
+  )
+
+  # create indices
+  RSQLite::dbExecute(db, 'CREATE UNIQUE INDEX id on taxa (taxonID)')
+  RSQLite::dbExecute(db, 'CREATE INDEX sciname on taxa (scientificName)')
+  RSQLite::dbExecute(db, 'CREATE INDEX parname on taxa (parentNameUsageID)')
+
+  # disconnect
+  RSQLite::dbDisconnect(db)
 
   mssg(verbose, 'cleaning up...')
   unlink(db_path_file)
+  unlink(db_path_dir, recursive = TRUE)
 
   mssg(verbose, 'all done...')
   return(db_path)
@@ -414,9 +471,10 @@ db_download_col <- function(verbose = TRUE, overwrite = FALSE) {
 #' @export
 #' @rdname db_download
 db_download_gbif <- function(verbose = TRUE, overwrite = FALSE) {
-  db_url <- 'https://taxize-dbs.s3-us-west-2.amazonaws.com/gbif.zip'
+  db_url <- "https://hosted-datasets.gbif.org/datasets/backbone/current/backbone.zip"
   db_path <- file.path(tdb_cache$cache_path_get(), 'gbif.sqlite')
-  db_path_file <- file.path(tdb_cache$cache_path_get(), 'gbif.zip')
+  db_path_file <- file.path(tdb_cache$cache_path_get(), 'backbone.zip')
+  db_path_dir <- file.path(tdb_cache$cache_path_get(), 'gbif')
 
   assert(verbose, "logical")
   assert(overwrite, "logical")
@@ -431,10 +489,37 @@ db_download_gbif <- function(verbose = TRUE, overwrite = FALSE) {
   curl::curl_download(db_url, db_path_file, quiet = TRUE)
 
   mssg(verbose, 'unzipping...')
-  utils::unzip(db_path_file, exdir = tdb_cache$cache_path_get())
+  utils::unzip(
+    db_path_file, 
+    exdir = paste0(tdb_cache$cache_path_get(), "/gbif")
+  )
+
+  # Convert to SQLite database
+
+  mssg(verbose, 'building SQLite database...')
+  db <- RSQLite::dbConnect(RSQLite::SQLite(), dbname = db_path)
+
+  out <- readr::read_tsv_chunked(
+    paste0(tdb_cache$cache_path_get(), "/gbif/Taxon.tsv"),
+    callback = function(chunk, dummy) {
+      DBI::dbWriteTable(db, "gbif", chunk, append = T)
+    },
+    chunk_size = 10000,
+    col_types = "icccccccccccccccccccccc"
+  )
+
+  # create indices
+  RSQLite::dbExecute(db, 'CREATE UNIQUE INDEX id on gbif (taxonID)')
+  RSQLite::dbExecute(db, 'CREATE INDEX conname on gbif (canonicalName)')
+  RSQLite::dbExecute(db, 'CREATE INDEX sciname on gbif (scientificName)')
+  RSQLite::dbExecute(db, 'CREATE INDEX parname on gbif (parentNameUsageID)')
+
+  # disconnect
+  RSQLite::dbDisconnect(db)
 
   mssg(verbose, 'cleaning up...')
   unlink(db_path_file)
+  unlink(db_path_dir, recursive = TRUE)
 
   mssg(verbose, 'all done...')
   return(db_path)

diff --git a/R/taxa_at.R b/R/taxa_at.R
@@ -17,15 +17,15 @@
 #' @return list of data.frame's for each input taxon key, where each data.frame
 #' has fields: name, rank, id. When no results found, an empty data.frame
 #' @examples \dontrun{
-#' taxa_at(186803, rank = "order", db="ncbi", missing = "lower")
+#' taxa_at(186803, rank = "order", db = "ncbi", missing = "lower")
 #' taxa_at(c(186803, 541000, 216572, 186804, 31979,  186806),
 #'  rank = "family", missing = "lower")
 #' taxa_at(c(154395, 154357, 23041, 154396), rank = "family", db="itis")
-#' taxa_at(c('wfo-4000032377', 'wfo-0000541830'), rank = "family", db="wfo")
-#' taxa_at("wfo-7000000057", rank = "order", db="wfo")
-#' taxa_at(2877951, rank = "phylum", db="gbif")
-#' taxa_at(c(2877951, 5386), rank = "family", db="gbif")
-#' taxa_at(c(3960765, 3953606, 3953010), rank = "family", db="col")
+#' taxa_at(c('wfo-4000032377', 'wfo-0000541830'), rank = "family", db = "wfo")
+#' taxa_at("wfo-7000000057", rank = "order", db = "wfo")
+#' taxa_at(2877951, rank = "phylum", db = "gbif")
+#' taxa_at(c(2877951, 5386), rank = "family", db = "gbif")
+#' taxa_at(c("C66T4", "C7ZVH", "TP"), rank = "family", db = "col")
 #' }
 taxa_at <- function(x, rank, db='ncbi', missing = "lower", verbose=TRUE,
   warn=TRUE, ...) {

diff --git a/README.Rmd b/README.Rmd
@@ -26,69 +26,33 @@ knitr::opts_chunk$set(
 [![cran version](https://www.r-pkg.org/badges/version/taxizedb)](https://cran.r-project.org/package=taxizedb)
 [![DOI](https://zenodo.org/badge/53961466.svg)](https://zenodo.org/badge/latestdoi/53961466)
 
-`taxizedb` - Tools for Working with Taxonomic Databases on your machine
+`taxizedb` - Tools for Working with Taxonomic Databases
 
 Docs: <https://docs.ropensci.org/taxizedb/>
 
-[taxize](https://github.com/ropensci/taxize) is a heavily used taxonomic toolbelt
-package in R - However, it makes web requests for nearly all methods. That is fine
-for most cases, but when the user has many, many names it is much more efficient
-to do requests to a local SQL database.
+`taxizedb` is an R package for interacting with taxonomic databases. Its functionality can be divided in two parts: 1. You can download the databases to your platform 2. You can query the downloaded databases to retrieve taxonomic information.
+
+This two step approach is different from tools which interact with web services for each query, and has a number of advantages:
+
+* Once you download a database you can work with it offline
+* Once you download a database querying it is super fast
+* As long as you store your database files all the queries in your analysis will be fully reproducible
 
 ## Data sources
 
-Not all taxonomic databases are publicly available, or possible to mash into a SQLized
-version. Taxonomic DB's supported:
-
-- NCBI: text files are provided by NCBI, which we stitch into a sqlite db
-- ITIS: they provide a sqlite dump, which we use here
-- The PlantList: created from stitching together csv files. this
- source is no longer updated as far as we can tell. they say they've
- moved focus to the World Flora Online
-- Catalogue of Life: created from Darwin Core Archive dump.
-- GBIF: created from Darwin Core Archive dump. right now we only have
- the taxonomy table (called gbif), but will add the other tables in the
- darwin core archive later
-- Wikidata: aggregated taxonomy of Open Tree of Life, GLoBI and Wikidata. 
- On Zenodo, created by Joritt Poelen of GLOBI.
-- World Flora Online: http://www.worldfloraonline.org/
-
-Update schedule for databases:
-
-- NCBI: since `db_download_ncbi` creates the database when the function
-is called, it's updated whenever you run the function
-- ITIS: since ITIS provides the sqlite database as a download, you can
-delete the old file and run `db_download_itis` to get a new dump;
-they I think update the dumps every month or so
-- The PlantList: no longer updated, so you shouldn't need to download
-this after the first download. hosted on Amazon S3
-- Catalogue of Life: a GitHub Actions job runs once a day at 00:00 UTC,
-building the lastest COL data into a SQLite database thats hosted on
-Amazon S3
-- GBIF: a GitHub Actions job runs once a day at 00:00 UTC,
-building the lastest GBIF data into a SQLite database thats hosted on
-Amazon S3
-- Wikidata: last updated April 6, 2018. Scripts are available to 
-update the data if you prefer to do it yourself.
-- World Flora Online: since `db_download_wfo` creates the database when
-the function is called, it's updated whenever you run the function
-
- Links:
-
-- NCBI: ftp://ftp.ncbi.nih.gov/pub/taxonomy/
-- ITIS: https://www.itis.gov/downloads/index.html
-- The PlantList - http://www.theplantlist.org/
-- Catalogue of Life:
-  - latest monthly edition via https://www.catalogueoflife.org/data/download
-- GBIF: http://rs.gbif.org/datasets/backbone/
-- Wikidata: https://zenodo.org/record/1213477
-- World Flora Online: http://www.worldfloraonline.org/
+When you download a database with `taxizedb` it will automatically convert it to SQLite and then all query functions will interact with this SQLite database. However, not all taxonomic databases are publicly available, or can be converted to SQLite. The following databases are supported:
+
+- [NCBI Taxonomy](https://www.ncbi.nlm.nih.gov/taxonomy)
+- [ITIS](https://itis.gov/)
+- [The Plant List (TPL)](http://www.theplantlist.org/) - Note that The Plant List has been superseded by World Flora Online.
+- [World Flora Online (WFO)](http://www.worldfloraonline.org/)
+- [Catalogue of Life (COL)](https://www.catalogueoflife.org/)
+- [Global Biodiversity Information Facility (GBIF)](https://www.gbif.org/)
+- [Wikidata](https://zenodo.org/records/1213477)
 
 Get in touch [in the issues](https://github.com/ropensci/taxizedb/issues) with
 any ideas on new data sources.
 
-All databases are SQLite.
-
 ## Package API
 
 This package for each data sources performs the following tasks:
@@ -106,9 +70,9 @@ This package for each data sources performs the following tasks:
 
 You can use the `src` connections with `dplyr`, etc. to do operations downstream. Or use the database connection to do raw SQL queries.
 
-## install
+## Installation
 
-cran version
+CRAN version
 
 ```{r eval=FALSE}
 install.packages("taxizedb")