Skip to content

Commit

Permalink
Merge pull request #81 from stitam/issue77
Browse files Browse the repository at this point in the history
 Prepare all SQLite databases locally
  • Loading branch information
stitam authored Jun 2, 2024
2 parents 3a78dd4 + 3ef1b52 commit bc00b7b
Show file tree
Hide file tree
Showing 18 changed files with 216 additions and 190 deletions.
3 changes: 2 additions & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ Encoding: UTF-8
Language: en-US
Imports:
curl (>= 2.4),
data.table,
DBI (>= 0.6-1),
RSQLite (>= 1.1.2),
dplyr (>= 0.7.0),
Expand All @@ -31,4 +32,4 @@ Imports:
Suggests:
testthat,
taxize
RoxygenNote: 7.2.3
RoxygenNote: 7.3.1
8 changes: 4 additions & 4 deletions R/ap_children.R
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,10 @@
#' equivalent to the output of `taxize::children()`
#' @examples \dontrun{
#' children(c(3700, 2))
#' children(c(154395, 154357), db="itis")
#' children("wfo-4000032377", db="wfo")
#' children(2877951, db="gbif")
#' children(3960765, db="col") # Abies
#' children(c(154395, 154357), db = "itis")
#' children("wfo-4000032377", db = "wfo")
#' children(2877951, db = "gbif")
#' children("C66T4", db = "col") # Abies Mill. Mill.
#' }
children <- function(x, db='ncbi', verbose=TRUE, ...){
ap_dispatch(x=x, db=db, cmd='children', class='children', verbose=verbose, ...)
Expand Down
10 changes: 5 additions & 5 deletions R/ap_classification.R
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,11 @@
#' exactly equivalent to the output of `taxize::classification()`
#' @examples \dontrun{
#' classification(c(3702, 9606))
#' classification(c(154395, 154357), db="itis")
#' classification(c("wfo-0000291463", "wfo-7000000057"), db="wfo")
#' classification(2878586, db="gbif")
#' classification(c(2878586, 2704179), db="gbif")
#' classification(3960765, db="col") # Abies
#' classification(c(154395, 154357), db = "itis")
#' classification(c("wfo-0000291463", "wfo-7000000057"), db = "wfo")
#' classification(2878586, db = "gbif")
#' classification(c(2878586, 2704179), db = "gbif")
#' classification("C66T4", db = "col") # Abies Mill.
#' }
classification <- function(x, db='ncbi', verbose=TRUE, ...){
ap_dispatch(
Expand Down
2 changes: 1 addition & 1 deletion R/ap_name2taxid.R
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
#' name2taxid("Austrobaileyaceae", db = "wfo")
#' name2taxid("Quercus kelloggii", db = "gbif")
#' name2taxid(c("Quercus", "Fabaceae", "Animalia"), db = "gbif")
#' name2taxid(c("Abies", "Pinales", "Tracheophyta"), db = "col")
#' name2taxid(c("Abies Mill.", "Pinales Gorozh.", "Tracheophyta"), db = "col")
#' name2taxid(c("Abies mangifica", "Acanthopale aethiogermanica",
#' "Acanthopale albosetulosa"), db = "tpl")
#' }
Expand Down
10 changes: 5 additions & 5 deletions R/ap_taxid2name.R
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,11 @@
#' taxid2name(c(3702, 9606))
#' taxid2name(c(154395, 154357, 23041, 154396), db = "itis")
#' taxid2name(c('wfo-0000541830', 'wfo-0000291463'), db = "wfo")
#' taxid2name("wfo-7000000057", db="wfo")
#' taxid2name(2877951, db="gbif")
#' taxid2name(c(2877951, 5386), db="gbif")
#' taxid2name(c(3960765, 3953606, 3953010), db="col")
#' taxid2name(c("kew-2614538", "kew-2895433", "kew-2615007"), db="tpl")
#' taxid2name("wfo-7000000057", db = "wfo")
#' taxid2name(2877951, db = "gbif")
#' taxid2name(c(2877951, 5386), db = "gbif")
#' taxid2name(c("C66T4", "C7ZVH", "TP"), db = "col")
#' taxid2name(c("kew-2614538", "kew-2895433", "kew-2615007"), db = "tpl")
#' }
taxid2name <- function(x, db='ncbi', verbose=TRUE, warn=TRUE, ...){
result <- ap_vector_dispatch(
Expand Down
12 changes: 6 additions & 6 deletions R/ap_taxid2rank.R
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,12 @@
#' @return character vector of ranks in the same order as the inputs
#' @examples \dontrun{
#' taxid2rank(c(3701, 9606))
#' taxid2rank(c(154395, 154357, 23041, 154396), db="itis")
#' taxid2rank(c('wfo-4000032377', 'wfo-0000541830'), db="wfo")
#' taxid2rank("wfo-7000000057", db="wfo")
#' taxid2rank(2877951, db="gbif")
#' taxid2rank(c(2877951, 5386), db="gbif")
#' taxid2rank(c(3960765, 3953606, 3953010), db="col")
#' taxid2rank(c(154395, 154357, 23041, 154396), db = "itis")
#' taxid2rank(c('wfo-4000032377', 'wfo-0000541830'), db = "wfo")
#' taxid2rank("wfo-7000000057", db = "wfo")
#' taxid2rank(2877951, db = "gbif")
#' taxid2rank(c(2877951, 5386), db = "gbif")
#' taxid2rank(c("C66T4", "C7ZVH", "TP"), db = "col")
#' }
taxid2rank <- function(x, db='ncbi', verbose=TRUE, warn=TRUE, ...){
result <- ap_vector_dispatch(
Expand Down
127 changes: 106 additions & 21 deletions R/db_download.R
Original file line number Diff line number Diff line change
Expand Up @@ -254,10 +254,8 @@ db_download_itis <- function(verbose = TRUE, overwrite = FALSE) {

#' @export
#' @rdname db_download
db_download_tpl <- function(verbose = TRUE, overwrite = FALSE) {
db_url <- "https://taxize-dbs.s3-us-west-2.amazonaws.com/plantlist.zip" #nolint
db_path <- file.path(tdb_cache$cache_path_get(), 'plantlist.zip')
db_path_file <- file.path(tdb_cache$cache_path_get(), 'plantlist')
db_download_tpl <- function(verbose = TRUE, overwrite = FALSE) {
db_path_dir <- file.path(tdb_cache$cache_path_get(), 'tpl')
final_file <- file.path(tdb_cache$cache_path_get(), 'plantlist.sqlite')

assert(verbose, "logical")
Expand All @@ -271,25 +269,29 @@ db_download_tpl <- function(verbose = TRUE, overwrite = FALSE) {
# make home dir if not already present
tdb_cache$mkdir()
# download data
mssg(verbose, 'downloading...')
curl::curl_download(db_url, db_path, quiet = TRUE)
# unzip
mssg(verbose, 'unzipping...')
utils::unzip(db_path, exdir = db_path_file)
# move database
file.rename(file.path(db_path_file, "plantlist.sqlite"), final_file)
# cleanup
mssg(verbose, 'cleaning up...')
unlink(db_path)
unlink(db_path_file, recursive = TRUE)
taxize::tpl_get(db_path_dir)
# convert to sqlite
files <- list.files(db_path_dir, full.names = TRUE)
df <- data.table::rbindlist(lapply(files, data.table::fread), fill = TRUE)
names(df) <- gsub("\\s", "_", tolower(names(df)))
df$scientificname <- paste(df$genus, df$species)
x <- DBI::dbConnect(RSQLite::SQLite(), final_file)
DBI::dbWriteTable(x, "tpl", df)
# create indices
RSQLite::dbExecute(x, 'CREATE UNIQUE INDEX id ON tpl (id)')
RSQLite::dbExecute(x, 'CREATE INDEX sciname ON tpl (scientificname)')
# disconnect
DBI::dbDisconnect(x)
# clean up
unlink(db_path_dir, recursive = TRUE)
# return path
return(final_file)
}

#' @export
#' @rdname db_download
db_download_wfo <- function(verbose = TRUE, overwrite = FALSE) {
db_url <- "http://104.198.143.165/files/WFO_Backbone/_WFOCompleteBackbone/WFO_Backbone.zip" #nolint
db_url <- "https://zenodo.org/records/10425161/files/_DwC_backbone_R.zip?download=1" #nolint
db_path <- file.path(tdb_cache$cache_path_get(), 'WFO_Backbone.zip')
db_path_file <- file.path(tdb_cache$cache_path_get(), 'WFO_Backbone')
class_file <- file.path(tdb_cache$cache_path_get(), 'WFO_Backbone/classification.csv')
Expand Down Expand Up @@ -385,9 +387,10 @@ db_download_wfo <- function(verbose = TRUE, overwrite = FALSE) {
#' @export
#' @rdname db_download
db_download_col <- function(verbose = TRUE, overwrite = FALSE) {
db_url <- 'https://taxize-dbs.s3-us-west-2.amazonaws.com/col.zip'
db_url <- 'https://api.checklistbank.org/dataset/294826/export.zip?extended=true&format=DwCA'
db_path <- file.path(tdb_cache$cache_path_get(), 'col.sqlite')
db_path_file <- file.path(tdb_cache$cache_path_get(), 'col.zip')
db_path_dir <- file.path(tdb_cache$cache_path_get(), 'col')

assert(verbose, "logical")
assert(overwrite, "logical")
Expand All @@ -402,10 +405,64 @@ db_download_col <- function(verbose = TRUE, overwrite = FALSE) {
curl::curl_download(db_url, db_path_file, quiet = TRUE)

mssg(verbose, 'unzipping...')
utils::unzip(db_path_file, exdir = tdb_cache$cache_path_get())
utils::unzip(db_path_file, exdir = db_path_dir)

# Convert to SQLite database

mssg(verbose, 'building SQLite database...')
db <- RSQLite::dbConnect(RSQLite::SQLite(), dbname = db_path)

out <- readr::read_tsv_chunked(
paste0(db_path_dir, "/Distribution.tsv"),
callback = function(chunk, dummy) {
names(chunk) <- gsub("^.*:", "", names(chunk))
DBI::dbWriteTable(db, "distribution", chunk, append = T)
},
chunk_size = 10000,
col_types = "cccccc"
)

out <- readr::read_tsv_chunked(
paste0(db_path_dir, "/SpeciesProfile.tsv"),
callback = function(chunk, dummy) {
names(chunk) <- gsub("^.*:", "", names(chunk))
DBI::dbWriteTable(db, "speciesprofile", chunk, append = T)
},
chunk_size = 10000,
col_types = "ccccc"
)

out <- readr::read_tsv_chunked(
paste0(db_path_dir, "/VernacularName.tsv"),
callback = function(chunk, dummy) {
names(chunk) <- gsub("^.*:", "", names(chunk))
DBI::dbWriteTable(db, "vernacular", chunk, append = T)
},
chunk_size = 10000,
col_types = "ccc"
)

out <- readr::read_tsv_chunked(
paste0(db_path_dir, "/Taxon.tsv"),
callback = function(chunk, dummy) {
names(chunk) <- gsub("^.*:", "", names(chunk))
DBI::dbWriteTable(db, "taxa", chunk, append = T)
},
chunk_size = 10000,
col_types = "cccccicccclccccllccccc"
)

# create indices
RSQLite::dbExecute(db, 'CREATE UNIQUE INDEX id on taxa (taxonID)')
RSQLite::dbExecute(db, 'CREATE INDEX sciname on taxa (scientificName)')
RSQLite::dbExecute(db, 'CREATE INDEX parname on taxa (parentNameUsageID)')

# disconnect
RSQLite::dbDisconnect(db)

mssg(verbose, 'cleaning up...')
unlink(db_path_file)
unlink(db_path_dir, recursive = TRUE)

mssg(verbose, 'all done...')
return(db_path)
Expand All @@ -414,9 +471,10 @@ db_download_col <- function(verbose = TRUE, overwrite = FALSE) {
#' @export
#' @rdname db_download
db_download_gbif <- function(verbose = TRUE, overwrite = FALSE) {
db_url <- 'https://taxize-dbs.s3-us-west-2.amazonaws.com/gbif.zip'
db_url <- "https://hosted-datasets.gbif.org/datasets/backbone/current/backbone.zip"
db_path <- file.path(tdb_cache$cache_path_get(), 'gbif.sqlite')
db_path_file <- file.path(tdb_cache$cache_path_get(), 'gbif.zip')
db_path_file <- file.path(tdb_cache$cache_path_get(), 'backbone.zip')
db_path_dir <- file.path(tdb_cache$cache_path_get(), 'gbif')

assert(verbose, "logical")
assert(overwrite, "logical")
Expand All @@ -431,10 +489,37 @@ db_download_gbif <- function(verbose = TRUE, overwrite = FALSE) {
curl::curl_download(db_url, db_path_file, quiet = TRUE)

mssg(verbose, 'unzipping...')
utils::unzip(db_path_file, exdir = tdb_cache$cache_path_get())
utils::unzip(
db_path_file,
exdir = paste0(tdb_cache$cache_path_get(), "/gbif")
)

# Convert to SQLite database

mssg(verbose, 'building SQLite database...')
db <- RSQLite::dbConnect(RSQLite::SQLite(), dbname = db_path)

out <- readr::read_tsv_chunked(
paste0(tdb_cache$cache_path_get(), "/gbif/Taxon.tsv"),
callback = function(chunk, dummy) {
DBI::dbWriteTable(db, "gbif", chunk, append = T)
},
chunk_size = 10000,
col_types = "icccccccccccccccccccccc"
)

# create indices
RSQLite::dbExecute(db, 'CREATE UNIQUE INDEX id on gbif (taxonID)')
RSQLite::dbExecute(db, 'CREATE INDEX conname on gbif (canonicalName)')
RSQLite::dbExecute(db, 'CREATE INDEX sciname on gbif (scientificName)')
RSQLite::dbExecute(db, 'CREATE INDEX parname on gbif (parentNameUsageID)')

# disconnect
RSQLite::dbDisconnect(db)

mssg(verbose, 'cleaning up...')
unlink(db_path_file)
unlink(db_path_dir, recursive = TRUE)

mssg(verbose, 'all done...')
return(db_path)
Expand Down
12 changes: 6 additions & 6 deletions R/taxa_at.R
Original file line number Diff line number Diff line change
Expand Up @@ -17,15 +17,15 @@
#' @return list of data.frame's for each input taxon key, where each data.frame
#' has fields: name, rank, id. When no results found, an empty data.frame
#' @examples \dontrun{
#' taxa_at(186803, rank = "order", db="ncbi", missing = "lower")
#' taxa_at(186803, rank = "order", db = "ncbi", missing = "lower")
#' taxa_at(c(186803, 541000, 216572, 186804, 31979, 186806),
#' rank = "family", missing = "lower")
#' taxa_at(c(154395, 154357, 23041, 154396), rank = "family", db="itis")
#' taxa_at(c('wfo-4000032377', 'wfo-0000541830'), rank = "family", db="wfo")
#' taxa_at("wfo-7000000057", rank = "order", db="wfo")
#' taxa_at(2877951, rank = "phylum", db="gbif")
#' taxa_at(c(2877951, 5386), rank = "family", db="gbif")
#' taxa_at(c(3960765, 3953606, 3953010), rank = "family", db="col")
#' taxa_at(c('wfo-4000032377', 'wfo-0000541830'), rank = "family", db = "wfo")
#' taxa_at("wfo-7000000057", rank = "order", db = "wfo")
#' taxa_at(2877951, rank = "phylum", db = "gbif")
#' taxa_at(c(2877951, 5386), rank = "family", db = "gbif")
#' taxa_at(c("C66T4", "C7ZVH", "TP"), rank = "family", db = "col")
#' }
taxa_at <- function(x, rank, db='ncbi', missing = "lower", verbose=TRUE,
warn=TRUE, ...) {
Expand Down
74 changes: 19 additions & 55 deletions README.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -26,69 +26,33 @@ knitr::opts_chunk$set(
[![cran version](https://www.r-pkg.org/badges/version/taxizedb)](https://cran.r-project.org/package=taxizedb)
[![DOI](https://zenodo.org/badge/53961466.svg)](https://zenodo.org/badge/latestdoi/53961466)

`taxizedb` - Tools for Working with Taxonomic Databases on your machine
`taxizedb` - Tools for Working with Taxonomic Databases

Docs: <https://docs.ropensci.org/taxizedb/>

[taxize](https://github.com/ropensci/taxize) is a heavily used taxonomic toolbelt
package in R - However, it makes web requests for nearly all methods. That is fine
for most cases, but when the user has many, many names it is much more efficient
to do requests to a local SQL database.
`taxizedb` is an R package for interacting with taxonomic databases. Its functionality can be divided in two parts: 1. You can download the databases to your platform 2. You can query the downloaded databases to retrieve taxonomic information.

This two step approach is different from tools which interact with web services for each query, and has a number of advantages:

* Once you download a database you can work with it offline
* Once you download a database querying it is super fast
* As long as you store your database files all the queries in your analysis will be fully reproducible

## Data sources

Not all taxonomic databases are publicly available, or possible to mash into a SQLized
version. Taxonomic DB's supported:

- NCBI: text files are provided by NCBI, which we stitch into a sqlite db
- ITIS: they provide a sqlite dump, which we use here
- The PlantList: created from stitching together csv files. this
source is no longer updated as far as we can tell. they say they've
moved focus to the World Flora Online
- Catalogue of Life: created from Darwin Core Archive dump.
- GBIF: created from Darwin Core Archive dump. right now we only have
the taxonomy table (called gbif), but will add the other tables in the
darwin core archive later
- Wikidata: aggregated taxonomy of Open Tree of Life, GLoBI and Wikidata.
On Zenodo, created by Joritt Poelen of GLOBI.
- World Flora Online: http://www.worldfloraonline.org/

Update schedule for databases:

- NCBI: since `db_download_ncbi` creates the database when the function
is called, it's updated whenever you run the function
- ITIS: since ITIS provides the sqlite database as a download, you can
delete the old file and run `db_download_itis` to get a new dump;
they I think update the dumps every month or so
- The PlantList: no longer updated, so you shouldn't need to download
this after the first download. hosted on Amazon S3
- Catalogue of Life: a GitHub Actions job runs once a day at 00:00 UTC,
building the lastest COL data into a SQLite database thats hosted on
Amazon S3
- GBIF: a GitHub Actions job runs once a day at 00:00 UTC,
building the lastest GBIF data into a SQLite database thats hosted on
Amazon S3
- Wikidata: last updated April 6, 2018. Scripts are available to
update the data if you prefer to do it yourself.
- World Flora Online: since `db_download_wfo` creates the database when
the function is called, it's updated whenever you run the function

Links:

- NCBI: ftp://ftp.ncbi.nih.gov/pub/taxonomy/
- ITIS: https://www.itis.gov/downloads/index.html
- The PlantList - http://www.theplantlist.org/
- Catalogue of Life:
- latest monthly edition via https://www.catalogueoflife.org/data/download
- GBIF: http://rs.gbif.org/datasets/backbone/
- Wikidata: https://zenodo.org/record/1213477
- World Flora Online: http://www.worldfloraonline.org/
When you download a database with `taxizedb` it will automatically convert it to SQLite and then all query functions will interact with this SQLite database. However, not all taxonomic databases are publicly available, or can be converted to SQLite. The following databases are supported:

- [NCBI Taxonomy](https://www.ncbi.nlm.nih.gov/taxonomy)
- [ITIS](https://itis.gov/)
- [The Plant List (TPL)](http://www.theplantlist.org/) - Note that The Plant List has been superseded by World Flora Online.
- [World Flora Online (WFO)](http://www.worldfloraonline.org/)
- [Catalogue of Life (COL)](https://www.catalogueoflife.org/)
- [Global Biodiversity Information Facility (GBIF)](https://www.gbif.org/)
- [Wikidata](https://zenodo.org/records/1213477)

Get in touch [in the issues](https://github.com/ropensci/taxizedb/issues) with
any ideas on new data sources.

All databases are SQLite.

## Package API

This package for each data sources performs the following tasks:
Expand All @@ -106,9 +70,9 @@ This package for each data sources performs the following tasks:

You can use the `src` connections with `dplyr`, etc. to do operations downstream. Or use the database connection to do raw SQL queries.

## install
## Installation

cran version
CRAN version

```{r eval=FALSE}
install.packages("taxizedb")
Expand Down
Loading

0 comments on commit bc00b7b

Please sign in to comment.