Skip to content

Commit d645638

Browse files
authored
Merge pull request #131 from myushen/multiple_metadata_from_api
metadata accepts multiple databases
2 parents a338712 + 1ab4cd7 commit d645638

16 files changed

+114
-80
lines changed

DESCRIPTION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,7 @@ biocViews:
121121
Transcription,
122122
Transcriptomics
123123
Encoding: UTF-8
124-
RoxygenNote: 7.2.3
124+
RoxygenNote: 7.3.1
125125
LazyDataCompression: xz
126126
URL: https://github.com/stemangiola/CuratedAtlasQueryR
127127
BugReports: https://github.com/stemangiola/CuratedAtlasQueryR/issues

NAMESPACE

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
# Generated by roxygen2: do not edit by hand
22

33
S3method(as.sparse,DelayedMatrix)
4-
export(DATABASE_URL)
54
export(SAMPLE_DATABASE_URL)
65
export(get_SingleCellExperiment)
6+
export(get_database_url)
77
export(get_metadata)
88
export(get_seurat)
99
export(get_single_cell_experiment)
@@ -43,6 +43,7 @@ importFrom(dplyr,tbl)
4343
importFrom(dplyr,transmute)
4444
importFrom(duckdb,duckdb)
4545
importFrom(glue,glue)
46+
importFrom(glue,glue_sql)
4647
importFrom(httr,GET)
4748
importFrom(httr,HEAD)
4849
importFrom(httr,modify_url)

R/counts.R

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,7 @@ get_single_cell_experiment <- function(
112112
has_name(raw_data, c("cell_", "file_id_db"))
113113
)
114114

115-
versioned_cache_directory <- file.path(cache_directory, COUNTS_VERSION)
115+
versioned_cache_directory <- cache_directory
116116
versioned_cache_directory |> dir.create(
117117
showWarnings = FALSE,
118118
recursive = TRUE

R/dev.R

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -135,7 +135,7 @@ update_unharmonised <- function(unharmonised_parquet_dir, ...){
135135
#' @keywords internal
136136
#' @return A character vector of the newly-created anndata files
137137
#' @examples
138-
#' \donttest{
138+
#' \dontrun{
139139
#' hdf5_to_anndata(
140140
#' "/vast/projects/cellxgene_curated/splitted_DB2_data_0.2.1",
141141
#' "/vast/projects/cellxgene_curated/splitted_DB2_anndata_0.2.1"
@@ -194,7 +194,7 @@ hdf5_to_anndata <- function(input_directory, output_directory){
194194
# @return A character vector of the newly-created anndata files
195195
# @noRd
196196
# @examples
197-
# \donttest{
197+
# \dontrun{
198198
# h5seurat_to_anndata(
199199
# "/vast/projects/cellxgene_curated/splitted_DB2_data_0.2.1",
200200
# "/vast/projects/cellxgene_curated/splitted_DB2_anndata_0.2.1"

R/metadata.R

Lines changed: 44 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -6,27 +6,28 @@ NULL
66
#' Environment that we use to cache the DuckDB connections
77
#' @noRd
88
cache <- rlang::env(
9-
metadata_table = rlang::env()
9+
metadata_table = rlang::env()
1010
)
1111

12-
#' URL pointing to the full metadata file
12+
#' Returns the URLs for all metadata files
13+
#' @param databases A character vector specifying the names of the metadata files. Default is c("metadata.0.2.3.parquet", "fibrosis.0.2.3.parquet")
1314
#' @export
14-
#' @return A character scalar consisting of the URL
15+
#' @return A character vector of URLs to parquet files to download
1516
#' @examples
16-
#' get_metadata(remote_url = DATABASE_URL)
17-
DATABASE_URL <- single_line_str(
18-
"https://object-store.rc.nectar.org.au/v1/
19-
AUTH_06d6e008e3e642da99d806ba3ea629c5/metadata/metadata.0.2.3.parquet"
20-
)
17+
#' get_database_url("metadata.0.2.3.parquet")
18+
get_database_url <- function(databases = c("metadata.0.2.3.parquet", "fibrosis.0.2.3.parquet")) {
19+
glue::glue(
20+
"https://object-store.rc.nectar.org.au/v1/AUTH_06d6e008e3e642da99d806ba3ea629c5/metadata/{databases}")
21+
}
2122

2223
#' URL pointing to the sample metadata file, which is smaller and for test,
2324
#' demonstration, and vignette purposes only
2425
#' @export
2526
#' @return A character scalar consisting of the URL
2627
#' @examples
27-
#' get_metadata(remote_url = SAMPLE_DATABASE_URL)
28+
#' get_metadata(remote_url = SAMPLE_DATABASE_URL, cache_directory = tempdir())
2829
SAMPLE_DATABASE_URL <- single_line_str(
29-
"https://object-store.rc.nectar.org.au/v1/
30+
"https://object-store.rc.nectar.org.au/v1/
3031
AUTH_06d6e008e3e642da99d806ba3ea629c5/metadata/
3132
sample_metadata.0.2.3.parquet"
3233
)
@@ -38,8 +39,8 @@ SAMPLE_DATABASE_URL <- single_line_str(
3839
#' into [get_single_cell_experiment()] to obtain a
3940
#' [`SingleCellExperiment::SingleCellExperiment-class`]
4041
#'
41-
#' @param remote_url Optional character vector of length 1. An HTTP URL pointing
42-
#' to the location of the parquet database.
42+
#' @param remote_url Optional character vector of any length. HTTP URL/URLs pointing
43+
#' to the name and location of parquet database/databases.
4344
#' @param cache_directory Optional character vector of length 1. A file path on
4445
#' your local system to a directory (not a file) that will be used to store
4546
#' `metadata.parquet`
@@ -68,6 +69,7 @@ SAMPLE_DATABASE_URL <- single_line_str(
6869
#' @importFrom httr progress
6970
#' @importFrom cli cli_alert_info hash_sha256
7071
#' @importFrom glue glue
72+
#' @importFrom purrr walk
7173
#'
7274
#' @details
7375
#'
@@ -142,32 +144,38 @@ SAMPLE_DATABASE_URL <- single_line_str(
142144
#' get_metadata(cache_directory = path.expand('~'))
143145
#'
144146
get_metadata <- function(
145-
remote_url = DATABASE_URL,
147+
remote_url = get_database_url(),
146148
cache_directory = get_default_cache_dir(),
147149
use_cache = TRUE
148150
) {
149-
hash <- c(remote_url, cache_directory) |> paste0(collapse="") |>
150-
hash_sha256()
151-
cached_connection <- cache$metadata_table[[hash]]
152-
if (!is.null(cached_connection) && isTRUE(use_cache)) {
153-
cached_connection
154-
}
155-
else {
156-
db_path <- file.path(cache_directory, "metadata.0.2.3.parquet")
157-
158-
if (!file.exists(db_path)){
159-
report_file_sizes(remote_url)
160-
sync_remote_file(
161-
remote_url,
162-
db_path,
163-
progress(type = "down", con = stderr())
164-
)
165-
}
166-
167-
table <- duckdb() |>
168-
dbConnect(drv = _, read_only = TRUE) |>
169-
read_parquet(db_path)
170-
cache$metadata_table[[hash]] <- table
171-
table
151+
# Synchronize remote files
152+
walk(remote_url, function(url) {
153+
# Calculate the file path from the URL
154+
path <- file.path(cache_directory, url |> basename())
155+
if (!file.exists(path)) {
156+
report_file_sizes(url)
157+
sync_remote_file(url,
158+
path,
159+
progress(type = "down", con = stderr()))
172160
}
161+
})
162+
all_parquet <- file.path(cache_directory, dir(cache_directory, pattern = ".parquet$"))
163+
# We try to avoid re-reading a set of parquet files
164+
# that is identical to a previous set by hashing the file list
165+
hash <- all_parquet |> paste0(collapse="") |>
166+
hash_sha256()
167+
cached_connection <- cache$metadata_table[[hash]]
168+
169+
if (!is.null(cached_connection) && isTRUE(use_cache)) {
170+
# If the file list is identical, just re-use the database table
171+
cached_connection
172+
}
173+
else {
174+
table <- duckdb() |>
175+
dbConnect(drv = _, read_only = TRUE) |>
176+
read_parquet(path = all_parquet)
177+
cache$metadata_table[[hash]] <- table
178+
table
179+
}
173180
}
181+

R/unharmonised.R

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ UNHARMONISED_URL <- single_line_str(
3737
#' @return A named list, where each name is a dataset file ID, and each value is
3838
#' a "lazy data frame", ie a `tbl`.
3939
#' @examples
40-
#' \donttest{
40+
#' \dontrun{
4141
#' dataset <- "838ea006-2369-4e2c-b426-b2a744a2b02b"
4242
#' harmonised_meta <- get_metadata() |>
4343
#' dplyr::filter(file_id == dataset) |> dplyr::collect()
@@ -54,7 +54,6 @@ get_unharmonised_dataset <- function(
5454
){
5555
unharmonised_root <- file.path(
5656
cache_directory,
57-
COUNTS_VERSION,
5857
"unharmonised"
5958
)
6059
file_name <- glue::glue("{dataset_id}.parquet")

R/utils.R

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ single_line_str <- function(text){
4141
str_remove_all(text, r"(\n\s*)")
4242
}
4343

44-
#' Returns the default cache directory
44+
#' Returns the default cache directory with a version number
4545
#' @return A length one character vector.
4646
#' @importFrom tools R_user_dir
4747
#' @importFrom utils packageName
@@ -51,6 +51,7 @@ get_default_cache_dir <- function() {
5151
R_user_dir(
5252
"cache"
5353
) |>
54+
file.path(COUNTS_VERSION) |>
5455
normalizePath() |>
5556
suppressWarnings()
5657
}
@@ -89,10 +90,11 @@ sync_remote_file <- function(full_url, output_file, ...) {
8990
#' @importFrom glue glue
9091
#' @importFrom dplyr tbl
9192
#' @importFrom dbplyr sql
93+
#' @importFrom glue glue_sql
9294
#' @return An SQL data frame
9395
#' @keywords internal
9496
read_parquet <- function(conn, path){
95-
from_clause <- glue("FROM read_parquet('{path}')") |> sql()
97+
from_clause <- glue_sql("FROM read_parquet([{`path`*}], union_by_name=true)", .con=conn) |> sql()
9698
tbl(conn, from_clause)
9799
}
98100

man/DATABASE_URL.Rd

Lines changed: 0 additions & 22 deletions
This file was deleted.

man/SAMPLE_DATABASE_URL.Rd

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

man/get_database_url.Rd

Lines changed: 22 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)