stemangiola
diff --git a/‎DESCRIPTION
Lines changed: 1 addition & 1 deletion b/‎DESCRIPTION
Lines changed: 1 addition & 1 deletion
diff --git a/‎NAMESPACE
Lines changed: 2 additions & 1 deletion b/‎NAMESPACE
Lines changed: 2 additions & 1 deletion
diff --git a/‎R/counts.R
Lines changed: 1 addition & 1 deletion b/‎R/counts.R
Lines changed: 1 addition & 1 deletion
diff --git a/‎R/dev.R
Lines changed: 2 additions & 2 deletions b/‎R/dev.R
Lines changed: 2 additions & 2 deletions
diff --git a/‎R/metadata.R
Lines changed: 44 additions & 36 deletions b/‎R/metadata.R
Lines changed: 44 additions & 36 deletions
diff --git a/‎R/unharmonised.R
Lines changed: 1 addition & 2 deletions b/‎R/unharmonised.R
Lines changed: 1 addition & 2 deletions
diff --git a/‎R/utils.R
Lines changed: 4 additions & 2 deletions b/‎R/utils.R
Lines changed: 4 additions & 2 deletions
diff --git a/‎man/DATABASE_URL.Rd
Lines changed: 0 additions & 22 deletions b/‎man/DATABASE_URL.Rd
Lines changed: 0 additions & 22 deletions
diff --git a/‎man/SAMPLE_DATABASE_URL.Rd
Lines changed: 1 addition & 1 deletion b/‎man/SAMPLE_DATABASE_URL.Rd
Lines changed: 1 addition & 1 deletion
diff --git a/‎man/get_database_url.Rd
Lines changed: 22 additions & 0 deletions b/‎man/get_database_url.Rd
Lines changed: 22 additions & 0 deletions
@@ -121,7 +121,7 @@ biocViews:
     Transcription,
     Transcriptomics
 Encoding: UTF-8
-RoxygenNote: 7.2.3
+RoxygenNote: 7.3.1
 LazyDataCompression: xz
 URL: https://github.com/stemangiola/CuratedAtlasQueryR
 BugReports: https://github.com/stemangiola/CuratedAtlasQueryR/issues
 
@@ -1,9 +1,9 @@
 # Generated by roxygen2: do not edit by hand
 
 S3method(as.sparse,DelayedMatrix)
-export(DATABASE_URL)
 export(SAMPLE_DATABASE_URL)
 export(get_SingleCellExperiment)
+export(get_database_url)
 export(get_metadata)
 export(get_seurat)
 export(get_single_cell_experiment)
@@ -43,6 +43,7 @@ importFrom(dplyr,tbl)
 importFrom(dplyr,transmute)
 importFrom(duckdb,duckdb)
 importFrom(glue,glue)
+importFrom(glue,glue_sql)
 importFrom(httr,GET)
 importFrom(httr,HEAD)
 importFrom(httr,modify_url)
 
@@ -112,7 +112,7 @@ get_single_cell_experiment <- function(
         has_name(raw_data, c("cell_", "file_id_db"))
     )
 
-    versioned_cache_directory <- file.path(cache_directory, COUNTS_VERSION)
+    versioned_cache_directory <- cache_directory
     versioned_cache_directory |> dir.create(
         showWarnings = FALSE,
         recursive = TRUE
 
@@ -135,7 +135,7 @@ update_unharmonised <- function(unharmonised_parquet_dir, ...){
 #' @keywords internal
 #' @return A character vector of the newly-created anndata files
 #' @examples
-#' \donttest{
+#' \dontrun{
 #' hdf5_to_anndata(
 #'     "/vast/projects/cellxgene_curated/splitted_DB2_data_0.2.1",
 #'     "/vast/projects/cellxgene_curated/splitted_DB2_anndata_0.2.1"
@@ -194,7 +194,7 @@ hdf5_to_anndata <- function(input_directory, output_directory){
 # @return A character vector of the newly-created anndata files
 # @noRd
 # @examples
-# \donttest{
+# \dontrun{
 # h5seurat_to_anndata(
 #     "/vast/projects/cellxgene_curated/splitted_DB2_data_0.2.1",
 #     "/vast/projects/cellxgene_curated/splitted_DB2_anndata_0.2.1"
 
@@ -6,27 +6,28 @@ NULL
 #' Environment that we use to cache the DuckDB connections
 #' @noRd
 cache <- rlang::env(
-    metadata_table = rlang::env()
+  metadata_table = rlang::env()
 )
 
-#' URL pointing to the full metadata file
+#' Returns the URLs for all metadata files 
+#' @param databases A character vector specifying the names of the metadata files. Default is c("metadata.0.2.3.parquet", "fibrosis.0.2.3.parquet")
 #' @export
-#' @return A character scalar consisting of the URL
+#' @return A character vector of URLs to parquet files to download
 #' @examples
-#' get_metadata(remote_url = DATABASE_URL)
-DATABASE_URL <- single_line_str(
-    "https://object-store.rc.nectar.org.au/v1/
-    AUTH_06d6e008e3e642da99d806ba3ea629c5/metadata/metadata.0.2.3.parquet"
-)
+#' get_database_url("metadata.0.2.3.parquet")
+get_database_url <- function(databases = c("metadata.0.2.3.parquet", "fibrosis.0.2.3.parquet")) {
+  glue::glue(
+    "https://object-store.rc.nectar.org.au/v1/AUTH_06d6e008e3e642da99d806ba3ea629c5/metadata/{databases}")
+}
 
 #' URL pointing to the sample metadata file, which is smaller and for test,
 #' demonstration, and vignette purposes only
 #' @export
 #' @return A character scalar consisting of the URL
 #' @examples
-#' get_metadata(remote_url = SAMPLE_DATABASE_URL)
+#' get_metadata(remote_url = SAMPLE_DATABASE_URL, cache_directory = tempdir())
 SAMPLE_DATABASE_URL <- single_line_str(
-    "https://object-store.rc.nectar.org.au/v1/
+  "https://object-store.rc.nectar.org.au/v1/
     AUTH_06d6e008e3e642da99d806ba3ea629c5/metadata/
     sample_metadata.0.2.3.parquet"
 )
@@ -38,8 +39,8 @@ SAMPLE_DATABASE_URL <- single_line_str(
 #' into [get_single_cell_experiment()] to obtain a
 #' [`SingleCellExperiment::SingleCellExperiment-class`]
 #'
-#' @param remote_url Optional character vector of length 1. An HTTP URL pointing
-#'   to the location of the parquet database.
+#' @param remote_url Optional character vector of any length. HTTP URL/URLs pointing
+#'   to the name and location of parquet database/databases.
 #' @param cache_directory Optional character vector of length 1. A file path on
 #'   your local system to a directory (not a file) that will be used to store
 #'   `metadata.parquet`
@@ -68,6 +69,7 @@ SAMPLE_DATABASE_URL <- single_line_str(
 #' @importFrom httr progress
 #' @importFrom cli cli_alert_info hash_sha256
 #' @importFrom glue glue
+#' @importFrom purrr walk
 #'
 #' @details
 #'
@@ -142,32 +144,38 @@ SAMPLE_DATABASE_URL <- single_line_str(
 #' get_metadata(cache_directory = path.expand('~'))
 #' 
 get_metadata <- function(
-    remote_url = DATABASE_URL,
+    remote_url = get_database_url(),
     cache_directory = get_default_cache_dir(),
     use_cache = TRUE
 ) {
-    hash <- c(remote_url, cache_directory) |> paste0(collapse="") |>
-        hash_sha256()
-    cached_connection <- cache$metadata_table[[hash]]
-    if (!is.null(cached_connection) && isTRUE(use_cache)) {
-        cached_connection
-    }
-    else {
-        db_path <- file.path(cache_directory, "metadata.0.2.3.parquet")
-        
-        if (!file.exists(db_path)){
-            report_file_sizes(remote_url)
-            sync_remote_file(
-                remote_url,
-                db_path,
-                progress(type = "down", con = stderr())
-            )
-        }
-        
-        table <- duckdb() |>
-            dbConnect(drv = _, read_only = TRUE) |>
-            read_parquet(db_path)
-        cache$metadata_table[[hash]] <- table
-        table
+  # Synchronize remote files
+  walk(remote_url, function(url) {
+    # Calculate the file path from the URL
+    path <- file.path(cache_directory, url |> basename())
+    if (!file.exists(path)) {
+      report_file_sizes(url)
+      sync_remote_file(url,
+                       path,
+                       progress(type = "down", con = stderr()))
     }
+  })
+  all_parquet <- file.path(cache_directory, dir(cache_directory, pattern = ".parquet$"))
+  # We try to avoid re-reading a set of parquet files 
+  # that is identical to a previous set by hashing the file list
+  hash <- all_parquet |> paste0(collapse="") |>
+    hash_sha256()
+  cached_connection <- cache$metadata_table[[hash]]
+  
+  if (!is.null(cached_connection) && isTRUE(use_cache)) {
+    # If the file list is identical, just re-use the database table
+    cached_connection
+  }
+  else {
+    table <- duckdb() |>
+      dbConnect(drv = _, read_only = TRUE) |>
+      read_parquet(path = all_parquet)
+    cache$metadata_table[[hash]] <- table
+    table
+  }
 }
+
@@ -37,7 +37,7 @@ UNHARMONISED_URL <- single_line_str(
 #' @return A named list, where each name is a dataset file ID, and each value is
 #'   a "lazy data frame", ie a `tbl`.
 #' @examples
-#' \donttest{
+#' \dontrun{
 #' dataset <- "838ea006-2369-4e2c-b426-b2a744a2b02b"
 #' harmonised_meta <- get_metadata() |> 
 #'     dplyr::filter(file_id == dataset) |> dplyr::collect()
@@ -54,7 +54,6 @@ get_unharmonised_dataset <- function(
 ){
     unharmonised_root <- file.path(
       cache_directory,
-      COUNTS_VERSION,
       "unharmonised"
     )
     file_name <- glue::glue("{dataset_id}.parquet")
 
@@ -41,7 +41,7 @@ single_line_str <- function(text){
     str_remove_all(text, r"(\n\s*)")
 }
 
-#' Returns the default cache directory
+#' Returns the default cache directory with a version number
 #' @return A length one character vector.
 #' @importFrom tools R_user_dir
 #' @importFrom utils packageName
@@ -51,6 +51,7 @@ get_default_cache_dir <- function() {
         R_user_dir(
             "cache"
         ) |>
+        file.path(COUNTS_VERSION) |>
         normalizePath() |>
         suppressWarnings()
 }
@@ -89,10 +90,11 @@ sync_remote_file <- function(full_url, output_file, ...) {
 #' @importFrom glue glue
 #' @importFrom dplyr tbl
 #' @importFrom dbplyr sql
+#' @importFrom glue glue_sql
 #' @return An SQL data frame
 #' @keywords internal
 read_parquet <- function(conn, path){
-    from_clause <- glue("FROM read_parquet('{path}')") |> sql()
+    from_clause <- glue_sql("FROM read_parquet([{`path`*}], union_by_name=true)", .con=conn) |> sql()
     tbl(conn, from_clause)
 }
Original file line number	Diff line number	Diff line change
`@@ -112,7 +112,7 @@ get_single_cell_experiment <- function(`
`112`	`112`	`has_name(raw_data, c("cell_", "file_id_db"))`
`113`	`113`	`)`
`114`	`114`
`115`		`- versioned_cache_directory <- file.path(cache_directory, COUNTS_VERSION)`
	`115`	`+ versioned_cache_directory <- cache_directory`
`116`	`116`	`versioned_cache_directory \|> dir.create(`
`117`	`117`	`showWarnings = FALSE,`
`118`	`118`	`recursive = TRUE`