diff --git a/DESCRIPTION b/DESCRIPTION index 34057d9..5011bd5 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Type: Package Package: CuratedAtlasQueryR Title: Queries the Human Cell Atlas -Version: 1.3.7 +Version: 1.4.7 Authors@R: c( person( "Stefano", @@ -126,7 +126,7 @@ biocViews: Transcription, Transcriptomics Encoding: UTF-8 -RoxygenNote: 7.3.1 +RoxygenNote: 7.3.2 LazyDataCompression: xz URL: https://github.com/stemangiola/CuratedAtlasQueryR BugReports: https://github.com/stemangiola/CuratedAtlasQueryR/issues diff --git a/R/metadata.R b/R/metadata.R index 7d5de48..5bb6482 100644 --- a/R/metadata.R +++ b/R/metadata.R @@ -10,7 +10,8 @@ cache <- rlang::env( ) #' Returns the URLs for all metadata files -#' @param databases A character vector specifying the names of the metadata files. Default is c("metadata.0.2.3.parquet", "fibrosis.0.2.3.parquet") +#' @param databases A character vector specifying the names of the metadata files. +#' Default is c("metadata.0.2.3.parquet", "fibrosis.0.2.3.parquet", "prostate.0.1.0.parquet") #' @export #' @return A character vector of URLs to parquet files to download #' @examples @@ -20,7 +21,8 @@ cache <- rlang::env( #' immune system across age, sex and ethnicity." bioRxiv (2023): 2023-06. #' doi:10.1101/2023.06.08.542671. #' @source [Mangiola et al.,2023](https://www.biorxiv.org/content/10.1101/2023.06.08.542671v3) -get_database_url <- function(databases = c("metadata.0.2.3.parquet", "fibrosis.0.2.3.parquet")) { +get_database_url <- function(databases = c("metadata.0.2.3.parquet", "fibrosis.0.2.3.parquet", + "prostate.0.1.0.parquet")) { glue::glue( "https://object-store.rc.nectar.org.au/v1/AUTH_06d6e008e3e642da99d806ba3ea629c5/metadata/{databases}") } diff --git a/dev/census_accepted_assays.csv b/dev/census_accepted_assays.csv new file mode 100644 index 0000000..3488442 --- /dev/null +++ b/dev/census_accepted_assays.csv @@ -0,0 +1,99 @@ +id,assay +EFO:0003755,FL-cDNA +EFO:0008640,3'T-fill +EFO:0008641,3’-end-seq +EFO:0008643,3′-Seq +EFO:0008661,Bru-Seq +EFO:0008669,CAGEscan +EFO:0008673,CapSeq +EFO:0008675,CaptureSeq +EFO:0008679,CEL-seq +EFO:0008694,ClickSeq +EFO:0008697,cP-RNA-Seq +EFO:0008708,DeepCAGE +EFO:0008710,Digital RNA +EFO:0008718,DP-Seq +EFO:0008720,DroNc-seq +EFO:0008722,Drop-seq +EFO:0008735,FACS-seq +EFO:0008747,FRISCR +EFO:0008748,FRT-Seq +EFO:0008752,GMUCT 1.0 +EFO:0008753,GMUCT 2.0 +EFO:0008756,GRO-CAP +EFO:0008763,Hi-SCL +EFO:0008780,inDrop +EFO:0008796,MARS-seq +EFO:0008797,MATQ-seq +EFO:0008824,NanoCAGE +EFO:0008825,Nanogrid RNA-Seq +EFO:0008826,NET-Seq +EFO:0008850,PAS-Seq +EFO:0008859,PEAT +EFO:0008863,PLATE-Seq +EFO:0008868,PRO-cap +EFO:0008869,PRO-seq +EFO:0008877,Quartz-seq +EFO:0008896,RNA-Seq +EFO:0008897,RNAtag-Seq +EFO:0008898,RNET-seq +EFO:0008903,SC3-seq +EFO:0008908,SCI-seq +EFO:0008919,Seq-Well +EFO:0008929,SMA +EFO:0008930,Smart-seq +EFO:0008931,Smart-seq2 +EFO:0008937,snDrop-seq +EFO:0008941,sNuc-Seq +EFO:0008945,SPET-seq +EFO:0008953,STRT-seq +EFO:0008954,STRT-seq-2i +EFO:0008956,SUPeR-seq +EFO:0008962,TARDIS +EFO:0008966,TCR Chain Paring +EFO:0008967,TCR-LA-MC PCR +EFO:0008972,TL-seq +EFO:0008974,Tomo-Seq +EFO:0008975,TRAP-Seq +EFO:0008978,TSS Sequencing +EFO:0008980,UMI Method +EFO:0009309,Div-Seq +EFO:0009899,10x 3' v2 +EFO:0009900,10x 5' v2 +EFO:0009901,10x 3' v1 +EFO:0009919,SPLiT-seq +EFO:0009922,10x 3' v3 +EFO:0009991,Nuc-Seq +EFO:0010003,RASL-seq +EFO:0010004,SCRB-seq +EFO:0010010,CEL-seq2 +EFO:0010022,Smart-3Seq +EFO:0010034,Cappable-Seq +EFO:0010041,Nascent-Seq +EFO:0010058,Fluidigm C1-based library preparation +EFO:0010184,Smart-like +EFO:0010550,sci-RNA-seq +EFO:0010713,10x immune profiling +EFO:0010714,10x TCR enrichment +EFO:0010715,10x Ig enrichment +EFO:0010964,barcoded plate-based single cell RNA-seq +EFO:0011025,10x 5' v1 +EFO:0022396,TruSeq +EFO:0022488,Smart-seq3 +EFO:0022490,ScaleBio single cell RNA sequencing +EFO:0030002,microwell-seq +EFO:0030003,10x 3' transcription profiling +EFO:0030004,10x 5' transcription profiling +EFO:0030019,Seq-Well S3 +EFO:0030021,Nx1-seq +EFO:0030028,sci-RNA-seq3 +EFO:0030030,Quant-seq +EFO:0030031,SCOPE-chip +EFO:0030061,mcSCRB-seq +EFO:0030074,SORT-seq +EFO:0030078,droplet-based single-cell RNA library preparation +EFO:0700003,BD Rhapsody Whole Transcriptome Analysis +EFO:0700004,BD Rhapsody Targeted mRNA +EFO:0700010,TruDrop +EFO:0700011,GEXSCOPE technology +EFO:0700016,Smart-seq v4 \ No newline at end of file diff --git a/dev/execute_hpcell_on_census_and_defining_data_tranformation.R b/dev/execute_hpcell_on_census_and_defining_data_tranformation.R new file mode 100644 index 0000000..2e552ca --- /dev/null +++ b/dev/execute_hpcell_on_census_and_defining_data_tranformation.R @@ -0,0 +1,206 @@ +library(glue) +library(dplyr) +library(arrow) +library(SingleCellExperiment) +library(tibble) +library(SummarizedExperiment) +library(glue) +library(purrr) +library(zellkonverter) +library(tidyr) +library(ggplot2) +library(plotly) +library(targets) +library(stringr) +library(CuratedAtlasQueryR) +library(fs) +library(HPCell) +library(crew.cluster) +directory = "/home/users/allstaff/shen.m/scratch/Census_rerun/split_h5ad_based_on_sample_id/" +sample_anndata <- dir(glue("{directory}"), full.names = T) +downloaded_samples_tbl <- read_parquet("/home/users/allstaff/shen.m/scratch/Census_rerun/census_samples_to_download_groups.parquet") +downloaded_samples_tbl <- downloaded_samples_tbl |> + rename(cell_number = list_length) |> + mutate(cell_number = cell_number |> as.integer(), + file_name = glue("{directory}{sample_2}.h5ad") |> as.character(), + tier = case_when( + cell_number < 500 ~ "tier_1", cell_number >= 500 & + cell_number < 1000 ~ "tier_2", cell_number >= 1000 & + cell_number < 10000 ~ "tier_3", cell_number >= 10000 ~ "tier_4" + )) + +result_directory = "/vast/projects/cellxgene_curated/metadata_cellxgenedp_Apr_2024" +sample_meta <- tar_read(metadata_dataset_id_common_sample_columns, store = glue("{result_directory}/_targets")) +sample_tbl = downloaded_samples_tbl |> left_join(get_metadata() |> select(dataset_id, contains("norm")) |> + distinct() |> filter(!is.na(x_normalization)) |> + as_tibble(), by = "dataset_id") + + +sample_tbl <- sample_tbl |> left_join(sample_meta, by = "dataset_id") |> distinct(file_name, tier, cell_number, dataset_id, sample_2, + x_normalization, x_approximate_distribution) |> + mutate(transform_method = case_when(str_like(x_normalization, "C%") ~ "log", + x_normalization == "none" ~ "log", + x_normalization == "normalized" ~ "log", + is.na(x_normalization) & is.na(x_approximate_distribution) ~ "log", + is.na(x_normalization) & x_approximate_distribution == "NORMAL" ~ "NORMAL", + is.na(x_normalization) & x_approximate_distribution == "COUNT" ~ "COUNT", + str_like(x_normalization, "%canpy%") ~ "log1p", + TRUE ~ x_normalization)) |> + + mutate(method_to_apply = case_when(transform_method %in% c("log","LogNormalization","LogNormalize","log-normalization") ~ "exp", + is.na(x_normalization) & is.na(x_approximate_distribution) ~ "exp", + str_like(transform_method, "Counts%") ~ "exp", + str_like(transform_method, "%log2%") ~ "exp", + transform_method %in% c("log1p", "log1p, base e", "Scanpy", + "scanpy.api.pp.normalize_per_cell method, scaling factor 10000") ~ "expm1", + transform_method == "log1p, base 2" ~ "expm1", + transform_method == "NORMAL" ~ "exp", + transform_method == "COUNT" ~ "identity", + is.na(transform_method) ~ "identity" + ) ) |> + mutate(comment = case_when(str_like(x_normalization, "Counts%") ~ "a checkpoint for max value of Assay must <= 50", + is.na(x_normalization) & is.na(x_approximate_distribution) ~ "round negative value to 0", + x_normalization == "normalized" ~ "round negative value to 0" + )) + +sample_tbl <- sample_tbl |> mutate(transformation_function = map( + method_to_apply, + ~ ( function(data) { + assay_name <- data@assays |> names() |> magrittr::extract2(1) + counts <- assay(data, assay_name) + density_est <- counts |> as.matrix() |> density() + mode_value <- density_est$x[which.max(density_est$y)] + if (mode_value < 0 ) counts <- counts + abs(mode_value) + + # Scale max counts to 20 to avoid any downstream failure + if (.x != "identity" && (max(counts) > 20)) { + scale_factor = 20 / max(counts) + counts <- counts * scale_factor} + + counts <- transform_method(counts) + # round counts to avoid potential substraction error due to different digits print out + counts <- counts |> round(5) + majority_gene_counts = names(which.max(table(as.vector(counts)))) |> as.numeric() + if (majority_gene_counts != 0) { + counts <- counts - majority_gene_counts + } + + # Avoid downstream failures negative counts + if((counts[,1:min(10000, ncol(counts))] |> min()) < 0) + counts[counts < 0] <- 0 + + # Assign counts back to data + assay(data, assay_name) <- counts + + col_sums <- colSums(counts) + # Drop all zero cells + data <- data[, col_sums > 0] + + # Avoid downstream binding error + rowData(data) = NULL + + data + + }) |> + # Meta programming, replacing the transformation programmatically + substitute( env = list(transform_method = as.name(.x))) |> + # Evaluate back to a working function + eval() + )) + +#sample_tbl |> saveRDS("~/scratch/Census_rerun/sample_tbl_input_for_hpcell.rds") +sample_tbl <- readRDS("~/scratch/Census_rerun/sample_tbl_input_for_hpcell.rds") + +# Set the parent directory where the subdirectories will be created +# parent_dir <- "~/scratch/Census_rerun/" +# +# # Directory names to create +# dir_names <- paste0("run", 1:25) +# +# # Full paths of the directories +# full_dir_paths <- file.path(parent_dir, dir_names) +# +# # Create each directory if it does not exist +# for (dir_path in full_dir_paths) { +# if (!dir_exists(dir_path)) { +# dir_create(dir_path) +# } +# } + +# Run 1000 samples per run. Save log and result in the corresponding store +store = "~/scratch/Census_rerun/run3/" +setwd(glue("{store}")) +sliced_sample_tbl = sample_tbl |> slice(2001:3000) |> select(file_name, tier, cell_number, dataset_id, + sample_2, transformation_function) + +# Enable sample_names.rds to store sample names for the input +sample_names <- sliced_sample_tbl |> pull(file_name) |> set_names(sliced_sample_tbl |> pull(sample_2)) + +sample_names |> + initialise_hpc( + gene_nomenclature = "ensembl", + data_container_type = "anndata", + store = store, + tier = sliced_sample_tbl |> pull(tier), + computing_resources = list( + crew_controller_slurm( + name = "tier_1", + script_lines = "#SBATCH --mem 35G", + slurm_cpus_per_task = 1, + workers = 200, + tasks_max = 1, + verbose = T + ), + + crew_controller_slurm( + name = "tier_2", + script_lines = "#SBATCH --mem 60G", + slurm_cpus_per_task = 1, + workers = 50, + tasks_max = 1, + verbose = T + ), + crew_controller_slurm( + name = "tier_3", + script_lines = "#SBATCH --mem 90G", + slurm_cpus_per_task = 1, + workers = 25, + tasks_max = 1, + verbose = T + ), + crew_controller_slurm( + name = "tier_4", + script_lines = "#SBATCH --mem 100G", + slurm_cpus_per_task = 1, + workers = 14, + tasks_max = 1, + verbose = T + ) + ) + + ) |> + tranform_assay(fx = sliced_sample_tbl |> + pull(transformation_function), + target_output = "sce_transformed") |> + + # Remove empty outliers based on RNA count threshold per cell + remove_empty_threshold(target_input = "sce_transformed", RNA_feature_threshold = 200) |> + + # Remove dead cells + remove_dead_scuttle(target_input = "sce_transformed") |> + + # Score cell cycle + score_cell_cycle_seurat(target_input = "sce_transformed") |> + + # Remove doublets + remove_doublets_scDblFinder(target_input = "sce_transformed") |> + + # Annotation + annotate_cell_type(target_input = "sce_transformed", azimuth_reference = "pbmcref") |> + + normalise_abundance_seurat_SCT( + factors_to_regress = c("subsets_Mito_percent", "subsets_Ribo_percent", "G2M.Score"), + target_input = "sce_transformed" + ) + + diff --git a/dev/split_census_anndata_base_on_sample_id.R b/dev/split_census_anndata_base_on_sample_id.R new file mode 100644 index 0000000..502e1bf --- /dev/null +++ b/dev/split_census_anndata_base_on_sample_id.R @@ -0,0 +1,134 @@ +library(targets) +library(zellkonverter) +library(tibble) +library(dplyr) +library(SummarizedExperiment) +library(tidybulk) +library(tidySingleCellExperiment) +library(stringr) +library(arrow) +anndata_path_based_on_dataset_id_to_read <- "~/scratch/Census_rerun/h5ad/" +anndata_path_based_on_sample_id_to_save <- "~/scratch/Census_rerun/split_h5ad_based_on_sample_id/" + +files <- list.files(anndata_path_based_on_dataset_id_to_read, pattern = "*h5ad", + full.names = TRUE) + +save_data <- function(data, file_name) { + filename <- paste0(file.path(anndata_path_based_on_sample_id_to_save), file_name, ".h5ad") + zellkonverter::writeH5AD(data, file = filename, compression = "gzip" ) +} + + +#' This function subsets samples from a dataset based on specified join IDs. +#' It reads a single-cell expression dataset stored in the HDF5 AnnData format, +#' removes unneeded metadata to optimize memory usage, and extracts the subset of cells +#' matching the provided observation join IDs. +subset_samples <- function(dataset_id, observation_joinid, sample_id) { + + + # Construct the file path to the dataset + file_path <- paste0(anndata_path_based_on_dataset_id_to_read, dataset_id, ".h5ad") + + # Read the dataset from HDF5 file using zellkonverter package + sce <- zellkonverter::readH5AD(file_path, use_hdf5 = TRUE, reader = "R") + + # Extract the base name of the file and remove the '.h5ad' extension to get dataset_id + sce$dataset_id = file_path |> basename() |> str_remove("\\.h5ad$") + + # Add original cell identifiers from column names to the dataset + sce$observation_originalid = colnames(sce) + + # Create a new identifier by combining the original cell id with the dataset id + cell_modified = paste(sce$observation_originalid, sce$dataset_id, sep = "___") + + # Update column names with the new combined identifier + colnames(sce) <- cell_modified + + # Clear the metadata to reduce memory overhead + S4Vectors::metadata(sce) <- list() + + # Add sample identifier to the dataset + sce <- sce |> mutate(sample_id = !!sample_id) + + # Identify cells that match the observation_joinid + cells_to_subset <- which(colData(sce)$observation_joinid %in% unlist(observation_joinid)) + + # Subset and return the SingleCellExperiment object with only the selected cells + sce[, cells_to_subset] +} + +rename_features <- function(data) { + edb <- EnsDb.Hsapiens.v86 + edb_df <- genes(edb, + columns = c("gene_name", "entrezid", "gene_biotype"), + filter = GeneIdFilter("ENSG", condition = "startsWith"), + return.type = "data.frame") + + gene_names_map <- rowData(data) |> as.data.frame() |> tibble::rownames_to_column(var = "gene_id") |> + dplyr::select(gene_id, feature_name) |> + mutate(current_gene_names = gsub("_", "-", feature_name)) |> dplyr::left_join(edb_df, by = "gene_id") |> + mutate(modified_gene_names = ifelse(stringr::str_like(current_gene_names, "%ENSG0%"), gene_name, current_gene_names)) + + gene_names_map_tidy <- gene_names_map |> + dplyr::filter(!is.na(modified_gene_names)) + + # delete ensemble IDs if cant be converted to gene symbols + rownames_to_delete <- gene_names_map |> + dplyr::filter(is.na(modified_gene_names)) |> pull(gene_id) + data <- data[!rownames(data) %in% rownames_to_delete, ] + + rownames(data) <- gene_names_map_tidy$modified_gene_names + # access assay name + assay_name <- data@assays |> names() |> magrittr::extract2(1) + counts_slot <- assay(data, assay_name) + rownames(counts_slot) <- gene_names_map_tidy$modified_gene_names + + SummarizedExperiment::rowData(data)$feature_name <- NULL + SummarizedExperiment::rowData(data)$gene_name <- gene_names_map_tidy$modified_gene_names + + data +} + +computing_resources = crew.cluster::crew_controller_slurm( + slurm_memory_gigabytes_per_cpu = 30, + slurm_cpus_per_task = 1, + workers = 75, + verbose = TRUE +) +tar_option_set( + memory = "transient", + garbage_collection = TRUE, + storage = "worker", + retrieval = "worker", + format = "qs", + controller = computing_resources +) + +list( + tar_target( + file_paths, + list.files(anndata_path_based_on_dataset_id_to_read, pattern = "*h5ad", + full.names = TRUE), + deployment = "main" + ), + tar_target( + grouped_observation_joinid_per_sample, + read_parquet("~/scratch/Census_rerun/census_samples_to_download_groups.parquet") + ), + tar_target( + sliced_sce, + subset_samples(grouped_observation_joinid_per_sample$dataset_id, + grouped_observation_joinid_per_sample$observation_joinid, + grouped_observation_joinid_per_sample$sample_2) |> + save_data(file_name = grouped_observation_joinid_per_sample$sample_2), + pattern = map(grouped_observation_joinid_per_sample), + format = "file" + ) +) + +# tar_make(store = "~/scratch/Census_rerun/split_h5ad_based_on_sample_id_target_store/", +# script = "~/git_control/CuratedAtlasQueryR/dev/split_census_anndata_base_on_sample_id.R", +# reporter = "summary") + + + diff --git a/man/get_database_url.Rd b/man/get_database_url.Rd index 2962d1b..5c43517 100644 --- a/man/get_database_url.Rd +++ b/man/get_database_url.Rd @@ -8,11 +8,13 @@ } \usage{ get_database_url( - databases = c("metadata.0.2.3.parquet", "fibrosis.0.2.3.parquet") + databases = c("metadata.0.2.3.parquet", "fibrosis.0.2.3.parquet", + "prostate.0.1.0.parquet") ) } \arguments{ -\item{databases}{A character vector specifying the names of the metadata files. Default is c("metadata.0.2.3.parquet", "fibrosis.0.2.3.parquet")} +\item{databases}{A character vector specifying the names of the metadata files. +Default is c("metadata.0.2.3.parquet", "fibrosis.0.2.3.parquet", "prostate.0.1.0.parquet")} } \value{ A character vector of URLs to parquet files to download diff --git a/tests/testthat/test-query.R b/tests/testthat/test-query.R index 4a2cb1b..a6e4355 100755 --- a/tests/testthat/test-query.R +++ b/tests/testthat/test-query.R @@ -253,4 +253,16 @@ test_that("get_pseudobulk() syncs appropriate fixed file", { expect_gt(1) }) +test_that("get_single_cell_experiment() syncs prostate atlas", { + temp <- tempfile() + # A sample from prostate atlas + sample <- "GSM4089151" + meta <- get_metadata(cache_directory = temp) |> filter(sample_ == sample) + sce <- meta |> get_single_cell_experiment(cache_directory = temp) + sce |> + row.names() |> + length() |> + expect_gt(1) +}) +