stemangiola · stemangiola · Oct 24, 2024 · Oct 10, 2024 · Oct 24, 2024 · Oct 24, 2024
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Type: Package
 Package: CuratedAtlasQueryR
 Title: Queries the Human Cell Atlas
-Version: 1.3.7
+Version: 1.4.7
 Authors@R: c(
     person(
         "Stefano",
@@ -126,7 +126,7 @@ biocViews:
     Transcription,
     Transcriptomics
 Encoding: UTF-8
-RoxygenNote: 7.3.1
+RoxygenNote: 7.3.2
 LazyDataCompression: xz
 URL: https://github.com/stemangiola/CuratedAtlasQueryR
 BugReports: https://github.com/stemangiola/CuratedAtlasQueryR/issues

diff --git a/R/metadata.R b/R/metadata.R
@@ -10,7 +10,8 @@ cache <- rlang::env(
 )
 
 #' Returns the URLs for all metadata files 
-#' @param databases A character vector specifying the names of the metadata files. Default is c("metadata.0.2.3.parquet", "fibrosis.0.2.3.parquet")
+#' @param databases A character vector specifying the names of the metadata files. 
+#' Default is c("metadata.0.2.3.parquet", "fibrosis.0.2.3.parquet", "prostate.0.1.0.parquet")
 #' @export
 #' @return A character vector of URLs to parquet files to download
 #' @examples
@@ -20,7 +21,8 @@ cache <- rlang::env(
 #'   immune system across age, sex and ethnicity." bioRxiv (2023): 2023-06.
 #'   doi:10.1101/2023.06.08.542671.
 #' @source [Mangiola et al.,2023](https://www.biorxiv.org/content/10.1101/2023.06.08.542671v3)
-get_database_url <- function(databases = c("metadata.0.2.3.parquet", "fibrosis.0.2.3.parquet")) {
+get_database_url <- function(databases = c("metadata.0.2.3.parquet", "fibrosis.0.2.3.parquet",
+                                           "prostate.0.1.0.parquet")) {
   glue::glue(
     "https://object-store.rc.nectar.org.au/v1/AUTH_06d6e008e3e642da99d806ba3ea629c5/metadata/{databases}")
 }

diff --git a/dev/census_accepted_assays.csv b/dev/census_accepted_assays.csv
@@ -0,0 +1,99 @@
+id,assay
+EFO:0003755,FL-cDNA
+EFO:0008640,3'T-fill
+EFO:0008641,3’-end-seq
+EFO:0008643,3′-Seq
+EFO:0008661,Bru-Seq
+EFO:0008669,CAGEscan
+EFO:0008673,CapSeq
+EFO:0008675,CaptureSeq
+EFO:0008679,CEL-seq
+EFO:0008694,ClickSeq
+EFO:0008697,cP-RNA-Seq
+EFO:0008708,DeepCAGE
+EFO:0008710,Digital RNA
+EFO:0008718,DP-Seq
+EFO:0008720,DroNc-seq
+EFO:0008722,Drop-seq
+EFO:0008735,FACS-seq
+EFO:0008747,FRISCR
+EFO:0008748,FRT-Seq
+EFO:0008752,GMUCT 1.0
+EFO:0008753,GMUCT 2.0
+EFO:0008756,GRO-CAP
+EFO:0008763,Hi-SCL
+EFO:0008780,inDrop
+EFO:0008796,MARS-seq
+EFO:0008797,MATQ-seq
+EFO:0008824,NanoCAGE
+EFO:0008825,Nanogrid RNA-Seq
+EFO:0008826,NET-Seq
+EFO:0008850,PAS-Seq
+EFO:0008859,PEAT
+EFO:0008863,PLATE-Seq
+EFO:0008868,PRO-cap
+EFO:0008869,PRO-seq
+EFO:0008877,Quartz-seq
+EFO:0008896,RNA-Seq
+EFO:0008897,RNAtag-Seq
+EFO:0008898,RNET-seq
+EFO:0008903,SC3-seq
+EFO:0008908,SCI-seq
+EFO:0008919,Seq-Well
+EFO:0008929,SMA
+EFO:0008930,Smart-seq
+EFO:0008931,Smart-seq2
+EFO:0008937,snDrop-seq
+EFO:0008941,sNuc-Seq
+EFO:0008945,SPET-seq
+EFO:0008953,STRT-seq
+EFO:0008954,STRT-seq-2i
+EFO:0008956,SUPeR-seq
+EFO:0008962,TARDIS
+EFO:0008966,TCR Chain Paring
+EFO:0008967,TCR-LA-MC PCR
+EFO:0008972,TL-seq
+EFO:0008974,Tomo-Seq
+EFO:0008975,TRAP-Seq
+EFO:0008978,TSS Sequencing
+EFO:0008980,UMI Method
+EFO:0009309,Div-Seq
+EFO:0009899,10x 3' v2
+EFO:0009900,10x 5' v2
+EFO:0009901,10x 3' v1
+EFO:0009919,SPLiT-seq
+EFO:0009922,10x 3' v3
+EFO:0009991,Nuc-Seq
+EFO:0010003,RASL-seq
+EFO:0010004,SCRB-seq
+EFO:0010010,CEL-seq2
+EFO:0010022,Smart-3Seq
+EFO:0010034,Cappable-Seq
+EFO:0010041,Nascent-Seq
+EFO:0010058,Fluidigm C1-based library preparation
+EFO:0010184,Smart-like
+EFO:0010550,sci-RNA-seq
+EFO:0010713,10x immune profiling
+EFO:0010714,10x TCR enrichment
+EFO:0010715,10x Ig enrichment
+EFO:0010964,barcoded plate-based single cell RNA-seq
+EFO:0011025,10x 5' v1
+EFO:0022396,TruSeq
+EFO:0022488,Smart-seq3
+EFO:0022490,ScaleBio single cell RNA sequencing
+EFO:0030002,microwell-seq
+EFO:0030003,10x 3' transcription profiling
+EFO:0030004,10x 5' transcription profiling
+EFO:0030019,Seq-Well S3
+EFO:0030021,Nx1-seq
+EFO:0030028,sci-RNA-seq3
+EFO:0030030,Quant-seq
+EFO:0030031,SCOPE-chip
+EFO:0030061,mcSCRB-seq
+EFO:0030074,SORT-seq
+EFO:0030078,droplet-based single-cell RNA library preparation
+EFO:0700003,BD Rhapsody Whole Transcriptome Analysis
+EFO:0700004,BD Rhapsody Targeted mRNA
+EFO:0700010,TruDrop
+EFO:0700011,GEXSCOPE technology
+EFO:0700016,Smart-seq v4
diff --git a/dev/execute_hpcell_on_census_and_defining_data_tranformation.R b/dev/execute_hpcell_on_census_and_defining_data_tranformation.R
@@ -0,0 +1,206 @@
+library(glue)
+library(dplyr)
+library(arrow)
+library(SingleCellExperiment)
+library(tibble)
+library(SummarizedExperiment)
+library(glue)
+library(purrr)
+library(zellkonverter)
+library(tidyr)
+library(ggplot2)
+library(plotly)
+library(targets)
+library(stringr)
+library(CuratedAtlasQueryR)
+library(fs)
+library(HPCell)
+library(crew.cluster)
+directory = "/home/users/allstaff/shen.m/scratch/Census_rerun/split_h5ad_based_on_sample_id/"
+sample_anndata <- dir(glue("{directory}"), full.names = T)
+downloaded_samples_tbl <- read_parquet("/home/users/allstaff/shen.m/scratch/Census_rerun/census_samples_to_download_groups.parquet")
+downloaded_samples_tbl <- downloaded_samples_tbl |>
+  rename(cell_number = list_length) |> 
+  mutate(cell_number = cell_number |> as.integer(),
+         file_name = glue("{directory}{sample_2}.h5ad") |> as.character(), 
+         tier = case_when(
+           cell_number < 500 ~ "tier_1", cell_number >= 500 &
+             cell_number < 1000 ~ "tier_2", cell_number >= 1000 &
+             cell_number < 10000 ~ "tier_3", cell_number >= 10000 ~ "tier_4"
+         ))
+
+result_directory = "/vast/projects/cellxgene_curated/metadata_cellxgenedp_Apr_2024"
+sample_meta <- tar_read(metadata_dataset_id_common_sample_columns, store = glue("{result_directory}/_targets"))
+sample_tbl = downloaded_samples_tbl |> left_join(get_metadata() |> select(dataset_id, contains("norm")) |>
+                                 distinct() |> filter(!is.na(x_normalization)) |>
+                                 as_tibble(), by = "dataset_id")
+
+
+sample_tbl <- sample_tbl |> left_join(sample_meta, by = "dataset_id") |> distinct(file_name, tier, cell_number, dataset_id, sample_2,
+                                                                                  x_normalization, x_approximate_distribution) |>
+  mutate(transform_method = case_when(str_like(x_normalization, "C%") ~ "log",
+                                      x_normalization == "none" ~ "log",
+                                      x_normalization == "normalized" ~ "log",
+                                      is.na(x_normalization) & is.na(x_approximate_distribution) ~ "log",
+                                      is.na(x_normalization) & x_approximate_distribution == "NORMAL" ~ "NORMAL",
+                                      is.na(x_normalization) & x_approximate_distribution == "COUNT" ~ "COUNT",
+                                      str_like(x_normalization, "%canpy%") ~ "log1p",
+                                      TRUE ~ x_normalization)) |>
+
+  mutate(method_to_apply =  case_when(transform_method %in% c("log","LogNormalization","LogNormalize","log-normalization") ~ "exp",
+                                      is.na(x_normalization) & is.na(x_approximate_distribution) ~ "exp",
+                                      str_like(transform_method, "Counts%") ~ "exp",
+                                      str_like(transform_method, "%log2%") ~ "exp",
+                                      transform_method %in% c("log1p", "log1p, base e", "Scanpy",
+                                                              "scanpy.api.pp.normalize_per_cell method, scaling factor 10000") ~ "expm1",
+                                      transform_method == "log1p, base 2" ~ "expm1",
+                                      transform_method == "NORMAL" ~ "exp",
+                                      transform_method == "COUNT" ~ "identity",
+                                      is.na(transform_method) ~ "identity"
+  ) ) |>
+  mutate(comment = case_when(str_like(x_normalization, "Counts%")  ~ "a checkpoint for max value of Assay must <= 50",
+                             is.na(x_normalization) & is.na(x_approximate_distribution) ~ "round negative value to 0",
+                             x_normalization == "normalized" ~ "round negative value to 0"
+  ))
+
+sample_tbl <- sample_tbl |> mutate(transformation_function = map(
+    method_to_apply,
+    ~ ( function(data) {
+      assay_name <- data@assays |> names() |> magrittr::extract2(1)
+      counts <- assay(data, assay_name)
+      density_est <- counts |> as.matrix() |> density()
+      mode_value <- density_est$x[which.max(density_est$y)]
+      if (mode_value < 0 )  counts <- counts + abs(mode_value)
+
+      # Scale max counts to 20 to avoid any downstream failure
+      if (.x != "identity" && (max(counts) > 20)) {
+        scale_factor = 20 / max(counts)
+        counts <- counts * scale_factor}
+
+      counts <- transform_method(counts)
+      # round counts to avoid potential substraction error due to different digits print out
+      counts <- counts |> round(5)
+      majority_gene_counts = names(which.max(table(as.vector(counts)))) |> as.numeric()
+      if (majority_gene_counts != 0) {
+        counts <- counts - majority_gene_counts
+      }
+
+      # Avoid downstream failures negative counts
+      if((counts[,1:min(10000, ncol(counts))] |> min()) < 0)
+        counts[counts < 0] <- 0
+
+      # Assign counts back to data
+      assay(data, assay_name) <- counts
+
+      col_sums <- colSums(counts)
+      # Drop all zero cells
+      data <- data[, col_sums > 0]
+
+      # Avoid downstream binding error
+      rowData(data) = NULL
+
+      data
+
+    }) |>
+      # Meta programming, replacing the transformation programmatically
+      substitute( env = list(transform_method = as.name(.x))) |>
+      # Evaluate back to a working function
+      eval()
+  ))
+
+#sample_tbl |> saveRDS("~/scratch/Census_rerun/sample_tbl_input_for_hpcell.rds")
+sample_tbl <- readRDS("~/scratch/Census_rerun/sample_tbl_input_for_hpcell.rds")
+
+# Set the parent directory where the subdirectories will be created
+# parent_dir <- "~/scratch/Census_rerun/"
+# 
+# # Directory names to create
+# dir_names <- paste0("run", 1:25)
+# 
+# # Full paths of the directories
+# full_dir_paths <- file.path(parent_dir, dir_names)
+# 
+# # Create each directory if it does not exist
+# for (dir_path in full_dir_paths) {
+#   if (!dir_exists(dir_path)) {
+#     dir_create(dir_path)
+#   }
+# }
+
+# Run 1000 samples per run. Save log and result in the corresponding store
+store = "~/scratch/Census_rerun/run3/"
+setwd(glue("{store}"))
+sliced_sample_tbl = sample_tbl |> slice(2001:3000) |> select(file_name, tier, cell_number, dataset_id,
+                                                          sample_2, transformation_function)
+
+# Enable sample_names.rds to store sample names for the input
+sample_names <- sliced_sample_tbl |> pull(file_name)  |> set_names(sliced_sample_tbl |> pull(sample_2))
+
+sample_names |>
+  initialise_hpc(
+    gene_nomenclature = "ensembl",
+    data_container_type = "anndata",
+    store = store,
+    tier = sliced_sample_tbl |> pull(tier),
+    computing_resources = list(
+      crew_controller_slurm(
+        name = "tier_1",
+        script_lines = "#SBATCH --mem 35G",
+        slurm_cpus_per_task = 1,
+        workers = 200,
+        tasks_max = 1,
+        verbose = T
+      ),
+
+      crew_controller_slurm(
+        name = "tier_2",
+        script_lines = "#SBATCH --mem 60G",
+        slurm_cpus_per_task = 1,
+        workers = 50,
+        tasks_max = 1,
+        verbose = T
+      ),
+      crew_controller_slurm(
+        name = "tier_3",
+        script_lines = "#SBATCH --mem 90G",
+        slurm_cpus_per_task = 1,
+        workers = 25,
+        tasks_max = 1,
+        verbose = T
+      ),
+      crew_controller_slurm(
+        name = "tier_4",
+        script_lines = "#SBATCH --mem 100G",
+        slurm_cpus_per_task = 1,
+        workers = 14,
+        tasks_max = 1,
+        verbose = T
+      )
+    )
+
+  ) |>
+  tranform_assay(fx = sliced_sample_tbl |>
+                   pull(transformation_function),
+                 target_output = "sce_transformed") |>
+
+  # Remove empty outliers based on RNA count threshold per cell
+  remove_empty_threshold(target_input = "sce_transformed", RNA_feature_threshold = 200) |>
+
+  # Remove dead cells
+  remove_dead_scuttle(target_input = "sce_transformed") |>
+
+  # Score cell cycle
+  score_cell_cycle_seurat(target_input = "sce_transformed") |>
+
+  # Remove doublets
+  remove_doublets_scDblFinder(target_input = "sce_transformed") |>
+
+  # Annotation
+  annotate_cell_type(target_input = "sce_transformed", azimuth_reference = "pbmcref") |>
+
+  normalise_abundance_seurat_SCT(
+    factors_to_regress = c("subsets_Mito_percent", "subsets_Ribo_percent", "G2M.Score"),
+    target_input = "sce_transformed"
+  )
+
+