Skip to content

Commit

Permalink
Merge pull request #152 from myushen/master
Browse files Browse the repository at this point in the history
Add prostate atlas
  • Loading branch information
stemangiola authored Oct 24, 2024
2 parents edd72cf + dbc4bc8 commit 2fed3a7
Show file tree
Hide file tree
Showing 7 changed files with 461 additions and 6 deletions.
4 changes: 2 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Type: Package
Package: CuratedAtlasQueryR
Title: Queries the Human Cell Atlas
Version: 1.3.7
Version: 1.4.7
Authors@R: c(
person(
"Stefano",
Expand Down Expand Up @@ -126,7 +126,7 @@ biocViews:
Transcription,
Transcriptomics
Encoding: UTF-8
RoxygenNote: 7.3.1
RoxygenNote: 7.3.2
LazyDataCompression: xz
URL: https://github.com/stemangiola/CuratedAtlasQueryR
BugReports: https://github.com/stemangiola/CuratedAtlasQueryR/issues
Expand Down
6 changes: 4 additions & 2 deletions R/metadata.R
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@ cache <- rlang::env(
)

#' Returns the URLs for all metadata files
#' @param databases A character vector specifying the names of the metadata files. Default is c("metadata.0.2.3.parquet", "fibrosis.0.2.3.parquet")
#' @param databases A character vector specifying the names of the metadata files.
#' Default is c("metadata.0.2.3.parquet", "fibrosis.0.2.3.parquet", "prostate.0.1.0.parquet")
#' @export
#' @return A character vector of URLs to parquet files to download
#' @examples
Expand All @@ -20,7 +21,8 @@ cache <- rlang::env(
#' immune system across age, sex and ethnicity." bioRxiv (2023): 2023-06.
#' doi:10.1101/2023.06.08.542671.
#' @source [Mangiola et al.,2023](https://www.biorxiv.org/content/10.1101/2023.06.08.542671v3)
get_database_url <- function(databases = c("metadata.0.2.3.parquet", "fibrosis.0.2.3.parquet")) {
get_database_url <- function(databases = c("metadata.0.2.3.parquet", "fibrosis.0.2.3.parquet",
"prostate.0.1.0.parquet")) {
glue::glue(
"https://object-store.rc.nectar.org.au/v1/AUTH_06d6e008e3e642da99d806ba3ea629c5/metadata/{databases}")
}
Expand Down
99 changes: 99 additions & 0 deletions dev/census_accepted_assays.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
id,assay
EFO:0003755,FL-cDNA
EFO:0008640,3'T-fill
EFO:0008641,3’-end-seq
EFO:0008643,3′-Seq
EFO:0008661,Bru-Seq
EFO:0008669,CAGEscan
EFO:0008673,CapSeq
EFO:0008675,CaptureSeq
EFO:0008679,CEL-seq
EFO:0008694,ClickSeq
EFO:0008697,cP-RNA-Seq
EFO:0008708,DeepCAGE
EFO:0008710,Digital RNA
EFO:0008718,DP-Seq
EFO:0008720,DroNc-seq
EFO:0008722,Drop-seq
EFO:0008735,FACS-seq
EFO:0008747,FRISCR
EFO:0008748,FRT-Seq
EFO:0008752,GMUCT 1.0
EFO:0008753,GMUCT 2.0
EFO:0008756,GRO-CAP
EFO:0008763,Hi-SCL
EFO:0008780,inDrop
EFO:0008796,MARS-seq
EFO:0008797,MATQ-seq
EFO:0008824,NanoCAGE
EFO:0008825,Nanogrid RNA-Seq
EFO:0008826,NET-Seq
EFO:0008850,PAS-Seq
EFO:0008859,PEAT
EFO:0008863,PLATE-Seq
EFO:0008868,PRO-cap
EFO:0008869,PRO-seq
EFO:0008877,Quartz-seq
EFO:0008896,RNA-Seq
EFO:0008897,RNAtag-Seq
EFO:0008898,RNET-seq
EFO:0008903,SC3-seq
EFO:0008908,SCI-seq
EFO:0008919,Seq-Well
EFO:0008929,SMA
EFO:0008930,Smart-seq
EFO:0008931,Smart-seq2
EFO:0008937,snDrop-seq
EFO:0008941,sNuc-Seq
EFO:0008945,SPET-seq
EFO:0008953,STRT-seq
EFO:0008954,STRT-seq-2i
EFO:0008956,SUPeR-seq
EFO:0008962,TARDIS
EFO:0008966,TCR Chain Paring
EFO:0008967,TCR-LA-MC PCR
EFO:0008972,TL-seq
EFO:0008974,Tomo-Seq
EFO:0008975,TRAP-Seq
EFO:0008978,TSS Sequencing
EFO:0008980,UMI Method
EFO:0009309,Div-Seq
EFO:0009899,10x 3' v2
EFO:0009900,10x 5' v2
EFO:0009901,10x 3' v1
EFO:0009919,SPLiT-seq
EFO:0009922,10x 3' v3
EFO:0009991,Nuc-Seq
EFO:0010003,RASL-seq
EFO:0010004,SCRB-seq
EFO:0010010,CEL-seq2
EFO:0010022,Smart-3Seq
EFO:0010034,Cappable-Seq
EFO:0010041,Nascent-Seq
EFO:0010058,Fluidigm C1-based library preparation
EFO:0010184,Smart-like
EFO:0010550,sci-RNA-seq
EFO:0010713,10x immune profiling
EFO:0010714,10x TCR enrichment
EFO:0010715,10x Ig enrichment
EFO:0010964,barcoded plate-based single cell RNA-seq
EFO:0011025,10x 5' v1
EFO:0022396,TruSeq
EFO:0022488,Smart-seq3
EFO:0022490,ScaleBio single cell RNA sequencing
EFO:0030002,microwell-seq
EFO:0030003,10x 3' transcription profiling
EFO:0030004,10x 5' transcription profiling
EFO:0030019,Seq-Well S3
EFO:0030021,Nx1-seq
EFO:0030028,sci-RNA-seq3
EFO:0030030,Quant-seq
EFO:0030031,SCOPE-chip
EFO:0030061,mcSCRB-seq
EFO:0030074,SORT-seq
EFO:0030078,droplet-based single-cell RNA library preparation
EFO:0700003,BD Rhapsody Whole Transcriptome Analysis
EFO:0700004,BD Rhapsody Targeted mRNA
EFO:0700010,TruDrop
EFO:0700011,GEXSCOPE technology
EFO:0700016,Smart-seq v4
206 changes: 206 additions & 0 deletions dev/execute_hpcell_on_census_and_defining_data_tranformation.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,206 @@
library(glue)
library(dplyr)
library(arrow)
library(SingleCellExperiment)
library(tibble)
library(SummarizedExperiment)
library(glue)
library(purrr)
library(zellkonverter)
library(tidyr)
library(ggplot2)
library(plotly)
library(targets)
library(stringr)
library(CuratedAtlasQueryR)
library(fs)
library(HPCell)
library(crew.cluster)
directory = "/home/users/allstaff/shen.m/scratch/Census_rerun/split_h5ad_based_on_sample_id/"
sample_anndata <- dir(glue("{directory}"), full.names = T)
downloaded_samples_tbl <- read_parquet("/home/users/allstaff/shen.m/scratch/Census_rerun/census_samples_to_download_groups.parquet")
downloaded_samples_tbl <- downloaded_samples_tbl |>
rename(cell_number = list_length) |>
mutate(cell_number = cell_number |> as.integer(),
file_name = glue("{directory}{sample_2}.h5ad") |> as.character(),
tier = case_when(
cell_number < 500 ~ "tier_1", cell_number >= 500 &
cell_number < 1000 ~ "tier_2", cell_number >= 1000 &
cell_number < 10000 ~ "tier_3", cell_number >= 10000 ~ "tier_4"
))

result_directory = "/vast/projects/cellxgene_curated/metadata_cellxgenedp_Apr_2024"
sample_meta <- tar_read(metadata_dataset_id_common_sample_columns, store = glue("{result_directory}/_targets"))
sample_tbl = downloaded_samples_tbl |> left_join(get_metadata() |> select(dataset_id, contains("norm")) |>
distinct() |> filter(!is.na(x_normalization)) |>
as_tibble(), by = "dataset_id")


sample_tbl <- sample_tbl |> left_join(sample_meta, by = "dataset_id") |> distinct(file_name, tier, cell_number, dataset_id, sample_2,
x_normalization, x_approximate_distribution) |>
mutate(transform_method = case_when(str_like(x_normalization, "C%") ~ "log",
x_normalization == "none" ~ "log",
x_normalization == "normalized" ~ "log",
is.na(x_normalization) & is.na(x_approximate_distribution) ~ "log",
is.na(x_normalization) & x_approximate_distribution == "NORMAL" ~ "NORMAL",
is.na(x_normalization) & x_approximate_distribution == "COUNT" ~ "COUNT",
str_like(x_normalization, "%canpy%") ~ "log1p",
TRUE ~ x_normalization)) |>

mutate(method_to_apply = case_when(transform_method %in% c("log","LogNormalization","LogNormalize","log-normalization") ~ "exp",
is.na(x_normalization) & is.na(x_approximate_distribution) ~ "exp",
str_like(transform_method, "Counts%") ~ "exp",
str_like(transform_method, "%log2%") ~ "exp",
transform_method %in% c("log1p", "log1p, base e", "Scanpy",
"scanpy.api.pp.normalize_per_cell method, scaling factor 10000") ~ "expm1",
transform_method == "log1p, base 2" ~ "expm1",
transform_method == "NORMAL" ~ "exp",
transform_method == "COUNT" ~ "identity",
is.na(transform_method) ~ "identity"
) ) |>
mutate(comment = case_when(str_like(x_normalization, "Counts%") ~ "a checkpoint for max value of Assay must <= 50",
is.na(x_normalization) & is.na(x_approximate_distribution) ~ "round negative value to 0",
x_normalization == "normalized" ~ "round negative value to 0"
))

sample_tbl <- sample_tbl |> mutate(transformation_function = map(
method_to_apply,
~ ( function(data) {
assay_name <- data@assays |> names() |> magrittr::extract2(1)
counts <- assay(data, assay_name)
density_est <- counts |> as.matrix() |> density()
mode_value <- density_est$x[which.max(density_est$y)]
if (mode_value < 0 ) counts <- counts + abs(mode_value)

# Scale max counts to 20 to avoid any downstream failure
if (.x != "identity" && (max(counts) > 20)) {
scale_factor = 20 / max(counts)
counts <- counts * scale_factor}

counts <- transform_method(counts)
# round counts to avoid potential substraction error due to different digits print out
counts <- counts |> round(5)
majority_gene_counts = names(which.max(table(as.vector(counts)))) |> as.numeric()
if (majority_gene_counts != 0) {
counts <- counts - majority_gene_counts
}

# Avoid downstream failures negative counts
if((counts[,1:min(10000, ncol(counts))] |> min()) < 0)
counts[counts < 0] <- 0

# Assign counts back to data
assay(data, assay_name) <- counts

col_sums <- colSums(counts)
# Drop all zero cells
data <- data[, col_sums > 0]

# Avoid downstream binding error
rowData(data) = NULL

data

}) |>
# Meta programming, replacing the transformation programmatically
substitute( env = list(transform_method = as.name(.x))) |>
# Evaluate back to a working function
eval()
))

#sample_tbl |> saveRDS("~/scratch/Census_rerun/sample_tbl_input_for_hpcell.rds")
sample_tbl <- readRDS("~/scratch/Census_rerun/sample_tbl_input_for_hpcell.rds")

# Set the parent directory where the subdirectories will be created
# parent_dir <- "~/scratch/Census_rerun/"
#
# # Directory names to create
# dir_names <- paste0("run", 1:25)
#
# # Full paths of the directories
# full_dir_paths <- file.path(parent_dir, dir_names)
#
# # Create each directory if it does not exist
# for (dir_path in full_dir_paths) {
# if (!dir_exists(dir_path)) {
# dir_create(dir_path)
# }
# }

# Run 1000 samples per run. Save log and result in the corresponding store
store = "~/scratch/Census_rerun/run3/"
setwd(glue("{store}"))
sliced_sample_tbl = sample_tbl |> slice(2001:3000) |> select(file_name, tier, cell_number, dataset_id,
sample_2, transformation_function)

# Enable sample_names.rds to store sample names for the input
sample_names <- sliced_sample_tbl |> pull(file_name) |> set_names(sliced_sample_tbl |> pull(sample_2))

sample_names |>
initialise_hpc(
gene_nomenclature = "ensembl",
data_container_type = "anndata",
store = store,
tier = sliced_sample_tbl |> pull(tier),
computing_resources = list(
crew_controller_slurm(
name = "tier_1",
script_lines = "#SBATCH --mem 35G",
slurm_cpus_per_task = 1,
workers = 200,
tasks_max = 1,
verbose = T
),

crew_controller_slurm(
name = "tier_2",
script_lines = "#SBATCH --mem 60G",
slurm_cpus_per_task = 1,
workers = 50,
tasks_max = 1,
verbose = T
),
crew_controller_slurm(
name = "tier_3",
script_lines = "#SBATCH --mem 90G",
slurm_cpus_per_task = 1,
workers = 25,
tasks_max = 1,
verbose = T
),
crew_controller_slurm(
name = "tier_4",
script_lines = "#SBATCH --mem 100G",
slurm_cpus_per_task = 1,
workers = 14,
tasks_max = 1,
verbose = T
)
)

) |>
tranform_assay(fx = sliced_sample_tbl |>
pull(transformation_function),
target_output = "sce_transformed") |>

# Remove empty outliers based on RNA count threshold per cell
remove_empty_threshold(target_input = "sce_transformed", RNA_feature_threshold = 200) |>

# Remove dead cells
remove_dead_scuttle(target_input = "sce_transformed") |>

# Score cell cycle
score_cell_cycle_seurat(target_input = "sce_transformed") |>

# Remove doublets
remove_doublets_scDblFinder(target_input = "sce_transformed") |>

# Annotation
annotate_cell_type(target_input = "sce_transformed", azimuth_reference = "pbmcref") |>

normalise_abundance_seurat_SCT(
factors_to_regress = c("subsets_Mito_percent", "subsets_Ribo_percent", "G2M.Score"),
target_input = "sce_transformed"
)


Loading

0 comments on commit 2fed3a7

Please sign in to comment.