Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add prostate atlas #152

Merged
merged 4 commits into from
Oct 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Type: Package
Package: CuratedAtlasQueryR
Title: Queries the Human Cell Atlas
Version: 1.3.7
Version: 1.4.7
Authors@R: c(
person(
"Stefano",
Expand Down Expand Up @@ -126,7 +126,7 @@ biocViews:
Transcription,
Transcriptomics
Encoding: UTF-8
RoxygenNote: 7.3.1
RoxygenNote: 7.3.2
LazyDataCompression: xz
URL: https://github.com/stemangiola/CuratedAtlasQueryR
BugReports: https://github.com/stemangiola/CuratedAtlasQueryR/issues
Expand Down
6 changes: 4 additions & 2 deletions R/metadata.R
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@ cache <- rlang::env(
)

#' Returns the URLs for all metadata files
#' @param databases A character vector specifying the names of the metadata files. Default is c("metadata.0.2.3.parquet", "fibrosis.0.2.3.parquet")
#' @param databases A character vector specifying the names of the metadata files.
#' Default is c("metadata.0.2.3.parquet", "fibrosis.0.2.3.parquet", "prostate.0.1.0.parquet")
#' @export
#' @return A character vector of URLs to parquet files to download
#' @examples
Expand All @@ -20,7 +21,8 @@ cache <- rlang::env(
#' immune system across age, sex and ethnicity." bioRxiv (2023): 2023-06.
#' doi:10.1101/2023.06.08.542671.
#' @source [Mangiola et al.,2023](https://www.biorxiv.org/content/10.1101/2023.06.08.542671v3)
get_database_url <- function(databases = c("metadata.0.2.3.parquet", "fibrosis.0.2.3.parquet")) {
get_database_url <- function(databases = c("metadata.0.2.3.parquet", "fibrosis.0.2.3.parquet",
"prostate.0.1.0.parquet")) {
glue::glue(
"https://object-store.rc.nectar.org.au/v1/AUTH_06d6e008e3e642da99d806ba3ea629c5/metadata/{databases}")
}
Expand Down
99 changes: 99 additions & 0 deletions dev/census_accepted_assays.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
id,assay
EFO:0003755,FL-cDNA
EFO:0008640,3'T-fill
EFO:0008641,3’-end-seq
EFO:0008643,3′-Seq
EFO:0008661,Bru-Seq
EFO:0008669,CAGEscan
EFO:0008673,CapSeq
EFO:0008675,CaptureSeq
EFO:0008679,CEL-seq
EFO:0008694,ClickSeq
EFO:0008697,cP-RNA-Seq
EFO:0008708,DeepCAGE
EFO:0008710,Digital RNA
EFO:0008718,DP-Seq
EFO:0008720,DroNc-seq
EFO:0008722,Drop-seq
EFO:0008735,FACS-seq
EFO:0008747,FRISCR
EFO:0008748,FRT-Seq
EFO:0008752,GMUCT 1.0
EFO:0008753,GMUCT 2.0
EFO:0008756,GRO-CAP
EFO:0008763,Hi-SCL
EFO:0008780,inDrop
EFO:0008796,MARS-seq
EFO:0008797,MATQ-seq
EFO:0008824,NanoCAGE
EFO:0008825,Nanogrid RNA-Seq
EFO:0008826,NET-Seq
EFO:0008850,PAS-Seq
EFO:0008859,PEAT
EFO:0008863,PLATE-Seq
EFO:0008868,PRO-cap
EFO:0008869,PRO-seq
EFO:0008877,Quartz-seq
EFO:0008896,RNA-Seq
EFO:0008897,RNAtag-Seq
EFO:0008898,RNET-seq
EFO:0008903,SC3-seq
EFO:0008908,SCI-seq
EFO:0008919,Seq-Well
EFO:0008929,SMA
EFO:0008930,Smart-seq
EFO:0008931,Smart-seq2
EFO:0008937,snDrop-seq
EFO:0008941,sNuc-Seq
EFO:0008945,SPET-seq
EFO:0008953,STRT-seq
EFO:0008954,STRT-seq-2i
EFO:0008956,SUPeR-seq
EFO:0008962,TARDIS
EFO:0008966,TCR Chain Paring
EFO:0008967,TCR-LA-MC PCR
EFO:0008972,TL-seq
EFO:0008974,Tomo-Seq
EFO:0008975,TRAP-Seq
EFO:0008978,TSS Sequencing
EFO:0008980,UMI Method
EFO:0009309,Div-Seq
EFO:0009899,10x 3' v2
EFO:0009900,10x 5' v2
EFO:0009901,10x 3' v1
EFO:0009919,SPLiT-seq
EFO:0009922,10x 3' v3
EFO:0009991,Nuc-Seq
EFO:0010003,RASL-seq
EFO:0010004,SCRB-seq
EFO:0010010,CEL-seq2
EFO:0010022,Smart-3Seq
EFO:0010034,Cappable-Seq
EFO:0010041,Nascent-Seq
EFO:0010058,Fluidigm C1-based library preparation
EFO:0010184,Smart-like
EFO:0010550,sci-RNA-seq
EFO:0010713,10x immune profiling
EFO:0010714,10x TCR enrichment
EFO:0010715,10x Ig enrichment
EFO:0010964,barcoded plate-based single cell RNA-seq
EFO:0011025,10x 5' v1
EFO:0022396,TruSeq
EFO:0022488,Smart-seq3
EFO:0022490,ScaleBio single cell RNA sequencing
EFO:0030002,microwell-seq
EFO:0030003,10x 3' transcription profiling
EFO:0030004,10x 5' transcription profiling
EFO:0030019,Seq-Well S3
EFO:0030021,Nx1-seq
EFO:0030028,sci-RNA-seq3
EFO:0030030,Quant-seq
EFO:0030031,SCOPE-chip
EFO:0030061,mcSCRB-seq
EFO:0030074,SORT-seq
EFO:0030078,droplet-based single-cell RNA library preparation
EFO:0700003,BD Rhapsody Whole Transcriptome Analysis
EFO:0700004,BD Rhapsody Targeted mRNA
EFO:0700010,TruDrop
EFO:0700011,GEXSCOPE technology
EFO:0700016,Smart-seq v4
206 changes: 206 additions & 0 deletions dev/execute_hpcell_on_census_and_defining_data_tranformation.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,206 @@
library(glue)
library(dplyr)
library(arrow)
library(SingleCellExperiment)
library(tibble)
library(SummarizedExperiment)
library(glue)
library(purrr)
library(zellkonverter)
library(tidyr)
library(ggplot2)
library(plotly)
library(targets)
library(stringr)
library(CuratedAtlasQueryR)
library(fs)
library(HPCell)
library(crew.cluster)
directory = "/home/users/allstaff/shen.m/scratch/Census_rerun/split_h5ad_based_on_sample_id/"
sample_anndata <- dir(glue("{directory}"), full.names = T)
downloaded_samples_tbl <- read_parquet("/home/users/allstaff/shen.m/scratch/Census_rerun/census_samples_to_download_groups.parquet")
downloaded_samples_tbl <- downloaded_samples_tbl |>
rename(cell_number = list_length) |>
mutate(cell_number = cell_number |> as.integer(),
file_name = glue("{directory}{sample_2}.h5ad") |> as.character(),
tier = case_when(
cell_number < 500 ~ "tier_1", cell_number >= 500 &
cell_number < 1000 ~ "tier_2", cell_number >= 1000 &
cell_number < 10000 ~ "tier_3", cell_number >= 10000 ~ "tier_4"
))

result_directory = "/vast/projects/cellxgene_curated/metadata_cellxgenedp_Apr_2024"
sample_meta <- tar_read(metadata_dataset_id_common_sample_columns, store = glue("{result_directory}/_targets"))
sample_tbl = downloaded_samples_tbl |> left_join(get_metadata() |> select(dataset_id, contains("norm")) |>
distinct() |> filter(!is.na(x_normalization)) |>
as_tibble(), by = "dataset_id")


sample_tbl <- sample_tbl |> left_join(sample_meta, by = "dataset_id") |> distinct(file_name, tier, cell_number, dataset_id, sample_2,
x_normalization, x_approximate_distribution) |>
mutate(transform_method = case_when(str_like(x_normalization, "C%") ~ "log",
x_normalization == "none" ~ "log",
x_normalization == "normalized" ~ "log",
is.na(x_normalization) & is.na(x_approximate_distribution) ~ "log",
is.na(x_normalization) & x_approximate_distribution == "NORMAL" ~ "NORMAL",
is.na(x_normalization) & x_approximate_distribution == "COUNT" ~ "COUNT",
str_like(x_normalization, "%canpy%") ~ "log1p",
TRUE ~ x_normalization)) |>

mutate(method_to_apply = case_when(transform_method %in% c("log","LogNormalization","LogNormalize","log-normalization") ~ "exp",
is.na(x_normalization) & is.na(x_approximate_distribution) ~ "exp",
str_like(transform_method, "Counts%") ~ "exp",
str_like(transform_method, "%log2%") ~ "exp",
transform_method %in% c("log1p", "log1p, base e", "Scanpy",
"scanpy.api.pp.normalize_per_cell method, scaling factor 10000") ~ "expm1",
transform_method == "log1p, base 2" ~ "expm1",
transform_method == "NORMAL" ~ "exp",
transform_method == "COUNT" ~ "identity",
is.na(transform_method) ~ "identity"
) ) |>
mutate(comment = case_when(str_like(x_normalization, "Counts%") ~ "a checkpoint for max value of Assay must <= 50",
is.na(x_normalization) & is.na(x_approximate_distribution) ~ "round negative value to 0",
x_normalization == "normalized" ~ "round negative value to 0"
))

sample_tbl <- sample_tbl |> mutate(transformation_function = map(
method_to_apply,
~ ( function(data) {
assay_name <- data@assays |> names() |> magrittr::extract2(1)
counts <- assay(data, assay_name)
density_est <- counts |> as.matrix() |> density()
mode_value <- density_est$x[which.max(density_est$y)]
if (mode_value < 0 ) counts <- counts + abs(mode_value)

# Scale max counts to 20 to avoid any downstream failure
if (.x != "identity" && (max(counts) > 20)) {
scale_factor = 20 / max(counts)
counts <- counts * scale_factor}

counts <- transform_method(counts)
# round counts to avoid potential substraction error due to different digits print out
counts <- counts |> round(5)
majority_gene_counts = names(which.max(table(as.vector(counts)))) |> as.numeric()
if (majority_gene_counts != 0) {
counts <- counts - majority_gene_counts
}

# Avoid downstream failures negative counts
if((counts[,1:min(10000, ncol(counts))] |> min()) < 0)
counts[counts < 0] <- 0

# Assign counts back to data
assay(data, assay_name) <- counts

col_sums <- colSums(counts)
# Drop all zero cells
data <- data[, col_sums > 0]

# Avoid downstream binding error
rowData(data) = NULL

data

}) |>
# Meta programming, replacing the transformation programmatically
substitute( env = list(transform_method = as.name(.x))) |>
# Evaluate back to a working function
eval()
))

#sample_tbl |> saveRDS("~/scratch/Census_rerun/sample_tbl_input_for_hpcell.rds")
sample_tbl <- readRDS("~/scratch/Census_rerun/sample_tbl_input_for_hpcell.rds")

# Set the parent directory where the subdirectories will be created
# parent_dir <- "~/scratch/Census_rerun/"
#
# # Directory names to create
# dir_names <- paste0("run", 1:25)
#
# # Full paths of the directories
# full_dir_paths <- file.path(parent_dir, dir_names)
#
# # Create each directory if it does not exist
# for (dir_path in full_dir_paths) {
# if (!dir_exists(dir_path)) {
# dir_create(dir_path)
# }
# }

# Run 1000 samples per run. Save log and result in the corresponding store
store = "~/scratch/Census_rerun/run3/"
setwd(glue("{store}"))
sliced_sample_tbl = sample_tbl |> slice(2001:3000) |> select(file_name, tier, cell_number, dataset_id,
sample_2, transformation_function)

# Enable sample_names.rds to store sample names for the input
sample_names <- sliced_sample_tbl |> pull(file_name) |> set_names(sliced_sample_tbl |> pull(sample_2))

sample_names |>
initialise_hpc(
gene_nomenclature = "ensembl",
data_container_type = "anndata",
store = store,
tier = sliced_sample_tbl |> pull(tier),
computing_resources = list(
crew_controller_slurm(
name = "tier_1",
script_lines = "#SBATCH --mem 35G",
slurm_cpus_per_task = 1,
workers = 200,
tasks_max = 1,
verbose = T
),

crew_controller_slurm(
name = "tier_2",
script_lines = "#SBATCH --mem 60G",
slurm_cpus_per_task = 1,
workers = 50,
tasks_max = 1,
verbose = T
),
crew_controller_slurm(
name = "tier_3",
script_lines = "#SBATCH --mem 90G",
slurm_cpus_per_task = 1,
workers = 25,
tasks_max = 1,
verbose = T
),
crew_controller_slurm(
name = "tier_4",
script_lines = "#SBATCH --mem 100G",
slurm_cpus_per_task = 1,
workers = 14,
tasks_max = 1,
verbose = T
)
)

) |>
tranform_assay(fx = sliced_sample_tbl |>
pull(transformation_function),
target_output = "sce_transformed") |>

# Remove empty outliers based on RNA count threshold per cell
remove_empty_threshold(target_input = "sce_transformed", RNA_feature_threshold = 200) |>

# Remove dead cells
remove_dead_scuttle(target_input = "sce_transformed") |>

# Score cell cycle
score_cell_cycle_seurat(target_input = "sce_transformed") |>

# Remove doublets
remove_doublets_scDblFinder(target_input = "sce_transformed") |>

# Annotation
annotate_cell_type(target_input = "sce_transformed", azimuth_reference = "pbmcref") |>

normalise_abundance_seurat_SCT(
factors_to_regress = c("subsets_Mito_percent", "subsets_Ribo_percent", "G2M.Score"),
target_input = "sce_transformed"
)


Loading
Loading