Skip to content

fix ks_statistic metrics #8

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Mar 14, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/methods/scdesign3_nb/config.vsh.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ engines:
setup:
- type: r
github:
- SONGDONGYUAN1994/scDesign3
- SONGDONGYUAN1994/scDesign3
url:
- https://cran.r-project.org/src/contrib/Archive/reticulate/reticulate_1.40.0.tar.gz

Expand Down
160 changes: 129 additions & 31 deletions src/metrics/ks_statistic_gene_cell/script.R
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,20 @@ library(Matrix)
library(matrixStats)

## VIASH START
dir <- "work/66/65ddd5a686b4c712172ff7aa70cce8/_viash_par"
par <- list(
input_spatial_dataset = "resources_test/spatialsimbench_mobnew/dataset_sp.h5ad",
input_singlecell_dataset = "resources_test/spatialsimbench_mobnew/dataset_sc.h5ad",
input_simulated_dataset = "resources_test/spatialsimbench_mobnew/simulated_dataset.h5ad",
input_spatial_dataset = paste0(
dir,
"/input_spatial_dataset_1/dataset_sp.h5ad"
),
input_singlecell_dataset = paste0(
dir,
"/input_singlecell_dataset_1/dataset_sc.h5ad"
),
input_simulated_dataset = paste0(
dir,
"/input_simulated_dataset_1/spatialsimbench_mobnew.negative_normal.generate_sim_spatialcluster.output_sp.h5ad"
),
output = "output.h5ad"
)
meta <- list(
Expand All @@ -20,88 +30,177 @@ meta <- list(

cat("Reading input files\n")
input_spatial_dataset <- anndata::read_h5ad(par[["input_spatial_dataset"]])
input_singlecell_dataset <- anndata::read_h5ad(par[["input_singlecell_dataset"]])
input_singlecell_dataset <- anndata::read_h5ad(par[[
"input_singlecell_dataset"
]])
input_simulated_dataset <- anndata::read_h5ad(par[["input_simulated_dataset"]])

real_counts <- input_spatial_dataset$layers[["counts"]]
sim_counts <- input_simulated_dataset$layers[["counts"]]

try_kde_test <- function(x1, x2) {
tryCatch(
{
ks::kde.test(x1 = x1, x2 = x2)
},
error = function(e) {
warning(
"Caught error in ks::kde.test: ",
e$message,
"\n\nTrying again with some random noise added to the vectors."
)
x1_noise <- stats::runif(length(x1), -1e-8, 1e-8)
x2_noise <- stats::runif(length(x2), -1e-8, 1e-8)
ks::kde.test(x1 = x1 + x1_noise, x2 = x2 + x2_noise)
}
)
}

cat("Computing ks statistic of fraction of zeros per gene\n")
frac_zero_real_genes <- colMeans(real_counts == 0)
frac_zero_sim_genes <- colMeans(sim_counts == 0)
ks_statistic_frac_zero_genes <- ks::kde.test(x1 = frac_zero_real_genes, x2 = frac_zero_sim_genes)
ks_statistic_frac_zero_genes <- try_kde_test(
x1 = frac_zero_real_genes,
x2 = frac_zero_sim_genes
)

cat("Computing ks statistic of fraction of zeros per cell\n")
frac_zero_real_cells <- rowMeans(real_counts == 0)
frac_zero_sim_cells <- rowMeans(sim_counts == 0)
ks_statistic_frac_zero_cells <- ks::kde.test(x1 = frac_zero_real_cells, x2 = frac_zero_sim_cells)
ks_statistic_frac_zero_cells <- try_kde_test(
x1 = frac_zero_real_cells,
x2 = frac_zero_sim_cells
)

cat("Computing ks statistic of the library size\n")
lib_size_real_cells <- log1p(rowSums(real_counts))
lib_size_sim_cells <- log1p(rowSums(sim_counts))
ks_statistic_lib_size_cells <- ks::kde.test(x1 = lib_size_real_cells, x2 = lib_size_sim_cells)
ks_statistic_lib_size_cells <- try_kde_test(
x1 = lib_size_real_cells,
x2 = lib_size_sim_cells
)

cat("Computing ks statistic of the effective library size\n")
efflib_size_real_cells <- log1p(rowSums(real_counts))
efflib_size_sim_cells <- log1p(rowSums(sim_counts))
ks_statistic_efflib_size_cells <- ks::kde.test(x1 = efflib_size_real_cells, x2 = efflib_size_sim_cells)
ks_statistic_efflib_size_cells <- try_kde_test(
x1 = efflib_size_real_cells,
x2 = efflib_size_sim_cells
)

cat("Computing ks statistic of TMM\n")
real_dge <- edgeR::DGEList(counts = Matrix::t(real_counts))
sim_dge <- edgeR::DGEList(counts = Matrix::t(sim_counts))
tmm_real_cells <- edgeR::calcNormFactors(real_dge, method = "TMM")$samples$norm.factors
tmm_sim_cells <- edgeR::calcNormFactors(sim_dge, method = "TMM")$samples$norm.factors
ks_statistic_tmm_cells <- ks::kde.test(x1 = tmm_real_cells, x2 = tmm_sim_cells)
tmm_real_cells <- edgeR::calcNormFactors(
real_dge,
method = "TMM"
)$samples$norm.factors
tmm_sim_cells <- edgeR::calcNormFactors(
sim_dge,
method = "TMM"
)$samples$norm.factors
ks_statistic_tmm_cells <- try_kde_test(x1 = tmm_real_cells, x2 = tmm_sim_cells)

cat("Computing ks statistic of the cell-level scaled variance\n")
scaled_var_real_cells <- scale(sparseMatrixStats::colVars(Matrix::t(real_counts)))
scaled_var_real_cells <- scale(sparseMatrixStats::colVars(Matrix::t(
real_counts
)))
scaled_var_sim_cells <- scale(sparseMatrixStats::colVars(Matrix::t(sim_counts)))
ks_statistic_scaled_var_cells <- ks::kde.test(x1 = as.numeric(scaled_var_sim_cells), x2 = as.numeric(scaled_var_sim_cells))
ks_statistic_scaled_var_cells <- try_kde_test(
x1 = as.numeric(scaled_var_sim_cells),
x2 = as.numeric(scaled_var_sim_cells)
)

cat("Computing ks statistic of the cell-level scaled mean\n")
scaled_mean_real_cells <- scale(colMeans(Matrix::t(real_counts)))
scaled_mean_sim_cells <- scale(colMeans(Matrix::t(sim_counts)))
ks_statistic_scaled_mean_cells <- ks::kde.test(x1 = as.numeric(scaled_mean_sim_cells), x2 = as.numeric(scaled_mean_sim_cells))
ks_statistic_scaled_mean_cells <- try_kde_test(
x1 = as.numeric(scaled_mean_sim_cells),
x2 = as.numeric(scaled_mean_sim_cells)
)

cat("Computing ks statistic of the library size vs fraction of zeros per cell\n")
lib_fraczero_real_cells <- data.frame(lib = lib_size_real_cells, fraczero = frac_zero_real_cells)
lib_fraczero_sim_cells <- data.frame(lib = lib_size_sim_cells, fraczero = frac_zero_sim_cells)
ks_statistic_lib_fraczero_cells <- ks::kde.test(x1 = lib_fraczero_real_cells, x2 = lib_fraczero_sim_cells)
cat(
"Computing ks statistic of the library size vs fraction of zeros per cell\n"
)
lib_fraczero_real_cells <- data.frame(
lib = lib_size_real_cells,
fraczero = frac_zero_real_cells
)
lib_fraczero_sim_cells <- data.frame(
lib = lib_size_sim_cells,
fraczero = frac_zero_sim_cells
)
ks_statistic_lib_fraczero_cells <- try_kde_test(
x1 = lib_fraczero_real_cells,
x2 = lib_fraczero_sim_cells
)

cat("Computing ks statistic of the sample Pearson correlation\n")
# pearson_real_cells <- reshape2::melt(cor(as.matrix(Matrix::t(real_counts)), method = "pearson"))
pearson_real_cells <- proxyC::simil(real_counts, method = "correlation")
# pearson_sim_cells <- reshape2::melt(cor(as.matrix(Matrix::t(sim_counts)), method = "pearson"))
pearson_sim_cells <- proxyC::simil(sim_counts, method = "correlation")

ks_statistic_pearson_cells <- ks::kde.test(x1 = sample(as.numeric(pearson_real_cells), 10000), x2 = sample(as.numeric(pearson_sim_cells), 10000))
ks_statistic_pearson_cells <- try_kde_test(
x1 = sample(as.numeric(pearson_real_cells), 10000),
x2 = sample(as.numeric(pearson_sim_cells), 10000)
)

cat("Computing ks statistic of the gene-level scaled variance\n")
scaled_var_real_genes <- scale(sparseMatrixStats::colVars(real_counts))
scaled_var_sim_genes <- scale(sparseMatrixStats::colVars(sim_counts))
ks_statistic_scaled_var_genes <- ks::kde.test(x1 = as.numeric(scaled_var_sim_genes), x2 = as.numeric(scaled_var_sim_genes))
ks_statistic_scaled_var_genes <- try_kde_test(
x1 = as.numeric(scaled_var_sim_genes),
x2 = as.numeric(scaled_var_sim_genes)
)

cat("Computing ks statistic of the gene-level scaled mean\n")
scaled_mean_real_genes <- scale(colMeans(real_counts))
scaled_mean_sim_genes <- scale(colMeans(sim_counts))
ks_statistic_scaled_mean_genes <- ks::kde.test(x1 = as.numeric(scaled_mean_real_genes), x2 = as.numeric(scaled_mean_sim_genes))
ks_statistic_scaled_mean_genes <- try_kde_test(
x1 = as.numeric(scaled_mean_real_genes),
x2 = as.numeric(scaled_mean_sim_genes)
)

cat("Computing ks statistic of the gene Pearson correlation\n")
# pearson_real_genes <- reshape2::melt(cor(as.matrix(real_counts), method = "pearson"))
pearson_real_genes <- proxyC::simil(real_counts, method = "correlation")
# pearson_sim_genes <- reshape2::melt(cor(as.matrix(sim_counts), method = "pearson"))
pearson_sim_genes <- proxyC::simil(sim_counts, method = "correlation")
ks_statistic_pearson_genes <- ks::kde.test(x1 = sample(as.numeric(pearson_real_genes), 10000), x2 = sample(as.numeric(pearson_sim_genes), 10000))
ks_statistic_pearson_genes <- try_kde_test(
x1 = sample(as.numeric(pearson_real_genes), 10000),
x2 = sample(as.numeric(pearson_sim_genes), 10000)
)

cat("Computing ks statistic of the mean expression vs variance expression\n")
mean_var_real_genes <- data.frame(mean = colMeans(real_counts), var = sparseMatrixStats::colVars(real_counts))
mean_var_sim_genes <- data.frame(mean = colMeans(sim_counts), var = sparseMatrixStats::colVars(sim_counts))
ks_statistic_mean_var_genes <- ks::kde.test(x1 = mean_var_real_genes, x2 = mean_var_sim_genes)
mean_var_real_genes <- data.frame(
mean = colMeans(real_counts),
var = sparseMatrixStats::colVars(real_counts)
)
mean_var_sim_genes <- data.frame(
mean = colMeans(sim_counts),
var = sparseMatrixStats::colVars(sim_counts)
)
ks_statistic_mean_var_genes <- try_kde_test(
x1 = mean_var_real_genes,
x2 = mean_var_sim_genes
)

cat("Computing ks statistic of the mean expression vs fraction of zeros per gene\n")
mean_fraczero_real_genes <- data.frame(mean = colMeans(real_counts), fraczero = frac_zero_real_genes)
mean_fraczero_sim_genes <- data.frame(mean = colMeans(sim_counts), fraczero = frac_zero_sim_genes)
ks_statistic_mean_fraczero_genes <- ks::kde.test(x1 = mean_fraczero_real_genes, x2 = mean_fraczero_sim_genes)
cat(
"Computing ks statistic of the mean expression vs fraction of zeros per gene\n"
)
mean_fraczero_real_genes <- data.frame(
mean = colMeans(real_counts),
fraczero = frac_zero_real_genes
)
mean_fraczero_sim_genes <- data.frame(
mean = colMeans(sim_counts),
fraczero = frac_zero_sim_genes
)
ks_statistic_mean_fraczero_genes <- try_kde_test(
x1 = mean_fraczero_real_genes,
x2 = mean_fraczero_sim_genes
)

cat("Combining metric values\n")
uns_metric_ids <- c(
Expand Down Expand Up @@ -134,7 +233,6 @@ uns_metric_ids <- c(
"ks_statistic_pearson_genes_tstat",
"ks_statistic_mean_var_genes_tstat",
"ks_statistic_mean_fraczero_genes_tstat"

)
uns_metric_values <- c(
ks_statistic_frac_zero_genes$zstat,
Expand All @@ -151,7 +249,7 @@ uns_metric_values <- c(
ks_statistic_pearson_genes$zstat,
ks_statistic_mean_var_genes$zstat,
ks_statistic_mean_fraczero_genes$zstat,

ks_statistic_frac_zero_genes$tstat,
ks_statistic_frac_zero_cells$tstat,
ks_statistic_lib_size_cells$tstat,
Expand Down
91 changes: 74 additions & 17 deletions src/metrics/ks_statistic_sc_features/script.R
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,33 @@ requireNamespace("SingleCellExperiment", quietly = TRUE)
requireNamespace("SummarizedExperiment", quietly = TRUE)


packages <- c("SingleCellExperiment", "SummarizedExperiment",
"concaveman", "sp", "Matrix", "methods",
"ggplot2", "ggcorrplot", "MuSiC", "fields",
"MCMCpack", "dplyr", "sf", "RANN", "stats",
"reshape2", "RColorBrewer", "scatterpie",
"grDevices", "nnls", "pbmcapply", "spatstat",
"gtools", "RcppML", "NMF")
packages <- c(
"SingleCellExperiment",
"SummarizedExperiment",
"concaveman",
"sp",
"Matrix",
"methods",
"ggplot2",
"ggcorrplot",
"MuSiC",
"fields",
"MCMCpack",
"dplyr",
"sf",
"RANN",
"stats",
"reshape2",
"RColorBrewer",
"scatterpie",
"grDevices",
"nnls",
"pbmcapply",
"spatstat",
"gtools",
"RcppML",
"NMF"
)

# Load each package
lapply(packages, library, character.only = TRUE)
Expand Down Expand Up @@ -43,8 +63,12 @@ real_log_count <- t(input_real_sp$layers[["logcounts"]])
real_prob_matrix <- input_real_sp$obsm[["celltype_proportions"]]
colnames(real_prob_matrix) <- paste0("ct", seq_len(ncol(real_prob_matrix)))

sim_log_count <- scuttle::normalizeCounts(t(input_simulated_sp$layers[["counts"]]))
real_log_count <- real_log_count[ , colnames(real_log_count) %in% rownames(real_prob_matrix)]
sim_log_count <- scuttle::normalizeCounts(t(input_simulated_sp$layers[[
"counts"
]]))
real_log_count <- real_log_count[,
colnames(real_log_count) %in% rownames(real_prob_matrix)
]


sim_sce <- scater::logNormCounts(SingleCellExperiment::SingleCellExperiment(
Expand All @@ -55,10 +79,31 @@ sim_sce <- scater::logNormCounts(SingleCellExperiment::SingleCellExperiment(

sim_log_count <- SummarizedExperiment::assay(sim_sce, "logcounts")

# helper function
try_kde_test <- function(x1, x2) {
tryCatch(
{
ks::kde.test(x1 = x1, x2 = x2)
},
error = function(e) {
warning(
"Caught error in ks::kde.test: ",
e$message,
"\n\nTrying again with some random noise added to the vectors."
)
x1_noise <- stats::runif(length(x1), -1e-8, 1e-8)
x2_noise <- stats::runif(length(x2), -1e-8, 1e-8)
ks::kde.test(x1 = x1 + x1_noise, x2 = x2 + x2_noise)
}
)
}

# build cell type deconvolution in simulated data ## error
sim_prob_matrix <- CARD_processing(input_real_sp, input_sc)

sim_log_count <- sim_log_count[ , colnames(sim_log_count) %in% rownames(sim_prob_matrix)]
sim_log_count <- sim_log_count[,
colnames(sim_log_count) %in% rownames(sim_prob_matrix)
]

feat_types <- c("L_stats", "celltype_interaction", "nn_correlation", "morans_I")

Expand All @@ -69,7 +114,7 @@ real_scfeatures_result <- scFeatures::scFeatures(
feature_types = feat_types,
type = "spatial_t",
species = sc_species,
spotProbability = t(real_prob_matrix)
spotProbability = t(real_prob_matrix)
)

sim_scfeatures_result <- scFeatures::scFeatures(
Expand All @@ -79,13 +124,25 @@ sim_scfeatures_result <- scFeatures::scFeatures(
feature_types = feat_types,
type = "spatial_t",
species = sc_species,
spotProbability = t(sim_prob_matrix)
spotProbability = t(sim_prob_matrix)
)

ks_statistic_L_stats <- ks::kde.test(x1 = as.numeric(real_scfeatures_result$L_stats), x2 = as.numeric(sim_scfeatures_result$L_stats))
ks_statistic_celltype_interaction <- ks::kde.test(x1 = as.numeric(real_scfeatures_result$celltype_interaction), x2 = as.numeric(sim_scfeatures_result$celltype_interaction))
ks_statistic_nn_correlation <- ks::kde.test(x1 = as.numeric(real_scfeatures_result$nn_correlation), x2 = as.numeric(sim_scfeatures_result$nn_correlation))
ks_statistic_morans_I <- ks::kde.test(x1 = as.numeric(real_scfeatures_result$morans_I), x2 = as.numeric(sim_scfeatures_result$morans_I))
ks_statistic_L_stats <- try_kde_test(
x1 = as.numeric(real_scfeatures_result$L_stats),
x2 = as.numeric(sim_scfeatures_result$L_stats)
)
ks_statistic_celltype_interaction <- try_kde_test(
x1 = as.numeric(real_scfeatures_result$celltype_interaction),
x2 = as.numeric(sim_scfeatures_result$celltype_interaction)
)
ks_statistic_nn_correlation <- try_kde_test(
x1 = as.numeric(real_scfeatures_result$nn_correlation),
x2 = as.numeric(sim_scfeatures_result$nn_correlation)
)
ks_statistic_morans_I <- try_kde_test(
x1 = as.numeric(real_scfeatures_result$morans_I),
x2 = as.numeric(sim_scfeatures_result$morans_I)
)

cat("Combining metric values\n")
uns_metric_ids <- c(
Expand All @@ -112,4 +169,4 @@ output <- anndata::AnnData(
),
shape = c(0L, 0L)
)
output$write_h5ad(par[["output"]], compression = "gzip")
output$write_h5ad(par[["output"]], compression = "gzip")