diff --git a/src/qc/integration_metrics/config.vsh.yaml b/src/qc/integration_metrics/config.vsh.yaml new file mode 100644 index 00000000000..157f7ff845c --- /dev/null +++ b/src/qc/integration_metrics/config.vsh.yaml @@ -0,0 +1,171 @@ +functionality: + name: integration_metrics + namespace: "qc" + description: | + Calculation integration qc metrics and aggregate scores for bio conservation and batch correction. + The calculated metrics and aggregate scores are stored in the uns column. + All metric calculations are based on the scib library: + - publication: https://doi.org/10.1038/s41592-021-01336-8 + - code: https://github.com/theislab/scib/ + - documentation: https://scib.readthedocs.io/ + + Bio conservation metrics: + - nmi_score: scib.metrics.nmi + - ari_score: scib.metrics.ari + - asw_label: scib.metrics.silhouette + - isolated_label_f1: scib.metrics.isolated_labels + - isolated_label_asw: scib.metrics.isolated_labels + - clisi_graph: scib.metrics.clisi_graph + Note that the scib bio conservation metrics for cell cycle conservation, HVG conservation and trajectory conservation are not currently provided, + as this requires integrated count matrices and pseudo time values to be calculated, which is not currently supported in OpenPipeline. + + Batch correction metrics: + - asw_batch: scib.metrics.silhouette_batch + - pcr_score: scib.metrics.pcr_comparison + - graph_connectivity: scib.metrics.graph_connectivity + - ilisi_graph: scib.metrics.ilisi_graph + - kbet: scib.metrics.kBET + + Aggregate scores: + - avg_bio: mean of calculated bio conservation metrics + - avg_batch: mean of calculated batch correction metrics + - overall_integration_score: weighted mean of bio and batch scores (60/40 ratio) + + authors: + - __merge__: /src/authors/dorien_roosen.yaml + roles: [ author ] + argument_groups: + - name: Inputs + arguments: + - name: "--input" + type: file + description: Input h5mu file + direction: input + required: true + example: input.h5mu + - name: "--modality" + type: string + default: "rna" + - name: "--bio_conservation_metrics" + type: string + multiple: true + multiple_sep: ";" + choices: + - nmi + - ari + - asw_label + - isolated_label_f1 + - isolated_label_asw + - clisi_graph + default: nmi;ari;asw_label + description: | + The metrics to be calculated to assess biological conservation. + The provided metrics will be aggregated into a biological conservation score. + - name: "--batch_correction_metrics" + type: string + multiple: true + multiple_sep: ";" + choices: + - asw_batch + - pcr + - graph_connectivity + - ilisi_graph + - kbet + default: asw_batch;pcr;graph_connectivity + description: | + The metrics to be calculated for the batch correction. + The provided metrics will be aggregated into a batch correction score. + + - name: Input Annotations + arguments: + - name: "--obsm_embeddings" + type: string + required: true + example: "X_scGPT" + description: | + The name of the adata.obsm array containing scGPT cell embeddings. + - name: "--obs_batch_label" + type: string + example: "sample" + required: true + description: | + The name of the adata obs column containing the batch labels. + - name: "--obs_cell_label" + type: string + example: "cell_type" + required: true + description: | + The name of the adata obs column containing the cell type labels. + - name: "--obs_cluster" + type: string + example: "cluster" + required: true + description: | + The name of the adata obs column containing the cluster labels. + - name: "--uns_neighbors" + type: string + example: "neighbors" + required: true + description: + The name of the adata uns object containing the neighbors information. + - name: "--obsp_neighbor_connectivities" + type: string + example: "connectivities" + required: true + description: | + The name of the adata obsp object containing the neighbor connectivities. + + + - name: Outputs + arguments: + - name: "--output" + type: file + description: Output h5mu file. + direction: output + example: output.h5mu + - name: "--output_compression" + type: string + description: The compression format to be used on the output h5mu object. + choices: ["gzip", "lzf"] + required: false + example: "gzip" + + resources: + - type: python_script + path: script.py + +platforms: + - type: docker + image: rocker/r2u:22.04 + setup: + - type: apt + packages: + - procps + - libhdf5-dev + - gfortran + - cmake + - libopenblas-dev + - libgeos-dev + - python3-pip + - python3-dev + - python-is-python3 + - type: python + __merge__: [ /src/base/requirements/anndata_mudata.yaml ] + - type: python + packages: + - scib==1.1.5 + - rpy2==3.5.16 + - anndata2ri + - type: r + github: [ theislab/kBET ] + - type: docker + run: | + cd "$(pip show scib | grep Location | cut -d ' ' -f 2)/scib/knn_graph" && \ + g++ -std=c++11 -O3 knn_graph.cpp -o knn_graph.o + + test_setup: + - type: python + __merge__: [ /src/base/requirements/viashpy.yaml, .] + - type: nextflow + directives: + label: [singlecpu, lowmem] \ No newline at end of file diff --git a/src/qc/integration_metrics/script.py b/src/qc/integration_metrics/script.py new file mode 100644 index 00000000000..1c4e80a3c01 --- /dev/null +++ b/src/qc/integration_metrics/script.py @@ -0,0 +1,274 @@ +import scib +import mudata as mu +import numpy as np +import warnings + +## VIASH START +par = { + "input": "resources_test/scgpt/test_resources/Kim2020_Lung_integrated_leiden.h5mu", + "modality": "rna", + "bio_conservation_metrics": ["nmi","ari","asw_label"], + "batch_correction_metrics": ["asw_batch","pcr","graph_connectivity"], + "obsm_embeddings": "X_scGPT", + "obs_batch_label": "sample", + "obs_cell_label": "cell_type", + "uns_neighbors": "scGPT_integration_neighbors", + "obsp_neighbor_connectivities": "scGPT_integration_connectivities", + "output_compression": None, + "obs_cluster": "scGPT_integration_leiden_1.0", + "output": "resources_test/scgpt/test_resources/Kim2020_Lung_integrated_qc.h5mu", +} + +## VIASH END + +# START TEMPORARY WORKAROUND setup_logger +# reason: resources aren't available when using Nextflow fusion +# from setup_logger import setup_logger +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger +# END TEMPORARY WORKAROUND setup_logger +logger = setup_logger() + +warnings.filterwarnings("ignore", category=DeprecationWarning) +warnings.filterwarnings("ignore", category=FutureWarning) + +logger.info("Reading in data") +# Read in data +mdata = mu.read(par["input"]) +input_adata = mdata.mod[par["modality"]] +adata = input_adata.copy() + +# Remove nan values to calculate scores +if adata.obs["cell_type"].isna().any(): + logger.warning("Cell label obs column contains nan values: removing rows with nan value") + adata_t = adata[~adata.obs["cell_type"].isna()].copy() +else: + adata_t = adata.copy() + +# Ensure all batch labels are str +adata.obs[par["obs_batch_label"]] = adata.obs[par["obs_batch_label"]].astype(str) + +# rename neighbors and connectivities names if necessary +if par["uns_neighbors"] != "neighbors": + adata_t.uns["neighbors"] = adata_t.uns[par["uns_neighbors"]] +if par["obsp_neighbor_connectivities"] != "connectivities": + adata_t.obsp["connectivities"] = adata_t.obsp[par["obsp_neighbor_connectivities"]] + +# Asses metrics to be calculated +assert len(par["bio_conservation_metrics"]) > 0, "Provide at least one metric for bio conservation." +assert len(par["batch_correction_metrics"]) > 0, "Provide at least one metric for bio conservation." + +bio_metric_calc = { + "nmi": False, + "ari": False, + "asw_label": False, + "isolated_label_f1": False, + "isolated_label_asw": False, + "clisi_graph": False, +} + +batch_metric_calc = { + "asw_batch": False, + "pcr": False, + "graph_connectivity": False, + "ilisi_graph": False, + "kbet": False +} + +for bio_metric in par["bio_conservation_metrics"]: + bio_metric_calc[bio_metric] = True +for batch_metric in par["batch_correction_metrics"]: + batch_metric_calc[batch_metric] = True + +# Batch correction metrics +logger.info(f">> Batch Correction Metrics: {par['batch_correction_metrics']}") +# Dictionary with results of calculated metrics +batch_metrics = {} + +# batch_metrics = {} +if batch_metric_calc["asw_batch"]: + logger.info("Calculating ASW score (batches)") + asw_batch = scib.metrics.silhouette_batch( + adata_t, + batch_key=par["obs_batch_label"], + label_key=par["obs_cell_label"], + embed=par["obsm_embeddings"], + metric="euclidean", + return_all=False, + verbose=False, + ) + batch_metrics["asw_batch"] = asw_batch +else: + logger.info("Skipping ASW score (batches) calculation") + batch_metrics["asw_batch"] = np.nan + +if batch_metric_calc["pcr"]: + logger.info("Calculating PC regression") + pcr_score = scib.metrics.pcr_comparison( + adata_t, + adata_t, + embed=par["obsm_embeddings"], + covariate=par["obs_batch_label"], + verbose=False + ) + batch_metrics["pcr"] = pcr_score +else: + logger.info("Skipping PC Regression calculation") + batch_metrics["pcr"] = np.nan + +if batch_metric_calc["graph_connectivity"]: + logger.info("Calculating graph connectivity") + graph_conn_score = scib.metrics.graph_connectivity( + adata_t, + label_key=par["obs_cell_label"] + ) + batch_metrics["graph_connectivity"] = graph_conn_score +else: + logger.info("Skipping graph connectivity calculation") + batch_metrics["graph_connectivity"] = np.nan + +if batch_metric_calc["ilisi_graph"]: + logger.info("Calculating graph iLISI") + graph_ilisi = scib.metrics.lisi.ilisi_graph( + adata_t, + batch_key=par["obs_batch_label"], + type_="knn", + use_rep=par["obsm_embeddings"] + ) + batch_metrics["ilisi_graph"] = graph_ilisi +else: + logger.info("Skipping graph iLISI calculation") + batch_metrics["ilisi_graph"] = np.nan + +if batch_metric_calc["kbet"]: + logger.info("Calculating kBET") + kbet_score = scib.metrics.kBET( + adata_t, + batch_key=par["obs_batch_label"], + label_key=par["obs_cell_label"], + type_="knn", + embed=par["obsm_embeddings"], + verbose=True + ) + batch_metrics["kbet"] = kbet_score +else: + logger.info("Skipping kBET calculation") + batch_metrics["kbet"] = np.nan + +# Bio conservation metrics +logger.info(f">>Bio Conservation Metrics: {par['bio_conservation_metrics']}") + +# Dictionary to store results of bio conservation metrics +bio_metrics = {} + +if bio_metric_calc["nmi"]: + logger.info("Calculating NMI score") + nmi_score = scib.metrics.nmi( + adata_t, + cluster_key=par["obs_cluster"], + label_key=par["obs_cell_label"] + ) + bio_metrics["nmi"] = nmi_score +else: + logger.info("Skipping NMI calculation") + bio_metrics["nmi"] = np.nan + +if bio_metric_calc["ari"]: + logger.info("Calculating ARI score") + ari_score = scib.metrics.ari( + adata_t, + cluster_key=par["obs_cluster"], + label_key=par["obs_cell_label"] + ) + bio_metrics["ari"] = ari_score +else: + logger.info("Skipping ARI calculation") + bio_metrics["ari"] = np.nan + +if bio_metric_calc["asw_label"]: + logger.info("Calculating ASW score (cell types)") + asw_label = scib.metrics.silhouette( + adata_t, + label_key=par["obs_cell_label"], + embed=par["obsm_embeddings"] + ) + bio_metrics["asw_label"] = asw_label +else: + logger.info("Skipping ASW score (cell types) calculation") + bio_metrics["asw_label"] = np.nan + +if bio_metric_calc["isolated_label_f1"]: + logger.info("Calculating isolated label F1") + il_score_f1 = scib.metrics.isolated_labels( + adata_t, + label_key=par["obs_cell_label"], + batch_key=par["obs_batch_label"], + embed=par["obsm_embeddings"], + cluster=True + ) + bio_metrics["isolated_label_f1"] = il_score_f1 +else: + logger.info("Skipping isolated label F1 calculation") + bio_metrics["isolated_label_f1"] = np.nan + +if bio_metric_calc["isolated_label_asw"]: + logger.info("Calculating isolated label silhouette") + il_score_asw = scib.metrics.isolated_labels( + adata_t, + label_key=par["obs_cell_label"], + batch_key=par["obs_batch_label"], + embed=par["obsm_embeddings"], + cluster=False + ) + bio_metrics["isolated_label_asw"] = il_score_asw +else: + logger.info("Skipping isolated label ASW calculation") + bio_metrics["isolated_label_asw"] = np.nan + +if bio_metric_calc["clisi_graph"]: + logger.info("Calculating graph cLISI") + graph_clisi = scib.metrics.clisi_graph( + adata_t, + label_key=par["obs_cell_label"], + type_="knn", + use_rep=par["obsm_embeddings"] + ) + bio_metrics["clisi_graph"] = graph_clisi +else: + logger.info("Skipping isolated graph cLISI calculation") + bio_metrics["clisi_graph"] = np.nan + +logger.info(">> Aggregate Metrics") +logger.info(f"Calculating Bio Conservation Score based on {par['bio_conservation_metrics']} ") +# Calculate the mean of the bio conservation metrics ignoring the nan values +avg_bio = np.nanmean(list(bio_metrics.values())) + +logger.info(f"Calculating Batch Correction Score based on {par['batch_correction_metrics']}") +# Calculate the mean of the bio conservation metrics ignoring the nan values +avg_batch = np.nanmean(list(batch_metrics.values())) + +logger.info("Calculating Overall Score") +# Weighted mean of the bio conservation and batch correction scores +# See https://doi.org/10.1038/s41592-021-01336-8 +overall_score = (0.4 * avg_batch) + (0.6 * avg_bio) + + +logger.info("Writing output data") +adata.uns["bio_conservation_metrics"] = bio_metrics +adata.uns["batch_correction_metrics"] = batch_metrics +adata.uns["bio_conservation_score"] = avg_bio +adata.uns["batch_correction_score"] = avg_batch +adata.uns["overall_integration_score"] = overall_score + +mdata.mod[par["modality"]] = adata +mdata.write(par["output"], compression=par["output_compression"]) diff --git a/src/qc/integration_report/config.vsh.yaml b/src/qc/integration_report/config.vsh.yaml new file mode 100644 index 00000000000..7b1285e5289 --- /dev/null +++ b/src/qc/integration_report/config.vsh.yaml @@ -0,0 +1,122 @@ +functionality: + name: integration_report + namespace: "qc" + description: | + Generates a .md file containing an overview of integration metrics (calculated in qc/integration_metrics) and umap visualizations. + + authors: + - __merge__: /src/authors/dorien_roosen.yaml + roles: [ author ] + argument_groups: + - name: Inputs + arguments: + - name: "--input" + type: file + description: Input h5mu file + direction: input + required: true + example: input.h5mu + - name: "--modality" + type: string + default: "rna" + - name: "--var_gene_names" + type: string + required: false + description: | + The name of the adata var column containing gene names. When no gene_name_layer is provided, the var index will be used. + - name: "--obs_batch_label" + type: string + example: "sample" + required: true + description: | + The name of the adata obs column containing the batch labels. + - name: "--obs_cell_label" + type: string + example: "cell_type" + required: true + description: | + The name of the adata obs column containing the cell type labels. + - name: "--uns_neighbors" + type: string + example: "neighbors" + required: true + description: + The name of the adata uns object containing the neighbors information. + - name: "--obsm_umap" + type: string + example: "X_scGPT_umap" + required: true + description: "The obsm slot containing UMAP embeddings." + + - name: Outputs + arguments: + - name: "--output" + type: file + required: true + description: Name of the integration qc output pdf report. + direction: output + example: output.pdf + - name: "--output_raw" + type: boolean_true + description: | + If provided, the raw data (unrendered report (md), figures (png) and metrics (json)) will be published. + - name: "--output_umap_label_fig" + type: file + direction: output + default: umap_label.png + must_exist: False + description: | + Name of the UMAP label figure (png). Will only be saved if --output_raw is set to true. + - name: "--output_umap_batch_fig" + type: file + direction: output + must_exist: False + default: umap_batch.png + description: | + Name of the UMAP batch figure (png). Will only be saved if --output_raw is set to true. + - name: "--output_md_report" + type: file + direction: output + default: report.qmd + must_exist: False + description: | + Name of the unrendered report file (qmd). Will only be saved if --output_raw is set to true. + - name: "--output_metrics" + type: file + direction: output + must_exist: False + default: metrics.json + description: | + Name of the json file containing integration metrics (json). Will only be saved if --output_raw is set to true. + + resources: + - type: python_script + path: script.py + - path: report_template.md + +platforms: + - type: docker + image: python:3.11 + setup: + - type: apt + packages: + - procps + - libhdf5-dev + - gfortran + - cmake + - libopenblas-dev + - type: python + __merge__: [ /src/base/requirements/anndata_mudata.yaml, /src/base/requirements/scanpy.yaml ] + - type: docker + run: | + wget https://github.com/quarto-dev/quarto-cli/releases/download/v1.4.549/quarto-1.4.549-linux-amd64.deb -O /tmp/quarto.deb && \ + dpkg -i /tmp/quarto.deb && \ + rm /tmp/quarto.deb && \ + quarto install tinytex + + test_setup: + - type: python + __merge__: [ /src/base/requirements/viashpy.yaml, .] + - type: nextflow + directives: + label: [singlecpu, lowmem] \ No newline at end of file diff --git a/src/qc/integration_report/report_template.md b/src/qc/integration_report/report_template.md new file mode 100644 index 00000000000..43abeff1b17 --- /dev/null +++ b/src/qc/integration_report/report_template.md @@ -0,0 +1,49 @@ +--- +title: "Integration QC report" +format: pdf +tbl-colwidths: [70,20] +--- + +## Overview of integration metrics and scores + +The overall integration score is **{overall_integration}**. The integration score is a 40/60 weighted average of the batch correction score ({batch_correction}, table 1) and the biological conservation score ({bio_conservation}, table 2). + + +::: {{layout-ncol=2}} + +| metric | value | +|--------|--------| +| NMI | {nmi} | +| ARI | {ari} | +| ASW (label) | {asw_label} | +| Isolated label (f1) | {isolated_label_f1} | +| Isolated label (ASW) | {isolated_label_asw} | +| cLISI | {clisi_graph} | +| **biological conservation score** | **{bio_conservation}** | + +: Biological conservation {{.striped}} + +| metric | value | +|--------|--------| +| ASW (batch) | {asw_batch} | +| PCR | {pcr} | +| Graph connectivity | {graph_connectivity} | +| iLISI | {ilisi_graph} | +| kBET | {kbet} | +| **batch correction score** | **{batch_correction}** | + +: Batch correction {{.striped}} +::: + +The biological conservation metrics consist of NMI (normalized mutual information), ARI (adjusted rand index), ASW (average silhouette width) label, isolated label (f1 and ASW) and graph cLISI (cell-type local inverse Simpson's index) metrics. The batch correction metrics consist of ASW, PCR (principal component regression), graph connectivity, iLISI (integration LISI) and kBET (k-nearest-neighbor batch effect test) metrics. +The biological conservation and batch correction scores are the means of their individual metrics. + +All score and metrics calculations are based on the scib (single cell integration benchmark) library (https://doi.org/10.1038/s41592-021-01336-8). + +## Visualizations of the integration embeddings + +![](umap_batch.png){{fig-align="center" width=70%}} + +![](umap_label.png){{fig-align="center" width=70%}} + +The integration embedding figures display the UMAP (uniform manifold approximation and projection) visualizations of the integrated data, colored by batch annotation or cell identity. \ No newline at end of file diff --git a/src/qc/integration_report/script.py b/src/qc/integration_report/script.py new file mode 100644 index 00000000000..2682ae49698 --- /dev/null +++ b/src/qc/integration_report/script.py @@ -0,0 +1,165 @@ +import scanpy as sc +import mudata as mu +import os +import json +import shutil +import subprocess +import tempfile +from pathlib import Path + + +## VIASH START +par = { + "input": "resources_test/scgpt/test_resources/Kim2020_Lung_integrated_leiden_qc.h5mu", + "modality": "rna", + "var_gene_names": None, + "uns_neighbors": "scGPT_integration_neighbors", + "obsm_umap": "X_scGPT_umap", + "obs_batch_label": "sample", + "obs_cell_label": "cell_type", + "output": "test_report.pdf", + "output_raw": True, + "output_umap_label_fig": "test_umap_label.png", + "output_umap_batch_fig": "test_umap_batch.png", + "output_md_report": "test_report.qmd", + "output_metrics": "test_metrics.json" + } + +meta = { + "resources_dir": "src/qc/integration_report", + "functionality_name": "integration_report", + "temp_dir": "tmp" +} + +## VIASH END + +# START TEMPORARY WORKAROUND setup_logger +# reason: resources aren't available when using Nextflow fusion +# from setup_logger import setup_logger +def setup_logger(): + import logging + from sys import stdout + + logger = logging.getLogger() + logger.setLevel(logging.INFO) + console_handler = logging.StreamHandler(stdout) + logFormatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s") + console_handler.setFormatter(logFormatter) + logger.addHandler(console_handler) + + return logger +# END TEMPORARY WORKAROUND setup_logger +logger = setup_logger() + +logger.info("Reading in data") +# Read in data +mdata = mu.read(par["input"]) +input_adata = mdata.mod[par["modality"]] +adata = input_adata.copy() + +# Ensure all batch labels are str +adata.obs[par["obs_batch_label"]] = adata.obs[par["obs_batch_label"]].astype(str) + +# Generate vizualizations + +logger.info("Generating UMAP visualizations") +fig_cell_type_clusters = sc.pl.embedding( + adata, + basis=par["obsm_umap"], + gene_symbols=par["var_gene_names"], + neighbors_key=par["uns_neighbors"], + color=par["obs_cell_label"], + title="UMAP visualization (label)", + frameon=False, + return_fig=True, + show=False, +) + +fig_batch_clusters = sc.pl.embedding( + adata, + basis=par["obsm_umap"], + gene_symbols=par["var_gene_names"], + neighbors_key=par["uns_neighbors"], + color=par["obs_batch_label"], + title="UMAP visualization (batch)", + frameon=False, + return_fig=True, + show=False, +) + +# Generate metrics and report + +# Open template +with open(meta["resources_dir"] + '/report_template.md', 'r') as file: + template = file.read() + +# Context manager to generate temporary directory to save intermediary files +with tempfile.TemporaryDirectory( + prefix=f"{meta['functionality_name']}-", + dir=meta["temp_dir"], + ignore_cleanup_errors=True) as temp_dir: + + temp_dir = Path(temp_dir) + temp_dir.mkdir(parents=True, exist_ok=True) + + # Directories for intermediary files + report_md = f"{temp_dir}/report.qmd" + umap_label_fig = f"{temp_dir}/umap_label.png" + umap_batch_fig = f"{temp_dir}/umap_batch.png" + + # Save figures + fig_cell_type_clusters.savefig(umap_label_fig, dpi=300, bbox_inches='tight') + fig_batch_clusters.savefig(umap_batch_fig, dpi=300, bbox_inches='tight') + + # Fetch variables required for report from the adata object + logger.info("Fetching integration metrics") + variables = { + "overall_integration": round(adata.uns["overall_integration_score"], 2), + "bio_conservation": round(adata.uns["bio_conservation_score"], 2), + "batch_correction": round(adata.uns["batch_correction_score"], 2), + "umap_batch": os.path.basename(umap_batch_fig), + "umap_label": os.path.basename(umap_label_fig) + } + + # Round all variables, replace nan's with "-" + for k, v in adata.uns["bio_conservation_metrics"].items(): + # Check if value is NaN and add to variables dict + variables[k] = round(v, 2) if v == v else "-" + + for k, v in adata.uns["batch_correction_metrics"].items(): + # Check if value is NaN and add to variables dict + variables[k] = round(v, 2) if v == v else "-" + + logger.info("Generating report") + # Fill report template with variables + markdown_content = template.format(**variables) + + # Save report + with open(report_md, 'w') as f: + f.write(markdown_content) + + # PDF rendering + logger.info("Rendering report to pdf") + subprocess.run(['quarto', 'render', report_md, '--to', 'pdf']) + + # Move report to output + logger.info("Publishing of pdf report") + shutil.move(f"{temp_dir}/report.pdf", par["output"]) + + # Save raw data + if not par["output_raw"]: + logger.info("Skipping publishing of raw output data because --output_raw was not provided.") + else: + logger.info("Publishing of raw output data") + shutil.move(umap_label_fig, par["output_umap_label_fig"]) + shutil.move(umap_batch_fig, par["output_umap_batch_fig"]) + shutil.move(report_md, par["output_md_report"]) + + # Remove figures from variable dict before saving as json + del variables["umap_batch"] + del variables["umap_label"] + + metrics = {k: (str(v) if v == v else "-") for k, v in variables.items()} + + with open(par["output_metrics"], 'w') as f: + json.dump(metrics, f) diff --git a/src/workflows/qc/integration_qc/config.vsh.yaml b/src/workflows/qc/integration_qc/config.vsh.yaml new file mode 100644 index 00000000000..c5d375484c8 --- /dev/null +++ b/src/workflows/qc/integration_qc/config.vsh.yaml @@ -0,0 +1,152 @@ +functionality: + name: integration_qc + namespace: "workflows/qc" + description: | + Workflow to add integration qc metrics to a MuData and generate an integration qc output report. + + authors: + - __merge__: /src/authors/dorien_roosen.yaml + roles: [ author, maintainer ] + argument_groups: + - name: Inputs + arguments: + - name: "--input" + type: file + description: Input h5mu file + direction: input + required: true + example: input.h5mu + - name: "--modality" + type: string + default: "rna" + - name: "--var_gene_names" + type: string + required: false + description: | + The name of the adata var column containing gene names. When no gene_name_layer is provided, the var index will be used. + - name: "--bio_conservation_metrics" + type: string + multiple: true + multiple_sep: ";" + choices: + - nmi + - ari + - asw_label + - isolated_label_f1 + - isolated_label_asw + - clisi_graph + default: nmi;ari;asw_label + description: | + The metrics to be calculated to assess biological conservation. + The provided metrics will be aggregated into a biological conservation score. + - name: "--batch_correction_metrics" + type: string + multiple: true + multiple_sep: ";" + choices: + - asw_batch + - pcr + - graph_connectivity + - ilisi_graph + - kbet + default: asw_batch;pcr;graph_connectivity + description: | + The metrics to be calculated for the batch correction. + The provided metrics will be aggregated into a batch correction score. + + - name: Input annotations + arguments: + - name: "--obsm_embeddings" + type: string + required: true + example: "X_scGPT" + description: | + The name of the adata.obsm array containing scGPT cell embeddings. + - name: "--obs_batch_label" + type: string + example: "sample" + required: true + description: | + The name of the adata obs column containing the batch labels. + - name: "--obs_cell_label" + type: string + example: "cell_type" + required: true + description: | + The name of the adata obs column containing the cell type labels. + - name: "--obs_cluster" + type: string + example: "cluster" + required: true + description: | + The name of the adata obs column containing the cluster labels. + - name: "--obsp_neighbor_connectivities" + type: string + example: "connectivities" + required: true + description: | + The name of the adata obsp object containing the neighbor connectivities. + - name: "--uns_neighbors" + type: string + example: "neighbors" + required: true + description: + The name of the adata uns object containing the neighbors information. + - name: "--obsm_umap" + type: string + example: "X_scGPT_umap" + required: true + description: "The obsm slot containing UMAP embeddings." + + - name: Outputs + arguments: + - name: "--output" + type: file + required: true + description: Name of the integration qc output pdf report. + direction: output + example: output.pdf + - name: "--output_raw" + type: boolean_true + description: | + If provided, the raw data (unrendered report (md), figures (png) and metrics (json)) will be published. + - name: "--output_md_report" + type: file + direction: output + default: report.qmd + must_exist: False + description: | + Name of the unrendered report file (qmd). Will only be saved if --output_raw is set to true. + - name: "--output_umap_label_fig" + type: file + direction: output + default: umap_label.png + must_exist: False + description: | + Name of the UMAP label figure (png). Will only be saved if --output_raw is set to true. + - name: "--output_umap_batch_fig" + type: file + direction: output + must_exist: False + default: umap_batch.png + description: | + Name of the UMAP batch figure (png). Will only be saved if --output_raw is set to true. + - name: "--output_metrics" + type: file + direction: output + must_exist: False + default: metrics.json + description: | + Name of the json file containing integration metrics (json). Will only be saved if --output_raw is set to true. + + + dependencies: + - name: qc/integration_metrics + - name: qc/integration_report + resources: + - type: nextflow_script + path: main.nf + entrypoint: run_wf + +platforms: + - type: nextflow diff --git a/src/workflows/qc/integration_qc/main.nf b/src/workflows/qc/integration_qc/main.nf new file mode 100644 index 00000000000..146969c5d81 --- /dev/null +++ b/src/workflows/qc/integration_qc/main.nf @@ -0,0 +1,46 @@ +workflow run_wf { + take: + input_ch + + main: + output_ch = input_ch + // Calculate integration qc metrics and add them to the MuData file + | integration_metrics.run( + fromState: [ + "input": "input", + "modality": "modality", + "obsm_output": "obsm_output", + "bio_conservation_metrics": "bio_conservation_metrics", + "batch_correction_metrics": "batch_correction_metrics", + "obsm_embeddings": "obsm_embeddings", + "obs_batch_label": "obs_batch_label", + "obs_cell_label": "obs_cell_label", + "obs_cluster": "obs_cluster", + "uns_neighbors": "uns_neighbors", + "obsp_neighbor_connectivities": "obsp_neighbor_connectivities", + ], + toState: [ + "input": "output" + ] + ) + + // Generate visualizations and a report based on the metrics + | integration_report.run( + fromState: [ + "input": "input", + "modality": "modality", + "var_gene_names": "var_gene_names", + "obs_batch_label": "obs_batch_label", + "obs_cell_label": "obs_cell_label", + "uns_neighbors": "uns_neighbors", + "obsm_umap": "obsm_umap", + "output_raw": "output_raw" + ], + toState: { id, output, state -> + output + } + ) + + emit: + output_ch +} \ No newline at end of file diff --git a/src/workflows/qc/integration_qc/nextflow.config b/src/workflows/qc/integration_qc/nextflow.config new file mode 100644 index 00000000000..8108bc25e84 --- /dev/null +++ b/src/workflows/qc/integration_qc/nextflow.config @@ -0,0 +1,10 @@ +manifest { + nextflowVersion = '!>=20.12.1-edge' +} + +params { + rootDir = java.nio.file.Paths.get("$projectDir/../../../../").toAbsolutePath().normalize().toString() +} + +// include common settings +includeConfig("${params.rootDir}/src/workflows/utils/labels.config") \ No newline at end of file