Skip to content

Commit 3de84d0

Browse files
committed
add conversion component
1 parent 8c33533 commit 3de84d0

File tree

6 files changed

+332
-41
lines changed

6 files changed

+332
-41
lines changed

scripts/generate_test_resource.sh

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
#!/bin/bash
2+
3+
viash run src/process_datasets/convert/config.vsh.yaml -- \
4+
--input_sc resources_test/datasets_raw/MOBNEW/dataset_sc.rds \
5+
--input_sp resources_test/datasets_raw/MOBNEW/dataset_sp.rds \
6+
--output_sc resources_test/datasets/MOBNEW/dataset_sc.h5ad \
7+
--output_sp resources_test/datasets/MOBNEW/dataset_sp.h5ad \
8+
--dataset_id MOBNEW \
9+
--dataset_name "MOBNEW" \
10+
--dataset_description "MOBNEW" \
11+
--dataset_summary "MOBNEW" \
12+
--dataset_reference "..." \
13+
--dataset_organism "mus_musculus"

src/api/file_common_dataset.yaml

Lines changed: 0 additions & 41 deletions
This file was deleted.

src/api/file_dataset_sc.yaml

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
type: file
2+
example: "resources_test/common/mobnew/dataset_sc.h5ad"
3+
info:
4+
label: Single-cell dataset
5+
summary: An unprocessed single-cell dataset as output by a dataset loader.
6+
description: |
7+
This dataset contains raw counts and metadata as output by a dataset loader.
8+
9+
The format of this file is derived from the [CELLxGENE schema v4.0.0](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/4.0.0/schema.md).
10+
slots:
11+
layers:
12+
- type: integer
13+
name: counts
14+
description: Raw counts
15+
required: true
16+
obs:
17+
- type: string
18+
name: cell_type
19+
description: Classification of the cell type based on its characteristics and function within the tissue or organism.
20+
required: false
21+
22+
- type: string
23+
name: donor_id
24+
description: Identifier for the donor from whom the cell sample is obtained.
25+
required: false
26+
var:
27+
- type: string
28+
name: feature_id
29+
description: Unique identifier for the feature, usually a ENSEMBL gene id.
30+
required: false
31+
- type: string
32+
name: feature_name
33+
description: A human-readable name for the feature, usually a gene symbol.
34+
required: true
35+
uns:
36+
- type: string
37+
name: dataset_id
38+
description: A unique identifier for the dataset. This is different from the `obs.dataset_id` field, which is the identifier for the dataset from which the cell data is derived.
39+
required: true
40+
- name: dataset_name
41+
type: string
42+
description: A human-readable name for the dataset.
43+
required: true
44+
- type: string
45+
name: dataset_url
46+
description: Link to the original source of the dataset.
47+
required: false
48+
- name: dataset_reference
49+
type: string
50+
description: Bibtex reference of the paper in which the dataset was published.
51+
required: false
52+
multiple: true
53+
- name: dataset_summary
54+
type: string
55+
description: Short description of the dataset.
56+
required: true
57+
- name: dataset_description
58+
type: string
59+
description: Long description of the dataset.
60+
required: true
61+
- name: dataset_organism
62+
type: string
63+
description: The organism of the sample in the dataset.
64+
required: false
65+
multiple: true

src/api/file_dataset_sp.yaml

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
type: file
2+
example: "resources_test/common/mobnew/dataset_sp.h5ad"
3+
info:
4+
label: Spatial dataset
5+
summary: An unprocessed spatial dataset as output by a dataset loader.
6+
description: |
7+
This dataset contains raw counts and metadata as output by a dataset loader.
8+
9+
The format of this file is derived from the [CELLxGENE schema v4.0.0](https://github.com/chanzuckerberg/single-cell-curation/blob/main/schema/4.0.0/schema.md).
10+
slots:
11+
layers:
12+
- type: integer
13+
name: counts
14+
description: Raw counts
15+
required: true
16+
- type: integer
17+
name: logcounts
18+
description: Log-transformed counts
19+
required: true
20+
obs:
21+
- type: double
22+
name: col
23+
description: Column index of the cell in the spatial grid.
24+
required: true
25+
- type: double
26+
name: row
27+
description: Row index of the cell in the spatial grid.
28+
required: true
29+
- type: double
30+
name: sizeFactor
31+
description: Size factor for the cell.
32+
required: true
33+
- type: integer
34+
name: spatial_cluster
35+
description: Spatial cluster assignment for the cell.
36+
required: true
37+
var:
38+
- type: string
39+
name: feature_id
40+
description: Unique identifier for the feature, usually a ENSEMBL gene id.
41+
required: false
42+
- type: string
43+
name: feature_name
44+
description: A human-readable name for the feature, usually a gene symbol.
45+
required: true
46+
obsm:
47+
- type: float
48+
name: celltype_proportions
49+
description: Spot-by-celltype matrix of celltype proportions generated by CARD.
50+
uns:
51+
- type: string
52+
name: dataset_id
53+
description: A unique identifier for the dataset. This is different from the `obs.dataset_id` field, which is the identifier for the dataset from which the cell data is derived.
54+
required: true
55+
- name: dataset_name
56+
type: string
57+
description: A human-readable name for the dataset.
58+
required: true
59+
- type: string
60+
name: dataset_url
61+
description: Link to the original source of the dataset.
62+
required: false
63+
- name: dataset_reference
64+
type: string
65+
description: Bibtex reference of the paper in which the dataset was published.
66+
required: false
67+
multiple: true
68+
- name: dataset_summary
69+
type: string
70+
description: Short description of the dataset.
71+
required: true
72+
- name: dataset_description
73+
type: string
74+
description: Long description of the dataset.
75+
required: true
76+
- name: dataset_organism
77+
type: string
78+
description: The organism of the sample in the dataset.
79+
required: false
80+
multiple: true
Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
name: convert
2+
namespace: process_datasets
3+
description: Transform the figshare rds into an HDF5-backed AnnData file.
4+
argument_groups:
5+
- name: Inputs
6+
arguments:
7+
- type: file
8+
name: --input_sc
9+
description: Raw single-cell dataset
10+
example: resources_test/datasets_raw/MOBNEW/dataset_sc.rds
11+
required: true
12+
13+
- type: file
14+
name: --input_sp
15+
description: Raw spatial dataset
16+
example: resources_test/datasets_raw/MOBNEW/dataset_sp.rds
17+
required: true
18+
- name: Outputs
19+
arguments:
20+
- type: file
21+
name: --output_sc
22+
description: Processed single-cell dataset
23+
example: resources_test/datasets/MOBNEW/dataset_sc.h5ad
24+
direction: output
25+
required: true
26+
27+
- type: file
28+
name: --output_sp
29+
description: Processed spatial dataset
30+
example: resources_test/datasets/MOBNEW/dataset_sp.h5ad
31+
direction: output
32+
required: true
33+
- name: Dataset metadata
34+
arguments:
35+
- type: string
36+
name: --dataset_id
37+
description: A unique identifier for the dataset.
38+
required: true
39+
- type: string
40+
name: --dataset_name
41+
description: A human-readable name for the dataset.
42+
required: true
43+
- type: string
44+
name: --dataset_url
45+
description: Link to the original source of the dataset.
46+
required: false
47+
- type: string
48+
name: --dataset_reference
49+
description: Bibtex reference of the paper in which the dataset was published.
50+
required: false
51+
multiple: true
52+
- type: string
53+
name: --dataset_summary
54+
description: Short description of the dataset.
55+
required: true
56+
- type: string
57+
name: --dataset_description
58+
description: Long description of the dataset.
59+
required: true
60+
- type: string
61+
name: --dataset_organism
62+
description: Organism from which the dataset was derived.
63+
required: true
64+
65+
resources:
66+
- type: r_script
67+
path: script.R
68+
69+
engines:
70+
- type: docker
71+
image: ghcr.io/openproblems-bio/base_images/r:1.1.0
72+
setup:
73+
- type: r
74+
bioc: [SingleCellExperiment]
75+
76+
runners:
77+
- type: executable
78+
- type: nextflow

src/process_datasets/convert/script.R

Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
suppressMessages(library(SingleCellExperiment, quietly = TRUE))
2+
3+
## VIASH START
4+
par <- list(
5+
# inputs
6+
input_sc = "resources_test/datasets_raw/MOBNEW/dataset_sc.rds",
7+
input_sp = "resources_test/datasets_raw/MOBNEW/dataset_sp.rds",
8+
9+
# outputs
10+
output_sc = "resources_test/datasets/MOBNEW/dataset_sc.rds",
11+
output_sp = "resources_test/datasets/MOBNEW/dataset_sp.rds",
12+
13+
# dataset metadata
14+
dataset_id = "MOBNEW",
15+
dataset_name = "MOBNEW",
16+
dataset_description = "MOBNEW",
17+
dataset_url = "...",
18+
dataset_reference = "...",
19+
dataset_summary = "...",
20+
dataset_organism = "..."
21+
)
22+
## VIASH END
23+
24+
cat("Read input files\n")
25+
input_sc <- readRDS(par$input_sc)
26+
input_sp <- readRDS(par$input_sp)
27+
28+
cat("Single cell dataset:\n")
29+
print(input_sc)
30+
31+
cat("Spatial dataset:\n")
32+
print(input_sp)
33+
34+
cat("Transforming single cell into AnnData\n")
35+
output_sc <- anndata::AnnData(
36+
layers = list(
37+
counts = Matrix::t(assay(input_sc, "counts"))
38+
),
39+
obs = data.frame(
40+
row.names = colnames(input_sc),
41+
cell_type = colData(input_sc)$cellType,
42+
donor_id = colData(input_sc)$sampleInfo
43+
),
44+
var = data.frame(
45+
row.names = rownames(input_sc),
46+
feature_id = rownames(input_sc),
47+
feature_name = rownames(input_sc)
48+
),
49+
uns = list(
50+
dataset_id = par$dataset_id,
51+
dataset_name = par$dataset_name,
52+
dataset_description = par$dataset_description,
53+
dataset_url = par$dataset_url,
54+
dataset_reference = par$dataset_reference,
55+
dataset_summary = par$dataset_summary,
56+
dataset_organism = par$dataset_organism
57+
)
58+
)
59+
60+
cat("Transforming spatial into AnnData\n")
61+
celltype_proportions <- metadata(input_sp)[["celltype_prop"]]
62+
63+
output_sp <- anndata::AnnData(
64+
layers = list(
65+
counts = Matrix::t(assay(input_sp, "counts")),
66+
logcounts = Matrix::t(assay(input_sp, "logcounts"))
67+
),
68+
obs = data.frame(
69+
row.names = colnames(input_sp),
70+
col = colData(input_sp)$col,
71+
row = colData(input_sp)$row,
72+
sizeFactor = colData(input_sp)$sizeFactor,
73+
spatial_cluster = colData(input_sp)$spatial.cluster
74+
),
75+
var = data.frame(
76+
row.names = rownames(input_sp),
77+
feature_id = rownames(input_sp),
78+
feature_name = rownames(input_sp)
79+
),
80+
obsm = list(
81+
celltype_proportions = celltype_proportions
82+
),
83+
uns = list(
84+
dataset_id = par$dataset_id,
85+
dataset_name = par$dataset_name,
86+
dataset_description = par$dataset_description,
87+
dataset_url = par$dataset_url,
88+
dataset_reference = par$dataset_reference,
89+
dataset_summary = par$dataset_summary,
90+
dataset_organism = par$dataset_organism
91+
)
92+
)
93+
94+
cat("Write output files\n")
95+
output_sc$write_h5ad(par$output_sc, compression = "gzip")
96+
output_sp$write_h5ad(par$output_sp, compression = "gzip")

0 commit comments

Comments
 (0)