Skip to content

Commit 3c8814f

Browse files
sahunoclaude
andcommitted
feat: add modkit/extract/full module
Add new nf-core module wrapping `modkit extract full`, which transforms the MM/ML tags in a modBAM into a tab-separated per-read-per-position probability table. Output can be BGZF-compressed via `--bgzf` in `ext.args`. Useful for downstream custom filtering, plotting, and ML training on read-level methylation probabilities. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 135d883 commit 3c8814f

6 files changed

Lines changed: 341 additions & 0 deletions

File tree

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
---
2+
# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json
3+
channels:
4+
- conda-forge
5+
- bioconda
6+
dependencies:
7+
- "bioconda::ont-modkit=0.6.1"
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
process MODKIT_EXTRACT_FULL {
2+
tag "$meta.id"
3+
label 'process_high'
4+
5+
conda "${moduleDir}/environment.yml"
6+
container "${ workflow.containerEngine in ['singularity', 'apptainer'] && !task.ext.singularity_pull_docker_container ?
7+
'https://depot.galaxyproject.org/singularity/ont-modkit:0.6.1--hcdda2d0_0':
8+
'biocontainers/ont-modkit:0.6.1--hcdda2d0_0' }"
9+
10+
input:
11+
tuple val(meta), path(bam), path(bai)
12+
tuple val(meta2), path(fasta), path(fai)
13+
14+
output:
15+
tuple val(meta), path("*.tsv{,.gz}"), emit: tsv
16+
tuple val(meta), path("*.log") , emit: log, optional: true
17+
tuple val("${task.process}"), val('modkit'), eval("modkit --version | sed 's/modkit //'"), emit: versions_modkit, topic: versions
18+
19+
when:
20+
task.ext.when == null || task.ext.when
21+
22+
script:
23+
def args = task.ext.args ?: ''
24+
def prefix = task.ext.prefix ?: "${meta.id}"
25+
def reference = fasta ? "--reference ${fasta}" : ''
26+
// BGZF output is detected automatically from --bgzf in ext.args; suffix accordingly
27+
def out_suffix = args.tokenize().contains('--bgzf') ? 'tsv.gz' : 'tsv'
28+
"""
29+
modkit \\
30+
extract \\
31+
full \\
32+
$args \\
33+
--threads ${task.cpus} \\
34+
${reference} \\
35+
${bam} \\
36+
${prefix}.${out_suffix}
37+
"""
38+
39+
stub:
40+
def args = task.ext.args ?: ''
41+
def prefix = task.ext.prefix ?: "${meta.id}"
42+
def out_suffix = args.tokenize().contains('--bgzf') ? 'tsv.gz' : 'tsv'
43+
"""
44+
touch ${prefix}.${out_suffix}
45+
"""
46+
}
Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,115 @@
1+
name: modkit_extract_full
2+
description: |
3+
Transform the probabilities from the MM/ML tags in a modBAM into a
4+
tab-separated per-read-per-position table. Emits one row for every
5+
modified-base probability call in every read — useful for downstream
6+
custom filtering, plotting, and ML training. Optionally BGZF-compressed
7+
via `--bgzf` in `ext.args`.
8+
keywords:
9+
- modkit
10+
- methylation
11+
- extract
12+
- read-level
13+
- modbam
14+
- nanopore
15+
- ont
16+
tools:
17+
- "modkit":
18+
description: A bioinformatics tool for working with modified bases in Oxford Nanopore
19+
sequencing data.
20+
homepage: https://github.com/nanoporetech/modkit
21+
documentation: https://nanoporetech.github.io/modkit/
22+
tool_dev_url: https://github.com/nanoporetech/modkit
23+
licence:
24+
- "Oxford Nanopore Technologies PLC. Public License Version 1.0"
25+
identifier: ""
26+
input:
27+
- - meta:
28+
type: map
29+
description: |
30+
Groovy Map containing sample information
31+
e.g. `[ id:'sample1' ]`. The output inherits this meta.
32+
- bam:
33+
type: file
34+
description: Input modBAM with MM/ML tags.
35+
pattern: "*.{bam,cram}"
36+
ontologies:
37+
- edam: http://edamontology.org/format_2572
38+
- bai:
39+
type: file
40+
description: BAM index (`.bai` or `.csi`). Optional — if absent, modkit does
41+
a serial scan of the BAM (see `--ignore-index`).
42+
pattern: "*.{bai,csi}"
43+
ontologies: []
44+
- - meta2:
45+
type: map
46+
description: |
47+
Groovy Map containing reference information
48+
e.g. `[ id:'mm10' ]`. May be `[[], [], []]` to skip (modkit works
49+
unaligned, but reference context columns will be missing).
50+
- fasta:
51+
type: file
52+
description: Reference FASTA. Required to populate reference-context columns
53+
in the output (`ref_kmer`, strand, etc.).
54+
pattern: "*.{fa,fasta,fna}"
55+
ontologies:
56+
- edam: http://edamontology.org/format_1929
57+
- fai:
58+
type: file
59+
description: Samtools FASTA index for `fasta`.
60+
pattern: "*.fai"
61+
ontologies:
62+
- edam: http://edamontology.org/format_3475
63+
output:
64+
tsv:
65+
- - meta:
66+
type: map
67+
description: |
68+
Groovy Map containing sample information
69+
e.g. `[ id:'sample1' ]`.
70+
- "*.tsv{,.gz}":
71+
type: file
72+
description: |
73+
Per-read-per-position probability table. BGZF-compressed when
74+
`--bgzf` is passed via `ext.args`.
75+
pattern: "*.{tsv,tsv.gz}"
76+
ontologies:
77+
- edam: http://edamontology.org/format_3475
78+
log:
79+
- - meta:
80+
type: map
81+
description: |
82+
Groovy Map containing sample information
83+
e.g. `[ id:'sample1' ]`.
84+
- "*.log":
85+
type: file
86+
description: |
87+
Optional modkit debug log (only emitted when `--log-filepath
88+
<name>.log` is passed via `ext.args`).
89+
pattern: "*.log"
90+
ontologies: []
91+
versions_modkit:
92+
- - ${task.process}:
93+
type: string
94+
description: The name of the process
95+
- modkit:
96+
type: string
97+
description: The name of the tool
98+
- modkit --version | sed 's/modkit //':
99+
type: eval
100+
description: The expression to obtain the version of the tool
101+
topics:
102+
versions:
103+
- - ${task.process}:
104+
type: string
105+
description: The name of the process
106+
- modkit:
107+
type: string
108+
description: The name of the tool
109+
- modkit --version | sed 's/modkit //':
110+
type: eval
111+
description: The expression to obtain the version of the tool
112+
authors:
113+
- "@sahuno"
114+
maintainers:
115+
- "@sahuno"
Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
nextflow_process {
2+
3+
name "Test Process MODKIT_EXTRACT_FULL"
4+
script "../main.nf"
5+
process "MODKIT_EXTRACT_FULL"
6+
config "./nextflow.config"
7+
8+
tag "modules"
9+
tag "modules_nfcore"
10+
tag "modkit"
11+
tag "modkit/extract"
12+
tag "modkit/extract/full"
13+
14+
test("homo sapiens - nanopore modbam - stub") {
15+
16+
options "-stub"
17+
18+
when {
19+
process {
20+
"""
21+
input[0] = [
22+
[ id: 'test' ],
23+
file(params.modules_testdata_base_path + 'genomics/homo_sapiens/nanopore/bam/test.sorted.phased.bam', checkIfExists: true),
24+
file(params.modules_testdata_base_path + 'genomics/homo_sapiens/nanopore/bam/test.sorted.phased.bam.bai', checkIfExists: true)
25+
]
26+
input[1] = [
27+
[ id: 'genome' ],
28+
file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.fasta', checkIfExists: true),
29+
file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.fasta.fai', checkIfExists: true)
30+
]
31+
"""
32+
}
33+
}
34+
35+
then {
36+
assertAll(
37+
{ assert process.success },
38+
{ assert snapshot(process.out).match() }
39+
)
40+
}
41+
}
42+
43+
test("homo sapiens - nanopore modbam") {
44+
45+
when {
46+
process {
47+
"""
48+
input[0] = [
49+
[ id: 'test' ],
50+
file(params.modules_testdata_base_path + 'genomics/homo_sapiens/nanopore/bam/test.sorted.phased.bam', checkIfExists: true),
51+
file(params.modules_testdata_base_path + 'genomics/homo_sapiens/nanopore/bam/test.sorted.phased.bam.bai', checkIfExists: true)
52+
]
53+
input[1] = [
54+
[ id: 'genome' ],
55+
file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.fasta', checkIfExists: true),
56+
file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.fasta.fai', checkIfExists: true)
57+
]
58+
"""
59+
}
60+
}
61+
62+
then {
63+
assertAll(
64+
{ assert process.success },
65+
{ assert process.out.tsv },
66+
{ assert path(process.out.tsv[0][1]).exists() },
67+
{ assert path(process.out.tsv[0][1]).size() > 0 },
68+
{ assert snapshot(process.out).match() }
69+
)
70+
}
71+
}
72+
}
Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
{
2+
"homo sapiens - nanopore modbam": {
3+
"content": [
4+
{
5+
"0": [
6+
[
7+
{
8+
"id": "test"
9+
},
10+
"test.tsv:md5,8300986c98d1b67973439ced4e7f233c"
11+
]
12+
],
13+
"1": [
14+
15+
],
16+
"2": [
17+
[
18+
"MODKIT_EXTRACT_FULL",
19+
"modkit",
20+
"0.6.1"
21+
]
22+
],
23+
"log": [
24+
25+
],
26+
"tsv": [
27+
[
28+
{
29+
"id": "test"
30+
},
31+
"test.tsv:md5,8300986c98d1b67973439ced4e7f233c"
32+
]
33+
],
34+
"versions_modkit": [
35+
[
36+
"MODKIT_EXTRACT_FULL",
37+
"modkit",
38+
"0.6.1"
39+
]
40+
]
41+
}
42+
],
43+
"timestamp": "2026-04-23T22:03:49.629847264",
44+
"meta": {
45+
"nf-test": "0.9.5",
46+
"nextflow": "25.04.6"
47+
}
48+
},
49+
"homo sapiens - nanopore modbam - stub": {
50+
"content": [
51+
{
52+
"0": [
53+
[
54+
{
55+
"id": "test"
56+
},
57+
"test.tsv:md5,d41d8cd98f00b204e9800998ecf8427e"
58+
]
59+
],
60+
"1": [
61+
62+
],
63+
"2": [
64+
[
65+
"MODKIT_EXTRACT_FULL",
66+
"modkit",
67+
"0.6.1"
68+
]
69+
],
70+
"log": [
71+
72+
],
73+
"tsv": [
74+
[
75+
{
76+
"id": "test"
77+
},
78+
"test.tsv:md5,d41d8cd98f00b204e9800998ecf8427e"
79+
]
80+
],
81+
"versions_modkit": [
82+
[
83+
"MODKIT_EXTRACT_FULL",
84+
"modkit",
85+
"0.6.1"
86+
]
87+
]
88+
}
89+
],
90+
"timestamp": "2026-04-23T22:03:35.292337916",
91+
"meta": {
92+
"nf-test": "0.9.5",
93+
"nextflow": "25.04.6"
94+
}
95+
}
96+
}
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
process {
2+
withName: 'MODKIT_EXTRACT_FULL' {
3+
ext.args = ''
4+
}
5+
}

0 commit comments

Comments
 (0)