Skip to content

Commit 03b0d60

Browse files
authored
[MRG] add VCF-calculating code from #124 (#165)
* steal VCF code from #124 * fix n_snps calculation
1 parent cfdf4db commit 03b0d60

File tree

3 files changed

+51
-8
lines changed

3 files changed

+51
-8
lines changed

genome_grist/combine_csvs.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,10 @@ def main():
2424
rows.extend(list(r))
2525
fieldnames = r.fieldnames
2626

27+
if not rows:
28+
print(f"error: file {first_csv} is empty!?", file=sys.stderr)
29+
sys.exit(-1)
30+
2731
if args.fields:
2832
fieldnames = args.fields.split(',')
2933

genome_grist/conf/Snakefile

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -715,12 +715,13 @@ rule build_new_consensus_wc:
715715
# summarize depth into a CSV
716716
rule summarize_samtools_depth_wc:
717717
input:
718-
Checkpoint_GatherResults(outdir + f"/{{dir}}/{{sample}}.x.{{ident}}.depth.txt")
718+
depth = Checkpoint_GatherResults(outdir + f"/{{dir}}/{{sample}}.x.{{ident}}.depth.txt"),
719+
vcf = Checkpoint_GatherResults(outdir + f"/{{dir}}/{{sample}}.x.{{ident}}.vcf.gz")
719720
output:
720721
csv = f"{outdir}/{{dir}}/{{sample}}.summary.csv"
721722
shell: """
722723
python -m genome_grist.summarize_mapping {wildcards.sample} \
723-
{input} -o {output.csv}
724+
{input.depth} -o {output.csv}
724725
"""
725726

726727
# compute sourmash signature from abundtrim reads

genome_grist/summarize_mapping.py

Lines changed: 44 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,43 @@
11
#! /usr/bin/env python
22
"""
33
Summarize mapping depth information (produced by samtools depth -aa {bamfile}).
4+
Also summarize SNP counts from VCF file.
45
"""
56
import argparse
67
import sys
78
import pandas as pd
89
import os
10+
import gzip
11+
from collections import defaultdict
12+
13+
14+
def summarize_vcf(vcf_gz):
15+
"Count number of distinct SNP locations"
16+
by_pos = defaultdict(dict)
17+
n_lines = 0
18+
with gzip.open(vcf_gz, 'rt') as fp:
19+
for line in fp:
20+
# skip comments
21+
if line.startswith('#'):
22+
continue
23+
24+
n_lines += 1
25+
chrom, pos, ident, ref, alt, qual, *rest = line.split('\t')
26+
27+
# skip indels for now
28+
if len(ref) > 1 or len(alt) > 1:
29+
continue
30+
31+
pos = int(pos)
32+
by_pos[chrom][pos] = 1
33+
34+
n_chrom = len(by_pos)
35+
36+
n_snps = 0
37+
for chrom in by_pos:
38+
n_snps += len(by_pos[chrom])
39+
40+
return n_lines, n_chrom, n_snps
941

1042

1143
def main():
@@ -19,20 +51,25 @@ def main():
1951
sample = args.sample_name
2052
runs = {}
2153
for n, depth_txt in enumerate(args.depth_txts):
22-
print(f'reading from {depth_txt}', file=sys.stderr)
54+
assert depth_txt.endswith('.depth.txt')
55+
vcf_gz = depth_txt[:-len('.depth.txt')] + '.vcf.gz'
56+
assert os.path.exists(vcf_gz)
57+
print(f"reading from '{vcf_gz}'", file=sys.stderr)
58+
_, n_chrom, n_snps = summarize_vcf(vcf_gz)
2359

24-
data = pd.read_table(depth_txt, names=["contig", "pos", "coverage"])
60+
print(f"reading from '{depth_txt}", file=sys.stderr)
2561

26-
if not len(data):
27-
print("empty?")
28-
continue
62+
data = pd.read_table(depth_txt, names=["contig", "pos", "coverage"])
2963

3064
filename = os.path.basename(depth_txt)
3165
sample_check, _, genome_id, *rest = filename.split(".")
3266

3367
assert sample_check == sample, (sample_check, sample)
3468

3569
d = {}
70+
d['n_chrom'] = n_chrom
71+
d['n_snps'] = n_snps
72+
3673
value_counts = data['coverage'].value_counts()
3774
d['genome bp'] = int(len(data))
3875
d['missed'] = int(value_counts.get(0, 0))
@@ -43,7 +80,8 @@ def main():
4380
d['unique_mapped_coverage'] = uniq_cov
4481
else:
4582
d['unique_mapped_coverage'] = d['coverage']
46-
d['covered_bp'] = (1 - d['percent missed']/100.0) * d['genome bp']
83+
covered_bp = (1 - d['percent missed']/100.0) * d['genome bp']
84+
d['covered_bp'] = round(covered_bp + 0.5)
4785
d['genome_id'] = genome_id
4886
d['sample_id'] = sample
4987

0 commit comments

Comments
 (0)