[MRG] add VCF-calculating code from #124 (#165)

ctb · web-flow · commit 03b0d601191a · 2022-02-16T04:26:36.000-08:00
* steal VCF code from #124 * fix n_snps calculation
diff --git a/genome_grist/combine_csvs.py b/genome_grist/combine_csvs.py
@@ -24,6 +24,10 @@ def main():
         rows.extend(list(r))
         fieldnames = r.fieldnames
 
+    if not rows:
+        print(f"error: file {first_csv} is empty!?", file=sys.stderr)
+        sys.exit(-1)
+
     if args.fields:
         fieldnames = args.fields.split(',')
 
diff --git a/genome_grist/conf/Snakefile b/genome_grist/conf/Snakefile
@@ -715,12 +715,13 @@ rule build_new_consensus_wc:
 # summarize depth into a CSV
 rule summarize_samtools_depth_wc:
     input:
-        Checkpoint_GatherResults(outdir + f"/{{dir}}/{{sample}}.x.{{ident}}.depth.txt")
+        depth = Checkpoint_GatherResults(outdir + f"/{{dir}}/{{sample}}.x.{{ident}}.depth.txt"),
+        vcf = Checkpoint_GatherResults(outdir + f"/{{dir}}/{{sample}}.x.{{ident}}.vcf.gz")
     output:
         csv = f"{outdir}/{{dir}}/{{sample}}.summary.csv"
     shell: """
         python -m genome_grist.summarize_mapping {wildcards.sample} \
-             {input} -o {output.csv}
+             {input.depth} -o {output.csv}
     """
 
 # compute sourmash signature from abundtrim reads
diff --git a/genome_grist/summarize_mapping.py b/genome_grist/summarize_mapping.py
@@ -1,11 +1,43 @@
 #! /usr/bin/env python
 """
 Summarize mapping depth information (produced by samtools depth -aa {bamfile}).
+Also summarize SNP counts from VCF file.
 """
 import argparse
 import sys
 import pandas as pd
 import os
+import gzip
+from collections import defaultdict
+
+
+def summarize_vcf(vcf_gz):
+    "Count number of distinct SNP locations"
+    by_pos = defaultdict(dict)
+    n_lines = 0
+    with gzip.open(vcf_gz, 'rt') as fp:
+        for line in fp:
+            # skip comments
+            if line.startswith('#'):
+                continue
+
+            n_lines += 1
+            chrom, pos, ident, ref, alt, qual, *rest = line.split('\t')
+
+            # skip indels for now
+            if len(ref) > 1 or len(alt) > 1:
+                continue
+
+            pos = int(pos)
+            by_pos[chrom][pos] = 1
+
+    n_chrom = len(by_pos)
+
+    n_snps = 0
+    for chrom in by_pos:
+        n_snps += len(by_pos[chrom])
+
+    return n_lines, n_chrom, n_snps
 
 
 def main():
@@ -19,20 +51,25 @@ def main():
     sample = args.sample_name
     runs = {}
     for n, depth_txt in enumerate(args.depth_txts):
-        print(f'reading from {depth_txt}', file=sys.stderr)
+        assert depth_txt.endswith('.depth.txt')
+        vcf_gz = depth_txt[:-len('.depth.txt')] + '.vcf.gz'
+        assert os.path.exists(vcf_gz)
+        print(f"reading from '{vcf_gz}'", file=sys.stderr)
+        _, n_chrom, n_snps = summarize_vcf(vcf_gz)
 
-        data = pd.read_table(depth_txt, names=["contig", "pos", "coverage"])
+        print(f"reading from '{depth_txt}", file=sys.stderr)
 
-        if not len(data):
-            print("empty?")
-            continue
+        data = pd.read_table(depth_txt, names=["contig", "pos", "coverage"])
 
         filename = os.path.basename(depth_txt)
         sample_check, _, genome_id, *rest = filename.split(".")
 
         assert sample_check == sample, (sample_check, sample)
 
         d = {}
+        d['n_chrom'] = n_chrom
+        d['n_snps'] = n_snps
+
         value_counts = data['coverage'].value_counts()
         d['genome bp'] = int(len(data))
         d['missed'] = int(value_counts.get(0, 0))
@@ -43,7 +80,8 @@ def main():
             d['unique_mapped_coverage'] = uniq_cov
         else:
             d['unique_mapped_coverage'] = d['coverage']
-        d['covered_bp'] = (1 - d['percent missed']/100.0) * d['genome bp']
+        covered_bp = (1 - d['percent missed']/100.0) * d['genome bp']
+        d['covered_bp'] = round(covered_bp + 0.5)
         d['genome_id'] = genome_id
         d['sample_id'] = sample