1
1
#! /usr/bin/env python
2
2
"""
3
3
Summarize mapping depth information (produced by samtools depth -aa {bamfile}).
4
+ Also summarize SNP counts from VCF file.
4
5
"""
5
6
import argparse
6
7
import sys
7
8
import pandas as pd
8
9
import os
10
+ import gzip
11
+ from collections import defaultdict
12
+
13
+
14
+ def summarize_vcf (vcf_gz ):
15
+ "Count number of distinct SNP locations"
16
+ by_pos = defaultdict (dict )
17
+ n_lines = 0
18
+ with gzip .open (vcf_gz , 'rt' ) as fp :
19
+ for line in fp :
20
+ # skip comments
21
+ if line .startswith ('#' ):
22
+ continue
23
+
24
+ n_lines += 1
25
+ chrom , pos , ident , ref , alt , qual , * rest = line .split ('\t ' )
26
+
27
+ # skip indels for now
28
+ if len (ref ) > 1 or len (alt ) > 1 :
29
+ continue
30
+
31
+ pos = int (pos )
32
+ by_pos [chrom ][pos ] = 1
33
+
34
+ n_chrom = len (by_pos )
35
+
36
+ n_snps = 0
37
+ for chrom in by_pos :
38
+ n_snps += len (by_pos [chrom ])
39
+
40
+ return n_lines , n_chrom , n_snps
9
41
10
42
11
43
def main ():
@@ -19,20 +51,25 @@ def main():
19
51
sample = args .sample_name
20
52
runs = {}
21
53
for n , depth_txt in enumerate (args .depth_txts ):
22
- print (f'reading from { depth_txt } ' , file = sys .stderr )
54
+ assert depth_txt .endswith ('.depth.txt' )
55
+ vcf_gz = depth_txt [:- len ('.depth.txt' )] + '.vcf.gz'
56
+ assert os .path .exists (vcf_gz )
57
+ print (f"reading from '{ vcf_gz } '" , file = sys .stderr )
58
+ _ , n_chrom , n_snps = summarize_vcf (vcf_gz )
23
59
24
- data = pd . read_table ( depth_txt , names = [ "contig" , "pos" , "coverage" ] )
60
+ print ( f"reading from ' { depth_txt } " , file = sys . stderr )
25
61
26
- if not len (data ):
27
- print ("empty?" )
28
- continue
62
+ data = pd .read_table (depth_txt , names = ["contig" , "pos" , "coverage" ])
29
63
30
64
filename = os .path .basename (depth_txt )
31
65
sample_check , _ , genome_id , * rest = filename .split ("." )
32
66
33
67
assert sample_check == sample , (sample_check , sample )
34
68
35
69
d = {}
70
+ d ['n_chrom' ] = n_chrom
71
+ d ['n_snps' ] = n_snps
72
+
36
73
value_counts = data ['coverage' ].value_counts ()
37
74
d ['genome bp' ] = int (len (data ))
38
75
d ['missed' ] = int (value_counts .get (0 , 0 ))
@@ -43,7 +80,8 @@ def main():
43
80
d ['unique_mapped_coverage' ] = uniq_cov
44
81
else :
45
82
d ['unique_mapped_coverage' ] = d ['coverage' ]
46
- d ['covered_bp' ] = (1 - d ['percent missed' ]/ 100.0 ) * d ['genome bp' ]
83
+ covered_bp = (1 - d ['percent missed' ]/ 100.0 ) * d ['genome bp' ]
84
+ d ['covered_bp' ] = round (covered_bp + 0.5 )
47
85
d ['genome_id' ] = genome_id
48
86
d ['sample_id' ] = sample
49
87
0 commit comments