|
20 | 20 |
|
21 | 21 | import varcode
|
22 | 22 | import skbio
|
| 23 | +import numpy as np |
23 | 24 | from pysam import AlignmentFile
|
24 | 25 |
|
25 | 26 | from isovar import gather_variant_reads, sequence_counts
|
|
39 | 40 | default=None)
|
40 | 41 |
|
41 | 42 | parser.add_argument(
|
42 |
| - "--min-count", type=int, default=3) |
| 43 | + "--min-read-count", |
| 44 | + type=int, |
| 45 | + default=3) |
43 | 46 |
|
44 | 47 | parser.add_argument(
|
45 |
| - "--context-size", |
46 |
| - default=45, |
| 48 | + "--sequence-length", |
| 49 | + default=105, |
47 | 50 | type=int)
|
48 | 51 |
|
| 52 | +parser.add_argument( |
| 53 | + "--max-sequences-per-variant", |
| 54 | + type=int, |
| 55 | + default=5) |
49 | 56 |
|
50 | 57 | if __name__ == "__main__":
|
51 | 58 | args = parser.parse_args()
|
|
54 | 61 | samfile = AlignmentFile(args.bam)
|
55 | 62 |
|
56 | 63 | for variant in variants:
|
| 64 | + print(variant) |
57 | 65 | variant_reads = gather_variant_reads(
|
58 | 66 | samfile=samfile,
|
59 | 67 | chromosome="chr" + variant.contig,
|
60 | 68 | base1_location=variant.start,
|
61 | 69 | ref=variant.ref,
|
62 | 70 | alt=variant.alt)
|
63 |
| - if len(variant_reads) < args.min_count: |
| 71 | + if len(variant_reads) < args.min_read_count: |
64 | 72 | continue
|
| 73 | + |
| 74 | + # the number of context nucleotides on either side of the variant |
| 75 | + # is half the desired length (minus the number of variant nucleotides) |
| 76 | + context_size = int( |
| 77 | + np.ceil((args.sequence_length - len(variant.alt)) / 2.0)) |
65 | 78 | sequence_count_info = sequence_counts(
|
66 |
| - variant_reads, context_size=args.context_size) |
67 |
| - for ((prefix, suffix), count) in sorted( |
| 79 | + variant_reads, |
| 80 | + context_size=context_size) |
| 81 | + for i, ((prefix, suffix), count) in enumerate(sorted( |
68 | 82 | sequence_count_info.full_read_counts.items(),
|
69 |
| - key=lambda x: -x[1]): |
70 |
| - if count < args.min_count: |
| 83 | + key=lambda x: -x[1])): |
| 84 | + if i >= args.max_sequences_per_variant: |
71 | 85 | break
|
72 | 86 |
|
73 |
| - variant = sequence_count_info.variant_nucleotides |
| 87 | + if count < args.min_read_count: |
| 88 | + break |
| 89 | + |
| 90 | + variant_seq = sequence_count_info.variant_nucleotides |
| 91 | + |
74 | 92 | print("\t%s_%s_%s: %d" % (
|
75 | 93 | prefix,
|
76 |
| - variant, |
| 94 | + variant_seq, |
77 | 95 | suffix,
|
78 | 96 | count))
|
79 | 97 |
|
80 | 98 | # translate in three reading frames:
|
81 |
| - seq = "%s%s%s" % (prefix, variant, suffix) |
| 99 | + seq = "%s%s%s" % (prefix, variant_seq, suffix) |
82 | 100 | for offset in range(3):
|
83 | 101 | dna = skbio.DNA(seq[offset:])
|
84 | 102 | print("\t\tframe=%d: %s" % (offset, dna.translate()))
|
0 commit comments