Skip to content

Commit 09ce9a5

Browse files
committed
Merge pull request #18 from hammerlab/trim-N-from-read-sequence
Trim 'N' from nucleotide sequences when creating VariantRead
2 parents 6ab307a + ede53f1 commit 09ce9a5

File tree

4 files changed

+166
-73
lines changed

4 files changed

+166
-73
lines changed

isovar/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,4 +14,4 @@
1414

1515
from __future__ import print_function, division, absolute_import
1616

17-
__version__ = "0.0.2"
17+
__version__ = "0.0.3"

isovar/translation.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -158,9 +158,13 @@ def trim_sequences(variant_sequence, reference_context):
158158
# take the sequence PREFIX|VARIANT|SUFFIX
159159
# and take the complement of XIFFUS|TNAIRAV|XIFERP
160160
if reference_context.strand == "-":
161-
cdna_prefix = cdna_suffix.reverse_complement()
162-
cdna_alt = cdna_alt.reverse_complement()
163-
cdna_suffix = cdna_prefix.reverse_complement()
161+
# notice that we are setting the *prefix* to be reverse complement
162+
# of the *suffix* and vice versa
163+
cdna_prefix, cdna_alt, cdna_suffix = (
164+
cdna_suffix.reverse_complement(),
165+
cdna_alt.reverse_complement(),
166+
cdna_prefix.reverse_complement()
167+
)
164168

165169
reference_sequence_before_variant = reference_context.sequence_before_variant_locus
166170

isovar/variant_read.py

Lines changed: 128 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -34,10 +34,130 @@
3434
VariantRead = namedtuple(
3535
"VariantRead", "prefix alt suffix name")
3636

37+
38+
def trim_N_nucleotides(prefix, suffix):
39+
"""
40+
Drop all occurrences of 'N' from prefix and suffix nucleotide strings
41+
by trimming.
42+
"""
43+
if 'N' in prefix:
44+
# trim prefix to exclude all occurrences of N
45+
rightmost_index = prefix.rfind('N')
46+
logging.debug("Trimming %d nucleotides from read prefix '%s'" % (
47+
rightmost_index + 1, prefix))
48+
prefix = prefix[rightmost_index + 1:]
49+
50+
if 'N' in suffix:
51+
leftmost_index = suffix.find('N')
52+
logging.debug("Trimming %d nucleotides from read suffix '%s'" % (
53+
len(suffix) - leftmost_index,
54+
suffix))
55+
suffix = suffix[:leftmost_index]
56+
57+
return prefix, suffix
58+
59+
def convert_from_bytes_if_necessary(prefix, suffix):
60+
"""
61+
Depending on how we extract data from pysam we may end up with either
62+
a string or a byte array of nucleotides. For consistency and simplicity,
63+
we want to only use strings in the rest of our code.
64+
"""
65+
if isinstance(prefix, bytes):
66+
prefix = prefix.decode('ascii')
67+
68+
if isinstance(suffix, bytes):
69+
suffix = suffix.decode('ascii')
70+
71+
return prefix, suffix
72+
73+
def variant_read_from_single_read_at_locus(read, ref, alt):
74+
"""
75+
Given a single ReadAtLocus object, return either a VariantRead or None
76+
(if the read's sequence didn't contain the variant nucleotides).
77+
78+
Parameters
79+
----------
80+
read : ReadAtLocus
81+
Read which may possibly contain the alternate nucleotides
82+
83+
ref : str
84+
Reference sequence of the variant (empty for insertions)
85+
86+
alt : str
87+
Alternate sequence of the variant (empty for deletions)
88+
89+
"""
90+
sequence = read.sequence
91+
reference_positions = read.reference_positions
92+
93+
# positions of the nucleotides before and after the variant within
94+
# the read sequence
95+
read_pos_before = read.base0_read_position_before_variant
96+
read_pos_after = read.base0_read_position_after_variant
97+
98+
# positions of the nucleotides before and after the variant on the
99+
# reference genome
100+
ref_pos_before = reference_positions[read_pos_before]
101+
102+
if ref_pos_before is None:
103+
logging.warn(
104+
"Missing reference pos for nucleotide before variant on read: %s" % (
105+
read,))
106+
return None
107+
108+
ref_pos_after = reference_positions[read_pos_after]
109+
110+
if ref_pos_after is None:
111+
logging.warn(
112+
"Missing reference pos for nucleotide after variant on read: %s" % (
113+
read,))
114+
return None
115+
116+
if len(ref) == 0:
117+
if ref_pos_after - ref_pos_before != 1:
118+
# if the number of nucleotides skipped isn't the same
119+
# as the number of reference nucleotides in the variant then
120+
# don't use this read
121+
logging.debug(
122+
"Positions before (%d) and after (%d) variant should be adjacent on read %s" % (
123+
ref_pos_before,
124+
ref_pos_after,
125+
read))
126+
return None
127+
128+
# insertions require a sequence of non-aligned bases
129+
# followed by the subsequence reference position
130+
ref_positions_for_inserted = reference_positions[
131+
read_pos_before + 1:read_pos_after]
132+
if any(insert_pos is not None for insert_pos in ref_positions_for_inserted):
133+
# all these inserted nucleotides should *not* align to the
134+
# reference
135+
logging.debug(
136+
"Skipping read, inserted nucleotides shouldn't map to reference")
137+
return None
138+
else:
139+
# substitutions and deletions
140+
if ref_pos_after - ref_pos_before != len(ref) + 1:
141+
# if the number of nucleotides skipped isn't the same
142+
# as the number of reference nucleotides in the variant then
143+
# don't use this read
144+
logging.debug(
145+
"Positions before (%d) and after (%d) variant should be adjacent on read %s" % (
146+
ref_pos_before,
147+
ref_pos_after,
148+
read))
149+
return None
150+
151+
prefix = sequence[:read_pos_before + 1]
152+
suffix = sequence[read_pos_after:]
153+
prefix, suffix = convert_from_bytes_if_necessary(prefix, suffix)
154+
prefix, suffix = trim_N_nucleotides(prefix, suffix)
155+
return VariantRead(prefix, alt, suffix, name=read.name)
156+
37157
def variant_reads_from_reads_at_locus(reads, ref, alt):
38158
"""
39-
Given a collection of pysam.AlignedSegment objects, generates a
40-
sequence of VariantRead objects (which are split into prefix/variant/suffix
159+
Given a collection of ReadAtLocus objects, returns a
160+
list of VariantRead objects (which are split into prefix/variant/suffix
41161
nucleotides).
42162
43163
Parameters
@@ -50,75 +170,14 @@ def variant_reads_from_reads_at_locus(reads, ref, alt):
50170
alt : str
51171
Alternate sequence of the variant (empty for deletions)
52172
53-
Returns a sequence of VariantRead objects.
173+
Returns a list of VariantRead objects.
54174
"""
175+
variant_reads = []
55176
for read in reads:
56-
sequence = read.sequence
57-
reference_positions = read.reference_positions
58-
59-
# positions of the nucleotides before and after the variant within
60-
# the read sequence
61-
read_pos_before = read.base0_read_position_before_variant
62-
read_pos_after = read.base0_read_position_after_variant
63-
64-
# positions of the nucleotides before and after the variant on the
65-
# reference genome
66-
ref_pos_before = reference_positions[read_pos_before]
67-
if ref_pos_before is None:
68-
logging.warn(
69-
"Missing reference pos for nucleotide before variant on read: %s" % (
70-
read,))
71-
continue
72-
73-
ref_pos_after = reference_positions[read_pos_after]
74-
if ref_pos_after is None:
75-
logging.warn(
76-
"Missing reference pos for nucleotide after variant on read: %s" % (
77-
read,))
78-
continue
79-
80-
if len(ref) == 0:
81-
if ref_pos_after - ref_pos_before != 1:
82-
# if the number of nucleotides skipped isn't the same
83-
# as the number of reference nucleotides in the variant then
84-
# don't use this read
85-
logging.debug(
86-
"Positions before (%d) and after (%d) variant should be adjacent on read %s" % (
87-
ref_pos_before,
88-
ref_pos_after,
89-
read))
90-
continue
91-
# insertions require a sequence of non-aligned bases
92-
# followed by the subsequence reference position
93-
ref_positions_for_inserted = reference_positions[
94-
read_pos_before + 1:read_pos_after]
95-
if any(insert_pos is not None for insert_pos in ref_positions_for_inserted):
96-
# all these inserted nucleotides should *not* align to the
97-
# reference
98-
logging.debug(
99-
"Skipping read, inserted nucleotides shouldn't map to reference")
100-
else:
101-
# substitutions and deletions
102-
if ref_pos_after - ref_pos_before != len(ref) + 1:
103-
# if the number of nucleotides skipped isn't the same
104-
# as the number of reference nucleotides in the variant then
105-
# don't use this read
106-
logging.debug(
107-
"Positions before (%d) and after (%d) variant should be adjacent on read %s" % (
108-
ref_pos_before,
109-
ref_pos_after,
110-
read))
111-
continue
112-
prefix = sequence[:read_pos_before + 1]
113-
suffix = sequence[read_pos_after:]
114-
115-
if isinstance(prefix, bytes):
116-
prefix = prefix.decode('ascii')
117-
if isinstance(suffix, bytes):
118-
suffix = suffix.decode('ascii')
119-
120-
yield VariantRead(prefix, alt, suffix, name=read.name)
121-
177+
variant_read = variant_read_from_single_read_at_locus(read, ref, alt)
178+
if variant_read is not None:
179+
variant_reads.append(variant_read)
180+
return variant_reads
122181

123182
def gather_reads_for_single_variant(
124183
samfile,

test/test_variant_read.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
import isovar
2+
import isovar.variant_read
3+
from isovar.variant_read import (
4+
variant_read_from_single_read_at_locus,
5+
VariantRead,
6+
)
7+
from isovar.read_at_locus import ReadAtLocus
8+
from nose.tools import eq_
9+
10+
def make_read_at_locus(prefix, alt, suffix, base_quality=30, name="dummy"):
11+
dummy_sequence = prefix + alt + suffix
12+
return ReadAtLocus(
13+
name="dummy",
14+
sequence=dummy_sequence,
15+
reference_positions=list(range(1, len(dummy_sequence) + 1)),
16+
quality_scores=[base_quality] * len(dummy_sequence),
17+
base0_read_position_before_variant=len(prefix) - 1,
18+
base0_read_position_after_variant=len(prefix) + len(alt),
19+
)
20+
21+
def test_variant_read_from_single_read_at_locus_trim_N_nucleotides():
22+
read_at_locus =make_read_at_locus(prefix="NCCN", alt="A", suffix="TNNA")
23+
variant_read = variant_read_from_single_read_at_locus(
24+
read_at_locus, ref="T", alt="A")
25+
print(variant_read)
26+
expected = VariantRead(prefix="", alt="A", suffix="T", name="dummy")
27+
eq_(variant_read, expected)
28+
29+
if __name__ == "__main__":
30+
test_variant_read_from_single_read_at_locus_trim_N_nucleotides()

0 commit comments

Comments
 (0)