34
34
VariantRead = namedtuple (
35
35
"VariantRead" , "prefix alt suffix name" )
36
36
37
+
38
+ def trim_N_nucleotides (prefix , suffix ):
39
+ """
40
+ Drop all occurrences of 'N' from prefix and suffix nucleotide strings
41
+ by trimming.
42
+ """
43
+ if 'N' in prefix :
44
+ # trim prefix to exclude all occurrences of N
45
+ rightmost_index = prefix .rfind ('N' )
46
+ logging .debug ("Trimming %d nucleotides from read prefix '%s'" % (
47
+ rightmost_index + 1 , prefix ))
48
+ prefix = prefix [rightmost_index + 1 :]
49
+
50
+ if 'N' in suffix :
51
+ leftmost_index = suffix .find ('N' )
52
+ logging .debug ("Trimming %d nucleotides from read suffix '%s'" % (
53
+ len (suffix ) - leftmost_index ,
54
+ suffix ))
55
+ suffix = suffix [:leftmost_index ]
56
+
57
+ return prefix , suffix
58
+
59
+ def convert_from_bytes_if_necessary (prefix , suffix ):
60
+ """
61
+ Depending on how we extract data from pysam we may end up with either
62
+ a string or a byte array of nucleotides. For consistency and simplicity,
63
+ we want to only use strings in the rest of our code.
64
+ """
65
+ if isinstance (prefix , bytes ):
66
+ prefix = prefix .decode ('ascii' )
67
+
68
+ if isinstance (suffix , bytes ):
69
+ suffix = suffix .decode ('ascii' )
70
+
71
+ return prefix , suffix
72
+
73
+ def variant_read_from_single_read_at_locus (read , ref , alt ):
74
+ """
75
+ Given a single ReadAtLocus object, return either a VariantRead or None
76
+ (if the read's sequence didn't contain the variant nucleotides).
77
+
78
+ Parameters
79
+ ----------
80
+ read : ReadAtLocus
81
+ Read which may possibly contain the alternate nucleotides
82
+
83
+ ref : str
84
+ Reference sequence of the variant (empty for insertions)
85
+
86
+ alt : str
87
+ Alternate sequence of the variant (empty for deletions)
88
+
89
+ """
90
+ sequence = read .sequence
91
+ reference_positions = read .reference_positions
92
+
93
+ # positions of the nucleotides before and after the variant within
94
+ # the read sequence
95
+ read_pos_before = read .base0_read_position_before_variant
96
+ read_pos_after = read .base0_read_position_after_variant
97
+
98
+ # positions of the nucleotides before and after the variant on the
99
+ # reference genome
100
+ ref_pos_before = reference_positions [read_pos_before ]
101
+
102
+ if ref_pos_before is None :
103
+ logging .warn (
104
+ "Missing reference pos for nucleotide before variant on read: %s" % (
105
+ read ,))
106
+ return None
107
+
108
+ ref_pos_after = reference_positions [read_pos_after ]
109
+
110
+ if ref_pos_after is None :
111
+ logging .warn (
112
+ "Missing reference pos for nucleotide after variant on read: %s" % (
113
+ read ,))
114
+ return None
115
+
116
+ if len (ref ) == 0 :
117
+ if ref_pos_after - ref_pos_before != 1 :
118
+ # if the number of nucleotides skipped isn't the same
119
+ # as the number of reference nucleotides in the variant then
120
+ # don't use this read
121
+ logging .debug (
122
+ "Positions before (%d) and after (%d) variant should be adjacent on read %s" % (
123
+ ref_pos_before ,
124
+ ref_pos_after ,
125
+ read ))
126
+ return None
127
+
128
+ # insertions require a sequence of non-aligned bases
129
+ # followed by the subsequence reference position
130
+ ref_positions_for_inserted = reference_positions [
131
+ read_pos_before + 1 :read_pos_after ]
132
+ if any (insert_pos is not None for insert_pos in ref_positions_for_inserted ):
133
+ # all these inserted nucleotides should *not* align to the
134
+ # reference
135
+ logging .debug (
136
+ "Skipping read, inserted nucleotides shouldn't map to reference" )
137
+ return None
138
+ else :
139
+ # substitutions and deletions
140
+ if ref_pos_after - ref_pos_before != len (ref ) + 1 :
141
+ # if the number of nucleotides skipped isn't the same
142
+ # as the number of reference nucleotides in the variant then
143
+ # don't use this read
144
+ logging .debug (
145
+ "Positions before (%d) and after (%d) variant should be adjacent on read %s" % (
146
+ ref_pos_before ,
147
+ ref_pos_after ,
148
+ read ))
149
+ return None
150
+
151
+ prefix = sequence [:read_pos_before + 1 ]
152
+ suffix = sequence [read_pos_after :]
153
+ prefix , suffix = convert_from_bytes_if_necessary (prefix , suffix )
154
+ prefix , suffix = trim_N_nucleotides (prefix , suffix )
155
+ return VariantRead (prefix , alt , suffix , name = read .name )
156
+
37
157
def variant_reads_from_reads_at_locus (reads , ref , alt ):
38
158
"""
39
- Given a collection of pysam.AlignedSegment objects, generates a
40
- sequence of VariantRead objects (which are split into prefix/variant/suffix
159
+ Given a collection of ReadAtLocus objects, returns a
160
+ list of VariantRead objects (which are split into prefix/variant/suffix
41
161
nucleotides).
42
162
43
163
Parameters
@@ -50,75 +170,14 @@ def variant_reads_from_reads_at_locus(reads, ref, alt):
50
170
alt : str
51
171
Alternate sequence of the variant (empty for deletions)
52
172
53
- Returns a sequence of VariantRead objects.
173
+ Returns a list of VariantRead objects.
54
174
"""
175
+ variant_reads = []
55
176
for read in reads :
56
- sequence = read .sequence
57
- reference_positions = read .reference_positions
58
-
59
- # positions of the nucleotides before and after the variant within
60
- # the read sequence
61
- read_pos_before = read .base0_read_position_before_variant
62
- read_pos_after = read .base0_read_position_after_variant
63
-
64
- # positions of the nucleotides before and after the variant on the
65
- # reference genome
66
- ref_pos_before = reference_positions [read_pos_before ]
67
- if ref_pos_before is None :
68
- logging .warn (
69
- "Missing reference pos for nucleotide before variant on read: %s" % (
70
- read ,))
71
- continue
72
-
73
- ref_pos_after = reference_positions [read_pos_after ]
74
- if ref_pos_after is None :
75
- logging .warn (
76
- "Missing reference pos for nucleotide after variant on read: %s" % (
77
- read ,))
78
- continue
79
-
80
- if len (ref ) == 0 :
81
- if ref_pos_after - ref_pos_before != 1 :
82
- # if the number of nucleotides skipped isn't the same
83
- # as the number of reference nucleotides in the variant then
84
- # don't use this read
85
- logging .debug (
86
- "Positions before (%d) and after (%d) variant should be adjacent on read %s" % (
87
- ref_pos_before ,
88
- ref_pos_after ,
89
- read ))
90
- continue
91
- # insertions require a sequence of non-aligned bases
92
- # followed by the subsequence reference position
93
- ref_positions_for_inserted = reference_positions [
94
- read_pos_before + 1 :read_pos_after ]
95
- if any (insert_pos is not None for insert_pos in ref_positions_for_inserted ):
96
- # all these inserted nucleotides should *not* align to the
97
- # reference
98
- logging .debug (
99
- "Skipping read, inserted nucleotides shouldn't map to reference" )
100
- else :
101
- # substitutions and deletions
102
- if ref_pos_after - ref_pos_before != len (ref ) + 1 :
103
- # if the number of nucleotides skipped isn't the same
104
- # as the number of reference nucleotides in the variant then
105
- # don't use this read
106
- logging .debug (
107
- "Positions before (%d) and after (%d) variant should be adjacent on read %s" % (
108
- ref_pos_before ,
109
- ref_pos_after ,
110
- read ))
111
- continue
112
- prefix = sequence [:read_pos_before + 1 ]
113
- suffix = sequence [read_pos_after :]
114
-
115
- if isinstance (prefix , bytes ):
116
- prefix = prefix .decode ('ascii' )
117
- if isinstance (suffix , bytes ):
118
- suffix = suffix .decode ('ascii' )
119
-
120
- yield VariantRead (prefix , alt , suffix , name = read .name )
121
-
177
+ variant_read = variant_read_from_single_read_at_locus (read , ref , alt )
178
+ if variant_read is not None :
179
+ variant_reads .append (variant_read )
180
+ return variant_reads
122
181
123
182
def gather_reads_for_single_variant (
124
183
samfile ,
0 commit comments