-
Notifications
You must be signed in to change notification settings - Fork 3
/
correct_barcode_from_fastq.seq
215 lines (182 loc) · 8.26 KB
/
correct_barcode_from_fastq.seq
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
import bio
import sys
from bio.kmer import Kmer
from seq_lib.barcode_correction import (
correct_bc_by_qual_order_with_whitelist_or_correct_bc_with_Ns_with_whitelist,
map_barcodes_from_whitelist1_to_whitelist2,
read_barcode_whitelist_from_file
)
from seq_lib.utils import get_seq_and_qual_barcode_read
# bc_length needs to be passed when invoking seqc: "seqc run -D bc_length=16 -release correct_barcode_from_fastq.seq"
K = Kmer[bc_length]
# Minimum percentage of barcodes that should be found before returning with an error.
min_frac_bcs_to_find_default = 0.5
def correct_bc_from_fastq(
bc_whitelist: set[K],
bc_whitelist1_to_whitelist2_dict: Optional[dict[str, str]],
fastq_with_raw_bc_filename: str,
corrected_bc_filename: str,
corrected_bc_stats_tsv_filename: str,
bc_suffix: str = "1",
max_hamming_dist: int = 1,
min_frac_bcs_to_find: float = min_frac_bcs_to_find_default,
):
"""
Read index 2 FASTQ file with barcodes and get read name, raw barcode sequences and
qualities and corrected barcode sequences (if the barcode matches the whitelist
(exactly or with max max_hamming_dist mismatch)).
Parameters
----------
bc_whitelist:
Barcode whitelist.
bc_whitelist1_to_whitelist2_dict:
Barcode mapping dictionary to map corrected barcode to other barcode.
For example, map 10x Multiome ATAC barcodes to 10X Multiome RNA barcodes.
If None, no barcode remapping is done.
fastq_with_raw_bc_filename:
Input FASTQ file with raw barcode reads.
corrected_bc_filename:
Output file with read name and raw barcode sequences and qualities and
corrected barcode sequences.
corrected_bc_stats_tsv_filename:
File with barcode correction statistics.
bc_suffix:
Barcode suffix to append to corrected barcode sequences. Default: "1".
max_hamming_dist:
Maximum hamming distance allowed for the barcode to consider it a match with
the whitelist. Default: 1.
min_frac_bcs_to_find:
Required minimum fraction of barcode reads that need to contain a barcode.
Default: 0.5.
Returns
-------
enough_bcs_found, frac_bcs_found
"""
if max_hamming_dist > 3 or max_hamming_dist < 0:
raise ValueError("Only hamming distances 0, 1, 2 and 3 are supported.")
# Store number of reads in FASTQ file.
total_reads = 0
# Store number of reads which have hamming distance of 0, 1, 2 or 3 from barcodes whitelist.
bc_mismatches_stats = [0 , 0, 0, 0]
if corrected_bc_filename == "-":
corrected_bc_filename = "/dev/stdout"
# Check if corrected barcode need to be remapped to another barcode.
remap_barcodes = True if bc_whitelist1_to_whitelist2_dict else False
with open(corrected_bc_filename, "w") as corrected_bc_fh:
for record in bio.FASTQ(fastq_with_raw_bc_filename, gzip=True, validate=False, copy=True):
total_reads += 1
# Get barcode sequence and associated quality from FASTQ record
# reverse complemented if needed based on the sequencer.
bc_seq, bc_qual, _is_bc_rev = get_seq_and_qual_barcode_read(
fastq_record=record,
seq_start=0,
seq_end=bc_length,
)
corrected_bc = correct_bc_by_qual_order_with_whitelist_or_correct_bc_with_Ns_with_whitelist(
bc_whitelist=bc_whitelist,
bc_length=bc_length,
bc_seq=bc_seq,
bc_qual=bc_qual,
max_hamming_dist=max_hamming_dist,
)
if corrected_bc:
bc_mismatches_stats[corrected_bc.hamming_dist] += 1
# Remap corrected barcode, if it was requested.
final_corrected_bc = (
bc_whitelist1_to_whitelist2_dict[corrected_bc.corrected_bc]
if remap_barcodes
else corrected_bc.corrected_bc
)
# Write original FASTQ record name and raw cell barcode, raw cell
# barcode quality and corrected cell barcode.
corrected_bc_fh.write(
f"@{record.name} " +
f"CR:Z:{bc_seq}\t" +
f"CY:Z:{bc_qual}\t" +
f"CB:Z:{final_corrected_bc}" +
f"{'-' + bc_suffix if bc_suffix else ''}\n"
)
else:
# Write original FASTQ record name and raw cell barcode and raw cell
# barcode quality.
corrected_bc_fh.write(
f"@{record.name} " +
f"CR:Z:{bc_seq}\t" +
f"CY:Z:{bc_qual}\n"
)
# Calculate number of reads with/without a barcode.
bc_reads = sum(bc_mismatches_stats)
reads_without_bc = total_reads - bc_reads
with open(corrected_bc_stats_tsv_filename, 'w') as corrected_bc_stats_tsv_fh:
corrected_bc_stats_tsv_fh.write(
f"reads\t{total_reads}\t100.00%\n" +
f"bc_reads\t{bc_reads}\t" +
f"{(bc_reads / total_reads * 100.0)}%\n" +
f"reads_without_bc\t{reads_without_bc}\t" +
f"{(reads_without_bc / total_reads * 100.0)}%\n\n"
)
for i in range(0, max_hamming_dist + 1):
corrected_bc_stats_tsv_fh.write(
f"bc_reads_with_{i}_mismatches\t{bc_mismatches_stats[i]}\t" +
f"{(bc_mismatches_stats[i] / total_reads * 100.0)}%\n"
)
# Calculate fraction of barcodes found in all reads.
frac_bcs_found = bc_reads / total_reads
# Are there enough reads that had a barcode?
enough_bcs_found = (frac_bcs_found >= min_frac_bcs_to_find)
return enough_bcs_found, frac_bcs_found
def main():
if len(sys.argv) < 8:
print(
f"Usage: seqc run -D bc_length={bc_length} -release {sys.argv[0]} " +
"bc_whitelist_file bc_remapping_file fastq_with_raw_bc_file corrected_bc_file " +
"corrected_bc_stats_file bc_suffix max_mismatches [min_frac_bcs_to_find]",
file=sys.stderr,
)
sys.exit(1)
else:
bc_whitelist_filename = sys.argv[1]
bc_remapping_filename = sys.argv[2]
fastq_with_raw_bc_filename = sys.argv[3]
corrected_bc_filename = sys.argv[4]
corrected_bc_stats_tsv_filename = sys.argv[5]
bc_suffix = sys.argv[6]
max_mismatches = int(sys.argv[7])
min_frac_bcs_to_find = float(sys.argv[8]) if len(sys.argv) == 9 else min_frac_bcs_to_find_default
# Read whitelisted barcodes from file and convert to a set of Kmers.
bc_whitelist = read_barcode_whitelist_from_file(
bc_whitelist_filename=bc_whitelist_filename,
bc_length=bc_length,
bc_column_idx=0,
warning=True,
)
# Create barcode mapping from barcodes in file 1 to barcodes in file 2.
# If barcode whitelist 2 is None, "", "none" or "false", no barcode mapping is created.
bc_whitelist1_to_whitelist2_dict = map_barcodes_from_whitelist1_to_whitelist2(
bc_whitelist_filename=bc_whitelist_filename,
bc_remapping_filename=bc_remapping_filename,
bc_column_idxs=(0, 0),
warning=True,
)
# Read FASTQ with barcodes and write a FASTQ with corrected barcodes for barcodes
# that match the whitelist closely enough, else write the original barcodes.
enough_bcs_found, frac_bcs_found = correct_bc_from_fastq(
bc_whitelist=bc_whitelist,
bc_whitelist1_to_whitelist2_dict=bc_whitelist1_to_whitelist2_dict,
fastq_with_raw_bc_filename=fastq_with_raw_bc_filename,
corrected_bc_filename=corrected_bc_filename,
corrected_bc_stats_tsv_filename=corrected_bc_stats_tsv_filename,
bc_suffix=bc_suffix,
max_hamming_dist=max_mismatches,
min_frac_bcs_to_find=min_frac_bcs_to_find,
)
if not enough_bcs_found:
print(
f"Warning: Only {frac_bcs_found * 100.00}% of the reads have a barcode.",
" Check if you provided the correct barcode whitelist file.\n",
sep="\n",
file=sys.stderr,
)
sys.exit(1)
if __name__ == "__main__":
main()