1
1
#!/usr/bin/env python
2
2
3
3
# Caleb Lareau, Broad Institute
4
- # Finished: 2 June 2018
4
+ # Finished: 16 June 2018
5
5
# This program will demultiplex barcoded Tn5-based
6
6
# scATAC from v2.1 scheme
7
7
11
11
import regex
12
12
import sys
13
13
import gzip
14
+ from barcodeHelp import * # local python script
15
+
14
16
from optparse import OptionParser
15
17
from multiprocessing import Pool , freeze_support
16
18
from itertools import repeat
28
30
29
31
opts .add_option ("-a" , "--fastq1" , help = "<Read1> Accepts fastq or fastq.gz" )
30
32
opts .add_option ("-b" , "--fastq2" , help = "<Read2> Accepts fastq or fastq.gz" )
33
+
31
34
opts .add_option ("-n" , "--nreads" , default = 5000000 , help = "Number of reads in each split output file" )
32
35
opts .add_option ("-c" , "--ncores" , default = 4 , help = "Number of cores for parallel processing." )
33
36
36
39
opts .add_option ("-l" , "--nextera" , default = "TCGTCGGCAGCGTC" , help = "Nextera Adaptor Sequence" )
37
40
opts .add_option ("-m" , "--me" , default = "AGATGTGTATAAGAGACAG" , help = "ME Sequence" )
38
41
39
- opts .add_option ("-o" , "--out" , help = "Output sample convention" )
42
+ opts .add_option ("-x" , "--nmismatches" , default = 1 , help = "Number of mismatches" )
43
+ opts .add_option ("-o" , "--output" , help = "Output sample convention" )
44
+
40
45
options , arguments = opts .parse_args ()
41
46
42
47
print (options )
49
54
##### INPUTS #####
50
55
a = options .fastq1
51
56
b = options .fastq2
52
- outname = options .out
53
- o = options .out
57
+ outname = options .output
58
+ o = options .output
54
59
55
60
cpu = int (options .ncores )
56
61
n = int (options .nreads )
59
64
c2 = options .constant2
60
65
nxt = options .nextera
61
66
me = options .me
67
+ n_mismatch = int (options .nmismatches )
62
68
63
69
# Infer the length from the adaptors
64
70
c1_len = len (c1 )
78
84
79
85
# Define global variables
80
86
dumb = "N" * 7 + "_" + "N" * 7 + "_" + "N" * 7 + "_" + "N" * 6
87
+ dumb2 = "N" * 27
81
88
82
89
# Define barcodes
83
90
barcodes = ["GGACGAC" ,"GCAGTGT" ,"GAGAGGT" ,"GAACCGT" ,"GGTTAGT" ,"GCCTTTG" ,"GATAGAC" ,"GTGGTAG" ,"GTAATAC" ,"CGAGGTC" ,"CATCAGT" ,"CCAAGCT" ,"CCTTAGG" ,"CACGGAC" ,"CAGGCGG" ,"CCGAACC" ,"CACTTCT" ,"CTGGCAT" ,"CGATTAC" ,"TCGTTCT" ,"TGCTACT" ,"TTCCTCT" ,"TACTTTC" ,"TGAATCC" ,"TAGTACC" ,"TTATCAT" ,"TGATTGT" ,"TGGCAAC" ,"TGTTTAG" ,"AGTTTCT" ,"ATGGTGT" ,"ATTGCCT" ,"ACTCAAT" ,"AGACCAT" ,"AGCGAAT" ,"ACCTACC" ,"AGATAGG" ,"AAGGTTC" ,"AGGCATG" ,"GTGGCGC" ,"GGTCGTA" ,"GTGTCCA" ,"GAGGACA" ,"GTCCTTC" ,"GAGCGTG" ,"GATCACC" ,"GTTGATG" ,"CATACGC" ,"CTGCGCC" ,"CGTAGCC" ,"CGCGGCG" ,"CATCTTA" ,"CCAGTCA" ,"CGTTTGA" ,"CCACTTG" ,"CTAACTC" ,"CGAGTGG" ,"TCCTGGC" ,"TGACCGC" ,"TAAGGTA" ,"TCGCGCA" ,"TCATACA" ,"TAAGAGG" ,"TGGAAGG" ,"TCCGCTC" ,"TAACGCC" ,"TGCGTTG" ,"TCGGATG" ,"AGCCGCC" ,"ACACGCG" ,"ACTACGA" ,"AATGGCC" ,"ATGTTCC" ,"ACGTTGG" ,"AGACTTC" ,"ATATAAC" ,"ATAGTTG" ,"GCACAGC" ,"GACAATA" ,"GAATCAA" ,"GCTCCAA" ,"GCGTAGA" ,"GGAAGTT" ,"GGAGCCT" ,"GAATATG" ,"GGTTCAC" ,"CTAGAGC" ,"CGTGATA" ,"CGCCTAA" ,"CGATGCA" ,"CTTGCGA" ,"CCATAAT" ,"CCTATGT" ,"CGCGCTT" ,"CCGCGAT" ,"CGGCCAG" ,"TTGAGGC" ,"TTTCCTA" ,"TCAGCAA" ,"TCCTTAA" ,"TGGACCA" ,"TAGTGTT" ,"TATACTT" ,"TGTCGCT" ,"TACGCAT" ,"TTGTAAG" ,"TGTAGTG" ,"AGTAAGC" ,"ATGAATA" ,"AACGTAA" ,"AATTCCA" ,"AATGATT" ,"AAGTTAT" ,"ACAGCTT" ,"AGCTGAG" ,"ACAGTAC" ,"GGCAGGC" ,"GCGCACG" ,"GAGCTAA" ,"GGTAACA" ,"GCTAATT" ,"GTCGGTT" ,"GGTGTTT" ,"GCGACTC" ,"CTTACCG" ,"CTATTCG" ,"CTAAGAA" ,"CACGCCA" ,"CGGAGGA" ,"CTTGTCC" ,"CTCATTT" ,"CGGATCT" ,"CAGAATT" ,"CGCAATC" ,"TGCGAGC" ,"TTAAGCG" ,"TCTTGTA" ,"TACCGAA" ,"TTCTGCA" ,"TCCAGTT" ,"TGGCCTT" ,"TCGGCGT" ,"TCTGAAC" ,"TCGACAG" ,"AAGCAGC" ,"ATTCACG" ,"AAGTGCG" ,"ATAGGCA" ,"ATTCGTT" ,"ACGTATT" ,"ACCGGCT" ,"AATTGGT" ,"ATTATTC" ,"AACGGTG" ,"GAGTTGC" ,"GGCGGAA" ,"GTTAGGA" ,"GTGCATT" ,"GCCTCGT" ,"GCTTTAT" ,"GTGTGTC" ,"GGCGTCC" ,"CTCTTGC" ,"CGGCTGC" ,"CGGTACG" ,"CGTACAA" ,"CACATGA" ,"CCGGTTT" ,"CGACACT" ,"CCTCCTT" ,"CATGTAT" ,"CTTCATC" ,"CAGAGAG" ,"TATGTGC" ,"TCAAGAC" ,"TTGGTTA" ,"TGGTGAA" ,"TTACAGA" ,"TGAGATT" ,"TTTGGTC" ,"TTGGACT" ,"TTCGTAC" ,"TGAGGAG" ,"ACCATGC" ,"AGAGACC" ,"AGCAACG" ,"ACGAGAA" ,"AACCACA" ,"AACTCTT" ,"ATGAGCT" ,"AGGACGT" ,"AGGATAC" ]
84
91
tn5 = ["AAAGAA" ,"AACAGC" ,"AACGTG" ,"AAGCCA" ,"AAGTAT" ,"AATTGG" ,"ACAAGG" ,"ACCCAA" ,"ACCTTC" ,"ACGGAC" ,"ACTGCA" ,"AGACCC" ,"AGATGT" ,"AGCACG" ,"AGGTTA" ,"AGTAAA" ,"AGTCTG" ,"ATACTT" ,"ATAGCG" ,"ATATAC" ,"ATCCGG" ,"ATGAAG" ,"ATTAGT" ,"CAACCG" ,"CAAGTC" ,"CACCAC" ,"CACTGT" ,"CAGACT" ,"CAGGAG" ,"CATAGA" ,"CCACGC" ,"CCGATG" ,"CCGTAA" ,"CCTCTA" ,"CGAAAG" ,"CGAGCA" ,"CGCATA" ,"CGGCGT" ,"CGGTCC" ,"CGTTAT" ,"CTAGGT" ,"CTATTA" ,"CTCAAT" ,"CTGTGG" ,"CTTACG" ,"CTTGAA" ,"GAAATA" ,"GAAGGG" ,"GACTCG" ,"GAGCTT" ,"GAGGCC" ,"GAGTGA" ,"GATCAA" ,"GCCAGA" ,"GCCGTT" ,"GCGAAT" ,"GCGCGG" ,"GCTCCC" ,"GCTGAG" ,"GCTTGT" ,"GGACGA" ,"GGATTG" ,"GGCCAT" ,"GGGATC" ,"GGTAGG" ,"GGTGCT" ,"GTACAG" ,"GTCCTA" ,"GTCGGC" ,"GTGGTG" ,"GTTAAC" ,"GTTTCA" ,"TAAGCT" ,"TAATAG" ,"TACCGA" ,"TAGAGG" ,"TATTTC" ,"TCAGTG" ,"TCATCA" ,"TCCAAG" ,"TCGCCT" ,"TCGGGA" ,"TCTAGC" ,"TGAATT" ,"TGAGAC" ,"TGCGGT" ,"TGCTAA" ,"TGGCAG" ,"TGTGTA" ,"TGTTCG" ,"TTAAGA" ,"TTCGCA" ,"TTCTTG" ,"TTGCTC" ,"TTGGAT" ,"TTTGGG" ]
85
92
86
93
#------------------------------
87
94
88
- def prove_barcode (bc ):
89
- '''
90
- Function that takes a putative barcode and returns the nearest valid one
91
- '''
92
-
93
- if (bc in barcodes ):
94
- return (bc )
95
- else :
96
- eo = process .extractOne (bc , barcodes )
97
- if (eo [1 ] >= 71 ): # 71 comes from 5/7... the score is the score homology
98
- return (eo [0 ])
99
- else :
100
- return ("NNNNNNN" )
101
-
102
- def prove_tn5 (bc ):
103
- '''
104
- Function that takes a putative barcode and returns the nearest valid one
105
- '''
106
-
107
- if (bc in tn5 ):
108
- return (bc )
109
- else :
110
- eo = process .extractOne (bc , tn5 )
111
- if (eo [1 ] >= 66 ): # 66 comes from 4/6... the score is the score homology
112
- return (eo [0 ])
113
- else :
114
- return ("NNNNNN" )
115
-
116
-
117
- def formatRead (title , sequence , quality ):
118
- """
119
- Takes three components of fastq file and stiches them together in a string
120
- """
121
- return ("@%s\n %s\n +\n %s\n " % (title , sequence , quality ))
122
-
123
95
def extractbarcode_v2_tn5 (sequence1 ):
124
96
'''
125
97
Function to extract barcodes
126
98
'''
127
- # Parse out sequence features and split based on constant sequences
128
- bc1 = prove_barcode (sequence1 [0 :7 ])
129
99
130
100
# Parse out barcodes if we can ID the constants
131
101
try :
@@ -136,14 +106,15 @@ def extractbarcode_v2_tn5(sequence1):
136
106
me_hit = find_near_matches (me , sequence1 [55 :], max_l_dist = 2 )
137
107
138
108
# Now grab the barcodes
139
- bc2 = prove_barcode (sequence1 [c1_hit [0 ][1 ]+ 7 :c2_hit [0 ][0 ]+ 23 ])
140
- bc3 = prove_barcode (sequence1 [c2_hit [0 ][1 ]+ 23 :nxt_hit [0 ][0 ]+ 33 ])
141
- bc_tn5 = prove_tn5 (sequence1 [nxt_hit [0 ][1 ]+ 33 :me_hit [0 ][0 ]+ 55 ])
109
+ bc1 , mm1 = prove_barcode (sequence1 [0 :7 ], barcodes , n_mismatch )
110
+ bc2 , mm2 = prove_barcode (sequence1 [c1_hit [0 ][1 ]+ 7 :c2_hit [0 ][0 ]+ 23 ], barcodes , n_mismatch )
111
+ bc3 , mm3 = prove_barcode (sequence1 [c2_hit [0 ][1 ]+ 23 :nxt_hit [0 ][0 ]+ 33 ], barcodes , n_mismatch )
112
+ bc_tn5 , mm4 = prove_barcode (sequence1 [nxt_hit [0 ][1 ]+ 33 :me_hit [0 ][0 ]+ 55 ], tn5 , n_mismatch )
142
113
seq = sequence1 [me_hit [0 ][1 ]+ 55 :]
143
114
144
- return (bc1 + "_" + bc2 + "_" + bc3 + "_" + bc_tn5 , seq )
115
+ return (bc1 + "_" + bc2 + "_" + bc3 + "_" + bc_tn5 , seq , str ( mm1 ) + "," + str ( mm2 ) + "," + str ( mm3 ) + "," + str ( mm4 ) )
145
116
except :
146
- return (dumb , sequence1 )
117
+ return (dumb , sequence1 , "0,0,0,0" )
147
118
148
119
149
120
def debarcode_multiplexed (duo ):
@@ -156,6 +127,7 @@ def debarcode_multiplexed(duo):
156
127
# parameters to return
157
128
fq1 = ""
158
129
fq2 = ""
130
+ mm_quant = ""
159
131
160
132
nbc1 = 0
161
133
nbc2 = 0
@@ -170,7 +142,7 @@ def debarcode_multiplexed(duo):
170
142
title2 = listRead2 [0 ]; sequence2 = listRead2 [1 ]; quality2 = listRead2 [2 ]
171
143
172
144
# Return the barcode with underscores + the biological sequence learned
173
- barcode , sequence1 = extractbarcode_tn5 (sequence1 )
145
+ barcode , sequence1 , mm = extractbarcode_v2_tn5 (sequence1 )
174
146
quality1 = quality1 [- 1 * len (sequence1 ):]
175
147
176
148
four = barcode .split ("_" )
@@ -190,14 +162,16 @@ def debarcode_multiplexed(duo):
190
162
npass = 1
191
163
fq1 = formatRead ("" .join (four ) + "_" + title1 , sequence1 , quality1 )
192
164
fq2 = formatRead ("" .join (four ) + "_" + title2 , sequence2 , quality2 )
193
- return ([[fq1 , fq2 ], [nbc1 , nbc2 , nbc3 , ntn5 , npass , nfail ]])
165
+ mm_quant = mm_quant + "" .join (four ) + "," + mm + "\n "
166
+ return ([[fq1 , fq2 ], [nbc1 , nbc2 , nbc3 , ntn5 , npass , nfail ], [mm_quant ]])
194
167
195
168
196
169
# Define variables to keep track of things that fail
197
170
nbc1 = 0
198
171
nbc2 = 0
199
172
nbc3 = 0
200
173
ntn5 = 0
174
+
201
175
npass = 0
202
176
nfail = 0
203
177
@@ -211,7 +185,7 @@ def debarcode_multiplexed(duo):
211
185
# iterate over batches of length n
212
186
for i , batch1 in enumerate (it1 ):
213
187
batch2 = it2 .__next__ ()
214
- output = o + "-parse " + str (i + 1 ).zfill (3 )
188
+ output = o + "-c " + str (i + 1 ).zfill (3 )
215
189
216
190
# parallel process the barcode processing and accounting of failures.
217
191
pool = Pool (processes = cpu )
@@ -221,6 +195,8 @@ def debarcode_multiplexed(duo):
221
195
# Aggregate output
222
196
fqs = list (map ('' .join , zip (* [item .pop (0 ) for item in pm ])))
223
197
counts = list (map (sum , zip (* [item .pop (0 ) for item in pm ])))
198
+ mm_values = list (map ('' .join , zip (* [item .pop (0 ) for item in pm ])))
199
+
224
200
nbc1 = nbc1 + counts [0 ]
225
201
nbc2 = nbc2 + counts [1 ]
226
202
nbc3 = nbc3 + counts [2 ]
@@ -232,9 +208,10 @@ def debarcode_multiplexed(duo):
232
208
# Export one chunk in parallel
233
209
filename1 = output + '_1.fastq.gz'
234
210
filename2 = output + '_2.fastq.gz'
211
+ filenameMM = output + '_mismatches.csv.gz'
235
212
236
- pool = Pool (processes = 2 )
237
- toke = pool .starmap (chunkWriterGzip , [(filename1 , fqs [0 ]), (filename2 , fqs [1 ])])
213
+ pool = Pool (processes = 3 )
214
+ toke = pool .starmap (chunk_writer_gzip , [(filename1 , fqs [0 ]), (filename2 , fqs [1 ]), ( filenameMM , mm_values )])
238
215
pool .close ()
239
216
240
217
with open (o + "-debarcode" + '.sumstats.log' , 'w' ) as logfile :
0 commit comments