Skip to content

Commit 4048acc

Browse files
author
caleblareau
committed
v.4.0: full support for v2.1-multi
1 parent ade0a2d commit 4048acc

File tree

6 files changed

+35
-54
lines changed

6 files changed

+35
-54
lines changed

bap/barcode/cli_barcode.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ def main(mode, fastq1, fastq2, output, ncores, nreads, nmismatches):
3636
bap-barcode: De-barcode samples from BioRad bead single cell atac \n
3737
Caleb Lareau, clareau <at> broadinstitute <dot> org \n
3838
39-
mode = ['v1', 'v2', 'v2-multi'] for bead design\n
39+
mode = ['v1.0', 'v2.0', 'v2.1', 'v2.1-multi'] for bead design\n
4040
"""
4141

4242
__version__ = get_distribution('bap').version
@@ -61,6 +61,10 @@ def main(mode, fastq1, fastq2, output, ncores, nreads, nmismatches):
6161
cmd = 'python '+script_dir+'/modes/biorad_v2.py '
6262
earlier = " --constant1 " + "TATGCATGAC" + " --constant2 " + "AGTCACTGAG"
6363
later = " --nextera " + "TCGTCGGCAGCGTC" + " --me " + "AGATGTGTATAAGAGACAG"
64+
elif(mode == "v2.1-multi"):
65+
cmd = 'python '+script_dir+'/modes/biorad_v2-multi.py '
66+
earlier = " --constant1 " + "TATGCATGAC" + " --constant2 " + "AGTCACTGAG"
67+
later = " --nextera " + "TCGTCGGCAGCGTC" + " --me " + "AGATGTGTATAAGAGACAG"
6468
else:
6569
sys.exit(gettime() + "User-supplied mode %s not found!" % mode)
6670

bap/barcode/modes/biorad_v2-multi.py

Lines changed: 28 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
#!/usr/bin/env python
22

33
# Caleb Lareau, Broad Institute
4-
# Finished: 2 June 2018
4+
# Finished: 16 June 2018
55
# This program will demultiplex barcoded Tn5-based
66
# scATAC from v2.1 scheme
77

@@ -11,6 +11,8 @@
1111
import regex
1212
import sys
1313
import gzip
14+
from barcodeHelp import * # local python script
15+
1416
from optparse import OptionParser
1517
from multiprocessing import Pool, freeze_support
1618
from itertools import repeat
@@ -28,6 +30,7 @@
2830

2931
opts.add_option("-a", "--fastq1", help="<Read1> Accepts fastq or fastq.gz")
3032
opts.add_option("-b", "--fastq2", help="<Read2> Accepts fastq or fastq.gz")
33+
3134
opts.add_option("-n", "--nreads", default = 5000000, help="Number of reads in each split output file")
3235
opts.add_option("-c", "--ncores", default = 4, help="Number of cores for parallel processing.")
3336

@@ -36,7 +39,9 @@
3639
opts.add_option("-l", "--nextera", default="TCGTCGGCAGCGTC", help="Nextera Adaptor Sequence")
3740
opts.add_option("-m", "--me", default="AGATGTGTATAAGAGACAG", help="ME Sequence")
3841

39-
opts.add_option("-o", "--out", help="Output sample convention")
42+
opts.add_option("-x", "--nmismatches", default=1, help="Number of mismatches")
43+
opts.add_option("-o", "--output", help="Output sample convention")
44+
4045
options, arguments = opts.parse_args()
4146

4247
print(options)
@@ -49,8 +54,8 @@
4954
##### INPUTS #####
5055
a = options.fastq1
5156
b = options.fastq2
52-
outname = options.out
53-
o = options.out
57+
outname = options.output
58+
o = options.output
5459

5560
cpu = int(options.ncores)
5661
n = int(options.nreads)
@@ -59,6 +64,7 @@
5964
c2 = options.constant2
6065
nxt = options.nextera
6166
me = options.me
67+
n_mismatch = int(options.nmismatches)
6268

6369
# Infer the length from the adaptors
6470
c1_len = len(c1)
@@ -78,54 +84,18 @@
7884

7985
# Define global variables
8086
dumb = "N"*7 + "_" + "N"*7 + "_" + "N"*7 + "_" + "N"*6
87+
dumb2 = "N"*27
8188

8289
# Define barcodes
8390
barcodes = ["GGACGAC","GCAGTGT","GAGAGGT","GAACCGT","GGTTAGT","GCCTTTG","GATAGAC","GTGGTAG","GTAATAC","CGAGGTC","CATCAGT","CCAAGCT","CCTTAGG","CACGGAC","CAGGCGG","CCGAACC","CACTTCT","CTGGCAT","CGATTAC","TCGTTCT","TGCTACT","TTCCTCT","TACTTTC","TGAATCC","TAGTACC","TTATCAT","TGATTGT","TGGCAAC","TGTTTAG","AGTTTCT","ATGGTGT","ATTGCCT","ACTCAAT","AGACCAT","AGCGAAT","ACCTACC","AGATAGG","AAGGTTC","AGGCATG","GTGGCGC","GGTCGTA","GTGTCCA","GAGGACA","GTCCTTC","GAGCGTG","GATCACC","GTTGATG","CATACGC","CTGCGCC","CGTAGCC","CGCGGCG","CATCTTA","CCAGTCA","CGTTTGA","CCACTTG","CTAACTC","CGAGTGG","TCCTGGC","TGACCGC","TAAGGTA","TCGCGCA","TCATACA","TAAGAGG","TGGAAGG","TCCGCTC","TAACGCC","TGCGTTG","TCGGATG","AGCCGCC","ACACGCG","ACTACGA","AATGGCC","ATGTTCC","ACGTTGG","AGACTTC","ATATAAC","ATAGTTG","GCACAGC","GACAATA","GAATCAA","GCTCCAA","GCGTAGA","GGAAGTT","GGAGCCT","GAATATG","GGTTCAC","CTAGAGC","CGTGATA","CGCCTAA","CGATGCA","CTTGCGA","CCATAAT","CCTATGT","CGCGCTT","CCGCGAT","CGGCCAG","TTGAGGC","TTTCCTA","TCAGCAA","TCCTTAA","TGGACCA","TAGTGTT","TATACTT","TGTCGCT","TACGCAT","TTGTAAG","TGTAGTG","AGTAAGC","ATGAATA","AACGTAA","AATTCCA","AATGATT","AAGTTAT","ACAGCTT","AGCTGAG","ACAGTAC","GGCAGGC","GCGCACG","GAGCTAA","GGTAACA","GCTAATT","GTCGGTT","GGTGTTT","GCGACTC","CTTACCG","CTATTCG","CTAAGAA","CACGCCA","CGGAGGA","CTTGTCC","CTCATTT","CGGATCT","CAGAATT","CGCAATC","TGCGAGC","TTAAGCG","TCTTGTA","TACCGAA","TTCTGCA","TCCAGTT","TGGCCTT","TCGGCGT","TCTGAAC","TCGACAG","AAGCAGC","ATTCACG","AAGTGCG","ATAGGCA","ATTCGTT","ACGTATT","ACCGGCT","AATTGGT","ATTATTC","AACGGTG","GAGTTGC","GGCGGAA","GTTAGGA","GTGCATT","GCCTCGT","GCTTTAT","GTGTGTC","GGCGTCC","CTCTTGC","CGGCTGC","CGGTACG","CGTACAA","CACATGA","CCGGTTT","CGACACT","CCTCCTT","CATGTAT","CTTCATC","CAGAGAG","TATGTGC","TCAAGAC","TTGGTTA","TGGTGAA","TTACAGA","TGAGATT","TTTGGTC","TTGGACT","TTCGTAC","TGAGGAG","ACCATGC","AGAGACC","AGCAACG","ACGAGAA","AACCACA","AACTCTT","ATGAGCT","AGGACGT","AGGATAC"]
8491
tn5 = ["AAAGAA","AACAGC","AACGTG","AAGCCA","AAGTAT","AATTGG","ACAAGG","ACCCAA","ACCTTC","ACGGAC","ACTGCA","AGACCC","AGATGT","AGCACG","AGGTTA","AGTAAA","AGTCTG","ATACTT","ATAGCG","ATATAC","ATCCGG","ATGAAG","ATTAGT","CAACCG","CAAGTC","CACCAC","CACTGT","CAGACT","CAGGAG","CATAGA","CCACGC","CCGATG","CCGTAA","CCTCTA","CGAAAG","CGAGCA","CGCATA","CGGCGT","CGGTCC","CGTTAT","CTAGGT","CTATTA","CTCAAT","CTGTGG","CTTACG","CTTGAA","GAAATA","GAAGGG","GACTCG","GAGCTT","GAGGCC","GAGTGA","GATCAA","GCCAGA","GCCGTT","GCGAAT","GCGCGG","GCTCCC","GCTGAG","GCTTGT","GGACGA","GGATTG","GGCCAT","GGGATC","GGTAGG","GGTGCT","GTACAG","GTCCTA","GTCGGC","GTGGTG","GTTAAC","GTTTCA","TAAGCT","TAATAG","TACCGA","TAGAGG","TATTTC","TCAGTG","TCATCA","TCCAAG","TCGCCT","TCGGGA","TCTAGC","TGAATT","TGAGAC","TGCGGT","TGCTAA","TGGCAG","TGTGTA","TGTTCG","TTAAGA","TTCGCA","TTCTTG","TTGCTC","TTGGAT","TTTGGG"]
8592

8693
#------------------------------
8794

88-
def prove_barcode(bc):
89-
'''
90-
Function that takes a putative barcode and returns the nearest valid one
91-
'''
92-
93-
if(bc in barcodes):
94-
return(bc)
95-
else:
96-
eo = process.extractOne(bc, barcodes)
97-
if(eo[1] >= 71): # 71 comes from 5/7... the score is the score homology
98-
return(eo[0])
99-
else:
100-
return("NNNNNNN")
101-
102-
def prove_tn5(bc):
103-
'''
104-
Function that takes a putative barcode and returns the nearest valid one
105-
'''
106-
107-
if(bc in tn5):
108-
return(bc)
109-
else:
110-
eo = process.extractOne(bc, tn5)
111-
if(eo[1] >= 66): # 66 comes from 4/6... the score is the score homology
112-
return(eo[0])
113-
else:
114-
return("NNNNNN")
115-
116-
117-
def formatRead(title, sequence, quality):
118-
"""
119-
Takes three components of fastq file and stiches them together in a string
120-
"""
121-
return("@%s\n%s\n+\n%s\n" % (title, sequence, quality))
122-
12395
def extractbarcode_v2_tn5(sequence1):
12496
'''
12597
Function to extract barcodes
12698
'''
127-
# Parse out sequence features and split based on constant sequences
128-
bc1 = prove_barcode(sequence1[0:7])
12999

130100
# Parse out barcodes if we can ID the constants
131101
try:
@@ -136,14 +106,15 @@ def extractbarcode_v2_tn5(sequence1):
136106
me_hit = find_near_matches(me, sequence1[55:], max_l_dist=2)
137107

138108
# Now grab the barcodes
139-
bc2 = prove_barcode(sequence1[c1_hit[0][1]+7:c2_hit[0][0]+23])
140-
bc3 = prove_barcode(sequence1[c2_hit[0][1]+23:nxt_hit[0][0]+33])
141-
bc_tn5 = prove_tn5(sequence1[nxt_hit[0][1]+33:me_hit[0][0]+55])
109+
bc1, mm1 = prove_barcode(sequence1[0:7], barcodes, n_mismatch)
110+
bc2, mm2 = prove_barcode(sequence1[c1_hit[0][1]+7:c2_hit[0][0]+23], barcodes, n_mismatch)
111+
bc3, mm3 = prove_barcode(sequence1[c2_hit[0][1]+23:nxt_hit[0][0]+33], barcodes, n_mismatch)
112+
bc_tn5, mm4 = prove_barcode(sequence1[nxt_hit[0][1]+33:me_hit[0][0]+55], tn5, n_mismatch)
142113
seq = sequence1[me_hit[0][1]+55:]
143114

144-
return(bc1 + "_" + bc2 + "_" + bc3 + "_" + bc_tn5, seq)
115+
return(bc1 + "_" + bc2 + "_" + bc3 + "_" + bc_tn5, seq, str(mm1)+","+str(mm2)+","+str(mm3)+","+str(mm4))
145116
except:
146-
return(dumb, sequence1)
117+
return(dumb, sequence1, "0,0,0,0")
147118

148119

149120
def debarcode_multiplexed(duo):
@@ -156,6 +127,7 @@ def debarcode_multiplexed(duo):
156127
# parameters to return
157128
fq1 = ""
158129
fq2 = ""
130+
mm_quant = ""
159131

160132
nbc1 = 0
161133
nbc2 = 0
@@ -170,7 +142,7 @@ def debarcode_multiplexed(duo):
170142
title2 = listRead2[0]; sequence2 = listRead2[1]; quality2 = listRead2[2]
171143

172144
# Return the barcode with underscores + the biological sequence learned
173-
barcode, sequence1 = extractbarcode_tn5(sequence1)
145+
barcode, sequence1, mm = extractbarcode_v2_tn5(sequence1)
174146
quality1 = quality1[-1*len(sequence1):]
175147

176148
four = barcode.split("_")
@@ -190,14 +162,16 @@ def debarcode_multiplexed(duo):
190162
npass = 1
191163
fq1 = formatRead("".join(four) + "_" + title1, sequence1, quality1)
192164
fq2 = formatRead("".join(four) + "_" + title2, sequence2, quality2)
193-
return([[fq1, fq2], [nbc1, nbc2, nbc3, ntn5, npass, nfail]])
165+
mm_quant = mm_quant + "".join(four) + "," + mm +"\n"
166+
return([[fq1, fq2], [nbc1, nbc2, nbc3, ntn5, npass, nfail], [mm_quant]])
194167

195168

196169
# Define variables to keep track of things that fail
197170
nbc1 = 0
198171
nbc2 = 0
199172
nbc3 = 0
200173
ntn5 = 0
174+
201175
npass = 0
202176
nfail = 0
203177

@@ -211,7 +185,7 @@ def debarcode_multiplexed(duo):
211185
# iterate over batches of length n
212186
for i, batch1 in enumerate(it1):
213187
batch2 = it2.__next__()
214-
output = o + "-parse" + str(i+1).zfill(3)
188+
output = o + "-c" + str(i+1).zfill(3)
215189

216190
# parallel process the barcode processing and accounting of failures.
217191
pool = Pool(processes=cpu)
@@ -221,6 +195,8 @@ def debarcode_multiplexed(duo):
221195
# Aggregate output
222196
fqs = list(map(''.join, zip(*[item.pop(0) for item in pm])))
223197
counts = list(map(sum, zip(*[item.pop(0) for item in pm])))
198+
mm_values = list(map(''.join, zip(*[item.pop(0) for item in pm])))
199+
224200
nbc1 = nbc1 + counts[0]
225201
nbc2 = nbc2 + counts[1]
226202
nbc3 = nbc3 + counts[2]
@@ -232,9 +208,10 @@ def debarcode_multiplexed(duo):
232208
# Export one chunk in parallel
233209
filename1 = output +'_1.fastq.gz'
234210
filename2 = output +'_2.fastq.gz'
211+
filenameMM = output +'_mismatches.csv.gz'
235212

236-
pool = Pool(processes=2)
237-
toke = pool.starmap(chunkWriterGzip, [(filename1, fqs[0]), (filename2, fqs[1])])
213+
pool = Pool(processes=3)
214+
toke = pool.starmap(chunk_writer_gzip, [(filename1, fqs[0]), (filename2, fqs[1]), (filenameMM, mm_values)])
238215
pool.close()
239216

240217
with open(o + "-debarcode" + '.sumstats.log', 'w') as logfile:

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88

99
setup(
1010
name='bap',
11-
version='0.3.9994',
11+
version='0.4.0',
1212
url='https://github.com/caleblareau/bap',
1313
license='MIT',
1414
author='Caleb Lareau',

tests/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ bap bam -i data/test.small.bam -z -bt CB -ji 0.002 -em -pf data/test.small.peaks
1919
```
2020
bap-barcode v1.0 -a fastq_br/biorad_v1_R1.fastq.gz -b fastq_br/biorad_v1_R2.fastq.gz --nmismatches 1
2121
bap-barcode v2.1 -a fastq_br/biorad_v2_R1.fastq.gz -b fastq_br/biorad_v2_R2.fastq.gz --nmismatches 1
22-
22+
bap-barcode v2.1-multi -a fastq_br/biorad_v2-multi_R1.fastq.gz -b fastq_br/biorad_v2-multi_R2.fastq.gz --nmismatches 1
2323
2424
2525
bwa mem /Volumes/dat/genomes/hg19_bwa/hg19.fa debarcode-c001_1.fastq.gz debarcode-c001_2.fastq.gz | samtools view -bS - | samtools sort -@ 4 - -o debarcode.bam
660 KB
Binary file not shown.
299 KB
Binary file not shown.

0 commit comments

Comments
 (0)