-
Notifications
You must be signed in to change notification settings - Fork 13
/
Copy pathtigmint-make
executable file
·396 lines (310 loc) · 13.4 KB
/
tigmint-make
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
#!/usr/bin/make -rRf
# Correct misassemblies using Tigmint
# Written by Shaun Jackman @sjackman
# Contributions by Lauren Coombe
# Usage:
# tigmint-make draft=myassembly reads=myreads
# To run Tigmint and calculate assembly metrics:
# tigmint-make draft=myassembly reads=myreads ref=GRCh38 G=3088269832
# To run Tigmint with long reads:
# tigmint-make tigmint-long draft=myassembly reads=myreads span=auto G=genomesize
# Name of the draft assembly, draft.fa
draft=draft
# Name of the reads, reads.fq.gz, after running longranger basic (for linked reads)
# Can be reads.fq.gz, reads.fq, reads.fastq, reads.fa, reads.fasta or reads.fa.gz for long reads
reads=reads
# Find the complete read file name
fastq_gz=$(shell test -f $(reads).fq.gz && echo "true")
fastq=$(shell test -f $(reads).fq && echo "true")
fastq_long=$(shell test -f $(reads).fastq && echo "true")
fasta_gz=$(shell test -f $(reads).fa.gz && echo "true")
fasta=$(shell test -f $(reads).fa && echo "true")
fasta_long=$(shell test -f $(reads).fasta && echo "true")
ifeq ($(fastq_gz), true)
longreads=$(reads).fq.gz
endif
ifeq ($(fastq), true)
longreads=$(reads).fq
endif
ifeq ($(fastq_long), true)
longreads=$(reads).fastq
endif
ifeq ($(fasta_gz), true)
longreads=$(reads).fa.gz
endif
ifeq ($(fasta), true)
longreads=$(reads).fa
endif
ifeq ($(fasta_long), true)
longreads=$(reads).fasta
endif
# Reference genome, ref.fa, for calculating assembly contiguity metrics
ref=ref
# Haploid size of the reference genome, for calculating span, NG50 and NGA50
G=0
# Minimap2 long read map parameter
longmap=ont
# Mapping approach
mapping=minimap2 # Can be minimap2 or ntLink
ntLink_k=32
ntLink_w=100
# Minimum molecule size
minsize=2000
# Minimum AS/Read length ratio
as=0.65
# Maximum number of mismatches
nm=5
# Maximum distance between reads to be considered the same molecule
dist=50000
# Mapping quality threshold
mapq=0
# Bp of contigs to trim after cutting at error
trim=0
#Threshold for number of spanning molecules to consider properly assembled
span=20
#Window size for checking for spanning molecules
window=1000
#Length for tigmint-long cut reads
cut=500
# Minimum length of contig for tallying attempted corrections
ac=3000
# Parameters of ARCS
c=5
e=30000
r=0.05
# Parameters of LINKS
a=0.1
l=10
# Number of threads
t=8
SHELL=bash -e -o pipefail
ifneq ($(shell command -v zsh),)
# Set pipefail to ensure that all commands of a pipe succeed.
SHELL=zsh -e -o pipefail
# Report run time and memory usage with zsh.
export REPORTTIME=1
export TIMEFMT=time user=%U system=%S elapsed=%E cpu=%P memory=%M job=%J
endif
# Determine path to tigmint executables
bin=$(shell dirname `command -v $(MAKEFILE_LIST)`)
ifdef bin
PATH:=$(bin):$(PATH)
endif
# Use pigz or bgzip for parallel compression if available.
ifneq ($(shell command -v pigz),)
gzip=pigz -p$t
else
ifneq ($(shell command -v bgzip),)
gzip=bgzip -@$t
else
gzip=gzip
endif
endif
# Record run time and memory usage in a file using GNU time.
ifdef time
ifneq ($(shell command -v memusg),)
gtime=command memusg -t -o [email protected]
else
ifneq ($(shell command -v gtime),)
gtime=command gtime -v -o [email protected]
else
gtime=command time -v -o [email protected]
endif
endif
endif
.DELETE_ON_ERROR:
.SECONDARY:
.PHONY: help version all tigmint tigmint-long arcs metrics draft_metrics tigmint_metrics arcs_metrics check_span_g
help:
@echo 'Tigmint: Correct misassemblies using linked or long reads'
@echo 'Usage: tigmint-make [COMMAND]... [PARAMETER=VALUE]...'
@echo 'Example: tigmint-make tigmint draft=myassembly reads=myreads'
@echo 'For more information see https://bcgsc.github.io/tigmint/'
version:
@echo "Tigmint 1.2.9"
@echo "Written by Shaun Jackman @sjackman."
all: tigmint arcs
ifneq ($(ref), ref)
ifneq ($G, 0)
all: metrics
endif
endif
metrics: draft_metrics tigmint_metrics arcs_metrics
# Calculate assembly metrics of the draft assembly using abyss-fac and abyss-samtobreak
draft_metrics: \
$(draft).abyss-fac.tsv \
$(draft).scaftigs.abyss-fac.tsv \
$(draft).scaftigs.$(ref).samtobreak.tsv
# Correct misassemblies using Tigmint
tigmint: \
$(draft).$(reads).as$(as).nm$(nm).molecule.size$(minsize).dist$(dist).trim$(trim).window$(window).span$(span).breaktigs.fa \
$(draft).tigmint.fa
# Index the linked reads
tigmint-index: $(draft).fa.bwt
# Align the linked reads (using bwa mem)
tigmint-align: $(draft).$(reads).sortbx.bam
# Perform the tigmint-molecule step for linked reads
tigmint-molecule: $(draft).$(reads).as$(as).nm$(nm).molecule.size$(minsize).dist$(dist).bed
# Perform the final tigmint-cut step for linked reads
tigmint-cut: \
$(draft).$(reads).as$(as).nm$(nm).molecule.size$(minsize).dist$(dist).trim$(trim).window$(window).span$(span).breaktigs.fa \
$(draft).tigmint.fa
# Correct misassemblies for long reads using Tigmint
tigmint-long: \
$(draft).$(reads).cut$(cut).molecule.size$(minsize).dist$(dist).trim$(trim).window$(window).span$(span).breaktigs.fa \
$(draft).cut$(cut).tigmint.fa
# Estimate the tigmint parameters for long reads
tigmint-long-estimate: $(reads).tigmint-long.params.tsv
# Convert the long reads to pseudo-linked reads, perform minimap2 alignment and the tigmint-molecule step
tigmint-long-to-linked: $(draft).$(reads).cut$(cut).molecule.size$(minsize).dist$(dist).bed
# Perform the final tigmint-cut step for long reads
tigmint-long-cut: \
$(draft).$(reads).cut$(cut).molecule.size$(minsize).dist$(dist).trim$(trim).window$(window).span$(span).breaktigs.fa \
$(draft).cut$(cut).tigmint.fa
# Calculate assembly metrics after Tigmint
tigmint_metrics: \
$(draft).$(reads).as$(as).nm$(nm).molecule.size$(minsize).dist$(dist).trim$(trim).window$(window).span$(span).breaktigs.abyss-fac.tsv \
$(draft).$(reads).as$(as).nm$(nm).molecule.size$(minsize).dist$(dist).trim$(trim).window$(window).span$(span).breaktigs.scaftigs.abyss-fac.tsv \
$(draft).$(reads).as$(as).nm$(nm).molecule.size$(minsize).dist$(dist).trim$(trim).window$(window).span$(span).breaktigs.scaftigs.$(ref).samtobreak.tsv
# Scaffold using ARCS
arcs: \
$(draft).$(reads).as$(as).nm$(nm).molecule.size$(minsize).dist$(dist).trim$(trim).window$(window).span$(span).breaktigs.$(reads).c$c_e$e_r$r.arcs.a$a_l$l.links.fa \
$(draft).tigmint.arcs.fa
# Calculate assembly metrics after ARCS
arcs_metrics: \
$(draft).$(reads).as$(as).nm$(nm).molecule.size$(minsize).dist$(dist).trim$(trim).window$(window).span$(span).breaktigs.$(reads).c$c_e$e_r$r.arcs.a$a_l$l.links.abyss-fac.tsv \
$(draft).$(reads).as$(as).nm$(nm).molecule.size$(minsize).dist$(dist).trim$(trim).window$(window).span$(span).breaktigs.$(reads).c$c_e$e_r$r.arcs.a$a_l$l.links.scaftigs.abyss-fac.tsv \
$(draft).$(reads).as$(as).nm$(nm).molecule.size$(minsize).dist$(dist).trim$(trim).window$(window).span$(span).breaktigs.$(reads).c$c_e$e_r$r.arcs.a$a_l$l.links.scaftigs.$(ref).samtobreak.tsv
# Symlink the Tigmint results
$(draft).tigmint.fa: $(draft).$(reads).as$(as).nm$(nm).molecule.size$(minsize).dist$(dist).trim$(trim).window$(window).span$(span).breaktigs.fa
ln -sf $< $@
# Symlink the long read Tigmint results
$(draft).cut$(cut).tigmint.fa: $(draft).$(reads).cut$(cut).molecule.size$(minsize).dist$(dist).trim$(trim).window$(window).span$(span).breaktigs.fa
ln -sf $< $@
# Symlink the ARCS results
$(draft).tigmint.arcs.fa: $(draft).$(reads).as$(as).nm$(nm).molecule.size$(minsize).dist$(dist).trim$(trim).window$(window).span$(span).breaktigs.$(reads).c$c_e$e_r$r.arcs.a$a_l$l.links.fa
ln -sf $< $@
# BWA
# Index the genome.
%.fa.bwt: %.fa
bwa index $<
# Align paired-end reads to the draft genome and sort by BX tag.
$(draft).%.sortbx.bam: %.fq.gz $(draft).fa.bwt
$(gtime) bwa mem -t$t -pC $(draft).fa $< | samtools view -u -F4 | samtools sort -@$t -tBX -T$$(mktemp -u -t [email protected]) -o $@
# Minimap2
# Align cut long reads to the draft genome and output primary alignments sorted by BX tag.
$(draft).%.cut$(cut).sortbx.bam: %.cut$(cut).fa.gz $(draft).fa
$(gtime) minimap2 -y -t$t -ax map-$(longmap) --secondary=no $(draft).fa $< | samtools view -b -u -F4 | samtools sort -@$t -tBX -T$$(mktemp -u -t [email protected]) -o $@
# Check that G is set if span=auto
check_span_g:
ifeq ($(span), auto)
ifeq ($G, 0)
$(error Must set genome size parameter (G) to calculate span automatically)
endif
endif
# samtools
# Index a FASTA file.
%.fa.fai: %.fa
samtools faidx $<
# Create molecule extents BED
%.as$(as).nm$(nm).molecule.size$(minsize).dist$(dist).bed: %.sortbx.bam
$(gtime) $(bin)/tigmint_molecule.py -a$(as) -n$(nm) -q$(mapq) -d$(dist) -s$(minsize) $< | sort -k1,1 -k2,2n -k3,3n $(SORT_OPTS) >$@
# Estimate dist parameter for tigmint-long
dist_sample=1000000
$(reads).tigmint-long.params.tsv: $(longreads)
$(gtime) $(bin)/tigmint_estimate_dist.py $< -n $(dist_sample) -o $@
# Set dist options for the molecule extents rule
dist_options=
ifeq ($(dist), auto)
dist_options+=-p $(reads).tigmint-long.params.tsv
else
dist_options+=-d$(dist)
endif
# Create molecule extents BED using cut long reads
$(draft).$(reads).cut$(cut).molecule.size$(minsize).dist$(dist).bed: $(longreads) $(draft).fa $(reads).tigmint-long.params.tsv
ifeq ($(span), auto)
ifeq ($G, 0)
$(error Must set genome size parameter (G) to calculate span automatically)
endif
endif
ifeq ($(mapping), ntLink)
$(gtime) $(bin)/tigmint-ntlink-map --bed $@ -k$(ntLink_k) -w$(ntLink_w) -t$t --path $(bin) -m$(minsize) --span $(span) $(draft) $<
else
$(gtime) sh -c '$(bin)/../src/long-to-linked-pe -l $(cut) -m$(minsize) -g$G -s -b $(reads).barcode-multiplicity.tsv --bx -t$t --fasta -f $(reads).tigmint-long.params.tsv $< | \
minimap2 -y -t$t -x map-$(longmap) --secondary=no $(draft).fa - | \
$(bin)/tigmint_molecule_paf.py -q$(mapq) -s$(minsize) $(dist_options) - | sort -k1,1 -k2,2n -k3,3n $(SORT_OPTS) > $@'
endif
# Create molecule extents TSV
%.as$(as).nm$(nm).molecule.size$(minsize).tsv: %.sortbx.bam
$(gtime) $(bin)/tigmint_molecule.py --tsv -a$(as) -n$(nm) -q$(mapq) -d$(dist) -s$(minsize) -o $@ $<
# Create molecule extents TSV using cut long reads
%.cut$(cut).as$(as).nm$(cut).molecule.size$(minsize).tsv: %.cut$(cut).sortbx.bam
$(gtime) $(bin)/tigmint_molecule.py --tsv -a$(as) -n$(cut) -q$(mapq) -d$(dist) -s$(minsize) -o $@ $<
# Report summary statistics of a Chromium library
%.molecule.summary.html: %.molecule.tsv
Rscript -e 'rmarkdown::render("$(bin)/../summary.rmd", "html_document", "$@", params = list(input_tsv="$<", output_tsv="$*.summary.tsv"))'
# bedtools
# Compute statistics on the depth of coverage of a BED file.
%.bed.genomecov.tsv: %.bed $(draft).fa.fai
(printf "Rname\tDepth\tCount\tRsize\tFraction\n"; awk '$$2 != $$3' $< | bedtools genomecov -g $(draft).fa.fai -i -) >$@
# Calculate depth of coverage statistics from bedtools genomecov.
%.genomecov.stats.tsv: %.genomecov.tsv
mlr --tsvlite \
then filter '$$Rname == "genome" && $$Depth > 0' \
then step -a rsum -f Fraction \
then put -q '@Depth_count += $$Count; if (is_null(@p25) && $$Fraction_rsum >= 0.25) { @p25 = $$Depth }; if (is_null(@p50) && $$Fraction_rsum >= 0.50) { @p50 = $$Depth }; if (is_null(@p75) && $$Fraction_rsum >= 0.75) { @p75 = $$Depth } end { emitf @Depth_count, @p25, @p50, @p75 }' \
then rename p25,Depth_p25,p50,Depth_p50,p75,Depth_p75 \
then put '$$Depth_IQR = $$Depth_p75 - $$Depth_p25' \
$< >$@
# Identify breakpoints
# Make breakpoints BED file
%.trim$(trim).window$(window).span$(span).breaktigs.fa: %.bed $(draft).fa $(draft).fa.fai
ifeq ($(span), auto)
$(gtime) $(bin)/tigmint-cut -p$t -w$(window) -t$(trim) -m$(ac) -f $(reads).tigmint-long.params.tsv -o $@ $(draft).fa $<
else
$(gtime) $(bin)/tigmint-cut -p$t -w$(window) -n$(span) -t$(trim) -m$(ac) -o $@ $(draft).fa $<
endif
################################################################################
# Calculate assembly contiguity and correctness metrics.
# BWA
# Align an assembly to the reference using BWA-MEM.
%.$(ref).sam.gz: %.fa $(ref).fa.bwt
bwa mem -xintractg -t$t $(ref).fa $< | $(gzip) >$@
# Align paired-end reads to the draft genome and do not sort.
%.$(reads).sortn.bam: %.fa.bwt $(reads).fq.gz
bwa mem -t$t -pC $*.fa $(reads).fq.gz | samtools view -@$t -h -F4 -o $@
# ARCS
# Create a graph of linked contigs using ARCS.
%.$(reads).c$c_e$e_r$r.arcs_original.gv %.$(reads).c$c_e$e_r$r.arcs.dist.gv %.$(reads).c$c_e$e_r$r.arcs.dist.tsv: %.$(reads).sortn.bam %.fa
arcs -s98 -c$c -l0 -z500 -m4-20000 -d0 -e$e -r$r -v \
-f $*.fa \
-b $*.$(reads).c$c_e$e_r$r.arcs \
-g $*.$(reads).c$c_e$e_r$r.arcs.dist.gv \
--tsv=$*.$(reads).c$c_e$e_r$r.arcs.dist.tsv \
--barcode-counts=$<.barcode-counts.tsv \
$<
# Convert the ARCS graph to LINKS TSV format.
%.$(reads).c$c_e$e_r$r.arcs.links.tsv: %.$(reads).c$c_e$e_r$r.arcs_original.gv %.fa
$(bin)/tigmint-arcs-tsv $< $@ $*.fa
# Scaffold the assembly using the ARCS graph and LINKS.
%.$(reads).c$c_e$e_r$r.arcs.a$a_l$l.links.scaffolds.fa %.$(reads).c$c_e$e_r$r.arcs.a$a_l$l.links.assembly_correspondence.tsv: %.$(reads).c$c_e$e_r$r.arcs.links.tsv %.fa
cp $< $*.$(reads).c$c_e$e_r$r.arcs.a$a_l$l.links.tigpair_checkpoint.tsv
LINKS -k20 -l$l -t2 -a$a -x1 -s /dev/null -f $*.fa -b $*.$(reads).c$c_e$e_r$r.arcs.a$a_l$l.links
# Rename the scaffolds.
%.links.fa: %.links.scaffolds.fa
sed -r 's/^>scaffold([^,]*),(.*)/>\1 scaffold\1,\2/' $< >$@
# ABySS
# Convert scaffolds to scaftigs.
%.scaftigs.fa: %.fa
seqtk seq $< | tr _ '~' | abyss-fatoagp -f $@ >[email protected]
# Calculate assembly contiguity metrics with abyss-fac.
%.abyss-fac.tsv: %.fa
abyss-fac -G$G -t500 $< >$@
ifneq ($G, 0)
abyss_samtobreak=abyss-samtobreak -l500 -G$G
else
abyss_samtobreak=abyss-samtobreak -l500
endif
# Calculate assembly contiguity and correctness metrics.
%.samtobreak.tsv: %.sam.gz
gunzip -c $< | $(abyss_samtobreak) >$@