-
Notifications
You must be signed in to change notification settings - Fork 10
/
Snakefile-base-flu
350 lines (325 loc) · 12.1 KB
/
Snakefile-base-flu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
"""
Processes input fastq files to create consensus genomes and their associated
summary statistics for the Seattle Flu Study.
Basic steps:
1. Trim raw fastq's with Trimmomatic
2. Map trimmed reads to each reference genomes in the reference panel using
bowtie2 # This step may change with time
~3. Remove duplicate reads using Picard~
4. Call consensus(SNPs/indels) using varscan
5. Use consensus to generate full consensus genomes for each sample x reference
virus combination
6. Compute summary statistics for each sample x refernce virus combination
Adapted from Louise Moncla's illumina pipeline for influenza snp calling:
https://github.com/lmoncla/illumina_pipeline
"""
import glob
from datetime import datetime
from itertools import product
#### Helper functions and variable def'ns
def generate_sample_ids(config: dict) -> set:
"""
Find all fastz.gz files within the fastq_directory in the *config* and
parse the sample ids from these files to return as a set.
Expects the file names to have the format of <sample_id>_*.fastq.gz
Will ignore all sample ids listed under "ignored_samples" of *config*
"""
all_ids = set()
for filename in glob.glob("{}/*".format(config['fastq_directory'])):
if filename.endswith('.fastq.gz'):
sample_id = filename.split('.')[0].split('/')[-1].split('_')[0]
if sample_id in config['ignored_samples']:
continue
all_ids.add(sample_id)
return all_ids
def generate_all_files(sample: str, config: dict) -> tuple:
"""
For a given sample, determine all fastqs as a tuple of forward
and reverse within the fastq_directory in the *config*.
The fastqs are expected to have *sample* in the filenamane and
R1/R2 in the filename to differentiate forward and reverse reads.
"""
r1 = glob.glob('{}/*{}*R1*'.format(config['fastq_directory'], sample))
r2 = glob.glob('{}/*{}*R2*'.format(config['fastq_directory'], sample))
return (r1, r2)
def filter_combinator(combinator, config):
"""
Custom combinatoric function to be used with :py:func:`snakemake.io.expand`
Only generates combination of sample and reference wildcards that are
listed under `sample_reference_pairs` of the *config*.
If no references are listed for a sample, will generate all possible
combinations with all references.
"""
def filtered_combinator(*args, **kwargs):
for wc_comb in combinator(*args, **kwargs):
if wc_comb[0][1] not in config["sample_reference_pairs"]:
yield wc_comb
elif len(config["sample_reference_pairs"][wc_comb[0][1]]) == 0:
yield wc_comb
elif wc_comb[1][1] in config["sample_reference_pairs"][wc_comb[0][1]]:
yield wc_comb
return filtered_combinator
# Build static lists of reference genomes, ID's of samples, and ID -> input files
all_references = [ v for v in config['reference_viruses'].keys() ]
all_ids = generate_sample_ids(config)
mapped = {id: generate_all_files(id, config) for id in all_ids}
filtered_product = filter_combinator(product, config)
#### Main pipeline
rule index_reference_genome:
input:
raw_reference = "references/{reference}.fasta"
output:
indexed_reference = "references/{reference}.1.bt2"
shell:
"""
bowtie2-build {input.raw_reference} references/{wildcards.reference}
"""
rule merge_lanes:
input:
all_r1 = lambda wildcards: mapped[wildcards.sample][0],
all_r2 = lambda wildcards: mapped[wildcards.sample][1]
output:
merged_r1 = "process/merged/{sample}_R1.fastq.gz",
merged_r2 = "process/merged/{sample}_R2.fastq.gz"
shell:
"""
cat {input.all_r1} >> {output.merged_r1}
cat {input.all_r2} >> {output.merged_r2}
"""
rule trim_fastqs:
input:
fastq_f = "process/merged/{sample}_R1.fastq.gz",
fastq_r = "process/merged/{sample}_R2.fastq.gz"
output:
trimmed_fastq_1p = "process/trimmed/{sample}.trimmed_1P.fastq",
trimmed_fastq_2p = "process/trimmed/{sample}.trimmed_2P.fastq",
trimmed_fastq_1u = "process/trimmed/{sample}.trimmed_1U.fastq",
trimmed_fastq_2u = "process/trimmed/{sample}.trimmed_2U.fastq"
params:
paired_end = config["params"]["trimmomatic"]["paired_end"],
adapters = config["params"]["trimmomatic"]["adapters"],
illumina_clip = config["params"]["trimmomatic"]["illumina_clip"],
window_size = config["params"]["trimmomatic"]["window_size"],
trim_qscore = config["params"]["trimmomatic"]["trim_qscore"],
minimum_length = config["params"]["trimmomatic"]["minimum_length"]
benchmark:
"benchmarks/{sample}.trimmo"
shell:
"""
trimmomatic \
{params.paired_end} \
-phred33 \
{input.fastq_f} {input.fastq_r} \
-baseout process/trimmed/{wildcards.sample}.trimmed.fastq \
ILLUMINACLIP:{params.adapters}:{params.illumina_clip} \
SLIDINGWINDOW:{params.window_size}:{params.trim_qscore} \
MINLEN:{params.minimum_length}
"""
rule post_trim_fastqc:
input:
p1 = rules.trim_fastqs.output.trimmed_fastq_1p,
p2 = rules.trim_fastqs.output.trimmed_fastq_2p,
u1 = rules.trim_fastqs.output.trimmed_fastq_1u,
u2 = rules.trim_fastqs.output.trimmed_fastq_2u
output:
qc_1p_html = "summary/post_trim_fastqc/{sample}.trimmed_1P_fastqc.html",
qc_2p_html = "summary/post_trim_fastqc/{sample}.trimmed_2P_fastqc.html",
qc_1u_html = "summary/post_trim_fastqc/{sample}.trimmed_1U_fastqc.html",
qc_2u_html = "summary/post_trim_fastqc/{sample}.trimmed_2U_fastqc.html",
qc_1p_zip = "summary/post_trim_fastqc/{sample}.trimmed_1P_fastqc.zip",
qc_2p_zip = "summary/post_trim_fastqc/{sample}.trimmed_2P_fastqc.zip",
qc_1u_zip = "summary/post_trim_fastqc/{sample}.trimmed_1U_fastqc.zip",
qc_2u_zip = "summary/post_trim_fastqc/{sample}.trimmed_2U_fastqc.zip"
shell:
"""
fastqc {input.p1} -o summary/post_trim_fastqc
fastqc {input.p2} -o summary/post_trim_fastqc
fastqc {input.u1} -o summary/post_trim_fastqc
fastqc {input.u2} -o summary/post_trim_fastqc
"""
rule map:
input:
p1 = rules.trim_fastqs.output.trimmed_fastq_1p,
p2 = rules.trim_fastqs.output.trimmed_fastq_2p,
u1 = rules.trim_fastqs.output.trimmed_fastq_1u,
u2 = rules.trim_fastqs.output.trimmed_fastq_2u,
ref_file = rules.index_reference_genome.output.indexed_reference
output:
mapped_bam_file = "process/mapped/{reference}/{sample}.bam",
bt2_log = "summary/bowtie2/{reference}/{sample}.log"
params:
threads = config["params"]["bowtie2"]["threads"],
map_all = config["params"]["bowtie2"]["all"]
benchmark:
"benchmarks/{sample}_{reference}.bowtie2"
shell:
"""
bowtie2 \
-x references/{wildcards.reference} \
-1 {input.p1} \
-2 {input.p2} \
-U {input.u1},{input.u2} \
-P {params.threads} \
{params.map_all} \
--local 2> {output.bt2_log} | \
samtools view -bSF4 - > {output.mapped_bam_file}
"""
rule sort:
input:
mapped_bam = rules.map.output.mapped_bam_file
output:
sorted_bam_file = "process/sorted/{reference}/{sample}.sorted.bam"
benchmark:
"benchmarks/{sample}_{reference}.sort"
shell:
"""
samtools view \
-bS {input.mapped_bam} | \
samtools sort | \
samtools view -h > {output.sorted_bam_file}
"""
rule bamstats:
input:
sorted_bam = rules.sort.output.sorted_bam_file
output:
bamstats_file = "summary/bamstats/{reference}/{sample}.coverage_stats.txt"
benchmark:
"benchmarks/{sample}_{reference}.bamstats"
shell:
"""
BAMStats -i {input.sorted_bam} > {output.bamstats_file}
"""
checkpoint mapped_reads:
input:
bt2_log = rules.map.output.bt2_log,
bamstats = rules.bamstats.output.bamstats_file,
reference = "references/{reference}.fasta"
output:
"summary/checkpoint/{reference}/{sample}.json"
params:
min_cov = config["params"]["varscan"]["min_cov"],
raw_read_length = config["raw_read_length"]
shell:
"""
python scripts/checkpoint_mapped_reads.py \
--bamstats {input.bamstats} \
--bowtie2 {input.bt2_log} \
--min-cov {params.min_cov} \
--raw-read-length {params.raw_read_length} \
--reference {input.reference} \
--output {output} \
"""
rule not_mapped:
input:
sorted_bam = rules.sort.output.sorted_bam_file,
reference = "references/{reference}.fasta"
output:
not_mapped = "summary/not_mapped/{reference}/{sample}.txt"
shell:
"""
touch {output.not_mapped}
"""
rule pileup:
input:
sorted_bam = rules.sort.output.sorted_bam_file,
reference = "references/{reference}.fasta"
output:
pileup = "process/mpileup/{reference}/{sample}.pileup"
params:
depth = config["params"]["mpileup"]["depth"],
min_base_qual = config["params"]["varscan"]["snp_qual_threshold"]
benchmark:
"benchmarks/{sample}_{reference}.mpileup"
shell:
"""
samtools mpileup -a -A -B \
-Q {params.min_base_qual} \
-d {params.depth} \
{input.sorted_bam} > {output.pileup} \
-f {input.reference}
"""
rule create_bed_file:
input:
pileup = rules.pileup.output.pileup
output:
bed_file = "summary/low_coverage/{reference}/{sample}.bed"
params:
min_cov = config["params"]["varscan"]["min_cov"],
min_freq = config["params"]["varscan"]["snp_frequency"]
shell:
"""
python scripts/create_bed_file_for_masking.py \
--pileup {input.pileup} \
--min-cov {params.min_cov} \
--min-freq {params.min_freq} \
--bed-file {output.bed_file}
"""
rule call_consensus:
input:
pileup = rules.pileup.output.pileup
output:
vcf = "process/vcfs/{reference}/{sample}.vcf"
params:
min_cov = config["params"]["varscan"]["min_cov"],
snp_qual_threshold = config["params"]["varscan"]["snp_qual_threshold"],
snp_frequency = config["params"]["varscan"]["snp_frequency"]
benchmark:
"benchmarks/{sample}_{reference}.varscan"
shell:
"""
varscan mpileup2cns \
{input.pileup} \
--min-coverage {params.min_cov} \
--min-avg-qual {params.snp_qual_threshold} \
--min-var-freq {params.snp_frequency} \
--strand-filter 0 \
--variants 1 \
--output-vcf 1 > {output.vcf}
"""
rule zip_vcf:
input:
vcf = rules.call_consensus.output.vcf
output:
bcf = "process/vcfs/{reference}/{sample}.vcf.gz"
shell:
"""
bgzip {input.vcf}
"""
rule index_bcf:
input:
bcf = rules.zip_vcf.output.bcf
output:
index = "process/vcfs/{reference}/{sample}.vcf.gz.csi"
shell:
"""
bcftools index {input}
"""
rule vcf_to_consensus:
input:
bcf = rules.zip_vcf.output.bcf,
index = rules.index_bcf.output.index,
mask = rules.create_bed_file.output.bed_file,
ref = "references/{reference}.fasta"
output:
consensus_genome = "consensus_genomes/{reference}/{sample}.consensus.fasta"
benchmark:
"benchmarks/{sample}_{reference}.consensus"
shell:
"""
cat {input.ref} | \
bcftools consensus {input.bcf} \
--mask {input.mask} > \
{output.consensus_genome}
"""
rule clean:
message: "Removing directories: {params}"
params:
"summary "
"process "
"benchmarks "
"references/*.bt2 "
"references/*.fai"
shell:
"""
rm -rfv {params}
"""