-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathAnnotate_NLR
70 lines (70 loc) · 2.76 KB
/
Annotate_NLR
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
#!/bin/bash -i
#Necessary packages:-parser, -annotator, BRAKER, samtools, blast, interproscan, etc.#
#Put input genome in a folder named genome#
import os
genome="genome/"
configfile: "FindPlantNLRs.config"
SAMPLES = list(set([x.split(".")[0] for x in os.listdir(genome) if x.endswith("fa")]))
path = 'tmp'
if not os.path.exists(path):
os.mkdir(path)
result = 'result'
if not os.path.exists(result):
os.mkdir(result)
rule all:
input:
expand('tmp/{sample}_all_20kbflanking_removed.fasta', sample=SAMPLES),
expand('result/{sample}_braker_aa.fasta', sample=SAMPLES),
expand('result/{sample}.codingseq', sample=SAMPLES),
expand('result/{sample}_braker_aa.fasta.tsv', sample=SAMPLES),
expand('result/{sample}_NLRclassification.done', sample=SAMPLES),
expand('result/{sample}_ID.done', sample=SAMPLES)
#RefPlant_aa.fa is from https://www.biorxiv.org/content/10.1101/2020.07.08.193961v2
#Remember to correct the path for braker.pl and replace braker_2.1.6 with the version you use
#Remove sample species from config file of braker if you stopped once#
#Set alignment tools as
rule braker:
input:
raw="tmp/{sample}.all_20kbflanking_merged_upper.fasta"
output:
removed="tmp/{sample}_all_20kbflanking_removed.fasta",
hints_gtf="tmp/{sample}_braker.gtf",
codingseq="result/{sample}.codingseq",
gff3="result/{sample}_braker.gff3",
aa="result/{sample}_braker_aa.fasta"
params:
"{sample}"
run:
shell("sed 's/(//;s/)//' {input.raw} > {output.removed}")
shell("singularity exec -B $PWD:$PWD {config[braker]} braker.pl --threads={config[CPUs]} --genome={output.removed} --prot_seq={config[ref]} --gff3")
shell("cp braker/braker.codingseq {output.codingseq}")
shell("sed -e 's/\*//g' braker/braker.aa > {output.aa}")
shell("rm -r braker/GeneMark-EP braker/GeneMark-ES")
#Interproscan
rule Interproscan_classification:
input:
fasta="result/{sample}_braker_aa.fasta",
gff="result/{sample}_braker.gff3",
genome="genome/{sample}.fa"
output:
interpro="result/{sample}_braker_aa.fasta.tsv",
touch_file="result/{sample}_NLRclassification.done"
shell:
"""
bash {config[interproscan]} -t p -appl Pfam, COILS, Gene3D -i {input.fasta} -cpu {config[CPUs]} -f tsv -o {output.interpro}
bash NLR_classification.sh {output.interpro} {input.gff} {input.fasta} {input.genome}
touch {output.touch_file}
"""
#Find Integrate domain
rule Integrate_domain:
input:
"result/{sample}_braker_aa.fasta.tsv"
output:
"result/{sample}_ID.done"
params:
"{sample}"
shell:
"""
bash get_integrated_domains.sh {params} NLR
touch {output}
"""