-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathconfig.yaml
executable file
·153 lines (130 loc) · 6.82 KB
/
config.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
## Important note:
## All paths defined in this configuration file must be either absolute or relative to the
## location of the Snakefile!
proj_dir: ..
## Reference annotation details
##--------------------------------------------------------------------------------------------
## Specify "Ensembl" or "Gencode" depending on your choice
annotation: Ensembl
organism: Homo_sapiens # separate with underscore
build: GRCh38
release: 87
##--------------------------------------------------------------------------------------------
## Paths to existing reference files
##--------------------------------------------------------------------------------------------
txome: /dataVolume/storage/Homo_sapiens/grch38/txome/Homo_sapiens.GRCh38.cdna.all.fa.gz
genome: /dataVolume/storage/Homo_sapiens/grch38/Homo_sapiens.GRCh38.dna_sm.primary_assembly.fa
known-variants: /dataVolume/storage/Homo_sapiens/grch38/dbsnp_146.hg38.vcf.gz
bed: /dataVolume/storage/Homo_sapiens/grch38_tran/Homo_sapiens.GRCh38.87.bed
exonic_bed: /dataVolume/storage/Homo_sapiens/grch38_tran/Homo_sapiens.GRCh38.87.exonic.bed
bin_size: 500000
gtf: /dataVolume/storage/Homo_sapiens/grch38_tran/Homo_sapiens.GRCh38.87.gtf
repeat_mask: /dataVolume/storage/Homo_sapiens/grch38_tran/Homo_sapiens.GRCh38.87.rmsk.gtf
dbtss_bed: /dataVolume/storage/Homo_sapiens/dbtss/corrected_dbtss.bed
chrom_sizes: /dataVolume/storage/Homo_sapiens/grch38/Homo_sapiens.GRCh38.dna_sm.primary_assembly.chrom.sizes
##--------------------------------------------------------------------------------------------
## Paths to indexes that will be generated by the workflow
##--------------------------------------------------------------------------------------------
salmonindex: /dataVolume/storage/Homo_sapiens/grch38/SalmonIndex/Homo_sapiens.GRCh38.9870.8.2
salmonk: 31
STARindex: /dataVolume/storage/Homo_sapiens/grch38/STARIndex/Homo_sapiens.GRCh38.87.STAR.idx
HISAT2index: /dataVolume/storage/Homo_sapiens/grch38_tran/genome_tran
##--------------------------------------------------------------------------------------------
## Information about the experiment
##--------------------------------------------------------------------------------------------
readlength: 150
## Mean and standard deviation of the fragment length distribution, for use with Salmon.
## This is important to specify for single-end reads.
## For paired-end reads, these values will define the prior, which is then updated
## based on the observed fragment lengths.
fldMean: 250
fldSD: 25
## Path to metadata text file. This file must contain at least the following columns:
## names: the sample identifiers = the names of the FASTQ files (excluding the _R1/R2.fastq.gz part)
## type: either SE or PE, indicating whether the sample was analyzed
## via single-end or paired-end sequencing.
metatxt: ../data/metadata.txt
## Variables used for model fitting
## design: design formula for use with edgeR, camera and DRIMSeq. Must be a string
## of the form "~ <predictors>"
## contrast: (comma-separated if multiple) list of contrasts to estimate in edgeR_dge.Rmd
design: "~ 0 + celline"
contrast: cellineN61311-cellineN052611,cellineN052611-cellineN61311
## Gene sets used for gene set analysis with camera
## Comma-separated list of gene set categories to test with camera.
## Must be a subset of H,C1,C2,C3,C4,C5,C6,C7
## Only required if variable "run_camera: is True (see below).
genesets: H,C5
## The maximal number of cores to use for FastQC, STAR, Salmon and DRIMSeq.
## Note that the actual number of cores available to Snakemake is determined by
## the --cores argument when it is invoked.
ncores: 6
n_coverage: 3
##---------------------------------------------------------------------------------------------
## Path to a folder containing gzipped fastq files, and the file suffix (typically, either fastq or fq).
## If you have paired-end fastq files, you also need to define the extension distinguishing the two read files.
## More precisely, ARMOR assumes that paired-end fastq files are named
## <sample-name>_<fqext1>.<fqsuffix>.gz and <sample-name>_<fqext2>.<fqsuffix>.gz.
## Single-end fastq files are supposed to be named
## <sample-name>.<fqsuffix>.gz.
##---------------------------------------------------------------------------------------------
FASTQ: ../data/FASTQ
fqext1: R1
fqext2: R2
fqsuffix: fastq
##---------------------------------------------------------------------------------------------
## jbrowse setup
##---------------------------------------------------------------------------------------------
refdir: /dataVolume/storage/Homo_sapiens
gff: /dataVolume/storage/Homo_sapiens/grch38_tran/Homo_sapiens.GRCh38.87.sorted.gff3.gz
gff_tbi: /dataVolume/storage/Homo_sapiens/grch38_tran/Homo_sapiens.GRCh38.87.sorted.gff3.gz.tbi
refseq: /dataVolume/storage/Homo_sapiens/grch38_tran/refSeqs.json
##---------------------------------------------------------------------------------------------
## Path to a folder that will store the output generated by the workflow.
## Additional subfolders of this folder will be generated by the workflow.
## To put output in the current directory, set output to ".".
##---------------------------------------------------------------------------------------------
output: ../output/
##---------------------------------------------------------------------------------------------
## R setup
##---------------------------------------------------------------------------------------------
## Specify "True" if R should be installed in a conda environment or "False" if you want to use
## your own R installation (then you have to set the path to your library in the .Renviron file)
useCondaR: False
Rbin: "/usr/bin/R"
##---------------------------------------------------------------------------------------------
## Conditional conda rules
##---------------------------------------------------------------------------------------------
## Should read trimming, STAR mapping, DRIMSeq analysis and gene set analysis be performed? Set
## to False if the step is not required.
run_trimming: True
run_STAR: False
run_HISAT2: True
run_SALMON: False
run_DRIMSeq: False
run_camera: False
run_genebodycoverage: False
##---------------------------------------------------------------------------------------------
params:
gatk:
HaplotypeCaller: "--intervals Homo_sapiens.GRCh38.87.exonic.autosomes.bed"
BaseRecalibrator: ""
GenotypeGVCFs: ""
VariantRecalibrator: ""
picard:
MarkDuplicates: "REMOVE_DUPLICATES=true"
trimmomatic:
pe:
trimmer:
# See trimmomatic manual for adding additional options, e.g. for adapter trimmi
- "LEADING:3"
- "TRAILING:3"
- "SLIDINGWINDOW:4:15"
- "MINLEN:36"
se:
trimmer:
# See trimmomatic manual for adding additional options, e.g. for adapter trimmi
- "LEADING:3"
- "TRAILING:3"
- "SLIDINGWINDOW:4:15"
- "MINLEN:36"