config.yaml

## Important note:
## All paths defined in this configuration file must be either absolute or relative to the 
## location of the Snakefile!

proj_dir: ..

## Reference annotation details
##--------------------------------------------------------------------------------------------
## Specify "Ensembl" or "Gencode" depending on your choice
annotation: Ensembl

organism: Homo_sapiens # separate with underscore
build: GRCh38
release: 87
##--------------------------------------------------------------------------------------------


## Paths to existing reference files 
##--------------------------------------------------------------------------------------------
txome: /dataVolume/storage/Homo_sapiens/grch38/txome/Homo_sapiens.GRCh38.cdna.all.fa.gz
genome: /dataVolume/storage/Homo_sapiens/grch38/Homo_sapiens.GRCh38.dna_sm.primary_assembly.fa
known-variants: /dataVolume/storage/Homo_sapiens/grch38/dbsnp_146.hg38.vcf.gz
bed: /dataVolume/storage/Homo_sapiens/grch38_tran/Homo_sapiens.GRCh38.87.bed
exonic_bed: /dataVolume/storage/Homo_sapiens/grch38_tran/Homo_sapiens.GRCh38.87.exonic.bed
bin_size: 500000
gtf: /dataVolume/storage/Homo_sapiens/grch38_tran/Homo_sapiens.GRCh38.87.gtf 
repeat_mask: /dataVolume/storage/Homo_sapiens/grch38_tran/Homo_sapiens.GRCh38.87.rmsk.gtf
dbtss_bed: /dataVolume/storage/Homo_sapiens/dbtss/corrected_dbtss.bed
chrom_sizes: /dataVolume/storage/Homo_sapiens/grch38/Homo_sapiens.GRCh38.dna_sm.primary_assembly.chrom.sizes
##--------------------------------------------------------------------------------------------


## Paths to indexes that will be generated by the workflow
##--------------------------------------------------------------------------------------------
salmonindex: /dataVolume/storage/Homo_sapiens/grch38/SalmonIndex/Homo_sapiens.GRCh38.9870.8.2
salmonk: 31
STARindex: /dataVolume/storage/Homo_sapiens/grch38/STARIndex/Homo_sapiens.GRCh38.87.STAR.idx
HISAT2index: /dataVolume/storage/Homo_sapiens/grch38_tran/genome_tran
##--------------------------------------------------------------------------------------------


## Information about the experiment
##--------------------------------------------------------------------------------------------
readlength: 150

## Mean and standard deviation of the fragment length distribution, for use with Salmon. 
## This is important to specify for single-end reads.
## For paired-end reads, these values will define the prior, which is then updated 
## based on the observed fragment lengths.
fldMean: 250
fldSD: 25

## Path to metadata text file. This file must contain at least the following columns:
## names: the sample identifiers = the names of the FASTQ files (excluding the _R1/R2.fastq.gz part)
## type: either SE or PE, indicating whether the sample was analyzed 
## via single-end or paired-end sequencing.
metatxt: ../data/metadata.txt

## Variables used for model fitting
## design: design formula for use with edgeR, camera and DRIMSeq. Must be a string 
## of the form "~ <predictors>"
## contrast: (comma-separated if multiple) list of contrasts to estimate in edgeR_dge.Rmd
design: "~ 0 + celline"
contrast: cellineN61311-cellineN052611,cellineN052611-cellineN61311

## Gene sets used for gene set analysis with camera
## Comma-separated list of gene set categories to test with camera. 
## Must be a subset of H,C1,C2,C3,C4,C5,C6,C7
## Only required if variable "run_camera: is True (see below).
genesets: H,C5

## The maximal number of cores to use for FastQC, STAR, Salmon and DRIMSeq.
## Note that the actual number of cores available to Snakemake is determined by
## the --cores argument when it is invoked.
ncores: 6

n_coverage: 3
##---------------------------------------------------------------------------------------------


## Path to a folder containing gzipped fastq files, and the file suffix (typically, either fastq or fq). 
## If you have paired-end fastq files, you also need to define the extension distinguishing the two read files. 
## More precisely, ARMOR assumes that paired-end fastq files are named 
## <sample-name>_<fqext1>.<fqsuffix>.gz and <sample-name>_<fqext2>.<fqsuffix>.gz.
## Single-end fastq files are supposed to be named 
## <sample-name>.<fqsuffix>.gz.
##---------------------------------------------------------------------------------------------
FASTQ: ../data/FASTQ
fqext1: R1
fqext2: R2
fqsuffix: fastq
##---------------------------------------------------------------------------------------------

## jbrowse setup
##---------------------------------------------------------------------------------------------
refdir: /dataVolume/storage/Homo_sapiens
gff: /dataVolume/storage/Homo_sapiens/grch38_tran/Homo_sapiens.GRCh38.87.sorted.gff3.gz
gff_tbi: /dataVolume/storage/Homo_sapiens/grch38_tran/Homo_sapiens.GRCh38.87.sorted.gff3.gz.tbi
refseq: /dataVolume/storage/Homo_sapiens/grch38_tran/refSeqs.json
##---------------------------------------------------------------------------------------------

## Path to a folder that will store the output generated by the workflow. 
## Additional subfolders of this folder will be generated by the workflow. 
## To put output in the current directory, set output to ".".
##---------------------------------------------------------------------------------------------
output: ../output/
##---------------------------------------------------------------------------------------------

## R setup
##---------------------------------------------------------------------------------------------
## Specify "True" if R should be installed in a conda environment or "False" if you want to use 
## your own R installation (then you have to set the path to your library in the .Renviron file)
useCondaR: False
Rbin: "/usr/bin/R"
##---------------------------------------------------------------------------------------------

## Conditional conda rules
##---------------------------------------------------------------------------------------------
## Should read trimming, STAR mapping, DRIMSeq analysis and gene set analysis be performed? Set
## to False if the step is not required.
run_trimming: True
run_STAR: False
run_HISAT2: True
run_SALMON: False
run_DRIMSeq: False
run_camera: False
run_genebodycoverage: False
##---------------------------------------------------------------------------------------------


params:
  gatk:
    HaplotypeCaller: "--intervals Homo_sapiens.GRCh38.87.exonic.autosomes.bed"
    BaseRecalibrator: ""
    GenotypeGVCFs: ""
    VariantRecalibrator: ""
  picard:
    MarkDuplicates: "REMOVE_DUPLICATES=true"
  trimmomatic:
    pe: 
      trimmer:
        # See trimmomatic manual for adding additional options, e.g. for adapter trimmi
        - "LEADING:3"
        - "TRAILING:3"
        - "SLIDINGWINDOW:4:15"
        - "MINLEN:36"
    se: 
      trimmer:
        # See trimmomatic manual for adding additional options, e.g. for adapter trimmi
        - "LEADING:3"
        - "TRAILING:3"
        - "SLIDINGWINDOW:4:15"
        - "MINLEN:36"