diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 1e35a38..7ff41d2 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -12,12 +12,12 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout with submodules - uses: actions/checkout@v2 + uses: actions/checkout@v3 with: submodules: recursive fetch-depth: 0 - name: Formatting - uses: github/super-linter@v4 + uses: github/super-linter@v5 env: VALIDATE_ALL_CODEBASE: false DEFAULT_BRANCH: master @@ -26,13 +26,13 @@ jobs: linting: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v3 - name: Linting uses: snakemake/snakemake-github-action@v1.22.0 with: directory: .test snakefile: workflow/Snakefile - args: "--lint" + args: "--configfile .test/config_complex/config.yaml --lint" run-workflow: runs-on: ubuntu-latest @@ -41,18 +41,30 @@ jobs: - formatting steps: - name: Checkout repository with submodules - uses: actions/checkout@v2 + uses: actions/checkout@v3 with: submodules: recursive - - name: Test workflow + - name: Test workflow (basic model, no batch_effects) uses: snakemake/snakemake-github-action@v1.22.0 with: directory: .test snakefile: workflow/Snakefile - args: "--use-conda --show-failed-logs --cores 2 --conda-cleanup-pkgs cache" - - name: Test report + args: "--configfile .test/config_basic/config.yaml --use-conda --show-failed-logs --cores 2 --conda-cleanup-pkgs cache" + - name: Test report (basic model, no batch_effects) uses: snakemake/snakemake-github-action@v1.22.0 with: directory: .test snakefile: workflow/Snakefile - args: "--report report.zip" + args: "--configfile .test/config_basic/config.yaml --report report.zip" + - name: Test workflow (multiple variables_of_interest, include batch_effects) + uses: snakemake/snakemake-github-action@v1.22.0 + with: + directory: .test + snakefile: workflow/Snakefile + args: "--configfile .test/config_complex/config.yaml --use-conda --show-failed-logs --cores 2 --conda-cleanup-pkgs cache" + - name: Test report (multiple variables_of_interest, include batch_effects) + uses: snakemake/snakemake-github-action@v1.22.0 + with: + directory: .test + snakefile: workflow/Snakefile + args: "--configfile .test/config_complex/config.yaml --report report.zip" diff --git a/.test/config/config.yaml b/.test/config/config.yaml deleted file mode 100644 index 1db3919..0000000 --- a/.test/config/config.yaml +++ /dev/null @@ -1,41 +0,0 @@ -# path or URL to sample sheet (TSV format, columns: sample, condition, ...) -samples: config/samples.tsv -# path or URL to sequencing unit sheet (TSV format, columns: sample, unit, fq1, fq2) -# Units are technical replicates (e.g. lanes, or resequencing of the same biological -# sample). -units: config/units.tsv - - -ref: - # Ensembl species name - species: saccharomyces_cerevisiae - # Ensembl release - release: 100 - # Genome build - build: R64-1-1 - - -trimming: - activate: False - -mergeReads: - activate: False - -pca: - activate: True - labels: - # columns of sample sheet to use for PCA - - condition - -diffexp: - # contrasts for the deseq2 results method - contrasts: - treated-vs-untreated: - - treated - - untreated - model: ~condition - -params: - cutadapt-pe: "" - cutadapt-se: "" - star: "" diff --git a/.test/config_basic/config.yaml b/.test/config_basic/config.yaml new file mode 100644 index 0000000..650bc7d --- /dev/null +++ b/.test/config_basic/config.yaml @@ -0,0 +1,64 @@ +# path or URL to sample sheet (TSV format, columns: sample, condition, ...) +samples: config_basic/samples.tsv +# path or URL to sequencing unit sheet (TSV format, columns: sample, unit, fq1, fq2) +# Units are technical replicates (e.g. lanes, or resequencing of the same biological +# sample). +units: config_basic/units.tsv + + +ref: + # Ensembl species name + species: saccharomyces_cerevisiae + # Ensembl release + release: 100 + # Genome build + build: R64-1-1 + + +trimming: + # If you activate trimming by setting this to `True`, you will have to + # specify the respective cutadapt adapter trimming flag for each unit + # in the `units.tsv` file's `adapters` column + activate: True + +mergeReads: + activate: False + +pca: + activate: True + # Per default, a separate PCA plot is generated for each of the + # `variables_of_interest` and the `batch_effects`, coloring according to + # that variables groups. + # If you want PCA plots for further columns in the samples.tsv sheet, you + # can request them under labels as a list, for example: + # - relatively_uninteresting_variable_X + # - possible_batch_effect_Y + labels: + - condition + +diffexp: + # variables where you are interested in whether they have + # an effect on expression levels + variables_of_interest: + condition: + # any fold change will be relative to this factor level + base_level: untreated + batch_effects: "" + # contrasts for the deseq2 results method to determine fold changes + contrasts: + treated-vs-untreated: + # must be one of the variables_of_interest + variable_of_interest: condition + level_of_interest: treated + # The default model includes all interactions among variables_of_interest + # and batch_effects added on. For the example above this implicitly is: + # model: ~condition + # For the default model to be used, simply specify an empty `model: ""` + # With more variables_of_interest or batch_effects, you could introduce different + # assumptions into your model, by specicifying a different model here. + model: ~condition + +params: + cutadapt-pe: "" + cutadapt-se: "" + star: "" diff --git a/.test/config/samples.tsv b/.test/config_basic/samples.tsv similarity index 100% rename from .test/config/samples.tsv rename to .test/config_basic/samples.tsv diff --git a/.test/config/units.tsv b/.test/config_basic/units.tsv similarity index 55% rename from .test/config/units.tsv rename to .test/config_basic/units.tsv index ed1a602..c86b0bf 100644 --- a/.test/config/units.tsv +++ b/.test/config_basic/units.tsv @@ -1,5 +1,5 @@ sample_name unit_name fq1 fq2 sra adapters strandedness -A1 1 ngs-test-data/reads/a.scerevisiae.1.fq ngs-test-data/reads/a.scerevisiae.2.fq -B1 1 ngs-test-data/reads/c.scerevisiae.1.fq ngs-test-data/reads/c.scerevisiae.2.fq -A2 1 ngs-test-data/reads/c.scerevisiae.1.fq ngs-test-data/reads/c.scerevisiae.2.fq -B2 1 ngs-test-data/reads/b.scerevisiae.1.fq ngs-test-data/reads/b.scerevisiae.2.fq +A1 1 ngs-test-data/reads/a.scerevisiae.1.fq ngs-test-data/reads/a.scerevisiae.2.fq -a AGATCGGAAGAGCACACGTCTGAACTCCAGTCA yes +B1 1 ngs-test-data/reads/c.scerevisiae.1.fq ngs-test-data/reads/c.scerevisiae.2.fq -a AGATCGGAAGAGCACACGTCTGAACTCCAGTCA none +A2 1 ngs-test-data/reads/c.scerevisiae.1.fq ngs-test-data/reads/c.scerevisiae.2.fq -a AGATCGGAAGAGCACACGTCTGAACTCCAGTCA none +B2 1 ngs-test-data/reads/b.scerevisiae.1.fq ngs-test-data/reads/b.scerevisiae.2.fq -a AGATCGGAAGAGCACACGTCTGAACTCCAGTCA reverse diff --git a/.test/config_complex/config.yaml b/.test/config_complex/config.yaml new file mode 100644 index 0000000..b1e04f2 --- /dev/null +++ b/.test/config_complex/config.yaml @@ -0,0 +1,80 @@ +# path or URL to sample sheet (TSV format, columns: sample, condition, ...) +samples: config_complex/samples.tsv +# path or URL to sequencing unit sheet (TSV format, columns: sample, unit, fq1, fq2) +# Units are technical replicates (e.g. lanes, or resequencing of the same biological +# sample). +units: config_complex/units.tsv + + +ref: + # Ensembl species name + species: saccharomyces_cerevisiae + # Ensembl release + release: 100 + # Genome build + build: R64-1-1 + + +trimming: + # If you activate trimming by setting this to `True`, you will have to + # specify the respective cutadapt adapter trimming flag for each unit + # in the `units.tsv` file's `adapters` column + activate: False + +mergeReads: + activate: False + +pca: + activate: True + # Per default, a separate PCA plot is generated for each of the + # `variables_of_interest` and the `batch_effects`, coloring according to + # that variables groups. + # If you want PCA plots for further columns in the samples.tsv sheet, you + # can request them under labels as a list, for example: + # - relatively_uninteresting_variable_X + # - possible_batch_effect_Y + labels: + # columns of sample sheet to use for PCA + - jointly_handled + +diffexp: + # variables where you are interested in whether they have + # an effect on expression levels + variables_of_interest: + treatment_1: + # any fold change will be relative to this factor level + base_level: untreated + treatment_2: + # any fold change will be relative to this factor level + base_level: untreated + batch_effects: + - jointly_handled + # contrasts for the deseq2 results method to determine fold changes + contrasts: + treatment_1_alone: + # must be one of the variables_of_interest + variable_of_interest: treatment_1 + # the variable's level to test against the base_level + level_of_interest: treated + treatment_2_alone: + # must be one of the variables_of_interest + variable_of_interest: treatment_2 + # the variable's level to test against the base_level + level_of_interest: treated + # Must be a valid expression for option two in the contrasts description + # of ?results in the DESeq2 package. For a more detailed intro, also see: + # https://github.com/tavareshugo/tutorial_DESeq2_contrasts/blob/main/DESeq2_contrasts.md + both_treatments: 'list(c("treatment_1_treated_vs_untreated", "treatment_2_treated_vs_untreated", "treatment_1treated.treatment_2treated"))' + # The default model includes all interactions among variables_of_interest, + # and batch_effects added on. For the example above this implicitly is: + # model: ~jointly_handled + treatment_1 * treatment_2 + # For the default model to be used, simply specify an empty `model: ""` below. + # If you want to introduce different assumptions into your model, you can + # specify a different model to use, for example skipping the interaction: + # model: ~jointly_handled + treatment_1 + treatment_2 + model: "" + +params: + cutadapt-pe: "" + cutadapt-se: "" + star: "" diff --git a/.test/config_complex/samples.tsv b/.test/config_complex/samples.tsv new file mode 100644 index 0000000..4cd11f0 --- /dev/null +++ b/.test/config_complex/samples.tsv @@ -0,0 +1,9 @@ +sample_name treatment_1 treatment_2 jointly_handled +A1 treated treated 1 +A2 treated treated 2 +A3 treated untreated 1 +A4 treated untreated 2 +B1 untreated treated 1 +B2 untreated treated 2 +B3 untreated untreated 1 +B4 untreated untreated 2 diff --git a/.test/config_complex/units.tsv b/.test/config_complex/units.tsv new file mode 100644 index 0000000..7cdf4f7 --- /dev/null +++ b/.test/config_complex/units.tsv @@ -0,0 +1,9 @@ +sample_name unit_name fq1 fq2 sra adapters strandedness +A1 1 ngs-test-data/reads/a.scerevisiae.1.fq ngs-test-data/reads/a.scerevisiae.2.fq +A2 1 ngs-test-data/reads/a.scerevisiae.1.fq ngs-test-data/reads/a.scerevisiae.2.fq +A3 1 ngs-test-data/reads/c.scerevisiae.1.fq ngs-test-data/reads/c.scerevisiae.2.fq +A4 1 ngs-test-data/reads/c.scerevisiae.1.fq ngs-test-data/reads/c.scerevisiae.2.fq +B1 1 ngs-test-data/reads/c.scerevisiae.1.fq ngs-test-data/reads/c.scerevisiae.2.fq +B2 1 ngs-test-data/reads/b.scerevisiae.1.fq ngs-test-data/reads/b.scerevisiae.2.fq +B3 1 ngs-test-data/reads/b.scerevisiae.1.fq ngs-test-data/reads/b.scerevisiae.2.fq +B4 1 ngs-test-data/reads/c.scerevisiae.1.fq ngs-test-data/reads/c.scerevisiae.2.fq diff --git a/config/README.md b/config/README.md index 4f22172..095b49f 100644 --- a/config/README.md +++ b/config/README.md @@ -1,16 +1,54 @@ -# General settings -To configure this workflow, modify ``config/config.yaml`` according to your needs, following the explanations provided in the file. +# General configuration -# Sample and unit sheet +To configure this workflow, modify `config/config.yaml` according to your needs, following the explanations provided in the file. -* Add samples to `config/samples.tsv`. For each sample, the columns `sample_name`, and `condition` have to be defined. The `condition` (healthy/tumor, before Treatment / after Treatment) will be used as contrast for the DEG analysis in DESeq2. To include other relevant variables such as batches, add a new column to the sheet. -* For each sample, add one or more sequencing units (runs, lanes or replicates) to the unit sheet `config/units.tsv`. By activating or deactivating `mergeReads` in the `config/config.yaml`, you can decide wether to merge replicates or run them individually. For each unit, define adapters, and either one (column `fq1`) or two (columns `fq1`, `fq2`) FASTQ files (these can point to anywhere in your system). Alternatively, you can define an SRA (sequence read archive) accession (starting with e.g. ERR or SRR) by using a column `sra`. In the latter case, the pipeline will automatically download the corresponding paired end reads from SRA. If both local files and SRA accession are available, the local files will be preferred. -To choose the correct geneCounts produced by STAR, you can define the strandedness of a unit. STAR produces counts for unstranded ('None' - default), forward oriented ('yes') and reverse oriented ('reverse') protocols. +## `DESeq2` differential expression analysis configuration +To successfully run the differential expression analysis, you will need to tell DESeq2 which sample annotations to use (annotations are columns in the `samples.tsv` file described below). +This is done in the `config.yaml` file with the entries under `diffexp:`. +The comments for the entries should give all the necessary infos and linkouts. +But if in doubt, please also consult the [`DESeq2` manual](https://www.bioconductor.org/packages/devel/bioc/vignettes/DESeq2/inst/doc/DESeq2.html). + +# Sample and unit setup + +The sample and unit setup is specified via tab-separated tabular files (`.tsv`). Missing values can be specified by empty columns or by writing `NA`. -# DESeq scenario +## sample sheet + +The default sample sheet is `config/samples.tsv` (as configured in `config/config.yaml`). +Each sample refers to an actual physical sample, and replicates (both biological and technical) may be specified as separate samples. +For each sample, you will always have to specify a `sample_name`. +In addition, all `variables_of_interest` and `batch_effects` specified in the `config/config.yaml` under the `diffexp:` entry, will have to have corresponding columns in the `config/samples.tsv`. +Finally, the sample sheet can contain any number of additional columns. +So if in doubt about whether you might at some point need some metadata you already have at hand, just put it into the sample sheet already---your future self will thank you. + +## unit sheet + +The default unit sheet is `config/units.tsv` (as configured in `config/config.yaml`). +For each sample, add one or more sequencing units (for example if you have several runs or lanes per sample). + +### `.fastq` file source + +For each unit, you will have to define a source for your `.fastq` files. +This can be done via the columns `fq1`, `fq2` and `sra`, with either of: +1. A single `.fastq` file for single-end reads (`fq1` column only; `fq2` and `sra` columns present, but empty). + The entry can be any path on your system, but we suggest something like a `raw/` data directory within your analysis directory. +2. Two `.fastq` files for paired-end reads (columns `fq1` and `fq2`; column `sra` present, but empty). + As for the `fq1` column, the `fq2` column can also point to anywhere on your system. +3. A sequence read archive (SRA) accession number (`sra` column only; `fq1` and `fq2` columns present, but empty). + The workflow will automatically download the corresponding `.fastq` data (currently assumed to be paired-end). + The accession numbers usually start with SRR or ERR and you can find accession numbers for studies of interest with the [SRA Run Selector](https://trace.ncbi.nlm.nih.gov/Traces/study/). +If both local files and an SRA accession are specified for the same unit, the local files will be used. + +### adapter trimming + +If you set `trimming: activate:` in the `config/config.yaml` to `True`, you will have to provide at least one `cutadapt` adapter argument for each unit in the `adapters` column of the `units.tsv` file. +You will need to find out the adapters used in the sequencing protocol that generated a unit: from your sequencing provider, or for published data from the study's metadata (or its authors). +Then, enter the adapter sequences into the `adapters` column of that unit, preceded by the [correct `cutadapt` adapter argument](https://cutadapt.readthedocs.io/en/stable/guide.html#adapter-types). -To initialize the DEG analysis, you need to define a model in the `config/config.yaml`. The model can include all variables introduced as columns in `config/samples.tsv`. -* The standard model is `~condition` - to include a batch variable, write `~batch + condition`. +### strandedness of library preparation protocol +To get the correct `geneCounts` from `STAR` output, you can provide information on the strandedness of the library preparation protocol used for a unit. +`STAR` can produce counts for unstranded (`none` - this is the default), forward oriented (`yes`) and reverse oriented (`reverse`) protocols. +Enter the respective value into a `strandedness` column in the `units.tsv` file. diff --git a/config/config.yaml b/config/config.yaml index 9abea28..220ad38 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -15,23 +15,56 @@ ref: build: GRCh38 trimming: + # If you activate trimming by setting this to `True`, you will have to + # specify the respective cutadapt adapter trimming flag for each unit + # in the `units.tsv` file's `adapters` column activate: False pca: activate: True - labels: - # columns of sample sheet to use for PCA - - condition + # Per default, a separate PCA plot is generated for each of the + # `variables_of_interest` and the `batch_effects`, coloring according to + # that variables groups. + # If you want PCA plots for further columns in the samples.tsv sheet, you + # can request them under labels as a list, for example: + # - relatively_uninteresting_variable_X + # - possible_batch_effect_Y + labels: "" diffexp: - # contrasts for the deseq2 results method + # variables for whome you are interested in whether they have an effect on + # expression levels + variables_of_interest: + treatment_1: + # any fold change will be relative to this factor level + base_level: B + treatment_2: + # any fold change will be relative to this factor level + base_level: C + # variables whose effect you want to model to separate them from your + # variables_of_interest + batch_effects: + - jointly_handled + # contrasts for the deseq2 results method to determine fold changes contrasts: - A-vs-B: - - A - - B - model: ~condition + A-vs-B_treatment_1: + # must be one of the variables_of_interest, for details see: + # https://www.bioconductor.org/packages/devel/bioc/vignettes/DESeq2/inst/doc/DESeq2.html#contrasts + variable_of_interest: treatment_1 + # must be a level present in the variable_of_interest that is not the + # base_level specified above + level_of_interest: A + # The default model includes all interactions among variables_of_interest + # and batch_effects added on. For the example above this implicitly is: + # model: ~jointly_handled + treatment_1 * treatment_2 + # For the default model to be used, simply specify an empty `model: ""` below. + # If you want to introduce different assumptions into your model, you can + # specify a different model to use, for example skipping the interaction: + # model: ~jointly_handled + treatment_1 + treatment_2 + model: "" + params: cutadapt-pe: "" cutadapt-se: "" - star: "--outSAMtype BAM Unsorted" + star: "" diff --git a/config/samples.tsv b/config/samples.tsv index de2bbd4..a505f54 100644 --- a/config/samples.tsv +++ b/config/samples.tsv @@ -1,3 +1,6 @@ -sample_name condition -A untreated -B treated +sample_name treatment_1 treatment_2 jointly_handled +A untreated untreated 1 +B untreated treated 1 +C treated untreated 1 +D treated untreated 2 +E treated treated 2 diff --git a/config/units.tsv b/config/units.tsv index 85e35b0..ae8303f 100644 --- a/config/units.tsv +++ b/config/units.tsv @@ -2,3 +2,6 @@ sample_name unit_name fq1 fq2 sra adapters strandedness A lane1 A.1.fq.gz A.2.fq.gz A lane2 A2.1.fq.gz A2.2.fq.gz B lane1 B.1.fq.gz B.2.fq.gz +C lane1 C.1.fq.gz C.2.fq.gz +D lane1 D.1.fq.gz D.2.fq.gz +E lane1 E.1.fq.gz E.2.fq.gz diff --git a/workflow/Snakefile b/workflow/Snakefile index b29416b..c0605f1 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -36,5 +36,3 @@ include: "rules/diffexp.smk" rule all: input: get_final_output(), - "results/qc/multiqc_report.html", - "results/pca.svg", diff --git a/workflow/envs/deseq2.yaml b/workflow/envs/deseq2.yaml index 384ee41..dcdc1ea 100644 --- a/workflow/envs/deseq2.yaml +++ b/workflow/envs/deseq2.yaml @@ -4,4 +4,5 @@ channels: - nodefaults dependencies: - bioconductor-deseq2 =1.38 + - r-stringr =1.5 - r-ashr =2.2_54 diff --git a/workflow/rules/align.smk b/workflow/rules/align.smk index ee443ce..24aa523 100644 --- a/workflow/rules/align.smk +++ b/workflow/rules/align.smk @@ -2,6 +2,7 @@ rule align: input: unpack(get_fq), index="resources/star_genome", + gtf="resources/genome.gtf", output: aln="results/star/{sample}_{unit}/Aligned.sortedByCoord.out.bam", reads_per_gene="results/star/{sample}_{unit}/ReadsPerGene.out.tab", @@ -9,9 +10,7 @@ rule align: "logs/star/{sample}_{unit}.log", params: idx=lambda wc, input: input.index, - extra="--outSAMtype BAM SortedByCoordinate --quantMode GeneCounts --sjdbGTFfile {} {}".format( - "resources/genome.gtf", config["params"]["star"] - ), + extra=lambda wc, input: f'--outSAMtype BAM SortedByCoordinate --quantMode GeneCounts --sjdbGTFfile {input.gtf} {config["params"]["star"]}', threads: 24 wrapper: "v1.21.4/bio/star/align" diff --git a/workflow/rules/common.smk b/workflow/rules/common.smk index b36a4fa..515aef1 100644 --- a/workflow/rules/common.smk +++ b/workflow/rules/common.smk @@ -22,6 +22,18 @@ def get_final_output(): ) final_output.append("results/deseq2/normcounts.symbol.tsv") final_output.append("results/counts/all.symbol.tsv") + final_output.append("results/qc/multiqc_report.html") + + if config["pca"]["activate"]: + # get all the variables to plot a PCA for + pca_variables = list(config["diffexp"]["variables_of_interest"]) + if config["diffexp"]["batch_effects"]: + pca_variables.extend(config["diffexp"]["batch_effects"]) + if config["pca"]["labels"]: + pca_variables.extend(config["pca"]["labels"]) + final_output.extend( + expand("results/pca.{variable}.svg", variable=pca_variables) + ) return final_output @@ -35,6 +47,11 @@ units = ( validate(units, schema="../schemas/units.schema.yaml") +wildcard_constraints: + sample="|".join(samples["sample_name"]), + unit="|".join(units["unit_name"]), + + def get_cutadapt_input(wildcards): unit = units.loc[wildcards.sample].loc[wildcards.unit] diff --git a/workflow/rules/diffexp.smk b/workflow/rules/diffexp.smk index 301bcdb..074a136 100644 --- a/workflow/rules/diffexp.smk +++ b/workflow/rules/diffexp.smk @@ -38,9 +38,6 @@ rule deseq2_init: output: "results/deseq2/all.rds", "results/deseq2/normcounts.tsv", - params: - samples=config["samples"], - model=config["diffexp"]["model"], conda: "../envs/deseq2.yaml" log: @@ -54,13 +51,11 @@ rule pca: input: "results/deseq2/all.rds", output: - report("results/pca.svg", "../report/pca.rst"), - params: - pca_labels=config["pca"]["labels"], + report("results/pca.{variable}.svg", "../report/pca.rst"), conda: "../envs/deseq2.yaml" log: - "logs/pca.log", + "logs/pca.{variable}.log", script: "../scripts/plot-pca.R" diff --git a/workflow/rules/trim.smk b/workflow/rules/trim.smk index 640900f..9acf527 100644 --- a/workflow/rules/trim.smk +++ b/workflow/rules/trim.smk @@ -32,7 +32,7 @@ rule cutadapt_pe: log: "logs/cutadapt/{sample}_{unit}.log", params: - others=config["params"]["cutadapt-pe"], + extra=config["params"]["cutadapt-pe"], adapters=lambda w: str(units.loc[w.sample].loc[w.unit, "adapters"]), threads: 8 wrapper: @@ -48,7 +48,7 @@ rule cutadapt_se: log: "logs/cutadapt/{sample}_{unit}.log", params: - others=config["params"]["cutadapt-se"], + extra=config["params"]["cutadapt-se"], adapters_r1=lambda w: str(units.loc[w.sample].loc[w.unit, "adapters"]), threads: 8 wrapper: diff --git a/workflow/schemas/config.schema.yaml b/workflow/schemas/config.schema.yaml index 46f3a8c..84e7fe2 100644 --- a/workflow/schemas/config.schema.yaml +++ b/workflow/schemas/config.schema.yaml @@ -38,11 +38,12 @@ properties: activate: type: boolean labels: - type: array + type: + - array + - string items: type: string required: - - labels - activate diffexp: diff --git a/workflow/schemas/samples.schema.yaml b/workflow/schemas/samples.schema.yaml index f387b9c..c823570 100644 --- a/workflow/schemas/samples.schema.yaml +++ b/workflow/schemas/samples.schema.yaml @@ -5,10 +5,6 @@ properties: sample_name: type: string description: sample name/identifier - condition: - type: string - description: sample condition that will be compared during differential expression analysis (e.g. a treatment, a tissue time, a disease) required: - sample_name - - condition diff --git a/workflow/scripts/deseq2-init.R b/workflow/scripts/deseq2-init.R index 03cf4c9..c3745c3 100644 --- a/workflow/scripts/deseq2-init.R +++ b/workflow/scripts/deseq2-init.R @@ -1,7 +1,8 @@ -log <- file(snakemake@log[[1]], open="wt") +log <- file(snakemake@log[[1]], open = "wt") sink(log) sink(log, type="message") +library(stringr) library("DESeq2") parallel <- FALSE @@ -12,25 +13,78 @@ if (snakemake@threads > 1) { parallel <- TRUE } -# colData and countData must have the same sample order, but this is ensured -# by the way we create the count matrix -cts <- read.table(snakemake@input[["counts"]], header=TRUE, row.names="gene", check.names=FALSE) -cts <- cts[ , order(names(cts))] +counts_data <- read.table( + snakemake@input[["counts"]], + header = TRUE, + row.names = "gene", + check.names = FALSE +) +counts_data <- counts_data[, order(names(counts_data))] -coldata <- read.table(snakemake@params[["samples"]], header=TRUE, row.names="sample_name", check.names=FALSE) -coldata <- coldata[order(row.names(coldata)), , drop=F] +col_data <- read.table( + snakemake@config[["samples"]], + header = TRUE, + row.names = "sample_name", + check.names = FALSE +) +col_data <- col_data[order(row.names(col_data)), , drop = FALSE] -dds <- DESeqDataSetFromMatrix(countData=cts, - colData=coldata, - design=as.formula(snakemake@params[["model"]])) +# properly set the base level to the configuration in config.yaml, avoiding +# the default behaviour of choosing the alphabetical minimum level +for (vof in names(snakemake@config[["diffexp"]][["variables_of_interest"]])) { + snakemake@config[["diffexp"]][["variables_of_interest"]][[vof]] + base_level <- snakemake@config[["diffexp"]][["variables_of_interest"]][[vof]][["base_level"]] + col_data[[vof]] <- relevel( + factor(col_data[[vof]]), base_level + ) +} + +# properly turn all batch effects into factors, even if they are numeric +batch_effects <- snakemake@config[["diffexp"]][["batch_effects"]] +for (effect in batch_effects) { + if (str_length(effect) > 0) { + col_data[[effect]] <- factor(col_data[[effect]]) + } +} + +# build up formula with additive batch_effects and all interactions between the +# variables_of_interes + +design_formula <- snakemake@config[["diffexp"]][["model"]] + +if (str_length(design_formula) == 0) { + batch_effects <- str_flatten(batch_effects, " + ") + if (str_length(batch_effects) > 0) { + batch_effects <- str_c(batch_effects, " + ") + } + vof_interactions <- str_flatten( + names(snakemake@config[["diffexp"]][["variables_of_interest"]]), + " * " + ) + design_formula <- str_c("~", batch_effects, vof_interactions) +} + +dds <- DESeqDataSetFromMatrix( + countData = counts_data, + colData = col_data, + design = as.formula(design_formula) +) # remove uninformative columns -dds <- dds[ rowSums(counts(dds)) > 1, ] +dds <- dds[rowSums(counts(dds)) > 1, ] # normalization and preprocessing -dds <- DESeq(dds, parallel=parallel) +dds <- DESeq(dds, parallel = parallel) # Write dds object as RDS -saveRDS(dds, file=snakemake@output[[1]]) +saveRDS(dds, file = snakemake@output[[1]]) # Write normalized counts -norm_counts = counts(dds, normalized=T) -write.table(data.frame("gene"=rownames(norm_counts), norm_counts), file=snakemake@output[[2]], sep='\t', row.names=F) +norm_counts <- counts(dds, normalized = TRUE) +write.table( + data.frame( + "gene" = rownames(norm_counts), + norm_counts + ), + file = snakemake@output[[2]], + sep = "\t", + row.names = FALSE +) diff --git a/workflow/scripts/deseq2.R b/workflow/scripts/deseq2.R index bee231c..a7b246e 100644 --- a/workflow/scripts/deseq2.R +++ b/workflow/scripts/deseq2.R @@ -1,7 +1,8 @@ -log <- file(snakemake@log[[1]], open="wt") +log <- file(snakemake@log[[1]], open = "wt") sink(log) -sink(log, type="message") +sink(log, type = "message") +library("cli") library("DESeq2") parallel <- FALSE @@ -14,21 +15,79 @@ if (snakemake@threads > 1) { dds <- readRDS(snakemake@input[[1]]) -contrast <- c("condition", snakemake@params[["contrast"]]) -res <- results(dds, contrast=contrast, parallel=parallel) +contrast_config <- snakemake@config[["diffexp"]][["contrasts"]][[ + snakemake@wildcards[["contrast"]] +]] + +# basic case of contrast specification, see: +# https://www.bioconductor.org/packages/devel/bioc/vignettes/DESeq2/inst/doc/DESeq2.html#contrasts +if (length(contrast_config) == 2 && typeof(contrast_config) == "list") { + if ( + # check for existence contrast's variable_of_interest to + # provide useful error message + !(contrast_config[["variable_of_interest"]] %in% + names(snakemake@config[["diffexp"]][["variables_of_interest"]]) + ) + ) { + cli_abort( + c( + "config.yaml: All variable_of_interest entries under `diffexp: contrasts:`", + " " = "must also exist under `diffexp: variables_of_interest:`.", + "x" = "Could not find variable_of_interest: {contrast_config[['variable_of_interest']]}", + " " = "It was not among the `diffexp: variables_of_interest:`", + " " = "{names(snakemake@config[['diffexp']][['variables_of_interest']])}", + "i" = "Are there any typos in the contrasts' `variable_of_interest:` entries?" + ) + ) + } + contrast <- c( + contrast_config[["variable_of_interest"]], + contrast_config[["level_of_interest"]], + snakemake@config[["diffexp"]][["variables_of_interest"]][[ + contrast_config[["variable_of_interest"]] + ]][["base_level"]] + ) +# more complex contrast specification via list(c(), c()), see ?results docs of +# the DESeq2 package and this tutorial (plus the linked seqanswers thread): +# https://github.com/tavareshugo/tutorial_DESeq2_contrasts/blob/main/DESeq2_contrasts.md +} else if ( + length(contrast_config) == 1 && + typeof(contrast_config) == "character" + ) { + contrast <- d <- eval(parse(text = contrast_config)) +} + +res <- results( + dds, + contrast = contrast, + parallel = parallel +) # shrink fold changes for lowly expressed genes -# use ashr so we can use `contrast` as conversion to coef is not trivial -# see https://bioconductor.org/packages/release/bioc/vignettes/DESeq2/inst/doc/DESeq2.html#extended-section-on-shrinkage-estimators -res <- lfcShrink(dds, contrast=contrast, res=res, type="ashr") +# use ashr so we can use `contrast` as conversion to coef is not trivial, see: +# https://bioconductor.org/packages/release/bioc/vignettes/DESeq2/inst/doc/DESeq2.html#extended-section-on-shrinkage-estimators +res <- lfcShrink( + dds, + contrast = contrast, + res = res, + type = "ashr" +) # sort by p-value -res <- res[order(res$padj),] +res <- res[order(res$padj), ] # TODO explore IHW usage # store results svg(snakemake@output[["ma_plot"]]) -plotMA(res, ylim=c(-2,2)) +plotMA(res, ylim = c(-2, 2)) dev.off() -write.table(data.frame("gene"=rownames(res),res), file=snakemake@output[["table"]], row.names=FALSE, sep='\t') +write.table( + data.frame( + "gene" = rownames(res), + res + ), + file = snakemake@output[["table"]], + row.names = FALSE, + sep = "\t" +) diff --git a/workflow/scripts/plot-pca.R b/workflow/scripts/plot-pca.R index 663ece5..fd145df 100644 --- a/workflow/scripts/plot-pca.R +++ b/workflow/scripts/plot-pca.R @@ -1,6 +1,6 @@ -log <- file(snakemake@log[[1]], open="wt") +log <- file(snakemake@log[[1]], open = "wt") sink(log) -sink(log, type="message") +sink(log, type = "message") library("DESeq2") @@ -10,5 +10,5 @@ dds <- readRDS(snakemake@input[[1]]) # obtain normalized counts counts <- rlog(dds, blind=FALSE) svg(snakemake@output[[1]]) -plotPCA(counts, intgroup=snakemake@params[["pca_labels"]]) +plotPCA(counts, intgroup = snakemake@wildcards[["variable"]]) dev.off()