Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/conda_ci.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
name: conda_ci
dependencies:
- python>=3.11,<3.13
- python_abi
- anaconda-client
- conda-build
- conda-verify
1 change: 1 addition & 0 deletions docs/content/News.rst
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ snakePipes 3.5.0
----------------

* Allelic-whatshap mode was added to DNAmapping. Requires --phasedVcf.
* --useSpikeinForNorm sption to split bam files by host and spikein genome, and to generate spikein-normalized bamCoverage tracks, was added to DNAmapping.
* Allelic-whatshap mode now works with alignment-free mode and fromBAM in the mRNAseq workflow.


Expand Down
2 changes: 1 addition & 1 deletion docs/content/cookbook.rst
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ Process and analyze Cut&Tag data for chromatin binding. Fastq files are mapped t

.. code-block:: bash
DNAmapping --cutntag --trim --trimmerOptions ' -a nexteraF=CTGTCTCTTATA -A nexteraR=CTGTCTCTTATA ' --fastqc --dedup --mapq 3 -i $input_folder -o analysis_dedup customIndices/GRch38_dm6/GRCh38_g31_dm6.yaml
DNAmapping --cutntag --trim --trimmerOptions ' -a nexteraF=CTGTCTCTTATA -A nexteraR=CTGTCTCTTATA ' --fastqc --dedup --useSpikeinForNorm --mapq 3 -i $input_folder -o analysis_dedup customIndices/GRch38_dm6/GRCh38_g31_dm6.yaml
.. code-block:: bash
Expand Down
8 changes: 8 additions & 0 deletions docs/content/workflows/DNAmapping.rst
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,12 @@ There is a configuration file in ``snakePipes/workflows/DNAmapping/defaults.yaml
fragmentLength: 200
qualimap: false
verbose: false
#phased vcf
pvcf:
#arguments related to hybrid genome alignment
splitHybridGenome: False
spikeinExt: "_spikein"
spikein_bin_size: 1000

Many of these options can be more conveniently set on the command-line. However, you may need to change the ``reads:`` setting if your paired-end files are not denoted by ``sample_R1.fastq.gz`` and ``sample_R2.fastq.gz``, but rather ``sample_1.fastq.gz`` and ``sample_2.fastq.gz``.

Expand Down Expand Up @@ -119,6 +125,8 @@ A number of other directories may optionally be present if you specified read tr

A fair number of useful QC plots are or can be generated by the pipeline. These include correlation and PCA plots as well as the output from MultiQC.

Specyfying `--useSpikeinForNorm` will cause the bam files aligned to a hybrid host and spikein genome to be split into correspondingly named parts. Split bam files will be found in "split_bams" folder . Deeptools qc for the split bam files will be performed and the results will be found in the "split_deepTools_qc" folder. Spikein-scaled bam coverage tracks will be output in the bamCoverage folder.

.. image:: ../images/DNAmapping_correlation.png

Command line options
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ build = [
]

[tool.setuptools.package-data]
snakePipes = ["**/*.yaml", "**/*.R", "**/*.Rmd", "**/*snakefile", "**/*Snakefile", "**/*sh", "**/*txt"]
snakePipes = ["**/*.yaml", "**/*.R","**/*.py", "**/*.Rmd", "**/*snakefile", "**/*Snakefile", "**/*sh", "**/*txt"]

[project.scripts]
ATACseq = "snakePipes.workflows.ATACseq.ATACseq:main"
Expand Down
5 changes: 3 additions & 2 deletions snakePipes/shared/rules/deepTools_qc_allelic.snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -46,14 +46,15 @@ rule multiBamSummary_allelic:
bams = expand("allelic_bams/{sample}.{suffix}.sorted.bam", sample=samples, suffix = ['genome1', 'genome2']),
bais = expand("allelic_bams/{sample}.{suffix}.sorted.bam.bai", sample=samples, suffix = ['genome1', 'genome2'])
output:
npz = "deepTools_qc/multiBamSummary/read_coverage_allelic.bins.npz"
npz = "deepTools_qc/multiBamSummary/read_coverage_allelic.bins.npz",
sf = "deepTools_qc/multiBamSummary/allelic.scaling_factors.txt"
params:
labels = " ".join(expand('{sample}.{suffix}', sample=samples, suffix = ['genome1', 'genome2'])),
blacklist = "--blackListFileName "+blacklist_bed if blacklist_bed
else "",
read_extension = "--extendReads" if pairedEnd
else "--extendReads " + str(fragmentLength),
scaling_factors = "",
scaling_factors = "--scalingFactors deepTools_qc/multiBamSummary/allelic.scaling_factors.txt",
binSize = "",
spikein_region = ""
benchmark:
Expand Down
41 changes: 41 additions & 0 deletions snakePipes/shared/rules/deepTools_qc_allelic_spikein.snakefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
from pathlib import Path
tools_dir = Path(maindir) / "shared" / "tools"

def get_scaling_factor(sample,input):
sample_names=[]
scale_factors=[]
if os.path.isfile(os.path.join(outdir,input)):
with open(os.path.join(outdir,input)) as f:
for idx, line in enumerate(f):
if idx > 0:
sample_names.append(line.split('\t')[0])
scale_factors.append((line.split('\t')[1]).rstrip("\n"))
sf_dict = dict(zip(sample_names, scale_factors))
scale_factor = sf_dict[sample]

return float(scale_factor)
else:
return float(1)


rule bamCoverage_spikein:
input:
bam = "allelic_bams/{sample}.{suffix}.sorted.bam" ,
bai = "allelic_bams/{sample}.{suffix}.sorted.bam.bai",
scale_factors = "split_deepTools_qc/multiBamSummary/spikein.scaling_factors.txt"
output:
"bamCoverage/allele_specific/{sample}.{suffix}.host_scaled.BYspikein.bw"
params:
bwBinSize = bwBinSize,
genome_size = int(genome_size),
ignoreForNorm = "--ignoreForNormalization {}".format(ignoreForNormalization) if ignoreForNormalization else "",
read_extension = "--extendReads" if pairedEnd
else "--extendReads {}".format(fragmentLength),
blacklist = "--blackListFileName {}".format(blacklist_bed) if blacklist_bed
else "",
scaling_factors = lambda wildcards,input: "--scaleFactor {}".format(get_scaling_factor(wildcards.sample,input.scale_factors)) ## subset for the one factor needed
benchmark:
"bamCoverage/allele_specific/.benchmark/bamCoverage.{sample}.{suffix}_BYspikein.benchmark"
threads: lambda wildcards: 16 if 16<max_thread else max_thread # 4GB per core
conda: CONDA_SHARED_ENV
shell: bamcov_spikein_cmd
95 changes: 95 additions & 0 deletions snakePipes/shared/rules/split_bam_ops_DNA_spikein.snakefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
part=['host','spikein']
blacklist_dict={"host": blacklist_bed,"spikein": spikein_blacklist_bed }
region_dict={"host": " ".join(host_chr.keys()),"spikein": " ".join(spikein_chr.keys())}


def get_scaling_factor(sample,input):
sample_names=[]
scale_factors=[]
if os.path.isfile(os.path.join(outdir,input)):
with open(os.path.join(outdir,input)) as f:
for idx, line in enumerate(f):
if idx > 0:
sample_names.append(line.split('\t')[0])
scale_factors.append((line.split('\t')[1]).rstrip("\n"))
sf_dict = dict(zip(sample_names, scale_factors))
scale_factor = sf_dict[sample]

return float(scale_factor)
else:
return float(1)


rule split_bamfiles_by_genome:
input:
bam = "filtered_bam/{sample}.filtered.bam",
bai = "filtered_bam/{sample}.filtered.bam.bai"
output:
bam = "split_bam/{sample}_{part}.bam",
bai = "split_bam/{sample}_{part}.bam.bai"
params:
region = lambda wildcards: region_dict[wildcards.part]
conda: CONDA_SAMBAMBA_ENV
threads: 4
shell: """
sambamba slice -o {output.bam} {input.bam} {params.region};
sambamba index -t {threads} {output.bam}
"""

rule multiBamSummary_by_part:
input:
bams = lambda wildcards: expand("split_bam/{sample}_{part}.bam", sample=samples,part=wildcards.part),
bais = lambda wildcards: expand("split_bam/{sample}_{part}.bam.bai", sample=samples,part=wildcards.part)
output:
npz = "split_deepTools_qc/multiBamSummary/{part}_read_coverage.bins.npz",
scale_factors = "split_deepTools_qc/multiBamSummary/{part}.scaling_factors.txt"
params:
labels = " ".join(samples),
blacklist = lambda wildcards: "--blackListFileName {}".format(blacklist_dict[wildcards.part]) if blacklist_dict[wildcards.part] else "",
read_extension = "--extendReads" if pairedEnd
else "--extendReads {}".format(fragmentLength),
scaling_factors = "--scalingFactors split_deepTools_qc/multiBamSummary/{part}.scaling_factors.txt",
binSize = lambda wildcards: " --binSize "+str(spikein_bin_size) if wildcards.part=="spikein" else "",
spikein_region = lambda wildcards: " --region "+spikein_region if ((wildcards.part=="spikein") and (spikein_region != "")) else ""
benchmark:
"split_deepTools_qc/.benchmark/{part}_multiBamSummary.benchmark"
threads: lambda wildcards: 24 if 24<max_thread else max_thread
conda: CONDA_SHARED_ENV
shell: multiBamSummary_cmd


rule bamCoverage_by_part:
input:
bam = "split_bam/{sample}_host.bam" ,
bai = "split_bam/{sample}_host.bam.bai",
scale_factors = "split_deepTools_qc/multiBamSummary/{part}.scaling_factors.txt"
output:
"bamCoverage/{sample}.host_scaled.BY{part}.bw"
params:
bwBinSize = bwBinSize,
genome_size = int(genome_size),
ignoreForNorm = "--ignoreForNormalization {}".format(ignoreForNormalization) if ignoreForNormalization else "",
read_extension = "--extendReads" if pairedEnd
else "--extendReads {}".format(fragmentLength),
blacklist = "--blackListFileName {}".format(blacklist_bed) if blacklist_bed
else "",
scaling_factors = lambda wildcards,input: "--scaleFactor {}".format(get_scaling_factor(wildcards.sample,input.scale_factors)) ## subset for the one factor needed
benchmark:
"bamCoverage/.benchmark/bamCoverage.{sample}.BY{part}.filtered.benchmark"
threads: lambda wildcards: 16 if 16<max_thread else max_thread # 4GB per core
conda: CONDA_SHARED_ENV
shell: bamcov_spikein_cmd


rule bamPE_fragment_size_by_part:
input:
bams = lambda wildcards: expand("split_bam/{sample}_{part}.bam", sample=samples,part=wildcards.part),
bais = lambda wildcards: expand("split_bam/{sample}_{part}.bam.bai", sample=samples,part=wildcards.part)
output:
"split_deepTools_qc/bamPEFragmentSize/{part}.fragmentSize.metric.tsv"
params:
plotcmd = lambda wildcards: "" if plotFormat == 'None' else
"-o split_deepTools_qc/bamPEFragmentSize/" + wildcards.part + ".fragmentSizes.{}".format(plotFormat)
threads: lambda wildcards: 24 if 24<max_thread else max_thread
conda: CONDA_SHARED_ENV
shell: bamPEFragmentSize_cmd
10 changes: 8 additions & 2 deletions snakePipes/shared/rules/whatshap.snakefile
Original file line number Diff line number Diff line change
@@ -1,9 +1,15 @@
def collect_bam():
bam = "filtered_bam/{sample}.filtered.bam"
if pipeline == "dnamapping" and useSpikeInForNorm:
bam = "split_bam/{sample}_host.bam"
return(bam)

rule whatshap_haplotag:
input:
ref = genome_fasta,
pvcf = pvcf,
bam = "filtered_bam/{sample}.filtered.bam",
bai = "filtered_bam/{sample}.filtered.bam.bai"
bam = collect_bam(),
bai = lambda wildcards: f"{collect_bam()}.bai"
output:
hbam = "allelic_bams/{sample}.allele_flagged.sorted.bam",
hlist = "allelic_bams/{sample}_haplotype_list.tsv"
Expand Down
20 changes: 15 additions & 5 deletions snakePipes/workflows/ChIPseq/internals.snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ from ruamel.yaml import YAML
import sys
import pandas as pd
import warnings
from itertools import chain

### Functions ##################################################################

Expand Down Expand Up @@ -230,18 +231,27 @@ if sampleSheet:
filtered_dict = filter_dict(sampleSheet,dict(zip(chip_samples_w_ctrl, [ get_control_name(x) for x in chip_samples_w_ctrl ])))
else:
filtered_dict = filter_dict(sampleSheet,dict(zip(chip_samples_wo_ctrl, [None]*len(chip_samples_wo_ctrl))))
print(filtered_dict)
genrichDict = cf.sampleSheetGroups(sampleSheet,isMultipleComparison)
if not isMultipleComparison:
for k in genrichDict.keys():
genrichDict[k]=[item for item in genrichDict[k] if item in chip_samples]
reordered_dict = {k: filtered_dict[k] for k in [item for sublist in genrichDict.values() for item in sublist]}
else:
#print(genrichDict)
print(genrichDict)
reordered_dict = {}
for g in genrichDict.keys():
for k in genrichDict[g].keys():
genrichDict[g][k]=[item for item in genrichDict[g][k] if item in chip_samples]
reordered_dict[g] = {k: filtered_dict[k] for k in [item for sublist in genrichDict[g].values() for item in sublist]}
#for g in genrichDict.keys():
# for k in genrichDict[g].keys():
# genrichDict[g][k]=[item for item in genrichDict[g][k] if item in chip_samples]
# reordered_dict[g] = {k: filtered_dict[k] for k in [item for sublist in genrichDict[g].values() for item in sublist]}
for g in genrichDict:
# filter each condition list to only chip samples
for k in genrichDict[g]:
genrichDict[g][k] = [item for item in genrichDict[g][k] if item in chip_samples]

# flatten the lists and build mapping, skipping missing keys just in case
flattened = chain.from_iterable(genrichDict[g].values())
reordered_dict[g] = {fk: filtered_dict[fk] for fk in flattened if fk in filtered_dict}
else:
genrichDict = {"all_samples": chip_samples}

Expand Down
12 changes: 11 additions & 1 deletion snakePipes/workflows/DNAmapping/DNAmapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def parse_args(defaults={"verbose": False, "configFile": None,
"UMIDedup": False, "UMIDedupOpts": "",
"UMIDedupSep": "_", "UMIBarcode": False, "cutntag": False,
"bcPattern": "NNNNCCCCCCCC", "aligner":"Bowtie2",
"pvcf": None}):
"pvcf": None, "useSpikeInForNorm": False, "spikeinExt": None}):
"""
Parse arguments from the command line.
"""
Expand Down Expand Up @@ -119,6 +119,16 @@ def parse_args(defaults={"verbose": False, "configFile": None,
help="Phased vcf required for whatshap haplotagging. (default: '%(default)s')",
default=defaults["pvcf"])

optional.add_argument("--useSpikeInForNorm",
dest="useSpikeInForNorm",
action="store_true",
help="Split bam files by host and spikein genome, then scale bam coverage results by spikein-derived size factors.")

optional.add_argument("--spikeinExt",
dest="spikeinExt",
help="Extention of spikein chromosome names in the hybrid genome. Ignored if useSpikeInForNorm is False (default: '%(default)s') .",
default=defaults["spikeinExt"])

return parser


Expand Down
26 changes: 23 additions & 3 deletions snakePipes/workflows/DNAmapping/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,8 @@ include: os.path.join(maindir, "shared", "tools" , "deeptools_cmds.snakefile")
# deepTools QC
include: os.path.join(maindir, "shared", "rules", "deepTools_qc.snakefile")

# Qualimap BAM QC
include: os.path.join(maindir, "shared", "rules", "Qualimap_bamqc.snakefile")
if useSpikeInForNorm:
include: os.path.join(maindir, "shared", "rules", "split_bam_ops_DNA_spikein.snakefile")

## MultiQC
include: os.path.join(maindir, "shared", "rules", "multiQC.snakefile")
Expand Down Expand Up @@ -87,6 +87,8 @@ else:
if "allelic-whatshap" in mode:
include: os.path.join(maindir, "shared", "rules", "whatshap.snakefile")
include: os.path.join(maindir, "shared", "rules", "deepTools_qc_allelic.snakefile")
if useSpikeInForNorm:
include: os.path.join(maindir, "shared", "rules", "deepTools_qc_allelic_spikein.snakefile")

### conditional/optional rules #################################################
################################################################################
Expand All @@ -113,6 +115,23 @@ def run_computeGCBias(GCBias):
file_list = expand("deepTools_qc/computeGCBias/{sample}.filtered.GCBias.png", sample = samples)
return(file_list)

def run_splitBam_ops():
if useSpikeInForNorm:
file_list = expand("split_bam/{sample}_{part}.bam",sample=samples,part=part)
file_list.append(
[expand("split_deepTools_qc/multiBamSummary/{part}.scaling_factors.txt",part=part),
expand("bamCoverage/{sample}.host_scaled.BY{part}.bw",sample=samples,part=part),
expand("split_deepTools_qc/bamPEFragmentSize/{part}.fragmentSize.metric.tsv",part=part)]
)
if "allelic-whatshap" in mode:
allele_suffix = ['genome1', 'genome2']
file_list.append(
[expand("bamCoverage/allele_specific/{sample}.{suffix}.host_scaled.BYspikein.bw",sample=samples,suffix=allele_suffix)]
)
else:
file_list = []
return(file_list)

def run_deepTools_qc():
file_list = ["deepTools_qc/bamPEFragmentSize/fragmentSize.metric.tsv"]
if len(samples) <= 20:
Expand All @@ -122,7 +141,7 @@ def run_deepTools_qc():
"deepTools_qc/plotCorrelation/correlation.pearson.read_coverage.tsv",
"deepTools_qc/plotCorrelation/correlation.spearman.read_coverage.tsv",
"deepTools_qc/plotPCA/PCA.read_coverage.tsv" ])
if 'allelic-mapping' in mode:
if 'allelic-mapping' in mode or 'allelic-whatshap' in mode:
file_list.append( [
"deepTools_qc/plotCorrelation/correlation.pearson.read_coverage_allelic.tsv",
"deepTools_qc/plotCorrelation/correlation.spearman.read_coverage_allelic.tsv",
Expand Down Expand Up @@ -209,6 +228,7 @@ rule all:
expand("bamCoverage/{sample}.filtered.seq_depth_norm.bw", sample = samples),
run_computeGCBias(GCBias),
run_deepTools_qc(),
run_splitBam_ops(),
make_nmasked_genome(),
run_allelesp_mapping(),
run_whatshap(),
Expand Down
4 changes: 4 additions & 0 deletions snakePipes/workflows/DNAmapping/defaults.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -66,3 +66,7 @@ qualimap: false
verbose: false
#phased vcf
pvcf:
#arguments related to hybrid genome alignment
useSpikeInForNorm: False
spikeinExt: "_spikein"
spikein_bin_size: 1000
Loading
Loading