Skip to content

Commit

Permalink
Merge pull request #511 from broadinstitute/dp-scaffold
Browse files Browse the repository at this point in the history
more scaffolding updates
  • Loading branch information
dpark01 authored Feb 17, 2024
2 parents eae3562 + 88ca4d1 commit 847d661
Show file tree
Hide file tree
Showing 9 changed files with 268 additions and 88 deletions.
4 changes: 2 additions & 2 deletions github_actions_ci/install-wdl.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@ fetch_jar_from_github () {
ln -s $_jar_fname $_tool_name.jar
}

fetch_jar_from_github broadinstitute cromwell womtool 61
fetch_jar_from_github broadinstitute cromwell cromwell 61
fetch_jar_from_github broadinstitute cromwell womtool 86
fetch_jar_from_github broadinstitute cromwell cromwell 86
fetch_jar_from_github dnanexus dxWDL dxWDL v1.50

TGZ=dx-toolkit-v0.311.0-ubuntu-20.04-amd64.tar.gz
Expand Down
37 changes: 26 additions & 11 deletions pipes/WDL/tasks/tasks_assembly.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -231,19 +231,31 @@ task scaffold {
set +e +o pipefail
grep -v '^>' ~{sample_name}.intermediate_gapfill.fasta | tr -d '\n' | wc -c | tee assembly_preimpute_length
grep -v '^>' ~{sample_name}.intermediate_gapfill.fasta | tr -d '\nNn' | wc -c | tee assembly_preimpute_length_unambiguous
grep '^>' ~{sample_name}.intermediate_gapfill.fasta | wc -l | tee assembly_num_segments_recovered
grep '^>' ~{sample_name}.scaffolding_chosen_ref.fasta | wc -l | tee reference_num_segments_required
grep -v '^>' ~{sample_name}.scaffolding_chosen_ref.fasta | tr -d '\n' | wc -c | tee reference_length
set -e -o pipefail

#Input assembly/contigs, FASTA, already ordered oriented and merged with the reference gneome (FASTA)
assembly.py impute_from_reference \
~{sample_name}.intermediate_gapfill.fasta \
~{sample_name}.scaffolding_chosen_ref.fasta \
~{sample_name}.scaffolded_imputed.fasta \
--newName ~{sample_name} \
~{'--replaceLength=' + replace_length} \
~{'--minLengthFraction=' + min_length_fraction} \
~{'--minUnambig=' + min_unambig} \
~{'--aligner=' + aligner} \
--loglevel=DEBUG
if ~{true='true' false='false' allow_incomplete_output} && ! cmp -s assembly_num_segments_recovered reference_num_segments_required
then
# draft assembly does not have enough segments--and that's okay (allow_incomplete_output=true)
file_utils.py rename_fasta_sequences \
~{sample_name}.intermediate_gapfill.fasta \
~{sample_name}.scaffolded_imputed.fasta \
"~{sample_name}" --suffix_always --loglevel=DEBUG
else
# draft assembly must have the right number of segments (fail if not)
assembly.py impute_from_reference \
~{sample_name}.intermediate_gapfill.fasta \
~{sample_name}.scaffolding_chosen_ref.fasta \
~{sample_name}.scaffolded_imputed.fasta \
--newName ~{sample_name} \
~{'--replaceLength=' + replace_length} \
~{'--minLengthFraction=' + min_length_fraction} \
~{'--minUnambig=' + min_unambig} \
~{'--aligner=' + aligner} \
--loglevel=DEBUG
fi
}

output {
Expand All @@ -252,6 +264,9 @@ task scaffold {
File intermediate_gapfill_fasta = "~{sample_name}.intermediate_gapfill.fasta"
Int assembly_preimpute_length = read_int("assembly_preimpute_length")
Int assembly_preimpute_length_unambiguous = read_int("assembly_preimpute_length_unambiguous")
Int assembly_num_segments_recovered = read_int("assembly_num_segments_recovered")
Int reference_num_segments_required = read_int("reference_num_segments_required")
Int reference_length = read_int("reference_length")
Array[String] scaffolding_chosen_ref_names = read_lines("~{sample_name}.scaffolding_chosen_refs.txt")
File scaffolding_chosen_ref = "~{sample_name}.scaffolding_chosen_ref.fasta"
File scaffolding_stats = "~{sample_name}.scaffolding_stats.txt"
Expand Down
104 changes: 96 additions & 8 deletions pipes/WDL/tasks/tasks_metagenomics.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ task krakenuniq {
File krona_taxonomy_db_tgz # taxonomy.tab
Int? machine_mem_gb
String docker = "quay.io/broadinstitute/viral-classify:2.2.3.0" #skip-global-version-pin
String docker = "quay.io/broadinstitute/viral-classify:2.2.4.0" #skip-global-version-pin
}

Int disk_size = 750
Expand Down Expand Up @@ -143,7 +143,7 @@ task build_krakenuniq_db {
Int? zstd_compression_level

Int? machine_mem_gb
String docker = "quay.io/broadinstitute/viral-classify:2.2.3.0" #skip-global-version-pin
String docker = "quay.io/broadinstitute/viral-classify:2.2.4.0" #skip-global-version-pin
}

Int disk_size = 750
Expand Down Expand Up @@ -213,7 +213,7 @@ task kraken2 {
Int? min_base_qual

Int machine_mem_gb = 72
String docker = "quay.io/broadinstitute/viral-classify:2.2.3.0"
String docker = "quay.io/broadinstitute/viral-classify:2.2.4.0"
}

parameter_meta {
Expand Down Expand Up @@ -326,6 +326,94 @@ task kraken2 {
}
}

task report_primary_kraken_taxa {
meta {
description: "Interprets a kraken (or kraken2 or krakenuniq) summary report file and emits the primary contributing taxa under a focal taxon of interest."
}
input {
File kraken_summary_report
String focal_taxon = "Viruses"

String docker = "quay.io/broadinstitute/viral-classify:2.2.4.0"
}
String out_basename = basename(kraken_summary_report, '.txt')
Int disk_size = 50
Int machine_mem_gb = 2

command <<<
set -e
metagenomics.py taxlevel_plurality "~{kraken_summary_report}" "~{focal_taxon}" "~{out_basename}.ranked_focal_report.tsv"
cat "~{out_basename}.ranked_focal_report.tsv" | head -2 | tail +2 > TOPROW
cut -f 2 TOPROW > NUM_FOCAL
cut -f 4 TOPROW > PCT_OF_FOCAL
cut -f 7 TOPROW > NUM_READS
cut -f 8 TOPROW > TAX_RANK
cut -f 9 TOPROW > TAX_ID
cut -f 10 TOPROW > TAX_NAME
>>>

output {
String focal_tax_name = focal_taxon
File ranked_focal_report = "~{out_basename}.ranked_focal_report.tsv"
Int total_focal_reads = read_int("NUM_FOCAL")
Float percent_of_focal = read_float("PCT_OF_FOCAL")
Int num_reads = read_int("NUM_READS")
String tax_rank = read_string("TAX_RANK")
String tax_id = read_string("TAX_ID")
String tax_name = read_string("TAX_NAME")
}

runtime {
docker: docker
memory: machine_mem_gb + " GB"
cpu: 1
disks: "local-disk " + disk_size + " LOCAL"
disk: disk_size + " GB" # TESs
dx_instance_type: "mem1_ssd1_v2_x2"
preemptible: 2
maxRetries: 2
}
}

task filter_refs_to_found_taxa {
meta {
description: "Filters a taxid_to_ref_accessions_tsv to the set of taxa found in a focal_report."
}
input {
File taxid_to_ref_accessions_tsv
File focal_report_tsv
File taxdump_tgz
Int min_read_count = 100

String docker = "quay.io/broadinstitute/viral-classify:2.2.4.0"
}
String ref_basename = basename(taxid_to_ref_accessions_tsv, '.tsv')
String hits_basename = basename(focal_report_tsv, '.tsv')
Int disk_size = 50

command <<<
set -e
mkdir -p taxdump
read_utils.py extract_tarball "~{taxdump_tgz}" taxdump
metagenomics.py filter_taxids_to_focal_hits "~{taxid_to_ref_accessions_tsv}" "~{focal_report_tsv}" taxdump ~{min_read_count} "~{ref_basename}-~{hits_basename}.tsv"
>>>

output {
File filtered_taxid_to_ref_accessions_tsv = "~{ref_basename}-~{hits_basename}.tsv"
}

runtime {
docker: docker
memory: "2 GB"
cpu: 1
disks: "local-disk " + disk_size + " LOCAL"
disk: disk_size + " GB" # TESs
dx_instance_type: "mem1_ssd1_v2_x2"
preemptible: 2
maxRetries: 2
}
}

task build_kraken2_db {
meta {
description: "Builds a custom kraken2 database. Outputs tar.zst tarballs of kraken2 database, associated krona taxonomy db, and an ncbi taxdump.tar.gz. Requires live internet access if any standard_libraries are specified or if taxonomy_db_tgz is absent."
Expand All @@ -348,7 +436,7 @@ task build_kraken2_db {
Int? zstd_compression_level

Int? machine_mem_gb
String docker = "quay.io/broadinstitute/viral-classify:2.2.3.0"
String docker = "quay.io/broadinstitute/viral-classify:2.2.4.0"
}

Int disk_size = 750
Expand Down Expand Up @@ -490,7 +578,7 @@ task blastx {
File krona_taxonomy_db_tgz

Int? machine_mem_gb
String docker = "quay.io/broadinstitute/viral-classify:2.2.3.0"
String docker = "quay.io/broadinstitute/viral-classify:2.2.4.0"
}

parameter_meta {
Expand Down Expand Up @@ -580,7 +668,7 @@ task krona {
Int? magnitude_column

Int? machine_mem_gb
String docker = "quay.io/broadinstitute/viral-classify:2.2.3.0"
String docker = "quay.io/broadinstitute/viral-classify:2.2.4.0"
}

Int disk_size = 50
Expand Down Expand Up @@ -687,7 +775,7 @@ task filter_bam_to_taxa {
String out_filename_suffix = "filtered"

Int? machine_mem_gb
String docker = "quay.io/broadinstitute/viral-classify:2.2.3.0"
String docker = "quay.io/broadinstitute/viral-classify:2.2.4.0"
}

String out_basename = basename(classified_bam, ".bam") + "." + out_filename_suffix
Expand Down Expand Up @@ -774,7 +862,7 @@ task kaiju {
File krona_taxonomy_db_tgz # taxonomy/taxonomy.tab
Int? machine_mem_gb
String docker = "quay.io/broadinstitute/viral-classify:2.2.3.0"
String docker = "quay.io/broadinstitute/viral-classify:2.2.4.0"
}

String input_basename = basename(reads_unmapped_bam, ".bam")
Expand Down
2 changes: 1 addition & 1 deletion pipes/WDL/tasks/tasks_reports.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -488,7 +488,7 @@ task aggregate_metagenomics_reports {
String aggregate_taxlevel_focus = "species"
Int aggregate_top_N_hits = 5
String docker = "quay.io/broadinstitute/viral-classify:2.2.3.0"
String docker = "quay.io/broadinstitute/viral-classify:2.2.4.0"
}
parameter_meta {
Expand Down
6 changes: 3 additions & 3 deletions pipes/WDL/tasks/tasks_taxon_filter.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ task deplete_taxa {

Int? cpu=8
Int? machine_mem_gb
String docker = "quay.io/broadinstitute/viral-classify:2.2.3.0"
String docker = "quay.io/broadinstitute/viral-classify:2.2.4.0"
}

parameter_meta {
Expand Down Expand Up @@ -113,7 +113,7 @@ task filter_to_taxon {
String? neg_control_prefixes_space_separated = "neg water NTC"

Int? machine_mem_gb
String docker = "quay.io/broadinstitute/viral-classify:2.2.3.0"
String docker = "quay.io/broadinstitute/viral-classify:2.2.4.0"
}

# do this in two steps in case the input doesn't actually have "cleaned" in the name
Expand Down Expand Up @@ -172,7 +172,7 @@ task build_lastal_db {
File sequences_fasta

Int? machine_mem_gb
String docker = "quay.io/broadinstitute/viral-classify:2.2.3.0"
String docker = "quay.io/broadinstitute/viral-classify:2.2.4.0"
}

String db_name = basename(sequences_fasta, ".fasta")
Expand Down
29 changes: 29 additions & 0 deletions pipes/WDL/tasks/tasks_utils.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -712,6 +712,35 @@ task s3_copy {
}
}
task string_split {
meta {
description: "split a string by a delimiter"
}
input {
String joined_string
String delimiter
}
command <<<
set -e
python3<<CODE
with open('TOKENS', 'wt') as outf:
for token in "~{joined_string}".split("~{delimiter}"):
outf.write(token + '\n')
CODE
>>>
output {
Array[String] tokens = read_lines("TOKENS")
}
runtime {
docker: "python:slim"
memory: "1 GB"
cpu: 1
disks: "local-disk 50 SSD"
disk: "50 GB" # TES
maxRetries: 2
}
}
task filter_sequences_by_length {
meta {
description: "Filter sequences in a fasta file to enforce a minimum count of non-N bases."
Expand Down
13 changes: 13 additions & 0 deletions pipes/WDL/workflows/classify_single.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,10 @@ workflow classify_single {
trim_clip_db = trim_clip_db,
always_succeed = true
}
call metagenomics.report_primary_kraken_taxa {
input:
kraken_summary_report = kraken2.kraken2_summary_report
}

output {
File cleaned_reads_unaligned_bam = deplete.bam_filtered_to_taxa
Expand All @@ -134,6 +138,15 @@ workflow classify_single {

File kraken2_summary_report = kraken2.kraken2_summary_report
File kraken2_krona_plot = kraken2.krona_report_html
File kraken2_top_taxa_report = report_primary_kraken_taxa.ranked_focal_report
String kraken2_focal_taxon_name = report_primary_kraken_taxa.focal_tax_name
Int kraken2_focal_total_reads = report_primary_kraken_taxa.total_focal_reads
String kraken2_top_taxon_id = report_primary_kraken_taxa.tax_id
String kraken2_top_taxon_name = report_primary_kraken_taxa.tax_name
String kraken2_top_taxon_rank = report_primary_kraken_taxa.tax_rank
Int kraken2_top_taxon_num_reads = report_primary_kraken_taxa.num_reads
Float kraken2_top_taxon_pct_of_focal = report_primary_kraken_taxa.percent_of_focal

File raw_fastqc = merge_raw_reads.fastqc
File cleaned_fastqc = fastqc_cleaned.fastqc_html
File spikein_report = spikein.report
Expand Down
Loading

0 comments on commit 847d661

Please sign in to comment.