Skip to content

Commit 3214ce4

Browse files
authored
Merge pull request #7368 from wm75/fastp-dedup
fastp: Add duplication analysis / deduplication options
2 parents aed2557 + 0fb786c commit 3214ce4

3 files changed

Lines changed: 78 additions & 11 deletions

File tree

tools/fastp/fastp.xml

Lines changed: 64 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
<tool id="fastp" name="fastp" version="@TOOL_VERSION@+galaxy2" profile="23.1">
1+
<tool id="fastp" name="fastp" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="23.1">
22
<description>fast all-in-one preprocessing for FASTQ files</description>
33
<macros>
44
<import>macros.xml</import>
@@ -149,6 +149,14 @@ $filter_options.low_complexity_filter.enable_low_complexity_filter
149149
#end if
150150
151151
152+
## Duplicate analysis / deduplication
153+
154+
$duplicated_reads.handling_options.eval_dups
155+
#if not str($duplicated_reads.handling_options.eval_dups):
156+
$duplicated_reads.handling_options.dedup
157+
#end if
158+
159+
152160
## Read Modification Options
153161
154162
## PolyG tail trimming, useful for NextSeq/NovaSeq data
@@ -273,7 +281,18 @@ $read_mod_options.base_correction_options.correction
273281
<param name="complexity_threshold" argument="-Y" type="integer" optional="true" label="Complexity threshold" help="Threshold for low complexity filter (0~100). Default is 30, which means 30% complexity is required."/>
274282
</section>
275283
</section>
276-
284+
<section name="duplicated_reads" title="Duplicated Reads Options">
285+
<conditional name="handling_options">
286+
<param name="eval_dups" type="select" label="Enable duplicated reads analysis" help="If enabled, calculate and report read duplication statistics. Enabling this is also a prerequisite for optional deduplication of reads. Duplicate detection relies exclusively on exact identity between read sequences (both for SE and PE data). It also increases tool memory requirements and running time moderately. NOTE: the default (no duplication analysis) is different from the command-line tool.">
287+
<option value="">Enable</option>
288+
<option value="--dont_eval_duplication" selected="true">Disable (--dont_eval_duplication)</option>
289+
</param>
290+
<when value="--dont_eval_duplication" />
291+
<when value="">
292+
<param argument="--dedup" type="boolean" truevalue="--dedup" falsevalue="" label="Drop duplicate reads/pairs"/>
293+
</when>
294+
</conditional>
295+
</section>
277296
<!-- Read Modification Options -->
278297
<section name="read_mod_options" title="Read Modification Options">
279298
<conditional name="polyg_tail_trimming">
@@ -312,7 +331,7 @@ $read_mod_options.base_correction_options.correction
312331

313332
<section name="cutting_by_quality_options" title="Per read cutting by quality options" expanded="True">
314333
<conditional name="cut_front_select">
315-
<param argument="--cut_front" type="select" truevalue="--cut_front" falsevalue="" checked="false" label="Cut by quality in front (5')" help="Enable per read cutting by quality in front (5'), default is disabled (WARNING: this will interfere deduplication for both PE/SE data).">
334+
<param argument="--cut_front" type="select" truevalue="--cut_front" falsevalue="" checked="false" label="Cut by quality in front (5')" help="Enable per read cutting by quality in front (5'). (WARNING: this will interfere with deduplication of both PE/SE data if performed with downstream tools.)">
316335
<option value="--cut_front">Yes</option>
317336
<option value="" selected="true">No</option>
318337
</param>
@@ -324,7 +343,7 @@ $read_mod_options.base_correction_options.correction
324343
</when>
325344
</conditional>
326345
<conditional name="cut_tail_select">
327-
<param argument="--cut_tail" type="select" truevalue="--cut_tail" falsevalue="" checked="false" label="Cut by quality in tail (3')" help="Enable per read cutting by quality in tail (3'), default is disabled (WARNING: this will interfere deduplication for SE data).">
346+
<param argument="--cut_tail" type="select" truevalue="--cut_tail" falsevalue="" checked="false" label="Cut by quality in tail (3')" help="Enable per read cutting by quality in tail (3'). (WARNING: this will interfere with deduplication of SE data if performed with downstream tools.)">
328347
<option value="--cut_tail">Yes</option>
329348
<option value="" selected="true">No</option>
330349
</param>
@@ -336,7 +355,7 @@ $read_mod_options.base_correction_options.correction
336355
</when>
337356
</conditional>
338357
<conditional name="cut_right_select">
339-
<param argument="--cut_right" type="select" truevalue="--cut_right" falsevalue="" checked="false" label="Cut by quality in tail (3')" help="Move a sliding window from front to tail, if meet one window with mean quality &lt; threshold, drop the bases in the window and the right part, and then stop.">
358+
<param argument="--cut_right" type="select" truevalue="--cut_right" falsevalue="" checked="false" label="Cut by quality in tail (3')" help="Move a sliding window from front to tail, if meet one window with mean quality &lt; threshold, drop the bases in the window and the right part, and then stop. (WARNING: this will interfere with deduplication of SE data if performed with downstream tools.)">
340359
<option value="--cut_right">Yes</option>
341360
<option value="" selected="true">No</option>
342361
</param>
@@ -396,11 +415,13 @@ $read_mod_options.base_correction_options.correction
396415
<output name="report_html">
397416
<assert_contents>
398417
<has_text text="fastp report"/>
418+
<not_has_text text="duplication rate:"/>
399419
</assert_contents>
400420
</output>
401421
<output name="report_json">
402422
<assert_contents>
403423
<has_text text="fastp report"/>
424+
<not_has_text text="&quot;duplication&quot;:"/>
404425
</assert_contents>
405426
</output>
406427
</test>
@@ -421,6 +442,7 @@ $read_mod_options.base_correction_options.correction
421442
<output name="report_html">
422443
<assert_contents>
423444
<has_text text="fastp report"/>
445+
<not_has_text text="duplication rate:"/>
424446
</assert_contents>
425447
</output>
426448
<output_collection name="output_paired_coll" type="paired">
@@ -532,19 +554,28 @@ $read_mod_options.base_correction_options.correction
532554
</assert_contents>
533555
</output>
534556
</test>
535-
<!-- 8. Ensure JSON report output works -->
536-
<test expect_num_outputs="2">
557+
<!-- 8. Ensure enabling duplicate analysis works -->
558+
<test expect_num_outputs="3">
537559
<conditional name="single_paired">
538560
<param name="single_paired_selector" value="single"/>
539561
<param name="in1" ftype="fastqsanger" value="R1.fq"/>
540562
</conditional>
541-
<section name="output_options">
542-
<param name="report_html" value="False"/>
563+
<section name="duplicated_reads">
564+
<conditional name="handling_options">
565+
<param name="eval_dups" value=""/>
566+
</conditional>
543567
</section>
544568
<output name="out1" ftype="fastqsanger" file="out1.fq"/>
569+
<output name="report_html">
570+
<assert_contents>
571+
<has_text text="fastp report"/>
572+
<has_text text="duplication rate:"/>
573+
</assert_contents>
574+
</output>
545575
<output name="report_json">
546576
<assert_contents>
547577
<has_text text="fastp report"/>
578+
<has_text text="&quot;duplication&quot;:"/>
548579
</assert_contents>
549580
</output>
550581
</test>
@@ -792,6 +823,29 @@ $read_mod_options.base_correction_options.correction
792823
</assert_contents>
793824
</output>
794825
</test>
826+
<!-- 18. Ensure deduplication works -->
827+
<test expect_num_outputs="2">
828+
<conditional name="single_paired">
829+
<param name="single_paired_selector" value="single"/>
830+
<param name="in1" ftype="fastqsanger" value="R1_with_dup.fq"/>
831+
</conditional>
832+
<section name="duplicated_reads">
833+
<conditional name="handling_options">
834+
<param name="eval_dups" value=""/>
835+
<param name="dedup" value="true"/>
836+
</conditional>
837+
</section>
838+
<section name="output_options">
839+
<param name="report_html" value="false"/>
840+
</section>
841+
<output name="out1" ftype="fastqsanger" file="out1.fq"/>
842+
<output name="report_json">
843+
<assert_contents>
844+
<has_text text="fastp report"/>
845+
<has_text text="&quot;duplication&quot;:"/>
846+
</assert_contents>
847+
</output>
848+
</test>
795849
</tests>
796850
<help><![CDATA[
797851
.. class:: infomark
@@ -803,7 +857,7 @@ afford high performance.
803857
804858
*Features*
805859
806-
1. Filter out bad reads (too low quality, too short, or too many N...)
860+
1. Filter out bad (too low quality, too short, or too many N...) and/or duplicate reads
807861
808862
2. Cut low quality bases for per read in its 5' and 3' by evaluating the mean quality from a sliding window (like Trimmomatic but faster)
809863

tools/fastp/macros.xml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
<macros>
22
<token name="@TOOL_VERSION@">1.0.1</token>
3+
<token name="@VERSION_SUFFIX@">3</token>
34
<xml name="biotools">
45
<xrefs>
56
<xref type="bio.tools">
@@ -69,4 +70,4 @@
6970
help="The minimum length to detect polyG in the read tail. 10 by default."/>
7071
</xml>
7172

72-
</macros>
73+
</macros>
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
@NS500713:64:HFKJJBGXY:1:11101:1675:1101 1:N:0:TATAGCCT+GACCCCCA
2+
TAGGAGGCTTGGAGTACCAATAATAAAGTGAGCCCACCTTCCTGGTACCCAGACATTTCAGGAGGTCGGGAAATTTTTAAACCCAGGCAGCTTCCTGGCAGTGACATTTGGAGCATCAAAGTGGTAAATAAAATTTCATTTACATTAATAT
3+
+
4+
6AAAAAEEEEE/E/EA/E/AEA6EE//AEE66/AAE//EEE/E//E/AA/EEE/A/AEE/EEA//EEEEEEEE6EEAAA/E/A/6E/6//6<EAAEEE/EEEA/EA/EEEEEE/<<EEEE//A/EE<AEEEEE/</AA</E<AAAE/E<E/
5+
@NS500713:64:HFKJJBGXY:1:11101:17113:1101 1:N:0:TATAGCCT+GTTTCTTA
6+
TACAAAATGCACATCGCTGAAAGGGGTAAAGGAGAGAAATCGCTTTATAAAACCTTGAAAAGGAATATTCAAATATAAGCTGGGAAGGTATAAAAAACTCTGTACATCACAAGTAAACAAATGGAACCTGCAAAATATTAAACAAAGGATT
7+
+
8+
AAAAAEEEEE6EEAAAEEEEE6EEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEEAEEEEAEEEEEEEEEEEEEEEEEEEE/EEEEEEE6EE<AAEEEAEEEEEEEEEEEEAEEEEEEEA<E/AAEEEAEEEEE/EEEEAAEEE
9+
@NS500713:64:HFKJJBGXY:1:11101:17114:1101 1:N:0:TATAGCCT+GTTTCTTA
10+
TACAAAATGCACATCGCTGAAAGGGGTAAAGGAGAGAAATCGCTTTATAAAACCTTGAAAAGGAATATTCAAATATAAGCTGGGAAGGTATAAAAAACTCTGTACATCACAAGTAAACAAATGGAACCTGCAAAATATTAAACAAAGGATT
11+
+
12+
AAAAAEEEEE6EEAAAEEEEE6EEEEEEEBBBBBBBBBEEEEEEEEEEEEEEEEEEEEEEEEAEEEEAEEEEEEEEEEEEEEEEEEEE/EEEEEEE6EE<AAEEEAEEEEEEEEEEEEAEEEEEEEA<E/AAEEEAEEEEE/EEEEAAEEE

0 commit comments

Comments
 (0)