sourmash_wrapper

galaxyproject · Dec 11, 2024 · d180bde · d180bde
1 parent 60eb96a
commit d180bde
Show file tree

Hide file tree

Showing 5 changed files with 710 additions and 0 deletions.
diff --git a/tools/sourmash/.shed.yml b/tools/sourmash/.shed.yml
@@ -0,0 +1,17 @@
+name: sourmash
+owner: iuc
+description: "Compute and compare MinHash signatures for DNA data sets."
+long_description: |
+	Quickly search, compare, and analyze genomic and metagenomic data sets
+homepage_url: https://sourmash.readthedocs.io/en/latest/
+remote_repository_url: https://github.com/sourmash-bio/sourmash/tree/latest/src/sourmash
+type: unrestricted
+categories:
+  - Metagenomics
+auto_tool_repositories:
+  name_template: "{{ tool_id }}"
+  description_template: "Wrapper to sketch DNA, LCA classify and summarize : {{ tool_name }}"
+
+
+
+
diff --git a/tools/sourmash/dna.py b/tools/sourmash/dna.py
@@ -0,0 +1,106 @@
+"""create DNA signatures"""
+
+usage = """
+
+    sourmash sketch dna data/*.fna.gz
+
+The 'sketch dna' command reads in DNA sequences and outputs DNA
+sketches.
+
+By default, 'sketch dna' uses the parameter string 'k=31,scaled=1000,noabund'.
+
+This creates sketches with a k-mer size of 31, a scaled factor of
+1000, and no abundance tracking of k-mers.  You can specify one or
+more parameter strings of your own with -p, e.g.  'sourmash sketch dna
+-p k=31,noabund -p k=21,scaled=100,abund'. Note that a single `-p` parameter string can contain multiple ksize values, but only a single scaled value or abundance value, e.g. -p k=21,k=31,abund
+
+'sourmash sketch' takes input sequences in FASTA and FASTQ,
+uncompressed or gz/bz2 compressed.
+
+Please see the 'sketch' documentation for more details:
+  https://sourmash.readthedocs.io/en/latest/sourmash-sketch.html
+"""
+
+import sourmash
+from sourmash.logging import notify, print_results, error
+
+from sourmash import command_sketch
+
+assert command_sketch.DEFAULTS["dna"] == "k=31,scaled=1000,noabund"
+
+
+def subparser(subparsers):
+    subparser = subparsers.add_parser(
+        "dna", aliases=["rna", "nucleotide", "nt"], usage=usage
+    )
+    subparser.add_argument(
+        "--license",
+        default="CC0",
+        type=str,
+        help="signature license. Currently only CC0 is supported.",
+    )
+    subparser.add_argument(
+        "--check-sequence",
+        action="store_true",
+        help="complain if input sequence is invalid DNA",
+    )
+    subparser.add_argument(
+        "-p",
+        "--param-string",
+        default=[],
+        help="signature parameters to use.",
+        action="append",
+    )
+
+    subparser.add_argument("filenames", nargs="*", help="file(s) of sequences")
+    file_args = subparser.add_argument_group("File handling options")
+    file_args.add_argument(
+        "-f",
+        "--force",
+        action="store_true",
+        help="recompute signatures even if the file exists",
+    )
+    subparser.add_argument(
+        "--from-file", help="a text file containing a list of sequence files to load"
+    )
+    file_args.add_argument(
+        "-o", "--output", help="output computed signatures to this file"
+    )
+    file_args.add_argument(
+        "--set-name",
+        "--name",
+        "--merge",
+        dest="merge",
+        type=str,
+        default="",
+        metavar="FILE",
+        help="name the output sketch as specified; note, merges all input "
+        "files while sketching",
+    )
+    file_args.add_argument(
+        "--output-dir",
+        "--outdir",
+        help="output computed signatures to this directory",
+    )
+    file_args.add_argument(
+        "--singleton",
+        action="store_true",
+        help="compute a signature for each sequence record individually",
+    )
+    file_args.add_argument(
+        "--name-from-first",
+        action="store_true",
+        help="name the signature generated from each file after the first "
+        "record in the file",
+    )
+    file_args.add_argument(
+        "--randomize",
+        action="store_true",
+        help="shuffle the list of input filenames randomly",
+    )
+
+
+def main(args):
+    import sourmash.command_sketch
+
+    return sourmash.command_sketch.dna(args)
diff --git a/tools/sourmash/macros.xml b/tools/sourmash/macros.xml
@@ -0,0 +1,23 @@
+<?xml version="1.0"?>
+<macros>
+    <token name="@TOOL_VERSION@">4.8.11</token>
+    <token name="@VERSION_SUFFIX@">0</token>
+    <token name="@PROFILE@">23.2</token>
+
+    <xml name="xrefs">
+        <xrefs>
+            <xref type="bio.tools">sourmash</xref>
+        </xrefs>
+    </xml>
+    <xml name="requirements">
+        <requirements>
+            <requirement type="package" version="@TOOL_VERSION@">bioconductor-sourmash</requirement>
+
+        </requirements>
+    </xml>
+    <xml name="citations">
+        <citations>
+            <citation type="doi">10.21105/joss.00027</citation>
+        </citations>
+    </xml>
+ </macros>
diff --git a/tools/sourmash/sourmash.xml b/tools/sourmash/sourmash.xml
@@ -0,0 +1,64 @@
+<tool id="sourmash" name="sourmash" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
+    <description>Compute and compare MinHash signatures for DNA data sets</description>
+    <macros>
+        <import>macros.xml</import>
+    </macros>
+    <expand macro="xrefs"/>
+    <expand macro="requirements"/>
+
+    <command detect_errors="exit_code"><![CDATA[
+    python '${__tool_directory__}/dna.py' 
+    --param-string '[$additional_options.k_mers,$additional_options.scaled,$additional_options.noabund]'
+    --from-file '$input_seq'
+    --output '$dna_sketch'
+    --output-dir 'outputFolder'
+    --singleton '$singleton'
+    --name-from-first '$name_from_first'
+    --randomize '$randomize'
+    
+    ]]></command>
+    <inputs>
+        <param name="input_seq" type="data" format="fasta,fasta.gz,fastqsanger,fastqsanger.gz" label="Input DNA Sequence"/>
+        <section name="additional_options" title="Additional Options" expanded="true">
+            <param name="k_mers" type="integer" value="31" optional="true" multiple="true" label="K-mer size" help="multiple value separated by ',', DEFAULT: 31"/>
+            <param name="scaled" type="integer" value="1000" optional="true" label="Scaled factor" help="DEFAULT: 1000"/>
+            <param name="noabund" type="select" value="noabund" optional="true" label="Abundance tracking of k-mers" help="DEFAULT: noabund">
+                <option value="noabund" selected="true">No abundance tracking of k-mer</option>
+                <option value="abund">Abundance tracking of k-mer</option>
+            </param>  
+            <param argument="--singleton" type="boolean" value="true" optional="true" label="compute a signature for each sequence record individual"/>
+            <param argument="--name-from-first" type="boolean" value="true" label="name the signature generated from each file after the first"/>
+            <param argument="--randomize" type="boolean" value="true" label="shuffle the list of input filenames randomly"/>
+        </section>
+    </inputs>
+    <outputs>  
+        <data name="dna_sketch" format="pdf" from_work_dir="outputFolder/output" label="${tool.name} on ${on_string}:" />
+
+
+    </outputs>
+    <tests>
+        <test expect_num_outputs="1">
+            <param name="input_seq" value="GCA_903797575.1_PARATYPHIC668_genomic (1).fna"/>
+            <section name="additional_options">
+                <param name="k_mers" value="45"/>
+                <param name="scaled" value="1040"/>
+                <param name="noabund" value="noabund"/>
+                <param name="singleton" value="TRUE"/>
+                <param name="name_from_first" value="TRUE"/>
+                <param name="randomize" value="TRUE"/>
+            </section>
+
+            <output name="dna_sketch">
+                <assert_contents>
+                    <has_size value="7373" delta="1000" />
+                </assert_contents>
+            </output>
+        </test>
+    </tests>
+
+    <help><![CDATA[
+    Quickly search, compare, and analyze genomic and metagenomic data sets
+
+    ]]></help>
+    <expand macro="citations"/>
+</tool>