Skip to content

Commit

Permalink
sourmash_wrapper
Browse files Browse the repository at this point in the history
  • Loading branch information
renu-pal committed Dec 11, 2024
1 parent 60eb96a commit d180bde
Show file tree
Hide file tree
Showing 5 changed files with 710 additions and 0 deletions.
17 changes: 17 additions & 0 deletions tools/sourmash/.shed.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
name: sourmash
owner: iuc
description: "Compute and compare MinHash signatures for DNA data sets."
long_description: |
Quickly search, compare, and analyze genomic and metagenomic data sets
homepage_url: https://sourmash.readthedocs.io/en/latest/
remote_repository_url: https://github.com/sourmash-bio/sourmash/tree/latest/src/sourmash
type: unrestricted
categories:
- Metagenomics
auto_tool_repositories:
name_template: "{{ tool_id }}"
description_template: "Wrapper to sketch DNA, LCA classify and summarize : {{ tool_name }}"




106 changes: 106 additions & 0 deletions tools/sourmash/dna.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
"""create DNA signatures"""

usage = """
sourmash sketch dna data/*.fna.gz
The 'sketch dna' command reads in DNA sequences and outputs DNA
sketches.
By default, 'sketch dna' uses the parameter string 'k=31,scaled=1000,noabund'.
This creates sketches with a k-mer size of 31, a scaled factor of
1000, and no abundance tracking of k-mers. You can specify one or
more parameter strings of your own with -p, e.g. 'sourmash sketch dna
-p k=31,noabund -p k=21,scaled=100,abund'. Note that a single `-p` parameter string can contain multiple ksize values, but only a single scaled value or abundance value, e.g. -p k=21,k=31,abund
'sourmash sketch' takes input sequences in FASTA and FASTQ,
uncompressed or gz/bz2 compressed.
Please see the 'sketch' documentation for more details:
https://sourmash.readthedocs.io/en/latest/sourmash-sketch.html
"""

import sourmash
from sourmash.logging import notify, print_results, error

from sourmash import command_sketch

assert command_sketch.DEFAULTS["dna"] == "k=31,scaled=1000,noabund"


def subparser(subparsers):
subparser = subparsers.add_parser(
"dna", aliases=["rna", "nucleotide", "nt"], usage=usage
)
subparser.add_argument(
"--license",
default="CC0",
type=str,
help="signature license. Currently only CC0 is supported.",
)
subparser.add_argument(
"--check-sequence",
action="store_true",
help="complain if input sequence is invalid DNA",
)
subparser.add_argument(
"-p",
"--param-string",
default=[],
help="signature parameters to use.",
action="append",
)

subparser.add_argument("filenames", nargs="*", help="file(s) of sequences")
file_args = subparser.add_argument_group("File handling options")
file_args.add_argument(
"-f",
"--force",
action="store_true",
help="recompute signatures even if the file exists",
)
subparser.add_argument(
"--from-file", help="a text file containing a list of sequence files to load"
)
file_args.add_argument(
"-o", "--output", help="output computed signatures to this file"
)
file_args.add_argument(
"--set-name",
"--name",
"--merge",
dest="merge",
type=str,
default="",
metavar="FILE",
help="name the output sketch as specified; note, merges all input "
"files while sketching",
)
file_args.add_argument(
"--output-dir",
"--outdir",
help="output computed signatures to this directory",
)
file_args.add_argument(
"--singleton",
action="store_true",
help="compute a signature for each sequence record individually",
)
file_args.add_argument(
"--name-from-first",
action="store_true",
help="name the signature generated from each file after the first "
"record in the file",
)
file_args.add_argument(
"--randomize",
action="store_true",
help="shuffle the list of input filenames randomly",
)


def main(args):
import sourmash.command_sketch

return sourmash.command_sketch.dna(args)
23 changes: 23 additions & 0 deletions tools/sourmash/macros.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
<?xml version="1.0"?>
<macros>
<token name="@TOOL_VERSION@">4.8.11</token>
<token name="@VERSION_SUFFIX@">0</token>
<token name="@PROFILE@">23.2</token>

<xml name="xrefs">
<xrefs>
<xref type="bio.tools">sourmash</xref>
</xrefs>
</xml>
<xml name="requirements">
<requirements>
<requirement type="package" version="@TOOL_VERSION@">bioconductor-sourmash</requirement>

</requirements>
</xml>
<xml name="citations">
<citations>
<citation type="doi">10.21105/joss.00027</citation>
</citations>
</xml>
</macros>
64 changes: 64 additions & 0 deletions tools/sourmash/sourmash.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
<tool id="sourmash" name="sourmash" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
<description>Compute and compare MinHash signatures for DNA data sets</description>
<macros>
<import>macros.xml</import>
</macros>
<expand macro="xrefs"/>
<expand macro="requirements"/>

<command detect_errors="exit_code"><![CDATA[
python '${__tool_directory__}/dna.py'
--param-string '[$additional_options.k_mers,$additional_options.scaled,$additional_options.noabund]'
--from-file '$input_seq'
--output '$dna_sketch'
--output-dir 'outputFolder'
--singleton '$singleton'
--name-from-first '$name_from_first'
--randomize '$randomize'
]]></command>
<inputs>
<param name="input_seq" type="data" format="fasta,fasta.gz,fastqsanger,fastqsanger.gz" label="Input DNA Sequence"/>
<section name="additional_options" title="Additional Options" expanded="true">
<param name="k_mers" type="integer" value="31" optional="true" multiple="true" label="K-mer size" help="multiple value separated by ',', DEFAULT: 31"/>
<param name="scaled" type="integer" value="1000" optional="true" label="Scaled factor" help="DEFAULT: 1000"/>
<param name="noabund" type="select" value="noabund" optional="true" label="Abundance tracking of k-mers" help="DEFAULT: noabund">
<option value="noabund" selected="true">No abundance tracking of k-mer</option>
<option value="abund">Abundance tracking of k-mer</option>
</param>
<param argument="--singleton" type="boolean" value="true" optional="true" label="compute a signature for each sequence record individual"/>
<param argument="--name-from-first" type="boolean" value="true" label="name the signature generated from each file after the first"/>
<param argument="--randomize" type="boolean" value="true" label="shuffle the list of input filenames randomly"/>
</section>
</inputs>
<outputs>
<data name="dna_sketch" format="pdf" from_work_dir="outputFolder/output" label="${tool.name} on ${on_string}:" />


</outputs>
<tests>
<test expect_num_outputs="1">
<param name="input_seq" value="GCA_903797575.1_PARATYPHIC668_genomic (1).fna"/>
<section name="additional_options">
<param name="k_mers" value="45"/>
<param name="scaled" value="1040"/>
<param name="noabund" value="noabund"/>
<param name="singleton" value="TRUE"/>
<param name="name_from_first" value="TRUE"/>
<param name="randomize" value="TRUE"/>
</section>

<output name="dna_sketch">
<assert_contents>
<has_size value="7373" delta="1000" />
</assert_contents>
</output>
</test>
</tests>

<help><![CDATA[
Quickly search, compare, and analyze genomic and metagenomic data sets
]]></help>
<expand macro="citations"/>
</tool>
Loading

0 comments on commit d180bde

Please sign in to comment.