galaxyproject · astrovsky01 · Nov 8, 2024 · Nov 8, 2024 · Nov 8, 2024 · Nov 8, 2024
diff --git a/data_managers/data_manager_checkm2/.lint_skip b/data_managers/data_manager_checkm2/.lint_skip
@@ -0,0 +1 @@
+TestsMissing
diff --git a/data_managers/data_manager_checkm2/.shed.yml b/data_managers/data_manager_checkm2/.shed.yml
@@ -0,0 +1,10 @@
+categories:
+- Data Managers
+- Metagenomics
+homepage_url: https://github.com/chklovski/CheckM2
+description: Data manager for CheckM2 reference data
+long_description: Data manager for CheckM2 reference data
+name: checkm2_build_database
+owner: iuc
+remote_repository_url: https://github.com/galaxyproject/tools-iuc/tree/master/data_managers/data_manager_checkm2
+type: unrestricted
diff --git a/data_managers/data_manager_checkm2/data_manager/checkm2_datamanager.xml b/data_managers/data_manager_checkm2/data_manager/checkm2_datamanager.xml
@@ -0,0 +1,62 @@
+<tool id="checkm2_build_database" name="CheckM2" tool_type="manage_data" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="@PROFILE@">
+    <description>database builder</description>
+    <macros>
+        <!-- on update run a local test setting `test` to something else than "true" -->
+        <token name="@TOOL_VERSION@">1.0.2</token>
+        <token name="@VERSION_SUFFIX@">0</token>
+        <token name="@PROFILE@">23.1</token>
+    </macros>
+    <requirements>
+        <requirement type="package" version="@TOOL_VERSION@">checkm2</requirement>
+    </requirements>
+    <version_command>checkm2 --version</version_command>
+    <command detect_errors="exit_code"><![CDATA[
+        mkdir -p '$out_file.extra_files_path' &&
+        #if $test != "true"
+            checkm2 database --download --path  '$out_file.extra_files_path'/ &&
+            mv '$out_file.extra_files_path'/CheckM2_database/uniref100.KO.1.dmnd '$out_file.extra_files_path' &&
+        #else
+            touch '$out_file.extra_files_path'/uniref100.KO.1.dmnd &&
+        #end if
+        cp '$dmjson' '$out_file'
+    ]]></command>
+    <configfiles>
+        <configfile name="dmjson"><![CDATA[
+{
+  "data_tables":{
+    "checkm2":[
+      {
+        "path":"uniref100.KO.1.dmnd",
+        "name":"CheckM2 diamond DB downloaded with version @TOOL_VERSION@",
+        "version":"@TOOL_VERSION@",
+        "value":"@TOOL_VERSION@"
+      }
+    ]
+  }
+}]]>
+        </configfile>
+    </configfiles>
+    <inputs>
+        <param name="test" type="hidden"/>
+    </inputs>
+    <outputs>
+        <data name="out_file" format="data_manager_json" />
+    </outputs>
+    <tests>
+        <test expect_num_outputs="1">
+            <param name="test" value="true"/>
+            <output name="out_file">
+                <assert_contents>
+                    <has_text text='"value":"@TOOL_VERSION@"'/>
+                    <has_text text='"name":"CheckM2 diamond DB downloaded with version @TOOL_VERSION@"'/>
+                </assert_contents>
+            </output>
+        </test>
+    </tests>
+    <help><![CDATA[
+Download and extract CheckM2 reference data.
+    ]]></help>
+    <citations>
+        <citation type="doi">10.1038/s41592-023-01940-w</citation>
+    </citations>
+</tool>
diff --git a/data_managers/data_manager_checkm2/data_manager_conf.xml b/data_managers/data_manager_checkm2/data_manager_conf.xml
@@ -0,0 +1,19 @@
+<data_managers>
+    <data_manager tool_file="data_manager/checkm2_datamanager.xml" id="checkm2_build_database">
+        <data_table name="checkm2">
+            <output>
+                <column name="value"/>
+                <column name="name"/>
+                <column name="version"/>
+                <column name="path" output_ref="out_file">
+                    <move type="file">
+                        <source>${path}</source>
+                        <target base="${GALAXY_DATA_MANAGER_DATA_PATH}">checkm2/${value}/${path}</target>
+                    </move>
+                    <value_translation>${GALAXY_DATA_MANAGER_DATA_PATH}/checkm2/${value}/${path}</value_translation>
+                    <value_translation type="function">abspath</value_translation>
+                </column>
+            </output>
+        </data_table>
+    </data_manager>
+</data_managers>
diff --git a/data_managers/data_manager_checkm2/test-data/checkm2.loc b/data_managers/data_manager_checkm2/test-data/checkm2.loc
@@ -0,0 +1,2 @@
+
+1.0.2	CheckM2 diamond DB downloaded with version 1.0.2	1.0.2	/tmp/tmpf_hplx2a/galaxy-dev/tool-data/checkm2/1.0.2/uniref100.KO.1.dmnd
diff --git a/data_managers/data_manager_checkm2/tool-data/checkm2.loc.sample b/data_managers/data_manager_checkm2/tool-data/checkm2.loc.sample
@@ -0,0 +1,18 @@
+#This is a sample file distributed with Galaxy that enables tools
+#to use a the checkm2 dabase.
+#You will need to create these data files using the following command
+
+#checkm2 database --download --path /custom/path/
+
+#and then create a checkm2_db_versioned.loc file similar to this one (store it in this
+#directory) that points to the directories in which those files are stored.
+
+#The checkm2_db_versioned.loc file has this format (longer white space
+#characters are TAB characters):
+#
+#<unique_build_id>	<display_name>	<version>	<file_base_path>
+
+#The <version> column indicates the checkm2 version that generated the database
+
+#
+#diamond_db_1.0.2	Diamond database	1.0.2	/mnt/galaxyIndices/Checkm2_database/uniref100.KO.1.dmnd
diff --git a/data_managers/data_manager_checkm2/tool_data_table_conf.xml.sample b/data_managers/data_manager_checkm2/tool_data_table_conf.xml.sample
@@ -0,0 +1,6 @@
+<tables>
+    <table name="checkm2" comment_char="#" allow_duplicate_entries="False">
+        <columns>value, dbkey, name, version, path</columns>
+        <file path="tool-data/checkm2.loc" />
+    </table>
+</tables>
diff --git a/data_managers/data_manager_checkm2/tool_data_table_conf.xml.test b/data_managers/data_manager_checkm2/tool_data_table_conf.xml.test
@@ -0,0 +1,7 @@
+<tables>
+    <!-- Location of rnastar indexes for testing -->
+    <table name="checkm2" comment_char="#" allow_duplicate_entries="False">
+        <columns>value, dbkey, name, version, path</columns>
+        <file path="${__HERE__}/test-data/checkm2.loc" />
+    </table>
+</tables>
diff --git a/tools/checkm2/.shed.yml b/tools/checkm2/.shed.yml
@@ -0,0 +1,9 @@
+name: checkm2
+owner: iuc
+description: Rapid assessment of genome bin quality using machine learning
+long_description: Enhanced version of checkm, using machine learning models for greater speed and accuracy
+homepage_url: https://github.com/chklovski/CheckM2
+remote_repository_url: https://github.com/galaxyproject/tools-iuc/blob/main/tools/checkm2/
+categories:
+  - Metagenomics
+type: unrestricted
diff --git a/tools/checkm2/checkm2.xml b/tools/checkm2/checkm2.xml
@@ -0,0 +1,177 @@
+<tool id="checkm2" name="checkm2" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="23.1">
+    <description>Rapid assessment of genome bin quality using machine learning</description>
+    <macros>
+        <!-- enable tests and output assertions on update -->
+        <token name="@TOOL_VERSION@">1.0.2</token>
+        <token name="@VERSION_SUFFIX@">0</token>
+        <token name="@IDX_DATA_TABLE@">checkm2</token>
+    </macros>
+    <xrefs>
+        <xref type="bio.tools">checkm</xref>
+    </xrefs>
+    <requirements>
+        <requirement type="package" version="@TOOL_VERSION@">checkm2</requirement>
+    </requirements>
+    <command detect_errors="exit_code"><![CDATA[
+    #import re
+    mkdir input_dir &&
+    #for $i, $file in enumerate($input):
+        #set $cleaned =  re.sub('[^\s\w\-\\.]', '_', str($file.element_identifier))
+        ln -s '$file' 'input_dir/${cleaned}.dat' &&
+    #end for
+    checkm2 predict
+    --input input_dir
+    $model
+    $genes
+    #if $ttable:
+        --ttable $ttable
+    #end if
+    -x .dat
+    --threads "\${GALAXY_SLOTS:-1}"
+    --database_path '$database.fields.path'
+    --output-directory output
+    ]]></command>
+    <inputs>
+        <param name="input" type="data" format="fasta" label="Input MAG/SAG datasets" multiple="true"/>
+        <param name="database" type="select" label="Select reference genome" help="Checkm2 Diamond database">
+            <options from_data_table="@IDX_DATA_TABLE@">
+                <filter type="sort_by" column="2"/>
+            </options>
+            <validator type="no_options" message="No databases are available for this version of Checkm2. Please contact the Galaxy adminstrators to request one be installed."/>
+        </param>
+
+        <param argument="genes" type="boolean" truevalue="--genes" falsevalue="" label="Treat input files as protein files"/>
+        <param name="model" type="select" label="Model options">
+            <option value="">None</option>
+            <option value="--general">Force the use of the general quality prediction model (gradient boost)</option>
+            <option value="--specific">Force the use of the specific quality prediction model (neural network)</option>
+            <option value="--allmodels">Output quality prediction for both models for each genome.</option>
+        </param>
+        <!-- It's not all numbers and there's a check internally if it's in a specific list, so it had to be spelled out -->
+        <param argument="ttable" type="select" label="Prodigal table" optional="true">
+            <option value="1">1. The Standard Code</option>
+            <option value="2">2. The Vertebrate Mitochondrial Code</option>
+            <option value="3">3. The Yeast Mitochondrial Code</option>
+            <option value="4">4. The Mold, Protozoan, and Coelenterate Mitochondrial Code and the Mycoplasma/Spiroplasma Code</option>
+            <option value="5">5. The Invertebrate Mitochondrial Code</option>
+            <option value="6">6. The Ciliate, Dasycladacean and Hexamita Nuclear Code</option>
+            <option value="9">9. The Echinoderm and Flatworm Mitochondrial Code</option>
+            <option value="10">10. The Euplotid Nuclear Code</option>
+            <option value="11">11. The Bacterial, Archaeal and Plant Plastid Code</option>
+            <option value="12">12. The Alternative Yeast Nuclear Code</option>
+            <option value="13">13. The Ascidian Mitochondrial Code</option>
+            <option value="14">14. The Alternative Flatworm Mitochondrial Code</option>
+            <option value="16">16. Chlorophycean Mitochondrial Code</option>
+            <option value="21">21. Trematode Mitochondrial Code</option>
+            <option value="22">22. Scenedesmus obliquus Mitochondrial Code</option>
+            <option value="23">23. Thraustochytrium Mitochondrial Code</option>
+            <option value="24">24. Rhabdopleuridae Mitochondrial Code</option>
+            <option value="25">25. Candidate Division SR1 and Gracilibacteria Code</option>
+            <option value="26">26. Pachysolen tannophilus Nuclear Code</option>
+            <option value="27">27. Karyorelict Nuclear Code</option>
+            <option value="28">28. Condylostoma Nuclear Code</option>
+            <option value="29">29. Mesodinium Nuclear Code</option>
+            <option value="30">30. Peritrich Nuclear Code</option>
+            <option value="31">31. Blastocrithidia Nuclear Code</option>
+            <option value="33">33. Cephalodiscidae Mitochondrial UAA-Tyr Code</option>
+        </param>
+    </inputs>
+    <outputs>
+        <data name="quality" label="${tool.name} on ${on_string}: Quality report" format="tabular" from_work_dir="output/quality_report.tsv"/>
+        <collection name="protein_files" label="${tool.name} on ${on_string}: protein files" type="list">
+            <discover_datasets pattern="(?P&lt;designation&gt;.*)\.faa" format="fasta" directory="output/protein_files"/>
+        </collection>
+        <collection name="diamond_files" label="${tool.name} on ${on_string}: Tabular diamond output" type="list">
+            <discover_datasets pattern="(?P&lt;designation&gt;.*)\.tsv" format="tabular" directory="output/diamond_output"/>
+        </collection>
+    </outputs>
+    <tests>
+        <!-- These cannot run without a multi-gb db and will therefore fail. See README for details -->
+        <test expect_exit_code="1" expect_failure="true">
+            <param name="input" value="test1.faa,test2.faa"/>
+            <param name="model" value="--allmodels"/>
+            <param name="genes" value="--genes"/>
+            <param name="ttable" value="13"/>
+            <!-- <output name="quality">
+                <assert_contents>
+                    <has_n_columns n="6"/>
+                    <has_n_lines n="3"/>
+                </assert_contents>
+            </output>
+            <output_collection name="protein_files" count="2">
+                <element name="test1">
+                    <assert_contents>
+                        <has_n_lines n="4942"/>
+                    </assert_contents>
+                </element>
+                <element name="test2">
+                    <assert_contents>
+                        <has_n_lines n="4873"/>
+                    </assert_contents>
+                </element>
+            </output_collection>
+            <output_collection name="diamond_files" count="1">
+                <element name="DIAMOND_RESULTS">
+                    <assert_contents>
+                        <has_n_columns n="12"/>
+                        <has_n_lines n="818"/>
+                    </assert_contents>
+                </element>
+            </output_collection> -->
+            <assert_command>
+                <has_text text="checkm2 predict --input input_dir"/>
+                <has_text text="--allmodels --genes --ttable 13 -x .dat"/>
+                <has_text text="--output-directory output"/>
+            </assert_command>
+        </test>
+        <test expect_exit_code="1" expect_failure="true">
+            <param name="input" value="test1.tst,test2.tst"/>
+            <param name="model" value="--specific"/>
+            <!-- <output name="quality">
+                <assert_contents>
+                    <has_n_columns n="13"/>
+                    <has_n_lines n="3"/>
+                </assert_contents>
+            </output>
+            <output_collection name="protein_files" count="2">
+                <element name="test1.tst">
+                    <assert_contents>
+                        <has_n_lines n="5090"/>
+                    </assert_contents>
+                </element>
+                <element name="test2.tst">
+                    <assert_contents>
+                        <has_n_lines n="4868"/>
+                    </assert_contents>
+                </element>
+            </output_collection>
+            <output_collection name="diamond_files" count="1">
+                <element name="DIAMOND_RESULTS">
+                    <assert_contents>
+                        <has_n_columns n="12"/>
+                        <has_n_lines n="915"/>
+                    </assert_contents>
+                </element>
+            </output_collection> -->
+            <assert_command>
+                <has_text text="checkm2 predict --input input_dir"/>
+                <has_text text="--specific  -x .dat"/>
+                <has_text text="--output-directory output"/>
+            </assert_command>
+        </test>
+    </tests>
+    <help><![CDATA[
+    Unlike CheckM1, CheckM2 has universally trained machine learning models it applies regardless of taxonomic lineage to predict the completeness and contamination of genomic bins. 
+    This allows it to incorporate many lineages in its training set that have few - or even just one - high-quality genomic representatives, by putting it in the context of all other organisms in the training set. 
+    As a result of this machine learning framework, CheckM2 is also highly accurate on organisms with reduced genomes or unusual biology, such as the Nanoarchaeota or Patescibacteria.
+
+    CheckM2 uses two distinct machine learning models to predict genome completeness. The 'general' gradient boost model is able to generalize well and is intended to be used on organisms not well 
+    represented in GenBank or RefSeq (roughly, when an organism is novel at the level of order, class or phylum). The 'specific' neural network model is more accurate when predicting completeness 
+    of organisms more closely related to the reference training set (roughly, when an organism belongs to a known species, genus or family). CheckM2 uses a cosine similarity calculation to automatically 
+    determine the appropriate completeness model for each input genome, but you can also force the use of a particular completeness model, or get the prediction outputs for both. There is only one contamination 
+    model (based on gradient boost) which is applied regardless of taxonomic novelty and works well across all cases.
+    ]]></help>
+        <citations>
+            <citation type="doi">10.1038/s41592-023-01940-w</citation>
+        </citations>
+</tool>
diff --git a/tools/checkm2/test-data/checkm2.loc b/tools/checkm2/test-data/checkm2.loc
@@ -0,0 +1,3 @@
+##Checkm2 versioned indexes
+#build_id	display_name	path	version
+001	test_db	1.0.2	${__HERE__}/Checkm2_database/uniref100.KO.1.dmnd
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@

		1.0.2 CheckM2 diamond DB downloaded with version 1.0.2 1.0.2 /tmp/tmpf_hplx2a/galaxy-dev/tool-data/checkm2/1.0.2/uniref100.KO.1.dmnd