Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

update dada2 reference data #6593

Draft
wants to merge 3 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
79 changes: 76 additions & 3 deletions data_managers/data_manager_dada2/data_manager/dada2_fetcher.xml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
<?xml version="1.0"?>
<tool id="dada2_fetcher" name="dada2 data manager" tool_type="manage_data" version="0.1.1" profile="23.0">
<tool id="dada2_fetcher" name="dada2 data manager" tool_type="manage_data" version="0.1.2" profile="23.0">
<description>Download reference databases</description>
<requirements>
<requirement type="package" version="3.7">python</requirement>
Expand All @@ -16,6 +16,7 @@
<param name="db_select" type="select" label="Taxonomic database">
<option value="silva">Silva</option>
<option value="rdp">RDP</option>
<option value="greengenes2">GreenGenes2</option>
<option value="greengenes">GreenGenes</option>
<option value="unite">UNITE Fungi: General Fasta</option>
<!-- UNITE Eukaryotes not yet supported https://github.com/benjjneb/dada2/issues/702 -->
Expand All @@ -27,17 +28,27 @@
</param>
<when value="silva">
<param name="version_select" type="select" label="Database version">
<option value="138.2_toGenus">138.2 to Genus (for short read 16S)</option>
<option value="138.2_toSpecies">138.2 to Species (for long read 16S)</option>
<option value="138">138</option>
<option value="132">132</option>
<option value="128">128</option>
</param>
</when>
<when value="rdp">
<param name="version_select" type="select" label="Database version">
<option value="19_toGenus">19 to Genus (for short read 16S)</option>
<option value="19_toSpecies">19 to Species (for long read 16S)</option>
<option value="16">16</option>
<option value="14">14</option>
</param>
</when>
<when value="greengenes2">
<param name="version_select" type="select" label="Database version">
<option value="2024.09_toGenus">2024.09 to Genus (for short read 16S)</option>
<option value="2024.09_toSpecies">2024.09 to Species (for long read 16S)</option>
</param>
</when>
<when value="greengenes">
<param name="version_select" type="select" label="Database version">
<option value="13.84">13.84</option>
Expand Down Expand Up @@ -80,6 +91,29 @@
<data name="out_file" format="data_manager_json" />
</outputs>
<tests>
<test>
<param name="db_cond|db_select" value="silva"/>
<param name="db_cond|version_select" value="138.2_toGenus"/>
<output name="out_file">
<assert_contents>
<has_text text="Silva version 138.2 to Genus (for short read 16S)"/>
</assert_contents>
</output>
</test>
<test>
<param name="db_cond|db_select" value="silva"/>
<param name="db_cond|version_select" value="138.2_toSpecies"/>
<output name="out_file">
<assert_contents>
<has_text text="Silva version 138.2 to Species (for long read 16S)"/>
</assert_contents>
</output>
</test>
<test>
<param name="db_cond|db_select" value="silva"/>
<param name="db_cond|version_select" value="138"/>
<output name="out_file" file="silva138_json"/>
</test>
<test>
<param name="db_cond|db_select" value="silva"/>
<param name="db_cond|version_select" value="138"/>
Expand All @@ -90,14 +124,50 @@
<param name="db_cond|version_select" value="132"/>
<output name="out_file" file="silva132_json"/>
</test>
<test>
<param name="db_cond|db_select" value="rdp"/>
<param name="db_cond|version_select" value="19_toSpecies"/>
<output name="out_file">
<assert_contents>
<has_text text="RDP trainset 19 to Species (for long read 16S)"/>
</assert_contents>
</output>
</test>
<test>
<param name="db_cond|db_select" value="rdp"/>
<param name="db_cond|version_select" value="19_toGenus"/>
<output name="out_file">
<assert_contents>
<has_text text="RDP trainset 19 to Genus (for short read 16S)"/>
</assert_contents>
</output>
</test>
<test>
<param name="db_cond|db_select" value="rdp"/>
<param name="db_cond|version_select" value="16"/>
<output name="out_file" file="rdp16_json"/>
</test>
<test>
<param name="db_cond|db_select" value="greengenes2"/>
<param name="db_cond|version_select" value="2024.09_toGenus"/>
<output name="out_file">
<assert_contents>
<has_text text="GreenGenes2 release 2024.09 to Genus (for short read 16S)"/>
</assert_contents>
</output>
</test>
<test>
<param name="db_cond|db_select" value="greengenes2"/>
<param name="db_cond|version_select" value="2024.09_toSpecies"/>
<output name="out_file">
<assert_contents>
<has_text text="GreenGenes2 release 2024.09 to Species (for long read 16S)"/>
</assert_contents>
</output>
</test>
<test>
<param name="db_cond|db_select" value="greengenes"/>
<param name="db_cond|version_select" value="13.84"/>
<param name="db_cond|version_select" value=""/>
<output name="out_file" file="greengenes13.84_json"/>
</test>
<test>
Expand Down Expand Up @@ -144,7 +214,8 @@ The following refrence databases which are describes as maintained by the DADA2

- Silva (https://www.arb-silva.de/)
- RDP
- GreenGenes (http://greengenes.secondgenome.com/)
- GreenGenes (https://web.archive.org/web/20240703054246/https://greengenes.secondgenome.com/)
- Greengenes2 (https://greengenes2.ucsd.edu/)
- UNITE general FASTA (https://unite.ut.ee/repository.php)

While Silva and RDP contain reference databases for taxonomy and species assignment, the greengenes and UNITE databases only contains a reference database for taxonomy assignment.
Expand Down Expand Up @@ -176,6 +247,8 @@ More detailed informations in the reference data bases can be found on the DADA2
<citation type="doi">10.1093/nar/gks1219</citation>
<!-- rdp -->
<citation type="doi">10.1093/nar/gkt1244</citation>
<!-- greengenes2 -->
<citation type="doi">10.1038/s41587-023-01845-1</citation>
<!-- greengenes -->
<citation type="doi">10.1128/AEM.03006-05</citation>
<!-- unite -->
Expand Down
23 changes: 19 additions & 4 deletions data_managers/data_manager_dada2/data_manager/data_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,18 @@
DEFAULT_TAXLEVELS = "Kingdom,Phylum,Class,Order,Family,Genus,Species"

FILE2NAME = {
"silva_138.2_toGenus": "Silva version 138.2 to Genus (for short read 16S)",
"silva_138.2_toSpecies": "Silva version 138.2 to Species (for long read 16S)",
"silva_138": "Silva version 138",
"silva_132": "Silva version 132",
"silva_128": "Silva version 128",
"rdp_19_toGenus": "RDP trainset 19 to Genus (for short read 16S)",
"rdp_19_toSpecies": "RDP trainset 19 to Species (for long read 16S)",
"rdp_16": "RDP trainset 16",
"rdp_14": "RDP trainset 14",
"greengenes_13.84": "GreenGenes version 13.84",
"greengenes2_2024.09_toGenus": "GreenGenes2 release 2024.09 to Genus (for short read 16S)",
"greengenes2_2024.09_toSpecies": "GreenGenes2 release 2024.09 to Species (for long read 16S)",
"unite_8.0_fungi": "UNITE: General Fasta release 8.0 for Fungi",
"unite_8.0_fungi_singletons": "UNITE: General Fasta release 8.0 for Fungi including global and 97% singletons",
"RefSeq_RDP_2018_05": "NCBI RefSeq 16S rRNA database supplemented by RDP (05/2018)",
Expand All @@ -27,14 +33,21 @@
}

FILE2TAXURL = {
# see https://github.com/benjjneb/dada2/issues/2053 for the distinction of the datasets
"silva_138.2_toGenus": "https://zenodo.org/records/14169026/files/silva_nr99_v138.2_toGenus_trainset.fa.gz?download=1",
"silva_138.2_toSpecies": "https://zenodo.org/records/14169026/files/silva_nr99_v138.2_toSpecies_trainset.fa.gz?download=1",
"silva_138": "https://zenodo.org/record/3731176/files/silva_nr_v138_train_set.fa.gz?download=1",
"silva_132": "https://zenodo.org/record/1172783/files/silva_nr_v132_train_set.fa.gz?download=1",
"silva_128": "https://zenodo.org/record/824551/files/silva_nr_v128_train_set.fa.gz?download=1",
"rdp_19_toGenus": "https://zenodo.org/records/14168771/files/rdp_19_toGenus_trainset.fa.gz?download=1",
"rdp_19_toSpecies": "https://zenodo.org/records/14168771/files/rdp_19_toSpecies_trainset.fa.gz?download=1",
"rdp_16": "https://zenodo.org/record/801828/files/rdp_train_set_16.fa.gz?download=1",
"rdp_14": "https://zenodo.org/record/158955/files/rdp_train_set_14.fa.gz?download=1",
"unite_8.0_fungi": "https://files.plutof.ut.ee/public/orig/EB/0C/EB0CCB3A871B77EA75E472D13926271076904A588D2E1C1EA5AFCF7397D48378.zip",
"unite_8.0_fungi_singletons": "https://files.plutof.ut.ee/doi/06/A2/06A2C86256EED64085670EB0C54B7115F6DAC8F311C656A9CB33E386CFABA0D0.zip",
"unite_8.0_fungi": "https://s3.hpc.ut.ee/plutof-public/original/9f7b41c3-825b-4db8-9c52-74a4603a860a.zip",
"unite_8.0_fungi_singletons": "https://s3.hpc.ut.ee/plutof-public/original/53dfc9ce-9cb5-4205-84bb-f47faff26462.zip",
"greengenes_13.84": "https://zenodo.org/record/158955/files/gg_13_8_train_set_97.fa.gz?download=1",
"greengenes2_2024.09_toGenus": "https://zenodo.org/records/14169078/files/gg2_2024_09_toGenus_trainset.fa.gz?download=1",
"greengenes2_2024.09_toSpecies": "https://zenodo.org/records/14169078/files/gg2_2024_09_toSpecies_trainset.fa.gz?download=1",
"RefSeq_RDP_2018_05": "https://zenodo.org/record/2541239/files/RefSeq-RDP16S_v2_May2018.fa.gz?download=1",
"gtdb_2018_11": "https://zenodo.org/record/2541239/files/GTDB_bac-arc_ssu_r86.fa.gz?download=1",
"hitdb_1": "https://zenodo.org/record/159205/files/hitdb_v1.00.fa.gz?download=1",
Expand All @@ -43,15 +56,17 @@
}

FILE2SPECIESURL = {
"silva_138.2": "https://zenodo.org/records/14169026/files/silva_v138.2_assignSpecies.fa.gz?download=1",
"silva_138": "https://zenodo.org/record/3731176/files/silva_species_assignment_v138.fa.gz?download=1",
"silva_132": "https://zenodo.org/record/1172783/files/silva_species_assignment_v132.fa.gz?download=1",
"silva_128": "https://zenodo.org/record/824551/files/silva_species_assignment_v128.fa.gz?download=1",
# rdp_19 not available for assignSpecies https://github.com/benjjneb/dada2/issues/2053#issuecomment-2512185929
"rdp_16": "https://zenodo.org/record/801828/files/rdp_species_assignment_16.fa.gz?download=1",
"rdp_14": "https://zenodo.org/record/158955/files/rdp_species_assignment_14.fa.gz?download=1"
}

FILE2TAXLEVELS = {
"PR2_4.11.1": "Kingdom,Supergroup,Division,Class,Order,Family,Genus,Species"
"PR2_4.11.1": "Kingdom,Supergroup,Division,Class,Order,Family,Genus,Species",
}


Expand All @@ -78,7 +93,7 @@ def url_download(url, fname, workdir):
if src:
src.close()

# special treatment of UNITE DBs: they are zip files containing two fasta (xyz.fasta and developer/xyz.fasta)
# special treatment of UNITE DBs: they are zip files containing two fasta (xyz.fasta and developer/xyz.fasta)
if fname.startswith("unite"):
import glob
import gzip
Expand Down
Loading