diff --git a/data_managers/data_manager_dada2/data_manager/dada2_fetcher.xml b/data_managers/data_manager_dada2/data_manager/dada2_fetcher.xml index bb63c5837ac..2c14a7138b5 100644 --- a/data_managers/data_manager_dada2/data_manager/dada2_fetcher.xml +++ b/data_managers/data_manager_dada2/data_manager/dada2_fetcher.xml @@ -1,5 +1,5 @@ - + Download reference databases python @@ -16,6 +16,7 @@ + @@ -27,6 +28,8 @@ + + @@ -34,10 +37,18 @@ + + + + + + + + @@ -80,6 +91,29 @@ + + + + + + + + + + + + + + + + + + + + + + + @@ -90,14 +124,50 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - + @@ -144,7 +214,8 @@ The following refrence databases which are describes as maintained by the DADA2 - Silva (https://www.arb-silva.de/) - RDP -- GreenGenes (http://greengenes.secondgenome.com/) +- GreenGenes (https://web.archive.org/web/20240703054246/https://greengenes.secondgenome.com/) +- Greengenes2 (https://greengenes2.ucsd.edu/) - UNITE general FASTA (https://unite.ut.ee/repository.php) While Silva and RDP contain reference databases for taxonomy and species assignment, the greengenes and UNITE databases only contains a reference database for taxonomy assignment. @@ -176,6 +247,8 @@ More detailed informations in the reference data bases can be found on the DADA2 10.1093/nar/gks1219 10.1093/nar/gkt1244 + + 10.1038/s41587-023-01845-1 10.1128/AEM.03006-05 diff --git a/data_managers/data_manager_dada2/data_manager/data_manager.py b/data_managers/data_manager_dada2/data_manager/data_manager.py index 1a2c3cb84ae..d0ac47d8625 100644 --- a/data_managers/data_manager_dada2/data_manager/data_manager.py +++ b/data_managers/data_manager_dada2/data_manager/data_manager.py @@ -11,12 +11,18 @@ DEFAULT_TAXLEVELS = "Kingdom,Phylum,Class,Order,Family,Genus,Species" FILE2NAME = { + "silva_138.2_toGenus": "Silva version 138.2 to Genus (for short read 16S)", + "silva_138.2_toSpecies": "Silva version 138.2 to Species (for long read 16S)", "silva_138": "Silva version 138", "silva_132": "Silva version 132", "silva_128": "Silva version 128", + "rdp_19_toGenus": "RDP trainset 19 to Genus (for short read 16S)", + "rdp_19_toSpecies": "RDP trainset 19 to Species (for long read 16S)", "rdp_16": "RDP trainset 16", "rdp_14": "RDP trainset 14", "greengenes_13.84": "GreenGenes version 13.84", + "greengenes2_2024.09_toGenus": "GreenGenes2 release 2024.09 to Genus (for short read 16S)", + "greengenes2_2024.09_toSpecies": "GreenGenes2 release 2024.09 to Species (for long read 16S)", "unite_8.0_fungi": "UNITE: General Fasta release 8.0 for Fungi", "unite_8.0_fungi_singletons": "UNITE: General Fasta release 8.0 for Fungi including global and 97% singletons", "RefSeq_RDP_2018_05": "NCBI RefSeq 16S rRNA database supplemented by RDP (05/2018)", @@ -27,14 +33,21 @@ } FILE2TAXURL = { + # see https://github.com/benjjneb/dada2/issues/2053 for the distinction of the datasets + "silva_138.2_toGenus": "https://zenodo.org/records/14169026/files/silva_nr99_v138.2_toGenus_trainset.fa.gz?download=1", + "silva_138.2_toSpecies": "https://zenodo.org/records/14169026/files/silva_nr99_v138.2_toSpecies_trainset.fa.gz?download=1", "silva_138": "https://zenodo.org/record/3731176/files/silva_nr_v138_train_set.fa.gz?download=1", "silva_132": "https://zenodo.org/record/1172783/files/silva_nr_v132_train_set.fa.gz?download=1", "silva_128": "https://zenodo.org/record/824551/files/silva_nr_v128_train_set.fa.gz?download=1", + "rdp_19_toGenus": "https://zenodo.org/records/14168771/files/rdp_19_toGenus_trainset.fa.gz?download=1", + "rdp_19_toSpecies": "https://zenodo.org/records/14168771/files/rdp_19_toSpecies_trainset.fa.gz?download=1", "rdp_16": "https://zenodo.org/record/801828/files/rdp_train_set_16.fa.gz?download=1", "rdp_14": "https://zenodo.org/record/158955/files/rdp_train_set_14.fa.gz?download=1", - "unite_8.0_fungi": "https://files.plutof.ut.ee/public/orig/EB/0C/EB0CCB3A871B77EA75E472D13926271076904A588D2E1C1EA5AFCF7397D48378.zip", - "unite_8.0_fungi_singletons": "https://files.plutof.ut.ee/doi/06/A2/06A2C86256EED64085670EB0C54B7115F6DAC8F311C656A9CB33E386CFABA0D0.zip", + "unite_8.0_fungi": "https://s3.hpc.ut.ee/plutof-public/original/9f7b41c3-825b-4db8-9c52-74a4603a860a.zip", + "unite_8.0_fungi_singletons": "https://s3.hpc.ut.ee/plutof-public/original/53dfc9ce-9cb5-4205-84bb-f47faff26462.zip", "greengenes_13.84": "https://zenodo.org/record/158955/files/gg_13_8_train_set_97.fa.gz?download=1", + "greengenes2_2024.09_toGenus": "https://zenodo.org/records/14169078/files/gg2_2024_09_toGenus_trainset.fa.gz?download=1", + "greengenes2_2024.09_toSpecies": "https://zenodo.org/records/14169078/files/gg2_2024_09_toSpecies_trainset.fa.gz?download=1", "RefSeq_RDP_2018_05": "https://zenodo.org/record/2541239/files/RefSeq-RDP16S_v2_May2018.fa.gz?download=1", "gtdb_2018_11": "https://zenodo.org/record/2541239/files/GTDB_bac-arc_ssu_r86.fa.gz?download=1", "hitdb_1": "https://zenodo.org/record/159205/files/hitdb_v1.00.fa.gz?download=1", @@ -43,15 +56,17 @@ } FILE2SPECIESURL = { + "silva_138.2": "https://zenodo.org/records/14169026/files/silva_v138.2_assignSpecies.fa.gz?download=1", "silva_138": "https://zenodo.org/record/3731176/files/silva_species_assignment_v138.fa.gz?download=1", "silva_132": "https://zenodo.org/record/1172783/files/silva_species_assignment_v132.fa.gz?download=1", "silva_128": "https://zenodo.org/record/824551/files/silva_species_assignment_v128.fa.gz?download=1", + # rdp_19 not available for assignSpecies https://github.com/benjjneb/dada2/issues/2053#issuecomment-2512185929 "rdp_16": "https://zenodo.org/record/801828/files/rdp_species_assignment_16.fa.gz?download=1", "rdp_14": "https://zenodo.org/record/158955/files/rdp_species_assignment_14.fa.gz?download=1" } FILE2TAXLEVELS = { - "PR2_4.11.1": "Kingdom,Supergroup,Division,Class,Order,Family,Genus,Species" + "PR2_4.11.1": "Kingdom,Supergroup,Division,Class,Order,Family,Genus,Species", } @@ -78,7 +93,7 @@ def url_download(url, fname, workdir): if src: src.close() -# special treatment of UNITE DBs: they are zip files containing two fasta (xyz.fasta and developer/xyz.fasta) + # special treatment of UNITE DBs: they are zip files containing two fasta (xyz.fasta and developer/xyz.fasta) if fname.startswith("unite"): import glob import gzip