From b417e38d9182a1c03175fca1c44e4a3d8852a2b2 Mon Sep 17 00:00:00 2001 From: Matthias Bernt Date: Wed, 27 Nov 2024 18:00:16 +0100 Subject: [PATCH 1/3] update dada2 reference data --- .../data_manager/dada2_fetcher.xml | 14 ++++++++++- .../data_manager/data_manager.py | 23 +++++++++++++++++++ 2 files changed, 36 insertions(+), 1 deletion(-) diff --git a/data_managers/data_manager_dada2/data_manager/dada2_fetcher.xml b/data_managers/data_manager_dada2/data_manager/dada2_fetcher.xml index bb63c5837ac..fa8c132d125 100644 --- a/data_managers/data_manager_dada2/data_manager/dada2_fetcher.xml +++ b/data_managers/data_manager_dada2/data_manager/dada2_fetcher.xml @@ -1,5 +1,5 @@ - + Download reference databases python @@ -16,6 +16,7 @@ + @@ -27,6 +28,8 @@ + + @@ -34,10 +37,16 @@ + + + + + + @@ -51,11 +60,14 @@ + + + diff --git a/data_managers/data_manager_dada2/data_manager/data_manager.py b/data_managers/data_manager_dada2/data_manager/data_manager.py index 1a2c3cb84ae..467b85e8927 100644 --- a/data_managers/data_manager_dada2/data_manager/data_manager.py +++ b/data_managers/data_manager_dada2/data_manager/data_manager.py @@ -11,30 +11,51 @@ DEFAULT_TAXLEVELS = "Kingdom,Phylum,Class,Order,Family,Genus,Species" FILE2NAME = { + "silva_138.2": "Silva version 138.2", + "silva_138.1": "Silva version 138.1", "silva_138": "Silva version 138", "silva_132": "Silva version 132", "silva_128": "Silva version 128", + "rdp_19": "RDP trainset 19", "rdp_16": "RDP trainset 16", "rdp_14": "RDP trainset 14", "greengenes_13.84": "GreenGenes version 13.84", + "greengenes2_2024.09": "GreenGenes2 release 2024.09 ", "unite_8.0_fungi": "UNITE: General Fasta release 8.0 for Fungi", "unite_8.0_fungi_singletons": "UNITE: General Fasta release 8.0 for Fungi including global and 97% singletons", + + # # v4.5 https://zenodo.org/records/13984843 (contains no update on RefSeq_RDP) + # "gtdb_2024_10": "GTDB: Genome Taxonomy Database 220 (Bacteria & Archaea) (10/2024)", + + # # v4.3 https://zenodo.org/records/10403693 + # "gtdb_2023_12": "GTDB: Genome Taxonomy Database 214 (Bacteria & Archaea) (12/2023)", + # "RefSeq_RDP_2023_12": "NCBI RefSeq 16S rRNA database supplemented by RDP (12/2023)", + + # v1 https://zenodo.org/records/2541239 "RefSeq_RDP_2018_05": "NCBI RefSeq 16S rRNA database supplemented by RDP (05/2018)", "gtdb_2018_11": "GTDB: Genome Taxonomy Database (Bacteria & Archaea) (11/2018)", + "hitdb_1": "HitDB version 1 (Human InTestinal 16S rRNA)", "silva_euk_18S_132": "Silva version 132 Eukaryotic 18S", "PR2_4.11.1": "Protist Ribosomal Reference database (PR2) 4.11.1" } FILE2TAXURL = { + "silva_138.2": "https://zenodo.org/records/14169026/files/silva_nr99_v138.2_toGenus_trainset.fa.gz?download=1", # using the one wo species info https://github.com/benjjneb/dada2/issues/2053#issuecomment-2478617791 + "silva_138.1": "https://zenodo.org/records/4587955/files/silva_nr99_v138.1_train_set.fa.gz?download=1", # - " - "silva_138": "https://zenodo.org/record/3731176/files/silva_nr_v138_train_set.fa.gz?download=1", "silva_132": "https://zenodo.org/record/1172783/files/silva_nr_v132_train_set.fa.gz?download=1", "silva_128": "https://zenodo.org/record/824551/files/silva_nr_v128_train_set.fa.gz?download=1", + "rdp_19": "https://zenodo.org/records/14168771/files/rdp_19_toGenus_trainset.fa.gz?download=1", "rdp_16": "https://zenodo.org/record/801828/files/rdp_train_set_16.fa.gz?download=1", "rdp_14": "https://zenodo.org/record/158955/files/rdp_train_set_14.fa.gz?download=1", "unite_8.0_fungi": "https://files.plutof.ut.ee/public/orig/EB/0C/EB0CCB3A871B77EA75E472D13926271076904A588D2E1C1EA5AFCF7397D48378.zip", "unite_8.0_fungi_singletons": "https://files.plutof.ut.ee/doi/06/A2/06A2C86256EED64085670EB0C54B7115F6DAC8F311C656A9CB33E386CFABA0D0.zip", "greengenes_13.84": "https://zenodo.org/record/158955/files/gg_13_8_train_set_97.fa.gz?download=1", + "greengenes2_2024.09": "https://zenodo.org/records/14169078/files/gg2_2024_09_toGenus_trainset.fa.gz?download=1", + # "gtdb_220_4.5": "https://zenodo.org/records/13984843/files/GTDB_bac120_arc53_ssu_r220_genus.fa.gz?download=1", + # "gtdb_214_4.4": "https://zenodo.org/records/10403693/files/GTDB_bac120_arc53_ssu_r214_genus.fa.gz?download=1", + # "RefSeq_RDP_2023_12": "https://zenodo.org/records/10403693/files/RefSeq_16S_6-11-20_RDPv16_Genus.fa.gz?download=1", "RefSeq_RDP_2018_05": "https://zenodo.org/record/2541239/files/RefSeq-RDP16S_v2_May2018.fa.gz?download=1", "gtdb_2018_11": "https://zenodo.org/record/2541239/files/GTDB_bac-arc_ssu_r86.fa.gz?download=1", "hitdb_1": "https://zenodo.org/record/159205/files/hitdb_v1.00.fa.gz?download=1", @@ -43,6 +64,8 @@ } FILE2SPECIESURL = { + "silva_138.2": "https://zenodo.org/records/14169026/files/silva_v138.2_assignSpecies.fa.gz?download=1", + "silva_138.1": "https://zenodo.org/records/4587955/files/silva_species_assignment_v138.1.fa.gz?download=1", "silva_138": "https://zenodo.org/record/3731176/files/silva_species_assignment_v138.fa.gz?download=1", "silva_132": "https://zenodo.org/record/1172783/files/silva_species_assignment_v132.fa.gz?download=1", "silva_128": "https://zenodo.org/record/824551/files/silva_species_assignment_v128.fa.gz?download=1", From 61a42bc01574b97aa7f97bd069d6c378ae9f891a Mon Sep 17 00:00:00 2001 From: Matthias Bernt Date: Tue, 3 Dec 2024 11:25:22 +0100 Subject: [PATCH 2/3] only include updates for Maintained DBs - Silva - GG2 - RDP according to https://github.com/benjjneb/dada2/issues/2053#issuecomment-2512185929 --- .../data_manager/dada2_fetcher.xml | 18 +++++---- .../data_manager/data_manager.py | 40 ++++++++----------- 2 files changed, 26 insertions(+), 32 deletions(-) diff --git a/data_managers/data_manager_dada2/data_manager/dada2_fetcher.xml b/data_managers/data_manager_dada2/data_manager/dada2_fetcher.xml index fa8c132d125..a98fd8c5b33 100644 --- a/data_managers/data_manager_dada2/data_manager/dada2_fetcher.xml +++ b/data_managers/data_manager_dada2/data_manager/dada2_fetcher.xml @@ -28,8 +28,8 @@ - - + + @@ -37,14 +37,16 @@ - + + - + + @@ -60,14 +62,11 @@ - - - @@ -156,7 +155,8 @@ The following refrence databases which are describes as maintained by the DADA2 - Silva (https://www.arb-silva.de/) - RDP -- GreenGenes (http://greengenes.secondgenome.com/) +- GreenGenes (https://web.archive.org/web/20240703054246/https://greengenes.secondgenome.com/) +- Greengenes2 (https://greengenes2.ucsd.edu/) - UNITE general FASTA (https://unite.ut.ee/repository.php) While Silva and RDP contain reference databases for taxonomy and species assignment, the greengenes and UNITE databases only contains a reference database for taxonomy assignment. @@ -188,6 +188,8 @@ More detailed informations in the reference data bases can be found on the DADA2 10.1093/nar/gks1219 10.1093/nar/gkt1244 + + 10.1038/s41587-023-01845-1 10.1128/AEM.03006-05 diff --git a/data_managers/data_manager_dada2/data_manager/data_manager.py b/data_managers/data_manager_dada2/data_manager/data_manager.py index 467b85e8927..511916f4525 100644 --- a/data_managers/data_manager_dada2/data_manager/data_manager.py +++ b/data_managers/data_manager_dada2/data_manager/data_manager.py @@ -11,51 +11,43 @@ DEFAULT_TAXLEVELS = "Kingdom,Phylum,Class,Order,Family,Genus,Species" FILE2NAME = { - "silva_138.2": "Silva version 138.2", - "silva_138.1": "Silva version 138.1", + "silva_138.2_toGenus": "Silva version 138.2 to Genus (for short read 16S)", + "silva_138.2_toSpecies": "Silva version 138.2 to Species (for long read 16S)", "silva_138": "Silva version 138", "silva_132": "Silva version 132", "silva_128": "Silva version 128", - "rdp_19": "RDP trainset 19", + "rdp_19_toGenus": "RDP trainset 19 to Genus (for short read 16S)", + "rdp_19_toSpecies": "RDP trainset 19 to Species (for long read 16S)", "rdp_16": "RDP trainset 16", "rdp_14": "RDP trainset 14", "greengenes_13.84": "GreenGenes version 13.84", - "greengenes2_2024.09": "GreenGenes2 release 2024.09 ", + "greengenes2_2024.09_toGenus": "GreenGenes2 release 2024.09 to Genus (for short read 16S)", + "greengenes2_2024.09_toSpecies": "GreenGenes2 release 2024.09 to Species (for long read 16S)", "unite_8.0_fungi": "UNITE: General Fasta release 8.0 for Fungi", "unite_8.0_fungi_singletons": "UNITE: General Fasta release 8.0 for Fungi including global and 97% singletons", - - # # v4.5 https://zenodo.org/records/13984843 (contains no update on RefSeq_RDP) - # "gtdb_2024_10": "GTDB: Genome Taxonomy Database 220 (Bacteria & Archaea) (10/2024)", - - # # v4.3 https://zenodo.org/records/10403693 - # "gtdb_2023_12": "GTDB: Genome Taxonomy Database 214 (Bacteria & Archaea) (12/2023)", - # "RefSeq_RDP_2023_12": "NCBI RefSeq 16S rRNA database supplemented by RDP (12/2023)", - - # v1 https://zenodo.org/records/2541239 "RefSeq_RDP_2018_05": "NCBI RefSeq 16S rRNA database supplemented by RDP (05/2018)", "gtdb_2018_11": "GTDB: Genome Taxonomy Database (Bacteria & Archaea) (11/2018)", - "hitdb_1": "HitDB version 1 (Human InTestinal 16S rRNA)", "silva_euk_18S_132": "Silva version 132 Eukaryotic 18S", "PR2_4.11.1": "Protist Ribosomal Reference database (PR2) 4.11.1" } FILE2TAXURL = { - "silva_138.2": "https://zenodo.org/records/14169026/files/silva_nr99_v138.2_toGenus_trainset.fa.gz?download=1", # using the one wo species info https://github.com/benjjneb/dada2/issues/2053#issuecomment-2478617791 - "silva_138.1": "https://zenodo.org/records/4587955/files/silva_nr99_v138.1_train_set.fa.gz?download=1", # - " - + # see https://github.com/benjjneb/dada2/issues/2053 for the distinction of the datasets + "silva_138.2_toGenus": "https://zenodo.org/records/14169026/files/silva_nr99_v138.2_toGenus_trainset.fa.gz?download=1", + "silva_138.2_toSpecies": "https://zenodo.org/records/14169026/files/silva_nr99_v138.2_toSpecies_trainset.fa.gz?download=1", "silva_138": "https://zenodo.org/record/3731176/files/silva_nr_v138_train_set.fa.gz?download=1", "silva_132": "https://zenodo.org/record/1172783/files/silva_nr_v132_train_set.fa.gz?download=1", "silva_128": "https://zenodo.org/record/824551/files/silva_nr_v128_train_set.fa.gz?download=1", - "rdp_19": "https://zenodo.org/records/14168771/files/rdp_19_toGenus_trainset.fa.gz?download=1", + "rdp_19_toGenus": "https://zenodo.org/records/14168771/files/rdp_19_toGenus_trainset.fa.gz?download=1", + "rdp_19_toSpecies": "https://zenodo.org/records/14168771/files/rdp_19_toSpecies_trainset.fa.gz?download=1", "rdp_16": "https://zenodo.org/record/801828/files/rdp_train_set_16.fa.gz?download=1", "rdp_14": "https://zenodo.org/record/158955/files/rdp_train_set_14.fa.gz?download=1", "unite_8.0_fungi": "https://files.plutof.ut.ee/public/orig/EB/0C/EB0CCB3A871B77EA75E472D13926271076904A588D2E1C1EA5AFCF7397D48378.zip", "unite_8.0_fungi_singletons": "https://files.plutof.ut.ee/doi/06/A2/06A2C86256EED64085670EB0C54B7115F6DAC8F311C656A9CB33E386CFABA0D0.zip", "greengenes_13.84": "https://zenodo.org/record/158955/files/gg_13_8_train_set_97.fa.gz?download=1", - "greengenes2_2024.09": "https://zenodo.org/records/14169078/files/gg2_2024_09_toGenus_trainset.fa.gz?download=1", - # "gtdb_220_4.5": "https://zenodo.org/records/13984843/files/GTDB_bac120_arc53_ssu_r220_genus.fa.gz?download=1", - # "gtdb_214_4.4": "https://zenodo.org/records/10403693/files/GTDB_bac120_arc53_ssu_r214_genus.fa.gz?download=1", - # "RefSeq_RDP_2023_12": "https://zenodo.org/records/10403693/files/RefSeq_16S_6-11-20_RDPv16_Genus.fa.gz?download=1", + "greengenes2_2024.09_toGenus": "https://zenodo.org/records/14169078/files/gg2_2024_09_toGenus_trainset.fa.gz?download=1", + "greengenes2_2024.09_toSpecies": "https://zenodo.org/records/14169078/files/gg2_2024_09_toSpecies_trainset.fa.gz?download=1", "RefSeq_RDP_2018_05": "https://zenodo.org/record/2541239/files/RefSeq-RDP16S_v2_May2018.fa.gz?download=1", "gtdb_2018_11": "https://zenodo.org/record/2541239/files/GTDB_bac-arc_ssu_r86.fa.gz?download=1", "hitdb_1": "https://zenodo.org/record/159205/files/hitdb_v1.00.fa.gz?download=1", @@ -65,16 +57,16 @@ FILE2SPECIESURL = { "silva_138.2": "https://zenodo.org/records/14169026/files/silva_v138.2_assignSpecies.fa.gz?download=1", - "silva_138.1": "https://zenodo.org/records/4587955/files/silva_species_assignment_v138.1.fa.gz?download=1", "silva_138": "https://zenodo.org/record/3731176/files/silva_species_assignment_v138.fa.gz?download=1", "silva_132": "https://zenodo.org/record/1172783/files/silva_species_assignment_v132.fa.gz?download=1", "silva_128": "https://zenodo.org/record/824551/files/silva_species_assignment_v128.fa.gz?download=1", + # rdp_19 not available for assignSpecies https://github.com/benjjneb/dada2/issues/2053#issuecomment-2512185929 "rdp_16": "https://zenodo.org/record/801828/files/rdp_species_assignment_16.fa.gz?download=1", "rdp_14": "https://zenodo.org/record/158955/files/rdp_species_assignment_14.fa.gz?download=1" } FILE2TAXLEVELS = { - "PR2_4.11.1": "Kingdom,Supergroup,Division,Class,Order,Family,Genus,Species" + "PR2_4.11.1": "Kingdom,Supergroup,Division,Class,Order,Family,Genus,Species", } @@ -101,7 +93,7 @@ def url_download(url, fname, workdir): if src: src.close() -# special treatment of UNITE DBs: they are zip files containing two fasta (xyz.fasta and developer/xyz.fasta) + # special treatment of UNITE DBs: they are zip files containing two fasta (xyz.fasta and developer/xyz.fasta) if fname.startswith("unite"): import glob import gzip From dc403a1b279c6de4c0c89f8629383983b09c3323 Mon Sep 17 00:00:00 2001 From: Matthias Bernt Date: Tue, 3 Dec 2024 12:30:34 +0100 Subject: [PATCH 3/3] add tests --- .../data_manager/dada2_fetcher.xml | 61 ++++++++++++++++++- .../data_manager/data_manager.py | 4 +- 2 files changed, 62 insertions(+), 3 deletions(-) diff --git a/data_managers/data_manager_dada2/data_manager/dada2_fetcher.xml b/data_managers/data_manager_dada2/data_manager/dada2_fetcher.xml index a98fd8c5b33..2c14a7138b5 100644 --- a/data_managers/data_manager_dada2/data_manager/dada2_fetcher.xml +++ b/data_managers/data_manager_dada2/data_manager/dada2_fetcher.xml @@ -91,6 +91,29 @@ + + + + + + + + + + + + + + + + + + + + + + + @@ -101,14 +124,50 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + - + diff --git a/data_managers/data_manager_dada2/data_manager/data_manager.py b/data_managers/data_manager_dada2/data_manager/data_manager.py index 511916f4525..d0ac47d8625 100644 --- a/data_managers/data_manager_dada2/data_manager/data_manager.py +++ b/data_managers/data_manager_dada2/data_manager/data_manager.py @@ -43,8 +43,8 @@ "rdp_19_toSpecies": "https://zenodo.org/records/14168771/files/rdp_19_toSpecies_trainset.fa.gz?download=1", "rdp_16": "https://zenodo.org/record/801828/files/rdp_train_set_16.fa.gz?download=1", "rdp_14": "https://zenodo.org/record/158955/files/rdp_train_set_14.fa.gz?download=1", - "unite_8.0_fungi": "https://files.plutof.ut.ee/public/orig/EB/0C/EB0CCB3A871B77EA75E472D13926271076904A588D2E1C1EA5AFCF7397D48378.zip", - "unite_8.0_fungi_singletons": "https://files.plutof.ut.ee/doi/06/A2/06A2C86256EED64085670EB0C54B7115F6DAC8F311C656A9CB33E386CFABA0D0.zip", + "unite_8.0_fungi": "https://s3.hpc.ut.ee/plutof-public/original/9f7b41c3-825b-4db8-9c52-74a4603a860a.zip", + "unite_8.0_fungi_singletons": "https://s3.hpc.ut.ee/plutof-public/original/53dfc9ce-9cb5-4205-84bb-f47faff26462.zip", "greengenes_13.84": "https://zenodo.org/record/158955/files/gg_13_8_train_set_97.fa.gz?download=1", "greengenes2_2024.09_toGenus": "https://zenodo.org/records/14169078/files/gg2_2024_09_toGenus_trainset.fa.gz?download=1", "greengenes2_2024.09_toSpecies": "https://zenodo.org/records/14169078/files/gg2_2024_09_toSpecies_trainset.fa.gz?download=1",