diff --git a/q2_types/genome_data/_methods.py b/q2_types/genome_data/_methods.py index 99764ce2..9fdce2dc 100644 --- a/q2_types/genome_data/_methods.py +++ b/q2_types/genome_data/_methods.py @@ -12,7 +12,33 @@ import numpy as np from qiime2.util import duplicate -from q2_types.genome_data import SeedOrthologDirFmt, OrthologAnnotationDirFmt +from q2_types.genome_data import (SeedOrthologDirFmt, OrthologAnnotationDirFmt, + LociDirectoryFormat) + + +def collate_loci(loci: LociDirectoryFormat) -> LociDirectoryFormat: + """ + Collate the individual loci directories from the partitions. + Parameters: + - loci: A list of LociDirectoryFormat containing the gff files. + Returns: + - collated_loci: A LociDirectoryFormat object containing the + collated gff files. + """ + collated_loci = LociDirectoryFormat() + for loci_dir in loci: + for fp in loci_dir.path.iterdir(): + try: + duplicate( + fp, + collated_loci.path / os.path.basename(fp) + ) + except FileExistsError: + warnings.warn( + f"Skipping {fp}. File already exists " + f"in the destination directory." + ) + return collated_loci def collate_orthologs(orthologs: SeedOrthologDirFmt) -> SeedOrthologDirFmt: diff --git a/q2_types/genome_data/tests/data/uncollated_loci_1/loci1.gff b/q2_types/genome_data/tests/data/uncollated_loci_1/loci1.gff new file mode 100644 index 00000000..77304f5c --- /dev/null +++ b/q2_types/genome_data/tests/data/uncollated_loci_1/loci1.gff @@ -0,0 +1,10 @@ +##gff-version 3 +#!gff-spec-version 1.21 +#!processor NCBI annotwriter +#!genome-build ASM19595v2 +#!genome-build-accession NCBI_Assembly:GCA_000195955.2 +##sequence-region AL123456.3 1 4411532 +##species https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=83332 +AL123456.3 EMBL region 1 4411532 . + . ID=AL123456.3:1..4411532;Dbxref=taxon:83332;gbkey=Src;mol_type=genomic DNA;strain=H37Rv;type-material=type strain of Mycobacterium tuberculosis +AL123456.3 EMBL gene 1 1524 . + . ID=gene-Rv0001;Name=dnaA;gbkey=Gene;gene=dnaA;gene_biotype=protein_coding;locus_tag=Rv0001 +AL123456.3 EMBL CDS 1 1524 . + 0 ID=cds-CCP42723.1;Parent=gene-Rv0001;Dbxref=EnsemblGenomes-Gn:Rv0001,EnsemblGenomes-Tr:CCP42723,GOA:P9WNW3,InterPro:IPR001957,InterPro:IPR003593,InterPro:IPR010921,InterPro:IPR013159,InterPro:IPR013317,InterPro:IPR018312,InterPro:IPR020591,InterPro:IPR027417,NCBI_GP:CCP42723.1;Name=CCP42723.1;Note=Rv0001%2C (MT0001%2C MTV029.01%2C P49993)%2C len: 507 aa. dnaA%2C chromosomal replication initiator protein (see citations below)%2C equivalent to other Mycobacterial chromosomal replication initiator proteins. Also highly similar to others except in N-terminus e.g. Q9ZH75|DNAA_STRCH chromosomal replication initiator protein from Streptomyces chrysomallus (624 aa). Contains PS00017 ATP/GTP-binding site motif A (P-loop) and PS01008 DnaA protein signature. Belongs to the DnaA family. Note that the first base of this gene has been taken as base 1 of the Mycobacterium tuberculosis H37Rv genomic sequence.;experiment=EXISTENCE: identified in proteomics study;gbkey=CDS;gene=dnaA;inference=protein motif:PROSITE:PS01008;locus_tag=Rv0001;product=Chromosomal replication initiator protein DnaA;protein_id=CCP42723.1;transl_table=11 diff --git a/q2_types/genome_data/tests/data/uncollated_loci_1/loci2.gff b/q2_types/genome_data/tests/data/uncollated_loci_1/loci2.gff new file mode 100644 index 00000000..ab7c1087 --- /dev/null +++ b/q2_types/genome_data/tests/data/uncollated_loci_1/loci2.gff @@ -0,0 +1,5 @@ +##gff-version 3 +# Sequence Data: seqnum=1;seqlen=1713046;seqhdr="k129_5480" +# Model Data: version=Prodigal.v2.6.3;run_type=Metagenomic;model="49|_Nostoc_azollae__0708|B|38.5|11|1";gc_cont=38.40;transl_table=11;uses_sd=1 +k129_5480 Prodigal_v2.6.3 CDS 3 1988 255.7 - 0 ID=1_1;partial=10;start_type=ATG;rbs_motif=GGA/GAG/AGG;rbs_spacer=5-10bp;gc_cont=0.442;conf=99.99;score=255.03;cscore=250.12;sscore=4.92;rscore=0.92;uscore=0.44;tscore=4.21; +k129_5480 Prodigal_v2.6.3 CDS 2150 2623 63.6 + 0 ID=1_2;partial=00;start_type=ATG;rbs_motif=GGA/GAG/AGG;rbs_spacer=5-10bp;gc_cont=0.426;conf=100.00;score=63.62;cscore=60.65;sscore=2.97;rscore=0.92;uscore=-2.15;tscore=4.21; diff --git a/q2_types/genome_data/tests/data/uncollated_loci_2/loci3.gff b/q2_types/genome_data/tests/data/uncollated_loci_2/loci3.gff new file mode 100644 index 00000000..ab7c1087 --- /dev/null +++ b/q2_types/genome_data/tests/data/uncollated_loci_2/loci3.gff @@ -0,0 +1,5 @@ +##gff-version 3 +# Sequence Data: seqnum=1;seqlen=1713046;seqhdr="k129_5480" +# Model Data: version=Prodigal.v2.6.3;run_type=Metagenomic;model="49|_Nostoc_azollae__0708|B|38.5|11|1";gc_cont=38.40;transl_table=11;uses_sd=1 +k129_5480 Prodigal_v2.6.3 CDS 3 1988 255.7 - 0 ID=1_1;partial=10;start_type=ATG;rbs_motif=GGA/GAG/AGG;rbs_spacer=5-10bp;gc_cont=0.442;conf=99.99;score=255.03;cscore=250.12;sscore=4.92;rscore=0.92;uscore=0.44;tscore=4.21; +k129_5480 Prodigal_v2.6.3 CDS 2150 2623 63.6 + 0 ID=1_2;partial=00;start_type=ATG;rbs_motif=GGA/GAG/AGG;rbs_spacer=5-10bp;gc_cont=0.426;conf=100.00;score=63.62;cscore=60.65;sscore=2.97;rscore=0.92;uscore=-2.15;tscore=4.21; diff --git a/q2_types/genome_data/tests/data/uncollated_loci_2/loci4.gff b/q2_types/genome_data/tests/data/uncollated_loci_2/loci4.gff new file mode 100644 index 00000000..77304f5c --- /dev/null +++ b/q2_types/genome_data/tests/data/uncollated_loci_2/loci4.gff @@ -0,0 +1,10 @@ +##gff-version 3 +#!gff-spec-version 1.21 +#!processor NCBI annotwriter +#!genome-build ASM19595v2 +#!genome-build-accession NCBI_Assembly:GCA_000195955.2 +##sequence-region AL123456.3 1 4411532 +##species https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=83332 +AL123456.3 EMBL region 1 4411532 . + . ID=AL123456.3:1..4411532;Dbxref=taxon:83332;gbkey=Src;mol_type=genomic DNA;strain=H37Rv;type-material=type strain of Mycobacterium tuberculosis +AL123456.3 EMBL gene 1 1524 . + . ID=gene-Rv0001;Name=dnaA;gbkey=Gene;gene=dnaA;gene_biotype=protein_coding;locus_tag=Rv0001 +AL123456.3 EMBL CDS 1 1524 . + 0 ID=cds-CCP42723.1;Parent=gene-Rv0001;Dbxref=EnsemblGenomes-Gn:Rv0001,EnsemblGenomes-Tr:CCP42723,GOA:P9WNW3,InterPro:IPR001957,InterPro:IPR003593,InterPro:IPR010921,InterPro:IPR013159,InterPro:IPR013317,InterPro:IPR018312,InterPro:IPR020591,InterPro:IPR027417,NCBI_GP:CCP42723.1;Name=CCP42723.1;Note=Rv0001%2C (MT0001%2C MTV029.01%2C P49993)%2C len: 507 aa. dnaA%2C chromosomal replication initiator protein (see citations below)%2C equivalent to other Mycobacterial chromosomal replication initiator proteins. Also highly similar to others except in N-terminus e.g. Q9ZH75|DNAA_STRCH chromosomal replication initiator protein from Streptomyces chrysomallus (624 aa). Contains PS00017 ATP/GTP-binding site motif A (P-loop) and PS01008 DnaA protein signature. Belongs to the DnaA family. Note that the first base of this gene has been taken as base 1 of the Mycobacterium tuberculosis H37Rv genomic sequence.;experiment=EXISTENCE: identified in proteomics study;gbkey=CDS;gene=dnaA;inference=protein motif:PROSITE:PS01008;locus_tag=Rv0001;product=Chromosomal replication initiator protein DnaA;protein_id=CCP42723.1;transl_table=11 diff --git a/q2_types/genome_data/tests/test_methods.py b/q2_types/genome_data/tests/test_methods.py index 2aff71ad..754bd5f1 100644 --- a/q2_types/genome_data/tests/test_methods.py +++ b/q2_types/genome_data/tests/test_methods.py @@ -7,11 +7,14 @@ # ---------------------------------------------------------------------------- import filecmp import os +import warnings from qiime2.plugin.testing import TestPluginBase from q2_types.genome_data import SeedOrthologDirFmt, collate_orthologs, \ partition_orthologs, OrthologAnnotationDirFmt, collate_ortholog_annotations +from q2_types.genome_data import LociDirectoryFormat +from q2_types.genome_data._methods import collate_loci class TestOrthologsPartitionCollating(TestPluginBase): @@ -33,6 +36,32 @@ def test_collate_orthologs(self): collated_orthologs.path / "2.emapper.seed_orthologs") ) + def test_collate_loci(self): + p1 = self.get_data_path("uncollated_loci_1") + p2 = self.get_data_path("uncollated_loci_2") + loci_list = [ + LociDirectoryFormat(p1, mode="r"), + LociDirectoryFormat(p2, mode="r") + ] + + collated_loci = collate_loci(loci_list) + self.assertTrue(all(os.path.exists( + collated_loci.path / f"loci{no}.gff") for no in [1, 2, 3, 4])) + + def test_collate_loci_file_exists(self): + p1 = self.get_data_path("uncollated_loci_1") + loci_list = [ + LociDirectoryFormat(p1, mode="r"), + LociDirectoryFormat(p1, mode="r") + ] + + with warnings.catch_warnings(record=True) as w: + collated_loci = collate_loci(loci_list) + self.assertIn("File already exists", str(w[-1].message)) + + self.assertTrue(all(os.path.exists( + collated_loci.path / f"loci{no}.gff") for no in [1, 2])) + def test_partition_orthologs(self): p = self.get_data_path("collated_orthologs") orthologs = SeedOrthologDirFmt(path=p, mode="r") diff --git a/q2_types/plugin_setup.py b/q2_types/plugin_setup.py index ed59706c..bcb64c81 100644 --- a/q2_types/plugin_setup.py +++ b/q2_types/plugin_setup.py @@ -19,7 +19,8 @@ from q2_types.feature_data_mag import MAG from q2_types.per_sample_sequences import MAGs from q2_types.feature_data import FeatureData -from q2_types.genome_data import Orthologs, GenomeData, NOG +from q2_types.genome_data import Orthologs, GenomeData, NOG, Loci +from q2_types.genome_data._methods import collate_loci from q2_types.sample_data import SampleData @@ -136,6 +137,17 @@ "and collates them into a single artifact.", ) +plugin.methods.register_function( + function=collate_loci, + inputs={"loci": List[GenomeData[Loci]]}, + parameters={}, + outputs={"collated_loci": GenomeData[Loci]}, + input_descriptions={"loci": "A collection of loci to be collated."}, + name="Collate loci", + description="Takes a collection of GenomeData[Loci]'s " + "and collates them into a single artifact.", +) + importlib.import_module('q2_types.bowtie2._deferred_setup') importlib.import_module('q2_types.distance_matrix._deferred_setup') importlib.import_module('q2_types.feature_data._deferred_setup') diff --git a/setup.py b/setup.py index a71fae39..5fe83b7b 100644 --- a/setup.py +++ b/setup.py @@ -87,6 +87,8 @@ 'data/genes_samples/sample2/*', 'data/loci-invalid/*', 'data/loci/*', + 'data/uncollated_loci_1/*', + 'data/uncollated_loci_2/*', 'data/genome-sequences/*', 'data/ortholog/*', 'data/ortholog-annotation-extra/*',