Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: collate_loci action added #348

Open
wants to merge 4 commits into
base: dev
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 28 additions & 1 deletion q2_types/genome_data/_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,34 @@
import numpy as np
from qiime2.util import duplicate

from q2_types.genome_data import SeedOrthologDirFmt, OrthologAnnotationDirFmt
from q2_types.genome_data import (SeedOrthologDirFmt, OrthologAnnotationDirFmt,
LociDirectoryFormat)


def collate_loci(loci: LociDirectoryFormat) -> LociDirectoryFormat:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hey @DorielaGrabocka, could you please simplify these loops to resemble the other collating actions we have in this plugin? There is one example in this file, at the bottom. Unless you had a reason to do it this way?

"""
Collate the individual loci directories from the partitions.
Parameters:
- loci: A list of LociDirectoryFormat containing the gff files.
Returns:
- collated_loci: A LociDirectoryFormat object containing the
collated gff files.
"""
collated_loci = LociDirectoryFormat()
for loci_dir in loci:
for fp in os.listdir(loci_dir.path):
try:
duplicate(
os.path.join(loci_dir.path, fp),
os.path.join(collated_loci.path, fp)
)
except FileExistsError:
# raise a warning
warnings.warn(
f"Skipping {fp}. File already exists "
f"in the destination directory."
)
return collated_loci


def collate_orthologs(orthologs: SeedOrthologDirFmt) -> SeedOrthologDirFmt:
Expand Down
10 changes: 10 additions & 0 deletions q2_types/genome_data/tests/data/uncollated_loci_1/loci1.gff
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
##gff-version 3
#!gff-spec-version 1.21
#!processor NCBI annotwriter
#!genome-build ASM19595v2
#!genome-build-accession NCBI_Assembly:GCA_000195955.2
##sequence-region AL123456.3 1 4411532
##species https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=83332
AL123456.3 EMBL region 1 4411532 . + . ID=AL123456.3:1..4411532;Dbxref=taxon:83332;gbkey=Src;mol_type=genomic DNA;strain=H37Rv;type-material=type strain of Mycobacterium tuberculosis
AL123456.3 EMBL gene 1 1524 . + . ID=gene-Rv0001;Name=dnaA;gbkey=Gene;gene=dnaA;gene_biotype=protein_coding;locus_tag=Rv0001
AL123456.3 EMBL CDS 1 1524 . + 0 ID=cds-CCP42723.1;Parent=gene-Rv0001;Dbxref=EnsemblGenomes-Gn:Rv0001,EnsemblGenomes-Tr:CCP42723,GOA:P9WNW3,InterPro:IPR001957,InterPro:IPR003593,InterPro:IPR010921,InterPro:IPR013159,InterPro:IPR013317,InterPro:IPR018312,InterPro:IPR020591,InterPro:IPR027417,NCBI_GP:CCP42723.1;Name=CCP42723.1;Note=Rv0001%2C (MT0001%2C MTV029.01%2C P49993)%2C len: 507 aa. dnaA%2C chromosomal replication initiator protein (see citations below)%2C equivalent to other Mycobacterial chromosomal replication initiator proteins. Also highly similar to others except in N-terminus e.g. Q9ZH75|DNAA_STRCH chromosomal replication initiator protein from Streptomyces chrysomallus (624 aa). Contains PS00017 ATP/GTP-binding site motif A (P-loop) and PS01008 DnaA protein signature. Belongs to the DnaA family. Note that the first base of this gene has been taken as base 1 of the Mycobacterium tuberculosis H37Rv genomic sequence.;experiment=EXISTENCE: identified in proteomics study;gbkey=CDS;gene=dnaA;inference=protein motif:PROSITE:PS01008;locus_tag=Rv0001;product=Chromosomal replication initiator protein DnaA;protein_id=CCP42723.1;transl_table=11
5 changes: 5 additions & 0 deletions q2_types/genome_data/tests/data/uncollated_loci_1/loci2.gff
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
##gff-version 3
# Sequence Data: seqnum=1;seqlen=1713046;seqhdr="k129_5480"
# Model Data: version=Prodigal.v2.6.3;run_type=Metagenomic;model="49|_Nostoc_azollae__0708|B|38.5|11|1";gc_cont=38.40;transl_table=11;uses_sd=1
k129_5480 Prodigal_v2.6.3 CDS 3 1988 255.7 - 0 ID=1_1;partial=10;start_type=ATG;rbs_motif=GGA/GAG/AGG;rbs_spacer=5-10bp;gc_cont=0.442;conf=99.99;score=255.03;cscore=250.12;sscore=4.92;rscore=0.92;uscore=0.44;tscore=4.21;
k129_5480 Prodigal_v2.6.3 CDS 2150 2623 63.6 + 0 ID=1_2;partial=00;start_type=ATG;rbs_motif=GGA/GAG/AGG;rbs_spacer=5-10bp;gc_cont=0.426;conf=100.00;score=63.62;cscore=60.65;sscore=2.97;rscore=0.92;uscore=-2.15;tscore=4.21;
5 changes: 5 additions & 0 deletions q2_types/genome_data/tests/data/uncollated_loci_2/loci3.gff
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
##gff-version 3
# Sequence Data: seqnum=1;seqlen=1713046;seqhdr="k129_5480"
# Model Data: version=Prodigal.v2.6.3;run_type=Metagenomic;model="49|_Nostoc_azollae__0708|B|38.5|11|1";gc_cont=38.40;transl_table=11;uses_sd=1
k129_5480 Prodigal_v2.6.3 CDS 3 1988 255.7 - 0 ID=1_1;partial=10;start_type=ATG;rbs_motif=GGA/GAG/AGG;rbs_spacer=5-10bp;gc_cont=0.442;conf=99.99;score=255.03;cscore=250.12;sscore=4.92;rscore=0.92;uscore=0.44;tscore=4.21;
k129_5480 Prodigal_v2.6.3 CDS 2150 2623 63.6 + 0 ID=1_2;partial=00;start_type=ATG;rbs_motif=GGA/GAG/AGG;rbs_spacer=5-10bp;gc_cont=0.426;conf=100.00;score=63.62;cscore=60.65;sscore=2.97;rscore=0.92;uscore=-2.15;tscore=4.21;
10 changes: 10 additions & 0 deletions q2_types/genome_data/tests/data/uncollated_loci_2/loci4.gff
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
##gff-version 3
#!gff-spec-version 1.21
#!processor NCBI annotwriter
#!genome-build ASM19595v2
#!genome-build-accession NCBI_Assembly:GCA_000195955.2
##sequence-region AL123456.3 1 4411532
##species https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=83332
AL123456.3 EMBL region 1 4411532 . + . ID=AL123456.3:1..4411532;Dbxref=taxon:83332;gbkey=Src;mol_type=genomic DNA;strain=H37Rv;type-material=type strain of Mycobacterium tuberculosis
AL123456.3 EMBL gene 1 1524 . + . ID=gene-Rv0001;Name=dnaA;gbkey=Gene;gene=dnaA;gene_biotype=protein_coding;locus_tag=Rv0001
AL123456.3 EMBL CDS 1 1524 . + 0 ID=cds-CCP42723.1;Parent=gene-Rv0001;Dbxref=EnsemblGenomes-Gn:Rv0001,EnsemblGenomes-Tr:CCP42723,GOA:P9WNW3,InterPro:IPR001957,InterPro:IPR003593,InterPro:IPR010921,InterPro:IPR013159,InterPro:IPR013317,InterPro:IPR018312,InterPro:IPR020591,InterPro:IPR027417,NCBI_GP:CCP42723.1;Name=CCP42723.1;Note=Rv0001%2C (MT0001%2C MTV029.01%2C P49993)%2C len: 507 aa. dnaA%2C chromosomal replication initiator protein (see citations below)%2C equivalent to other Mycobacterial chromosomal replication initiator proteins. Also highly similar to others except in N-terminus e.g. Q9ZH75|DNAA_STRCH chromosomal replication initiator protein from Streptomyces chrysomallus (624 aa). Contains PS00017 ATP/GTP-binding site motif A (P-loop) and PS01008 DnaA protein signature. Belongs to the DnaA family. Note that the first base of this gene has been taken as base 1 of the Mycobacterium tuberculosis H37Rv genomic sequence.;experiment=EXISTENCE: identified in proteomics study;gbkey=CDS;gene=dnaA;inference=protein motif:PROSITE:PS01008;locus_tag=Rv0001;product=Chromosomal replication initiator protein DnaA;protein_id=CCP42723.1;transl_table=11
29 changes: 29 additions & 0 deletions q2_types/genome_data/tests/test_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,14 @@
# ----------------------------------------------------------------------------
import filecmp
import os
import warnings

from qiime2.plugin.testing import TestPluginBase

from q2_types.genome_data import SeedOrthologDirFmt, collate_orthologs, \
partition_orthologs, OrthologAnnotationDirFmt, collate_ortholog_annotations
from q2_types.genome_data import LociDirectoryFormat
from q2_types.genome_data._methods import collate_loci


class TestOrthologsPartitionCollating(TestPluginBase):
Expand All @@ -33,6 +36,32 @@ def test_collate_orthologs(self):
collated_orthologs.path / "2.emapper.seed_orthologs")
)

def test_collate_loci(self):
p1 = self.get_data_path("uncollated_loci_1")
p2 = self.get_data_path("uncollated_loci_2")
loci_list = [
LociDirectoryFormat(p1, mode="r"),
LociDirectoryFormat(p2, mode="r")
]

collated_loci = collate_loci(loci_list)
self.assertTrue(all(os.path.exists(
collated_loci.path / f"loci{no}.gff") for no in [1, 2, 3, 4]))

def test_collate_loci_file_exists(self):
p1 = self.get_data_path("uncollated_loci_1")
loci_list = [
LociDirectoryFormat(p1, mode="r"),
LociDirectoryFormat(p1, mode="r")
]

with warnings.catch_warnings(record=True) as w:
collated_loci = collate_loci(loci_list)
self.assertIn("File already exists", str(w[-1].message))

self.assertTrue(all(os.path.exists(
collated_loci.path / f"loci{no}.gff") for no in [1, 2]))

def test_partition_orthologs(self):
p = self.get_data_path("collated_orthologs")
orthologs = SeedOrthologDirFmt(path=p, mode="r")
Expand Down
14 changes: 13 additions & 1 deletion q2_types/plugin_setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@
from q2_types.feature_data_mag import MAG
from q2_types.per_sample_sequences import MAGs
from q2_types.feature_data import FeatureData
from q2_types.genome_data import Orthologs, GenomeData, NOG
from q2_types.genome_data import Orthologs, GenomeData, NOG, Loci
from q2_types.genome_data._methods import collate_loci
from q2_types.sample_data import SampleData


Expand Down Expand Up @@ -136,6 +137,17 @@
"and collates them into a single artifact.",
)

plugin.methods.register_function(
function=collate_loci,
inputs={"loci": List[GenomeData[Loci]]},
parameters={},
outputs={"collated_loci": GenomeData[Loci]},
input_descriptions={"loci": "A collection of loci to be collated."},
name="Collate loci",
description="Takes a collection of GenomeData[Loci]'s "
"and collates them into a single artifact.",
)

importlib.import_module('q2_types.bowtie2._deferred_setup')
importlib.import_module('q2_types.distance_matrix._deferred_setup')
importlib.import_module('q2_types.feature_data._deferred_setup')
Expand Down
2 changes: 2 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,8 @@
'data/genes_samples/sample2/*',
'data/loci-invalid/*',
'data/loci/*',
'data/uncollated_loci_1/*',
'data/uncollated_loci_2/*',
'data/genome-sequences/*',
'data/ortholog/*',
'data/ortholog-annotation-extra/*',
Expand Down
Loading