-
Notifications
You must be signed in to change notification settings - Fork 25
/
AmrFinderPlusIO.py
136 lines (122 loc) · 5.4 KB
/
AmrFinderPlusIO.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
#!/usr/bin/env python
import csv
import warnings
import re
from .Interfaces import hAMRonizedResultIterator
from hAMRonization.constants import (
NUCLEOTIDE_VARIANT,
AMINO_ACID_VARIANT,
GENE_PRESENCE,
)
required_metadata = [
"analysis_software_version",
"reference_database_version",
"input_file_name",
]
class AmrFinderPlusIterator(hAMRonizedResultIterator):
def __init__(self, source, metadata):
metadata["analysis_software_name"] = "amrfinderplus"
metadata["reference_database_name"] = "NCBI Reference Gene Database"
self.metadata = metadata
# check source for whether AMFP has been run in protein or nt mode
nucleotide_field_mapping = {
"Protein identifier": None,
"Contig id": "input_sequence_id",
"Start": "input_gene_start",
"Stop": "input_gene_stop",
"Strand": "strand_orientation",
"Gene symbol": "gene_symbol",
"Sequence name": "gene_name",
"Scope": None,
"Element type": None,
"Element subtype": None,
"Class": "drug_class",
"Subclass": "antimicrobial_agent",
"Method": None,
"Target length": "input_protein_length",
"Reference sequence length": "reference_protein_length",
"% Coverage of reference sequence": "coverage_percentage",
"% Identity to reference sequence": "sequence_identity",
"Alignment length": None,
"Accession of closest sequence": "reference_accession",
"Name of closest sequence": None,
"HMM id": None,
"HMM description": None,
"AA Mutation": "amino_acid_mutation",
"Nucleotide Mutation": "nucleotide_mutation",
"genetic_variation_type": "genetic_variation_type",
}
protein_field_mapping = {
"Protein identifier": "input_sequence_id",
"Gene symbol": "gene_symbol",
"Sequence name": "gene_name",
"Scope": None,
"Element": None,
"Element subtype": None,
"Class": "drug_class",
"Subclass": "antimicrobial_agent",
"Method": None,
"Target length": "input_protein_length",
"Reference sequence length": "reference_protein_length",
"% Coverage of reference sequence": "coverage_percentage",
"% Identity to reference sequence": "sequence_identity",
"Alignment length": None,
"Accession of closest sequence": "reference_accession",
"Name of closest sequence": None,
"HMM id": None,
"HMM description": None,
"AA Mutation": "amino_acid_mutation",
"genetic_variation_type": "genetic_variation_type",
}
with open(source) as fh:
_ = next(fh)
try:
first_result = next(fh)
if first_result.strip().split("\t")[0] == "NA":
self.field_mapping = nucleotide_field_mapping
else:
self.field_mapping = protein_field_mapping
except StopIteration:
# doesn't really matter which mapping as this error indicates
# this is an empty results file
self.field_mapping = nucleotide_field_mapping
super().__init__(source, self.field_mapping, self.metadata)
def parse(self, handle):
"""
Read each and return it
"""
skipped_truncated = 0
reader = csv.DictReader(handle, delimiter="\t")
for result in reader:
# replace NA value with None for consitency
for field, value in result.items():
if value == "NA":
result[field] = None
# AFP reports partial hits so to avoid misleadingly listing these
# as present skip results with INTERNAL_STOP
# recommended by developers
if "INTERNAL_STOP" in result['Method']:
skipped_truncated += 1
continue
# "POINT" indicates mutational resistance
# amrfinderplus has no special fields but the mutation itself is
# appended to the symbol name so we want to split this
result["AA Mutation"] = None
result["Nucleotide Mutation"] = None
result["genetic_variation_type"] = GENE_PRESENCE
if result["Element subtype"] == "POINT":
gene_symbol, mutation = result["Gene symbol"].rsplit("_", 1)
result["Gene symbol"] = gene_symbol
_, ref, pos, alt, _ = re.split(r"(\D+)(\d+)(\D+)", mutation)
# this means it is a protein mutation
if result["Method"] in ["POINTX", "POINTP"]:
result["AA Mutation"] = f"p.{ref}{pos}{alt}"
result["genetic_variation_type"] = AMINO_ACID_VARIANT
elif result["Method"] == "POINTN":
# e.g., 23S_G2032G ampC_C-11C -> c.2032G>G
result["Nucleotide Mutation"] = f"c.{pos}{ref}>{alt}"
result["genetic_variation_type"] = NUCLEOTIDE_VARIANT
yield self.hAMRonize(result, self.metadata)
if skipped_truncated > 0:
warnings.warn(f"Skipping {skipped_truncated} records with INTERNAL_STOP "
f"from {self.metadata['input_file_name']}")