-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathAnnotateClusters.py
167 lines (122 loc) · 6.79 KB
/
AnnotateClusters.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
#!/usr/local/bin/python
#Created on 10/2/15
__author__ = 'Juan A. Ugalde'
def annotate_cluster(protein_list, feature_type, annotation):
from collections import defaultdict
th = None # Top hits
tc = None # Total conflicts
uc = None # Unresolved conflicts
summary_annotation = defaultdict(int)
for protein_id in protein_list:
#First check that the protein is in the annotation
if not protein_id in annotation:
continue
else:
#Check that it has COG annotation
if feature_type in annotation[protein_id].keys():
protein_info = annotation[protein_id][feature_type]
summary_annotation[protein_info] += 1
if len(summary_annotation) == 0:
pass
elif len(summary_annotation) == 1:
th = summary_annotation.keys()[0]
else:
tc = summary_annotation
#Get the total number of values
total_annotations = sum(summary_annotation.itervalues())
for hit in summary_annotation:
if summary_annotation[hit] / float(total_annotations) >= float(0.5):
th = hit
#If no decision was made
if th is None:
uc = summary_annotation
return th, tc, uc
if __name__ == '__main__':
import os
import argparse
from collections import defaultdict
from tools.data_input import AnnotationData, GenomeData
from tools.AnnotationInfo import cog_definitions
program_description = "This script annotates a cluster generated by the Summarize OrthoMCLResults script"
parser = argparse.ArgumentParser(description=program_description)
#Arguments
parser.add_argument("-l", "--genome_list_index", type=str,
help="File with the genome list. Format GenomeID, FullName, ShortName", required=True)
parser.add_argument("-a", "--annotation_folder", type=str,
help="Folder with the annotation files from JGI", required=True)
parser.add_argument("-c", "--cluster_file", type=str,
help="Cluster file", required=True)
parser.add_argument("-o", "--output_directory", type=str,
help="Output folder", required=True)
args = parser.parse_args()
#Create the output directory
if not os.path.exists(args.output_directory):
os.makedirs(args.output_directory)
#####Read the genome list
genome_id_dictionary, genome_count = GenomeData.read_genome_list(args.genome_list_index)
###Read the annotation information
protein_annotation, function_definitions = \
AnnotationData.parse_annotation_folder(genome_id_dictionary.keys(), args.annotation_folder)
##Read the cluster information
total_clusters = AnnotationData.get_cluster_information(args.cluster_file)
##Print log file
logfile = open(args.output_directory + "/logfile.txt", 'w')
##Total number of clusters
logfile.write("Total number of analyzed clusters: %d" % len(total_clusters) + "\n")
features_to_annotate = ["COG", "KO", "PFAM", "Product"]
#Get the COG definitions
cog_one_letter, desc_cog_letter, desc_cog_number = cog_definitions()
#Create the output files
for feature in features_to_annotate:
output_top_hit = open(args.output_directory + "/" + feature + "_clusters.txt", 'w')
output_conflicts = open(args.output_directory + "/" + feature + "_conflicts.txt", 'w')
for cluster in total_clusters:
protein_id_list = [id_tag.split("|")[1] for id_tag in total_clusters[cluster].split(",")]
function_description = None
#Get the annotations
top_hit, total_conflicts, unresolved_conflicts = \
annotate_cluster(protein_id_list, feature, protein_annotation)
#Print the output file
if total_conflicts is None and unresolved_conflicts is None and top_hit is not None:
if feature == "Product":
output_top_hit.write(cluster + "\t" + top_hit + "\n")
else:
if top_hit in function_definitions:
function_description = function_definitions[top_hit]
if feature == "COG":
letter = cog_one_letter[top_hit]
try:
definition = desc_cog_letter[letter[0]]
except IndexError:
definition = None
print cluster, letter, top_hit
output_top_hit.write(cluster + "\t" + top_hit + "\t" + str(function_description)
+ "\t" + letter[0] + "\t" + definition + "\n")
else:
output_top_hit.write(cluster + "\t" + top_hit + "\t" + str(function_description) + "\n")
elif total_conflicts is not None and top_hit is not None and unresolved_conflicts is None:
if feature == "Product":
output_top_hit.write(cluster + "\t" + top_hit + "\t" +
" ; ".join(["%s=%s" % (k, v) for k, v in total_conflicts.items()]) + "\n")
else:
if top_hit in function_definitions:
function_description = function_definitions[top_hit]
if feature == "COG":
letter = cog_one_letter[top_hit]
definition = desc_cog_letter[letter[0]]
output_top_hit.write(cluster + "\t" + top_hit + "\t" + function_description +
"\t" + letter + "\t" + definition + "\t" +
" ; ".join(["%s=%s" % (k, v) for k, v in total_conflicts.items()]) + "\n")
else:
output_top_hit.write(cluster + "\t" + top_hit + "\t" + str(function_description) + "\t" +
" ; ".join(["%s=%s" % (k, v) for k, v in total_conflicts.items()]) + "\n")
elif top_hit is None and total_conflicts is not None and unresolved_conflicts is not None:
if feature == "Product":
output_conflicts.write(cluster + "\t" + " ; ".join(["%s=%s" % (k, v) for k, v in
unresolved_conflicts.items()]) + "\n")
else:
output_conflicts.write(cluster + "\t" + " ; ".join(["%s, %s=%s" % (k, function_definitions[k], v)
if k in function_definitions else "%s, None=%s" % (k, v) for k, v in
unresolved_conflicts.items()]) + "\n")
output_top_hit.close()
output_conflicts.close()