-
Notifications
You must be signed in to change notification settings - Fork 4
/
make_ortholog_stats_table.py
executable file
·98 lines (80 loc) · 3.68 KB
/
make_ortholog_stats_table.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
#!/usr/bin/env python
# make_ortholog_stats_table.py v1 2022-11-15
"""
make_ortholog_stats_table.py v1.0 last modified 2022-12-02
~/git/supermatrix/make_ortholog_stats_table.py -i clusters_lactobacillus_v4 > lactobacillus_cluster_stats_v4.tab
or can glob with:
~/git/supermatrix/make_ortholog_stats_table.py -i clusters_lactobacillus_v4/* > lactobacillus_cluster_stats_v4.tab
"""
import sys
import os
import argparse
import time
from Bio import SeqIO
from glob import iglob
def parse_gene_names_from_db(proteinfile):
"""from a fasta file, extract ID and gene name, description, or anything from the fasta header,
and return a dict, where key is ID and value is gene name, where available"""
accession_to_gene = {}
return accession_to_gene
def find_intermediate_ids(proteinfile):
"""from a fasta file, get IDs and accessions, and return a dict of ID to accession"""
prot_id_to_accession = {}
return prot_id_to_accession
def get_gene_from_members(memberlist, accession_to_gene, prot_id_to_accession):
"""for each gene, find the best or most common gene name"""
usable_gene_names = {}
best_name = ""
return best_name
def main(argv, wayout):
if not len(argv):
argv.append('-h')
parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, description=__doc__)
parser.add_argument('-i','--input', nargs="*", help="input files, or folder containing fasta files from makehomologs.py")
parser.add_argument('-v','--verbose', action="store_true", help="display more output to stderr")
parser.add_argument('--no-header', action="store_true", help="omit header line")
args = parser.parse_args(argv)
file_counter = 0
protein_counter = 0
duplicate_counter = 0
accession_to_gene_dict = {}
prot_id_to_accession_dict = {}
if os.path.isdir(args.input[0]): #
clust_list = os.path.join(args.input[0], "*")
clust_iter_obj = iglob(clust_list)
sys.stderr.write("# Starting parsing on clusters in folder {} {}\n".format( args.input[0], time.asctime() ) )
elif os.path.isfile(args.input[0]): # first item is a file, hopefully the rest are
clust_iter_obj = list(args.input)
sys.stderr.write("# Starting parsing on clusters, first is {} {}\n".format( args.input[0], time.asctime() ) )
if not args.no_header:
headerline = "cluster_ID\tgene_name\tnotes\tN_seqs\tmin_len\tmean_len\tmax_len\tmember_list\n"
sys.stdout.write(headerline)
for clustfile in clust_iter_obj:
file_counter += 1
# reset parameters for cluster
best_gene_name = "NA"
user_notes = ""
seq_counter = 0
min_length = 0
max_length = 0
member_list = {} # key is seqID, value is length
for seqrec in SeqIO.parse(clustfile, "fasta"):
protein_counter += 1
seqlen = len(seqrec.seq)
if seqrec.id not in member_list:
member_list[seqrec.id] = seqlen
else:
duplicate_counter += 1
sys.stderr.write( "WARNING: duplicate seq ID {} in cluster {}\n".format(seqrec.id, clustfile) )
if member_list:
min_length = min(member_list.values())
max_length = max(member_list.values())
mean_length = sum(member_list.values())/len(member_list.values())
#best_gene_name = get_gene_from_members( list(member_list.keys()), accession_to_gene_dict, prot_id_to_accession_dict )
outline = "{}\t{}\t{}\t{}\t{}\t{:.1f}\t{}\t{}\n".format(clustfile, best_gene_name, user_notes, len(member_list.values()), min_length, mean_length, max_length, " ; ".join( list(member_list.keys()) ) )
sys.stdout.write(outline)
sys.stderr.write("# Parsed {} clusters with {} proteins {}\n".format( file_counter, protein_counter, time.asctime() ) )
if duplicate_counter:
sys.stderr.write("# Counted {} proteins with duplicate names\n".format( duplicate_counter ) )
if __name__ == "__main__":
main(sys.argv[1:], sys.stdout)