-
Notifications
You must be signed in to change notification settings - Fork 4
/
reorder_matrix_by_cov.py
executable file
·150 lines (135 loc) · 7.02 KB
/
reorder_matrix_by_cov.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
#!/usr/bin/env python
#
# reorder_matrix_by_cov.py created 2017-10-24
# v1.1 python3 update 2023-01-19
'''reorder_matrix_by_cov.py last modified 2023-01-19
reorder_matrix_by_cov.py -a matrix.phy -p partitions.txt -o reordered_matrix.phy
for partitioned alignments, formats (-f) include:
clustal, fasta, nexus, phylip, phylip-relaxed, stockholm
to take the highest coverage genes stopping after N sites, use -m
reorder_matrix_by_cov.py -a matrix.phy -p partitions.txt -o reordered_matrix.phy -m 5000
add gene names in RAxML-type partition format, using --pair-stats
pair stats file is the output of align_pair_stats.py, or any file as:
1-1000 GENE
or can include species or SwissProt ID
Genus_species_1-1000 sp|123456|GENE
'''
import sys
import os
import argparse
import time
import gzip
from collections import Counter
from Bio import AlignIO
def get_partitions(partitionfile):
'''read comma-delimited partition information and return a list of tuples'''
partitions = [] # list of tuples of intervals
for line in open(partitionfile,'r'):
line = line.strip()
if line:
if line.count("=") > 0: # for RAxML type format
# WAG, homologs_134_11213 = 1-221
block = line.split("=",1)[-1].strip() # should be "1-221"
alignindex = tuple( int(i) for i in block.split("-") ) # split "1-221" into ( 1,221 )
partitions.append(alignindex)
else: # for short format
blocks = line.split(",") # split "1:136,137:301,..." into ['1:136', '137:301',...]
for block in blocks:
alignindex = tuple( int(i) for i in block.split(":") ) # split '1:136' into ( 1,136 )
partitions.append(alignindex)
print( "# read {} partitions from {} {}".format(len(partitions), partitionfile, time.asctime() ), file=sys.stderr )
return partitions
def parts_to_genes(pairstatsfile):
'''read tabular pair-wise gene stats, return a dict where key is partition and value is gene'''
part_to_gene = {}
sys.stderr.write( "# reading partitions and gene names from {} {}\n".format(pairstatsfile, time.asctime() ) )
for line in open(pairstatsfile,'r'):
line = line.strip()
if line:
lsplits = line.split("\t")
if lsplits[0]=="partition": # skip first line
continue
gene = lsplits[1].split("|")[-1]
partition = lsplits[0].split("_")[-1]
part_to_gene[partition] = gene
sys.stderr.write( "# found {} gene names".format( len(part_to_gene), time.asctime() ) )
return part_to_gene
def reorder_alignments(fullalignment, alignformat, partitions, maxsites, genenamedict, reverse_sort):
'''read alignment, and return a new alignment where partitions are reordered from highest coverage to lowest'''
if fullalignment.rsplit('.',1)[1]=="gz": # autodetect gzip format
opentype = gzip.open
sys.stderr.write( "# reading alignment {} as gzipped {}\n".format(fullalignment, time.asctime() ) )
else: # otherwise assume normal open
opentype = open
sys.stderr.write( "# reading alignment {} {}\n".format(fullalignment, time.asctime() ) )
alignedseqs = AlignIO.read(opentype(fullalignment), alignformat)
num_species = len(alignedseqs)
newalign = alignedseqs[:,0:0] # start with blank alignment
newpartitions = []
sys.stderr.write( "# alignment has {} taxa with {} positions {}\n".format(num_species, alignedseqs.get_alignment_length(), time.asctime() ) )
newnamedict = {} # key is new partition, value is gene name from old partition
alignparts = {} # key is old partition, value is alignment part
aligncompleteness = {} # key is old partition, value is coverage
sys.stderr.write( "# scoring partitions {}\n".format( time.asctime() ) )
for part in partitions:
alignpart = alignedseqs[:, part[0]-1:part[1] ] # alignment of each partition only
gaplist = []
#completecounter = {0:0, 1:0, 2:0}
for i,seqrec in enumerate(alignpart):
species = seqrec.id
seqlen = len(seqrec.seq)
lettercounts = Counter(str(seqrec.seq).replace("X","-"))
gapcount = lettercounts.get("-",0) + lettercounts.get("?",0)
occupancyscore = 100 - 100*gapcount/seqlen
gaplist.append(occupancyscore)
#occupancyscore = 2 # by default is present, reassign if absent or partial
#if lettercounts["-"] == seqlen or lettercounts["?"] == seqlen: # seq is all gaps, so no seq
# occupancyscore = 0 # set to 0 if all gaps
#elif lettercounts["-"] >= seqlen * 0.8: # partial means 20% or more of sequence is gaps
# occupancyscore = 1 # set to 1 if partial
#completecounter[occupancyscore] += 1
#covscore = sum( k*v for k,v in completecounter.items() )
covscore = sum( gaplist )
#sys.stderr.write( part, covscore
aligncompleteness[part] = covscore
alignparts[part] = alignpart
if reverse_sort:
sys.stderr.write( "# sorting partitions, by highest coverage {}\n".format( time.asctime() ) )
else:
sys.stderr.write( "# sorting partitions, taking lowest coverage {}\n".format( time.asctime() ) )
for part, score in sorted(aligncompleteness.items(), key=lambda x: x[1], reverse=reverse_sort):
#sys.stderr.write( part, score
newindices = "{}:{}".format( newalign.get_alignment_length()+1, newalign.get_alignment_length()+part[1]-part[0]+1 )
newpartitions.append( newindices )
newalign += alignparts[part]
if genenamedict: # transfer name to new indices
newnamedict[newindices] = genenamedict.get("{}-{}".format(*part),None)
if maxsites is not None and newalign.get_alignment_length() >= maxsites:
break
sys.stderr.write( "# alignment has {} partitions with {} sites {}\n".format( len(newpartitions), newalign.get_alignment_length() , time.asctime() ) )
return newalign, newpartitions, newnamedict
def main(argv, wayout):
if not len(argv):
argv.append('-h')
parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, description=__doc__)
parser.add_argument('-a','--alignment', help="supermatrix alignment")
parser.add_argument('-f','--format', default="fasta", help="alignment format [fasta]")
parser.add_argument('-m','--max-sites', type=int, help="optional limit to stop after N sites")
parser.add_argument('-o','--output', help="output name for new alignment", required=True)
parser.add_argument('-p','--partition', help="partition file for splitting large alignments")
parser.add_argument('--invert', action="store_false", help="instead sort by lowest coverage first")
parser.add_argument('--pair-stats', help="pair stats file for gene names, to use with RAxML style partition format")
args = parser.parse_args(argv)
partitions = get_partitions(args.partition)
genenames = parts_to_genes(args.pair_stats) if args.pair_stats else None
sortedalignment, partitionlist, genenames = reorder_alignments(args.alignment, args.format, partitions, args.max_sites, genenames, args.invert)
AlignIO.write(sortedalignment, args.output, args.format)
sys.stderr.write( "# Supermatrix written to {} {}\n".format(args.output, time.asctime() ) )
with open("{}.partition.txt".format(args.output),'w') as pf:
if args.pair_stats:
for part in partitionlist:
print("model, {} = {}-{}".format( genenames[part], *part.split(":") ), file= pf )
else:
print( ",".join(partitionlist), file= pf )
if __name__ == "__main__":
main(sys.argv[1:], sys.stdout)