forked from ckidner/Targeted_enrichment
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Concat_nexus.py
131 lines (93 loc) · 3.55 KB
/
Concat_nexus.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
''' Concatenate interleaved nexus files into a single interleaved nexus
cannot cope with missing data. Assumes if data is missing for a locus for an accession there is a files of Ns replacing it'''
#read in file lists
#nexusfile_list = "nexus_files"
#accession_list_file = "acc_names"
# Read in the seq data to a holding file
# when read "MATRIX": next line is the beginning of the data
# read in until ";" into a string
def get_block(nexus_file):
new_block = []
with open(nexus_file) as input_data:
# Skips text before the beginning of the interesting block:
for line in input_data:
if line.strip() == 'MATRIX': # Or whatever test is needed
break
# Reads text until the end of the block:
for line in input_data: # This keeps reading the file
if line.strip() == ';':
break
line = line.rstrip("\n")
new_block.append(line)
return(new_block)
# Count the resdidues in the file
def count_bp(block, name):
counter = ''
for line in block:
line = str(line)
if line.startswith(name):
# clean up line to just nucleotides and append to a counting list
line = line.lstrip(name)
line = line.lstrip()
line = line.rstrip()
counter = counter + str(line)
return len(counter) - counter.count(' ')
nexus_list_file = input("Which list of nexus files (full file name please)?\n")
accession_list_file = input("Which file for names of accessions?\n")
#open files for each locus (for reading and writing) ready to feed in the nexuses
if accession_list_file != None:
acc_list = open(accession_list_file)
acc_name_list = []
for name in acc_list:
name = name.rstrip("\n")
acc_name_list.append(name)
picked = acc_name_list[0]
n_acc = len(acc_name_list)
print(str(picked))
all_blocks = []
char_block = []
bp_beg = 1
if nexus_list_file != None:
nexus_file_list = open(nexus_list_file)
for nexus_file in nexus_file_list:
nexus_file = nexus_file.rstrip()
new_block = get_block(nexus_file)
locus_name = nexus_file.rstrip(".nex")
# all_blocks.append("[" + str(locus_name) +"]")
all_blocks = all_blocks + new_block
locus_count = count_bp(new_block, picked)
bp_end = bp_beg + locus_count - 1
print("Locus count is " + str(locus_count))
char_line = "charset " + str(locus_name) + " = " + str(bp_beg) + " - " + str(bp_end) + ";"
char_block.append(char_line)
bp_beg = bp_end + 1
print("Accumulated bp is " + str(bp_end))
total_bp = count_bp(all_blocks, picked)
print ("Total_bp is " + str(total_bp))
# rebuild the header
header_1 = "#NEXUS\n"
header_2 = "begin data;"
header_3 = "dimensions ntax=" + str(n_acc) + " nchar=" + str(bp_end) + ";"
header_4 = "format datatype=DNA interleave=yes gap=-;\n"
header_5 = "matrix"
header_list = [header_1, header_2, header_3, header_4, header_5]
# don't need this bit as the acc_data stuff is added by trimal to individual nexus files
#acc_data = []
#for acc_name in acc_name_list:
# line = "[Name: " + str(acc_name) + " Len: " + str(bp_end) + " Check: 0]"
# acc_data.append(line)
#acc_data.append("")
#acc_data.append("MATRIX")
footer_1 = ";"
footer_2 = "end;"
footer_3 = ""
footer_list = [footer_1, footer_2, footer_3]
char_block.insert(0,"begin sets;")
char_block.append("end;")
#new_nexus_list = header_list + acc_data + all_blocks + footer_list + char_block
new_nexus_list = header_list + all_blocks + footer_list + char_block
new_nexus = '\n'.join(new_nexus_list)
# print (new_nexus)
outfile = open("concat.nex", "w")
for item in new_nexus_list:
outfile.write("%s\n" % item)