-
Notifications
You must be signed in to change notification settings - Fork 1
/
aggregate_size_and_features.py
executable file
·111 lines (97 loc) · 4.31 KB
/
aggregate_size_and_features.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
#!/usr/bin/env python
#
# aggregate_size_and_features.py
'''aggregate_size_and_features.py last updated 2021-01-02
aggregate size information and features counts from RefSeq
to make an easily-plotted table of genome size vs. gene count
aggregate_size_and_features.py -a *assembly_stats.txt -f *feature_count.txt > assembly_w_features.tab
~/git/genome-size/aggregate_size_and_features.py -a assembly-stats-2019-12-12/*assembly_stats.txt -f feature-count-2019-12-12/*_feature_count.txt > bacteria_assembly_stats_2019-12-12_w_features.tab
'''
import sys
import argparse
# assembly stats and feature counts downloaded from NCBI Assembly
# https://www.ncbi.nlm.nih.gov/assembly/
# for example:
#
# for Archaea
# https://www.ncbi.nlm.nih.gov/assembly/?term=txid2157%5BOrganism%3Aexp%5D
# clicking on the "Download Assemblies" button
# selecting the "Assembly Statistics Report"
#
# produces a .tar file, including a folder of .txt files, one for each genome
# and a file report.txt, which states
text_report = '''HistoryId: NCID_1_30605502_130.14.18.97_5555_1576144714_2477331710_0MetA0_S_HStore
QueryKey: 16
ReleaseType: RefSeq
FileType: ASM_STATS
Flat: true
Query title: Search txid2157[Organism:exp] AND latest[filter] AND "representative genome"[filter] AND ( all[filter] NOT "derived from surveillance project"[filter] AND all[filter] NOT anomalous[filter] ) Sort by: SORTORDER
Search results count: 257
Filtered out 1 entries that do not have the requested ReleaseType, or are suppressed.
Entries to download: 256
ASM_STATS files in archive: 256
Total size (bytes): 1090685
Total time: 6.38 seconds
'''
#
def count_features(featurefile):
# 0 1 2 3 4 5 6
# Feature Class Full Assembly Assembly-unit accession Assembly-unit name Unique Ids Placements
#CDS with_protein GCF_900100385.1 GCF_900100384.1 Primary Assembly 3117 3119
featuredict = {} # key is assembly accession, value is sum of relevant features
featurecount = 0
for line in open(featurefile,'r'):
line = line.strip()
if line and line[0]!="#":
lsplits = line.split("\t")
feature = lsplits[0]
fclass = lsplits[1]
accession = lsplits[2]
if (feature=="CDS" and fclass=="with_protein") or feature=="rRNA" or feature=="tRNA":
placements = int(lsplits[6])
featurecount += placements
featuredict[accession] = featurecount
return featuredict
def get_assembly_stats(assemblyfile, countsdict):
# Organism name: Halovenus aranensis (euryarchaeotes)
for line in open(assemblyfile,'r'):
line = line.strip()
if line:
if line[0]=="#":
if line.find("Organism")>1:
species = line.split(":")[1].strip()
species = species.replace("=","_").replace('"','').replace("[","").replace("]","").replace("'","")
elif line.find("RefSeq assembly accession")>1:
accession = line.split(":")[1].strip()
else:
# 0 1 2 3 4 5
# unit-name molecule-name molecule-type/loc sequence-type statistic value
lsplits = line.split("\t")
unitname = lsplits[0]
molname = lsplits[1]
seqtype = lsplits[3]
if unitname=="Primary Assembly" and molname=="all" and seqtype=="all":
statistic = lsplits[4]
if statistic=="total-length":
basecount = lsplits[5]
genecount = countsdict[accession]
outline = "{}\t{}\t{}\t{}\n".format( accession, species, basecount, genecount )
sys.stdout.write(outline)
# no return
def main(argv, wayout):
if not len(argv):
argv.append("-h")
parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, description=__doc__)
parser.add_argument('-a','--assembly-stats', nargs='*', help="assembly stats files")
parser.add_argument('-f','--feature-counts', nargs='*', help="feature counts files")
args = parser.parse_args(argv)
featurecounts = {}
sys.stderr.write("# Reading features for {} files\n".format( len(args.feature_counts) ) )
for featurefile in args.feature_counts:
featurecounts.update( count_features(featurefile) )
sys.stderr.write("# Counted features for {} assemblies\n".format( len(featurecounts) ) )
sys.stderr.write("# Reading assembly size for {} files\n".format( len(args.assembly_stats) ) )
for assemblyfile in args.assembly_stats:
get_assembly_stats(assemblyfile, featurecounts)
if __name__ == "__main__":
main(sys.argv[1:],sys.stdout)