-
Notifications
You must be signed in to change notification settings - Fork 0
/
gb2ptt_bacteria.py
117 lines (91 loc) · 3.54 KB
/
gb2ptt_bacteria.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
#!/usr/bin/env python
"""
Convert genbank to ptt.
USAGE:
gb2ptt_bacteria.py yourfile.gb > file.ptt
NOTE:
It's designed to work with gb files coming from GenBank. gene is used as gene_id and transcript_id (locus_tag if gene not present).
Only entries having types in allowedTypes = ['CDS'] are stored in GTF.
Based on a script by Leszek Pryszcz ([email protected])
AUTHOR
Andreas Sjodin
Version 0.2
"""
import os, sys
from datetime import datetime
from Bio import SeqIO
import sys, re
def grep(l,s):
return [i for i in l if s in i]
def gb2gtf( source='gb2gtf',allowedTypes=set(['CDS']) ):
"""
"""
header = '%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' % ("Location", "Strand", "Length", "PID", "Gene", "Synonym", "Code", "COG", "Product")
print header
for gb in SeqIO.parse( sys.argv[1],'gb' ):
acc = gb.id #gb.name #gb.description # #
skipped = 0
skippedTypes = set()
for f in gb.features:
#process only gene and CDS entries
if f.type not in allowedTypes:
skipped += 1
skippedTypes.add( f.type )
continue
#generate comments field
if 'locus_tag' in f.qualifiers:
#use locul tag as gene_id/transcript_id
gene_id = f.qualifiers['locus_tag'][0]
transcript_id = 'T' + gene_id
elif 'gene' in f.qualifiers:
gene_id = f.qualifiers['gene'][0]
transcript_id = 'T' + gene_id
elif 'label' in f.qualifiers:
gene_id = f.qualifiers['label'][0].replace(" ", ".")
transcript_id = 'T' + gene_id
comments = 'gene_id "%s"; transcript_id "%s"' % ( gene_id,transcript_id )
if 'gene' in f.qualifiers:
gene_name = f.qualifiers['gene'][0]
comments += '; gene_name "%s"' % f.qualifiers['gene'][0]
comments += '; transcript_name "%s-1"' % f.qualifiers['gene'][0]
elif 'label' in f.qualifiers:
gene_name = f.qualifiers['label'][0]
comments += '; gene_name "%s"' % f.qualifiers['label'][0].replace(" ", ".")
comments += '; transcript_name "%s-1"' % f.qualifiers['label'][0].replace(" ", ".")
if 'protein_id' in f.qualifiers:
comments += '; protein_id "%s"' % f.qualifiers['protein_id'][0]
if 'function' in f.qualifiers:
annotation = f.qualifiers['function'][0]
#add external IDs
if 'db_xref' in f.qualifiers:
db_xref= f.qualifiers['db_xref']
if len(grep(db_xref,'GI')) != 0 :
PID = grep(db_xref,'GI')[0].replace('GI:', '')
else:
PID = ''
#code strand as +/- (in genbank 1 or -1)
if int(f.strand)>0: strand = '+'
else: strand = '-'
#define gb
"""
Location - start and stop of feature
strand - Valid entries include '+', '-', or '.' (for don't know/don't care).
Length - length of feature
PID - protein identifier
Gene - name of gene
Synonym - gene identifier
Code -
COG -
Product - Annotation
"""
protlength = ((f.location.end.position-(f.location.start.position))/3 ) -1
COG = '-'
code = '-'
gtf = '%s..%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' % ( f.location.start.position+1, f.location.end.position, strand, protlength, PID, gene_name, gene_id, code, COG, annotation ) #f.frame,
print gtf
sys.stderr.write( "%s\tSkipped %s entries having types: %s.\n" % ( gb.id,skipped,', '.join(skippedTypes) ) )
if __name__=='__main__':
t0=datetime.now()
gb2gtf()
dt=datetime.now()-t0
sys.stderr.write( "#Time elapsed: %s\n" % dt )