1
1
from __future__ import print_function
2
+
2
3
import collections
3
4
import csv
4
5
import logging
5
6
import pickle
6
7
import re
7
- import string
8
8
from cStringIO import StringIO
9
9
10
10
import swalign
11
-
12
- from common import read_fasta , GTFRecord , trans
13
-
14
- # Standard Genetic Code from NCBI
15
- amino = 'FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG'
16
- base1 = 'TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG'
17
- base2 = 'TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG'
18
- base3 = 'TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG'
19
- genetic_code = {'' .join ([b1 , b2 , b3 ]): aa
20
- for aa , b1 , b2 , b3 in zip (amino , base1 , base2 , base3 )}
11
+ from common import read_fasta , trans , BEDPE , translate
21
12
22
13
23
14
def get_transcriptome_data (infile ):
@@ -49,56 +40,53 @@ def get_transcriptome_data(infile):
49
40
return transcript_cds , gene_transcripts
50
41
51
42
52
- def get_exons ( genome_file , annotation_file ):
43
+ def rna_gene_in_bedpe ( record ):
53
44
"""
54
- Generates list of GTFRecord objects for each transcript
45
+ Determine if one of the two candidates in a BEDPE line is an rna gene.
55
46
56
- :param file genome_file: Reference genome FASTA file
57
- :param file annotation_file: Genome annotation file (GTF)
58
- :return: GTFRecord exons
59
- :rtype: dict
47
+ :param BEDPE record: A BEDPE line from the input file
48
+ :returns: True if one of the candidates is an RNA gene and False if not
49
+ :rtype: bool
60
50
"""
61
- chroms = {}
62
- exons = collections .defaultdict (list )
63
- for header , comment , seq in read_fasta (genome_file , 'ACGTN' ):
64
- chroms [header ] = seq
51
+ # We will accept fusions that have an RP11- (lncRNA) 3' partner since they can still be
52
+ # translated. This is a heuristic.
53
+ return 'RP11-' in record .hugo1
65
54
66
- for line in annotation_file :
67
- if line .startswith ('#' ):
68
- continue
69
- else :
70
- gtf = GTFRecord (line )
71
- if gtf .feature == 'exon' :
72
- gtf .sequence = chroms [gtf .seqname ][gtf .start - 1 : gtf .end ]
73
- exons [gtf .transcript_id ].append (gtf )
74
- return exons
75
55
76
-
77
- def translate (seq ):
56
+ def readthrough_in_bedpe (record , annotation , rt_threshold ):
78
57
"""
79
- Translates DNA sequence into protein sequence using globally defined genetic code
80
-
81
- :param str seq: DNA sequence
82
- :returns: Translated sequence
83
- :rtype: str
84
-
85
- >>> translate('ATGTTTCGTT')
86
- 'MFR'
58
+ Determine if the two genes in the record are within `rt_threshold` bp of each other on the same
59
+ chromosome.
60
+
61
+ :param BEDPE record: A BEDPE line from the input file
62
+ :param dict(str, GTFRecord) annotation: see `read_fusions:gene_annotations`
63
+ :param rt_threshold: The genomic distance on the same chromosome below which we will call a
64
+ candidate fusion a readthrough.
65
+ :returns: True if the pair is considered a readthrough and False if not
66
+ :rtype: bool
87
67
"""
88
- start = 0
89
- n = len ( seq )
90
- codons = ( seq [ i : i + 3 ] for i in range ( start , n - n % 3 , 3 ))
91
- protein = [ genetic_code [ codon ] for codon in codons ]
92
- return '' . join ( protein )
68
+ return ( record . chrom1 == record . chrom2 and
69
+ (( annotation [ record . hugo1 ]. start <= annotation [ record . hugo2 ]. start <=
70
+ annotation [ record . hugo1 ]. end + rt_threshold ) or
71
+ ( annotation [ record . hugo2 ]. start <= annotation [ record . hugo1 ]. start <=
72
+ annotation [ record . hugo2 ]. end + rt_threshold )) )
93
73
94
74
95
- def read_fusions (fusion_file ):
75
+ def read_fusions (fusion_file , gene_annotations , filter_mt , filter_ig , filter_rg , filter_rt ,
76
+ rt_threshold , out_bedpe ):
96
77
"""
97
78
Reads in gene fusion predictions in modified BEDPE format.
98
79
In addition to the basic BEDPE features, this function requires the fusion
99
80
junction sequences and HUGO names for the donor and acceptor genes.
100
81
101
82
:param file fusion_file: Fusion calls in BEDPE format
83
+ :param dict(str, GTFRecord) gene_annotations: The gene annotations from the gtf
84
+ :param bool filter_mt: Filter mitochondrial events?
85
+ :param bool filter_ig: Filter immunoglobulin pairs?
86
+ :param bool filter_rg: Filter RNA-Gene events?
87
+ :param bool filter_rt: Filter transcriptional read-throughs?
88
+ :param int rt_threshold: Distance threshold to call a readthrough
89
+ :param file out_bedpe: A file handle to an output BEDPE file
102
90
:returns: list of BEDPE namedtuples
103
91
:rtype: list
104
92
@@ -118,31 +106,49 @@ def read_fusions(fusion_file):
118
106
hugo1: HUGO name for first feature
119
107
hugo2: HUGO name for second feature
120
108
"""
121
- BEDPE = collections .namedtuple ('BEDPE' ,
122
- 'chrom1, start1, end1, '
123
- 'chrom2, start2, end2, '
124
- 'name, score, '
125
- 'strand1, strand2, '
126
- 'junctionSeq1, junctionSeq2, '
127
- 'hugo1, hugo2' )
128
109
129
110
calls = []
111
+
130
112
for line in csv .reader (fusion_file , delimiter = '\t ' ):
131
113
if line [0 ].startswith ('#' ):
114
+ print ('\t ' .join (line ), file = out_bedpe )
132
115
continue
133
116
try :
134
- calls .append (BEDPE (* line ))
135
-
117
+ record = BEDPE (* line )
136
118
except TypeError :
137
119
raise ValueError ("ERROR: fusion file is malformed.\n {}" .format (read_fusions .__doc__ ))
138
120
121
+ if filter_mt and 'M' in record .chrom1 or 'M' in record .chrom2 :
122
+ logging .warning ("Rejecting %s-%s for containing a Mitochondrial gene." , record .hugo1 ,
123
+ record .hugo2 )
124
+ continue
125
+ elif filter_ig and record .hugo1 .startswith ('IG' ) and record .hugo2 .startswith ('IG' ):
126
+ # This will drop some Insulin-like growth factor (IGF) proteins but they have a lot of
127
+ # homology too so its ok.
128
+ logging .warning ("Rejecting %s-%s an an Immunoglobulin gene pair." , record .hugo1 ,
129
+ record .hugo2 )
130
+ continue
131
+ elif filter_rg and rna_gene_in_bedpe (record ):
132
+ logging .warning ("Rejecting %s-%s for containing a 5' RNA gene." , record .hugo1 ,
133
+ record .hugo2 )
134
+ continue
135
+ elif filter_rt and readthrough_in_bedpe (record , gene_annotations , rt_threshold ):
136
+ logging .warning ("Rejecting %s-%s as a potential readthrough." , record .hugo1 ,
137
+ record .hugo2 )
138
+ continue
139
+ else :
140
+ logging .info ("Accepting %s-%s for further study." , record .hugo1 , record .hugo2 )
141
+ print ('\t ' .join (line ), file = out_bedpe )
142
+ calls .append (record )
143
+
139
144
return calls
140
145
141
146
# Namedtuple for storing alignment metrics
142
- # Neeeds to be global for pickling
147
+ # Needs to be global for pickling
143
148
AlignStats = collections .namedtuple ('AlignStats' ,
144
149
'qstart, qstop, rstart, rstop, insertions, deletions' )
145
150
151
+
146
152
def align_filter (ref , query , mode , mismatches_per_kb = 1 ):
147
153
"""
148
154
Aligns query to reference CDS sequence using the Smith-Waterman algorithm.
0 commit comments