Skip to content

Commit fd2f299

Browse files
committed
Filter fusion artifacts (resolves #29)
resolves #29 related to BD2KGenomics/protect#237 Fusion calling often has FPS related to homology, especially among mitochondrial and (lnc,etc) RNA genes. Furthermore, transcriptional readthroughs are usually oncologically irrelevant. This PR adds filtering flags for mitochondrial, Immunoglobulin, RP11- (lncRNA) and transcriptional readthrough filtering.
1 parent 5cf7d66 commit fd2f299

File tree

5 files changed

+210
-92
lines changed

5 files changed

+210
-92
lines changed

common.py

+83-1
Original file line numberDiff line numberDiff line change
@@ -180,4 +180,86 @@ def __eq__(self, other):
180180
# The number of reads at the position
181181
'coverage',
182182
# The variant allele frequency
183-
'vaf'))
183+
'vaf'))
184+
185+
# Standard Genetic Code from NCBI
186+
amino = 'FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG'
187+
base1 = 'TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG'
188+
base2 = 'TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG'
189+
base3 = 'TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG'
190+
genetic_code = {''.join([b1, b2, b3]): aa
191+
for aa, b1, b2, b3 in zip(amino, base1, base2, base3)}
192+
193+
194+
BEDPE = collections.namedtuple('BEDPE',
195+
'chrom1, start1, end1, '
196+
'chrom2, start2, end2, '
197+
'name, '
198+
'score, '
199+
'strand1, strand2, '
200+
'junctionSeq1, junctionSeq2, '
201+
'hugo1, hugo2')
202+
203+
204+
def get_exons(genome_file, annotation_file, genes_of_interest):
205+
"""
206+
Generates list of GTFRecord objects for each transcript
207+
208+
:param file genome_file: Reference genome FASTA file
209+
:param file annotation_file: Genome annotation file (GTF)
210+
:param set(str) genes_of_interest: The genes in this sample that might need translation.
211+
:return: GTFRecord exons
212+
:rtype: dict
213+
"""
214+
annotation_file.seek(0)
215+
chroms = {}
216+
exons = collections.defaultdict(list)
217+
for header, comment, seq in read_fasta(genome_file, 'ACGTN'):
218+
chroms[header] = seq
219+
220+
for line in annotation_file:
221+
if line.startswith('#'):
222+
continue
223+
else:
224+
gtf = GTFRecord(line)
225+
if gtf.feature == 'exon' and gtf.gene_name in genes_of_interest:
226+
gtf.sequence = chroms[gtf.seqname][gtf.start - 1: gtf.end]
227+
exons[gtf.transcript_id].append(gtf)
228+
return exons
229+
230+
231+
def read_genes_from_gtf(gtf_file):
232+
"""
233+
Read the gene annotations into a dict
234+
235+
:param file gtf_file: A file handle ot the annotation file.
236+
:returns: A dict of a gtf record for each gene
237+
:rtype: dict(string, GTFRecord)
238+
"""
239+
gene_annotations = {}
240+
for line in gtf_file:
241+
if line.startswith('#'):
242+
continue
243+
else:
244+
gtf = GTFRecord(line)
245+
if gtf.feature == 'gene':
246+
gene_annotations[gtf.gene_name] = gtf
247+
return gene_annotations
248+
249+
250+
def translate(seq):
251+
"""
252+
Translates DNA sequence into protein sequence using globally defined genetic code
253+
254+
:param str seq: DNA sequence
255+
:returns: Translated sequence
256+
:rtype: str
257+
258+
>>> translate('ATGTTTCGTT')
259+
'MFR'
260+
"""
261+
start = 0
262+
n = len(seq)
263+
codons = (seq[i: i+3] for i in range(start, n - n % 3, 3))
264+
protein = [genetic_code[codon] for codon in codons]
265+
return ''.join(protein)

fusion.py

+62-56
Original file line numberDiff line numberDiff line change
@@ -1,23 +1,14 @@
11
from __future__ import print_function
2+
23
import collections
34
import csv
45
import logging
56
import pickle
67
import re
7-
import string
88
from cStringIO import StringIO
99

1010
import swalign
11-
12-
from common import read_fasta, GTFRecord, trans
13-
14-
# Standard Genetic Code from NCBI
15-
amino = 'FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG'
16-
base1 = 'TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG'
17-
base2 = 'TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG'
18-
base3 = 'TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG'
19-
genetic_code = {''.join([b1, b2, b3]): aa
20-
for aa, b1, b2, b3 in zip(amino, base1, base2, base3)}
11+
from common import read_fasta, trans, BEDPE, translate
2112

2213

2314
def get_transcriptome_data(infile):
@@ -49,56 +40,53 @@ def get_transcriptome_data(infile):
4940
return transcript_cds, gene_transcripts
5041

5142

52-
def get_exons(genome_file, annotation_file):
43+
def rna_gene_in_bedpe(record):
5344
"""
54-
Generates list of GTFRecord objects for each transcript
45+
Determine if one of the two candidates in a BEDPE line is an rna gene.
5546
56-
:param file genome_file: Reference genome FASTA file
57-
:param file annotation_file: Genome annotation file (GTF)
58-
:return: GTFRecord exons
59-
:rtype: dict
47+
:param BEDPE record: A BEDPE line from the input file
48+
:returns: True if one of the candidates is an RNA gene and False if not
49+
:rtype: bool
6050
"""
61-
chroms = {}
62-
exons = collections.defaultdict(list)
63-
for header, comment, seq in read_fasta(genome_file, 'ACGTN'):
64-
chroms[header] = seq
51+
# We will accept fusions that have an RP11- (lncRNA) 3' partner since they can still be
52+
# translated. This is a heuristic.
53+
return 'RP11-' in record.hugo1
6554

66-
for line in annotation_file:
67-
if line.startswith('#'):
68-
continue
69-
else:
70-
gtf = GTFRecord(line)
71-
if gtf.feature == 'exon':
72-
gtf.sequence = chroms[gtf.seqname][gtf.start - 1: gtf.end]
73-
exons[gtf.transcript_id].append(gtf)
74-
return exons
7555

76-
77-
def translate(seq):
56+
def readthrough_in_bedpe(record, annotation, rt_threshold):
7857
"""
79-
Translates DNA sequence into protein sequence using globally defined genetic code
80-
81-
:param str seq: DNA sequence
82-
:returns: Translated sequence
83-
:rtype: str
84-
85-
>>> translate('ATGTTTCGTT')
86-
'MFR'
58+
Determine if the two genes in the record are within `rt_threshold` bp of each other on the same
59+
chromosome.
60+
61+
:param BEDPE record: A BEDPE line from the input file
62+
:param dict(str, GTFRecord) annotation: see `read_fusions:gene_annotations`
63+
:param rt_threshold: The genomic distance on the same chromosome below which we will call a
64+
candidate fusion a readthrough.
65+
:returns: True if the pair is considered a readthrough and False if not
66+
:rtype: bool
8767
"""
88-
start = 0
89-
n = len(seq)
90-
codons = (seq[i: i+3] for i in range(start, n - n % 3, 3))
91-
protein = [genetic_code[codon] for codon in codons]
92-
return ''.join(protein)
68+
return (record.chrom1 == record.chrom2 and
69+
((annotation[record.hugo1].start <= annotation[record.hugo2].start <=
70+
annotation[record.hugo1].end + rt_threshold) or
71+
(annotation[record.hugo2].start <= annotation[record.hugo1].start <=
72+
annotation[record.hugo2].end + rt_threshold)))
9373

9474

95-
def read_fusions(fusion_file):
75+
def read_fusions(fusion_file, gene_annotations, filter_mt, filter_ig, filter_rg, filter_rt,
76+
rt_threshold, out_bedpe):
9677
"""
9778
Reads in gene fusion predictions in modified BEDPE format.
9879
In addition to the basic BEDPE features, this function requires the fusion
9980
junction sequences and HUGO names for the donor and acceptor genes.
10081
10182
:param file fusion_file: Fusion calls in BEDPE format
83+
:param dict(str, GTFRecord) gene_annotations: The gene annotations from the gtf
84+
:param bool filter_mt: Filter mitochondrial events?
85+
:param bool filter_ig: Filter immunoglobulin pairs?
86+
:param bool filter_rg: Filter RNA-Gene events?
87+
:param bool filter_rt: Filter transcriptional read-throughs?
88+
:param int rt_threshold: Distance threshold to call a readthrough
89+
:param file out_bedpe: A file handle to an output BEDPE file
10290
:returns: list of BEDPE namedtuples
10391
:rtype: list
10492
@@ -118,31 +106,49 @@ def read_fusions(fusion_file):
118106
hugo1: HUGO name for first feature
119107
hugo2: HUGO name for second feature
120108
"""
121-
BEDPE = collections.namedtuple('BEDPE',
122-
'chrom1, start1, end1, '
123-
'chrom2, start2, end2, '
124-
'name, score, '
125-
'strand1, strand2, '
126-
'junctionSeq1, junctionSeq2, '
127-
'hugo1, hugo2')
128109

129110
calls = []
111+
130112
for line in csv.reader(fusion_file, delimiter='\t'):
131113
if line[0].startswith('#'):
114+
print('\t'.join(line), file=out_bedpe)
132115
continue
133116
try:
134-
calls.append(BEDPE(*line))
135-
117+
record = BEDPE(*line)
136118
except TypeError:
137119
raise ValueError("ERROR: fusion file is malformed.\n{}".format(read_fusions.__doc__))
138120

121+
if filter_mt and 'M' in record.chrom1 or 'M' in record.chrom2:
122+
logging.warning("Rejecting %s-%s for containing a Mitochondrial gene.", record.hugo1,
123+
record.hugo2)
124+
continue
125+
elif filter_ig and record.hugo1.startswith('IG') and record.hugo2.startswith('IG'):
126+
# This will drop some Insulin-like growth factor (IGF) proteins but they have a lot of
127+
# homology too so its ok.
128+
logging.warning("Rejecting %s-%s an an Immunoglobulin gene pair.", record.hugo1,
129+
record.hugo2)
130+
continue
131+
elif filter_rg and rna_gene_in_bedpe(record):
132+
logging.warning("Rejecting %s-%s for containing a 5' RNA gene.", record.hugo1,
133+
record.hugo2)
134+
continue
135+
elif filter_rt and readthrough_in_bedpe(record, gene_annotations, rt_threshold):
136+
logging.warning("Rejecting %s-%s as a potential readthrough.", record.hugo1,
137+
record.hugo2)
138+
continue
139+
else:
140+
logging.info("Accepting %s-%s for further study.", record.hugo1, record.hugo2)
141+
print('\t'.join(line), file=out_bedpe)
142+
calls.append(record)
143+
139144
return calls
140145

141146
# Namedtuple for storing alignment metrics
142-
# Neeeds to be global for pickling
147+
# Needs to be global for pickling
143148
AlignStats = collections.namedtuple('AlignStats',
144149
'qstart, qstop, rstart, rstop, insertions, deletions')
145150

151+
146152
def align_filter(ref, query, mode, mismatches_per_kb=1):
147153
"""
148154
Aligns query to reference CDS sequence using the Smith-Waterman algorithm.

test/test_input/test_fusions.bedpe

+3
Original file line numberDiff line numberDiff line change
@@ -4,3 +4,6 @@ chr8 . 42968214 chr10 43116584 . ENSG00000168172-ENSG00000165731 1_1 + + TTGGAGG
44
chr21 . 41494381 chr7 13935844 . ENSG00000184012-ENSG00000006468 1_1 - - CCGGAAAACCCCTATCCCGCACAGCCCACTGTGGTCCCCACTGTCTACGAGGTGCATCCGGCTCAGTACTACCCGTCCCCCGTGCCCCAGTACGCCCCGAGGGTCCTGACGCAGGCTTCCAACCCCGTCGTCTGCACGCAGCCCAAATCC CCATCCAGCACGCCAGTGTCCCCACTGCATCATGCATCTCCAAACTCAACTCATACACCGAAACCTGACCGGGCCTTCCCAGCTCACCTCCCTCCATCGCAGTCCATACCAGATAGCAGCTACCCCATGGACCACAGATTTCGCCGCCAGCTTTCTGAACCCTGTAACTCCTTTCCTCCTTTGCCGACGATGCCAAGGGAAGGACGTCCTATGTACCAACGCCAGATGTCTGAGCCAAACATCCCCTTCCCACCACAAGGCTTTAAGCAGGAGTACCACGACCCAGTGTATGAACACAACACCATGGTTGGCAGTGCGGCCAGCCAAAGCTTTCCCCCTCCTCTGATGATTAAACAGGAACCCAGAGATTTTGCATATGACTCAGAAGTGCCTAGCTGCCACTCCATTTATATGAGGCAAGAAGGCTTCCTGGCTCATCCCAGCAGAACAGAAGGCTGTATGTTTGAAAAGGGCCCCAGGCAGTTTTATGATGACACCTGTGTTGTCCCAGAAAAATTCGATGGAGACATCAAACAAGAGCCAGGAATGTATCGGGAAGGACCCACATACCAACGGCGAGGATCACTTCAGCTCTGGCAGTTTTTGGTAGCTCTTCTGGATGACCCTTCAAATTCTCATTTTATTGCCTGGACTGGTCGAGGCATGGAATTTAAACTGATTGAGCCTGAAGAGGTGGCCCGACGTTGGGGCATTCAGAAAAACAGGCCAGCTATGAACTATGATAAACTTAGCCGTTCACTCCGCTATTACTATGAGAAAGGAATTATGCAAAAGGTGGCTGGAGAGAGATATGTCTACAAGTTTGTGTGTGATCCAGAAGCCCTTTTCTCCATGGCCTTTCCAGATAATCAGCGTCCACTGCTGAAGACAGACATGGAACGTCACATCAACGAGGAGGACACAGTGCCTCTTTCTCACTTTGATGAGAGCATGGCCTACATGCCGGAAGGGGGCTGCTGCAACCCCCACCCCTACAACGAAGGCTACGTGTATTAACACAAGTGACAGTCAAGCAGGGCGTT TMPRSS2 ETV1
55
chr21 . 41494381 chr7 13935844 . ENSG00000184012-ENSG00000006468 1_1 - - CCGGAAAACCCCTATCCCGCACAGCCCACTGTGGTCCCCACTGTCTACGAGGTGCATCCGGCTCAGTACTACCCGTCCCCCGTGCCCCAGTACGCCCCGAGGGTCCTGACGCAGGCTTCCAACCCCGTCGTCTGCACGCAGCCCAAATCC CCATCCAGCACGCCAGTGTCCCCACTGCATCATGCATCTCCAAACTCAACTCATACACCGAAACCTGACCGGGCCTTCCCAGCTCACCTCCCTCCATCGCAGTCCATACCAGATAGCAGCTACCCCATGGACCACAGATTTCGCCGCCAGCTTTCTGAACCCTGTAACTCCTTTCCTCCTTTGCCGACGATGCCAAGGGAAGGACGTCCTATGTACCAACGCCAGATGTCTGAGCCAAACATCCCCTTCCCACCACAAGGCTTTAAGCAGGAGTACCACGACCCAGTGTATGAACACAACACCATGGTTGGCAGTGCGGCCA TMPRSS2 ETV1
66
chr21 . 42870046 chr21 39817544 . ENSG00000184012-ENSG00000157554 1_1 - - . . TMPRSS2 ERG
7+
chr21 . 46493768 chr21 46511596 . ENSG00000197381-ENSG00000268805 1_1 - - THIS_IS_A_READTHROUGH . ADARB1 PRED57
8+
chr21 . 16904219 chr21 16914235 . ENSG00000230481-ENSG00000253691 1_1 - - THIS_IS_AN_IG . IGKV1OR22-5 IGKV2OR22-4
9+
chr21 . 10397644 chr21 39817544 . ENSG00000278106-ENSG00000157554 1_1 - - THIS_IS_AN_RP11 . RP11-589M2.1 ERG

test/test_transgene.py

+3
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,9 @@ def _test_transgene(self, use_RNA=False, use_DNA=False, fusions=False):
107107
'annotation.gtf')) if fusions else None
108108
params.genome_file = open(self._get_input_path('test_input/chr21.fa')) if fusions else None
109109

110+
params.filter_mt = params.filter_rg = params.filter_ig = params.filter_rt = True
111+
params.rt_threshold = 100000
112+
110113
params.cores = 3
111114
transgene.main(params)
112115
output = {'9mer': {'fasta': 'unit_test_tumor_9_mer_snpeffed.faa',

0 commit comments

Comments
 (0)