203
203
#define CSQ_UPSTREAM_STOP (1<<19) // adds * in front of the csq string
204
204
#define CSQ_INCOMPLETE_CDS (1<<20) // to remove START/STOP in incomplete CDS, see ENSG00000173376/synon.vcf
205
205
#define CSQ_CODING_SEQUENCE (1<<21) // cannot tell exactly what it is, but it does affect the coding sequence
206
- #define CSQ_ELONGATION (1<<22) // symbolic insertion
207
- #define CSQ_START_RETAINED (1<<23)
206
+ #define CSQ_ELONGATION (1<<22) // symbolic insertion or complex elongating variant
207
+ #define CSQ_TRUNCATION (1<<23) // complex truncating variant
208
+ #define CSQ_START_RETAINED (1<<24)
208
209
209
210
// Haplotype-aware consequences, printed in one vcf record only, the rest has a reference @12345
210
211
#define CSQ_COMPOUND (CSQ_SYNONYMOUS_VARIANT|CSQ_MISSENSE_VARIANT|CSQ_STOP_LOST|CSQ_STOP_GAINED| \
211
212
CSQ_INFRAME_DELETION|CSQ_INFRAME_INSERTION|CSQ_FRAMESHIFT_VARIANT| \
212
213
CSQ_START_LOST|CSQ_STOP_RETAINED|CSQ_INFRAME_ALTERING|CSQ_INCOMPLETE_CDS| \
213
- CSQ_UPSTREAM_STOP|CSQ_START_RETAINED)
214
+ CSQ_UPSTREAM_STOP|CSQ_START_RETAINED|CSQ_ELONGATION|CSQ_TRUNCATION )
214
215
#define CSQ_START_STOP (CSQ_STOP_LOST|CSQ_STOP_GAINED|CSQ_STOP_RETAINED|CSQ_START_LOST|CSQ_START_RETAINED)
215
216
216
- #define CSQ_PRN_STRAND (csq ) ((csq)&CSQ_COMPOUND && !((csq)&(CSQ_SPLICE_ACCEPTOR|CSQ_SPLICE_DONOR|CSQ_SPLICE_REGION)))
217
+ #define CSQ_PRN_STRAND (csq ) ((csq)&CSQ_COMPOUND && !((csq)&(CSQ_SPLICE_ACCEPTOR|CSQ_SPLICE_DONOR|CSQ_SPLICE_REGION|CSQ_ELONGATION|CSQ_TRUNCATION )))
217
218
#define CSQ_PRN_TSCRIPT (~(CSQ_INTRON|CSQ_NON_CODING))
218
219
#define CSQ_PRN_NMD (~(CSQ_INTRON|CSQ_NON_CODING))
219
220
#define CSQ_PRN_BIOTYPE CSQ_NON_CODING
@@ -248,6 +249,7 @@ const char *csq_strings[] =
248
249
NULL ,
249
250
"coding_sequence" ,
250
251
"feature_elongation" ,
252
+ "feature_truncation" ,
251
253
"start_retained"
252
254
};
253
255
@@ -1032,7 +1034,7 @@ static inline int csq_stage_utr(args_t *args, regitr_t *itr, bcf1_t *rec, uint32
1032
1034
static inline void csq_stage_splice (args_t * args , bcf1_t * rec , gf_tscript_t * tr , uint32_t type , int ial )
1033
1035
{
1034
1036
#if XDBG
1035
- fprintf (stderr ,"csq_stage_splice %d: type=%d\n" ,(int )rec -> pos + 1 ,type );
1037
+ fprintf (stderr ,"csq_stage_splice %d: type=%d ial=%d \n" ,(int )rec -> pos + 1 ,type , ial );
1036
1038
#endif
1037
1039
if ( !type ) return ;
1038
1040
csq_t csq ;
@@ -1456,6 +1458,7 @@ static inline int splice_csq_mnp(args_t *args, splice_t *splice, uint32_t ex_beg
1456
1458
fprintf (stderr ,"mnp: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg,tend=%d,%d check_utr=%d start,stop,beg,end=%d,%d,%d,%d\n" , splice -> vcf .ref ,splice -> vcf .alt ,ex_beg ,ex_end ,splice -> ref_beg ,splice -> ref_end ,splice -> tbeg ,splice -> tend ,splice -> check_utr ,splice -> check_start ,splice -> check_stop ,splice -> check_region_beg ,splice -> check_region_end );
1457
1459
#endif
1458
1460
1461
+ int ret = SPLICE_INSIDE ;
1459
1462
if ( splice -> ref_beg < ex_beg ) // the part before the exon
1460
1463
{
1461
1464
if ( splice -> check_region_beg )
@@ -1484,6 +1487,7 @@ fprintf(stderr,"mnp: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg,tend=%d,%d check_ut
1484
1487
{
1485
1488
splice -> tbeg = splice -> ref_beg - splice -> vcf .pos ;
1486
1489
splice -> ref_beg = ex_beg ;
1490
+ ret = SPLICE_OVERLAP ;
1487
1491
}
1488
1492
}
1489
1493
if ( ex_end < splice -> ref_end ) // the part after the exon
@@ -1514,6 +1518,7 @@ fprintf(stderr,"mnp: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg,tend=%d,%d check_ut
1514
1518
{
1515
1519
splice -> tend = splice -> vcf .rlen - (splice -> ref_end - splice -> vcf .pos + 1 );
1516
1520
splice -> ref_end = ex_end ;
1521
+ ret = SPLICE_OVERLAP ;
1517
1522
}
1518
1523
}
1519
1524
if ( splice -> ref_end < ex_beg || splice -> ref_beg > ex_end )
@@ -1537,11 +1542,18 @@ fprintf(stderr,"mnp: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg,tend=%d,%d check_ut
1537
1542
if ( splice -> set_refalt )
1538
1543
{
1539
1544
splice -> vcf .rlen -= splice -> tbeg + splice -> tend ;
1545
+ splice -> vcf .alen -= splice -> tbeg + splice -> tend ;
1540
1546
splice -> kref .l = 0 ; kputsn (splice -> vcf .ref + splice -> tbeg , splice -> vcf .rlen , & splice -> kref );
1541
- splice -> kalt .l = 0 ; kputsn (splice -> vcf .alt + splice -> tbeg , splice -> vcf .rlen , & splice -> kalt );
1547
+ splice -> kalt .l = 0 ; kputsn (splice -> vcf .alt + splice -> tbeg , splice -> vcf .alen , & splice -> kalt );
1542
1548
}
1543
1549
csq_stage_splice (args , splice -> vcf .rec , splice -> tr , splice -> csq , splice -> vcf .ial );
1544
- return SPLICE_INSIDE ;
1550
+ return ret ;
1551
+ }
1552
+ static inline int splice_csq_complex (args_t * args , splice_t * splice , uint32_t ex_beg , uint32_t ex_end )
1553
+ {
1554
+ splice -> csq |= splice -> vcf .rlen > splice -> vcf .alen ? CSQ_TRUNCATION : CSQ_ELONGATION ;
1555
+ int ret = splice_csq_mnp (args , splice , ex_beg , ex_end );
1556
+ return ret ;
1545
1557
}
1546
1558
static inline int splice_csq (args_t * args , splice_t * splice , uint32_t ex_beg , uint32_t ex_end )
1547
1559
{
@@ -1565,9 +1577,14 @@ static inline int splice_csq(args_t *args, splice_t *splice, uint32_t ex_beg, ui
1565
1577
}
1566
1578
splice -> tbeg = i ;
1567
1579
1580
+ int rtrim = splice -> vcf .rlen - splice -> tbeg - splice -> tend ;
1581
+ int atrim = splice -> vcf .alen - splice -> tbeg - splice -> tend ;
1582
+ if ( splice -> vcf .alt [0 ]== '<' ) rtrim = atrim = 0 ;
1583
+
1568
1584
// The mnp, ins and del code was split into near-identical functions for clarity and debugging;
1569
1585
// possible todo: generalize once stable
1570
1586
if ( splice -> vcf .rlen == splice -> vcf .alen ) return splice_csq_mnp (args , splice , ex_beg , ex_end );
1587
+ if ( rtrim > 1 && atrim > 1 ) return splice_csq_complex (args , splice , ex_beg , ex_end );
1571
1588
if ( splice -> vcf .rlen < splice -> vcf .alen ) return splice_csq_ins (args , splice , ex_beg , ex_end );
1572
1589
if ( splice -> vcf .rlen > splice -> vcf .alen ) return splice_csq_del (args , splice , ex_beg , ex_end );
1573
1590
@@ -1976,12 +1993,15 @@ void tscript_splice_ref(gf_tscript_t *tr)
1976
1993
int csq_push (args_t * args , csq_t * csq , bcf1_t * rec )
1977
1994
{
1978
1995
#if XDBG
1979
- fprintf (stderr ,"csq_push: %d .. %d \n" ,(int )rec -> pos + 1 ,csq -> type .type );
1996
+ fprintf (stderr ,"csq_push: pos= %d .. type=%d ial=%d \n" ,(int )rec -> pos + 1 ,csq -> type .type , csq -> type . vcf_ial );
1980
1997
#endif
1981
1998
khint_t k = kh_get (pos2vbuf , args -> pos2vbuf , (int )csq -> pos );
1982
1999
vbuf_t * vbuf = (k == kh_end (args -> pos2vbuf )) ? NULL : kh_val (args -> pos2vbuf , k );
1983
2000
if ( !vbuf ) error ("This should not happen. %s:%d %s\n" ,bcf_seqname (args -> hdr ,rec ),csq -> pos + 1 ,csq -> type .vstr .s );
1984
2001
2002
+ if ( csq -> type .type & CSQ_INFRAME_INSERTION && csq -> type .type & CSQ_ELONGATION ) csq -> type .type &= ~CSQ_INFRAME_INSERTION ;
2003
+ if ( csq -> type .type & CSQ_INFRAME_DELETION && csq -> type .type & CSQ_TRUNCATION ) csq -> type .type &= ~CSQ_INFRAME_DELETION ;
2004
+
1985
2005
int i ;
1986
2006
for (i = 0 ; i < vbuf -> n ; i ++ )
1987
2007
if ( vbuf -> vrec [i ]-> line == rec ) break ;
@@ -3350,6 +3370,7 @@ int test_utr(args_t *args, bcf1_t *rec)
3350
3370
{
3351
3371
if ( rec -> d .allele [i ][0 ]== '<' || rec -> d .allele [i ][0 ]== '*' ) { continue ; }
3352
3372
splice .vcf .alt = rec -> d .allele [i ];
3373
+ splice .vcf .ial = i ;
3353
3374
splice .csq = 0 ;
3354
3375
int splice_ret = splice_csq (args , & splice , utr -> beg , utr -> end );
3355
3376
if ( splice_ret != SPLICE_INSIDE && splice_ret != SPLICE_OVERLAP ) continue ;
@@ -3394,6 +3415,7 @@ int test_splice(args_t *args, bcf1_t *rec)
3394
3415
{
3395
3416
if ( rec -> d .allele [1 ][0 ]== '<' || rec -> d .allele [1 ][0 ]== '*' ) { continue ; }
3396
3417
splice .vcf .alt = rec -> d .allele [i ];
3418
+ splice .vcf .ial = i ;
3397
3419
splice .csq = 0 ;
3398
3420
splice_csq (args , & splice , exon -> beg , exon -> end );
3399
3421
if ( splice .csq ) ret = 1 ;
@@ -3420,6 +3442,7 @@ int test_tscript(args_t *args, bcf1_t *rec)
3420
3442
{
3421
3443
if ( rec -> d .allele [i ][0 ]== '<' || rec -> d .allele [i ][0 ]== '*' ) { continue ; }
3422
3444
splice .vcf .alt = rec -> d .allele [i ];
3445
+ splice .vcf .ial = i ;
3423
3446
splice .csq = 0 ;
3424
3447
int splice_ret = splice_csq (args , & splice , tr -> beg , tr -> end );
3425
3448
if ( splice_ret != SPLICE_INSIDE && splice_ret != SPLICE_OVERLAP ) continue ; // SPLICE_OUTSIDE or SPLICE_REF
@@ -3455,7 +3478,10 @@ void test_symbolic_alt(args_t *args, bcf1_t *rec)
3455
3478
// only insertions atm
3456
3479
int beg = rec -> pos + 1 ;
3457
3480
int end = beg ;
3458
- int csq_class = CSQ_ELONGATION ;
3481
+ int csq_class ;
3482
+ if ( !strncasecmp ("<INS" ,rec -> d .allele [1 ],4 ) ) csq_class = CSQ_ELONGATION ;
3483
+ else if ( !strncasecmp ("<DEL" ,rec -> d .allele [1 ],4 ) ) csq_class = CSQ_TRUNCATION ;
3484
+ else return ;
3459
3485
3460
3486
int hit = 0 ;
3461
3487
if ( regidx_overlap (args -> idx_cds ,chr_gff ,beg ,end , args -> itr ) )
@@ -3472,6 +3498,7 @@ void test_symbolic_alt(args_t *args, bcf1_t *rec)
3472
3498
csq .type .strand = tr -> strand ;
3473
3499
csq .type .trid = tr -> id ;
3474
3500
csq .type .gene = tr -> gene -> name ;
3501
+ csq .type .vcf_ial = 1 ;
3475
3502
csq_stage (args , & csq , rec );
3476
3503
hit = 1 ;
3477
3504
}
@@ -3490,6 +3517,7 @@ void test_symbolic_alt(args_t *args, bcf1_t *rec)
3490
3517
csq .type .strand = tr -> strand ;
3491
3518
csq .type .trid = tr -> id ;
3492
3519
csq .type .gene = tr -> gene -> name ;
3520
+ csq .type .vcf_ial = 1 ;
3493
3521
csq_stage (args , & csq , rec );
3494
3522
hit = 1 ;
3495
3523
}
@@ -3508,6 +3536,7 @@ void test_symbolic_alt(args_t *args, bcf1_t *rec)
3508
3536
splice .check_region_beg = splice .tr -> beg == exon -> beg ? 0 : 1 ;
3509
3537
splice .check_region_end = splice .tr -> end == exon -> end ? 0 : 1 ;
3510
3538
splice .vcf .alt = rec -> d .allele [1 ];
3539
+ splice .vcf .ial = 1 ;
3511
3540
splice .csq = csq_class ;
3512
3541
splice_csq (args , & splice , exon -> beg , exon -> end );
3513
3542
if ( splice .csq ) hit = 1 ;
@@ -3524,6 +3553,7 @@ void test_symbolic_alt(args_t *args, bcf1_t *rec)
3524
3553
memset (& csq , 0 , sizeof (csq_t ));
3525
3554
gf_tscript_t * tr = splice .tr = regitr_payload (args -> itr , gf_tscript_t * );
3526
3555
splice .vcf .alt = rec -> d .allele [1 ];
3556
+ splice .vcf .ial = 1 ;
3527
3557
splice .csq = csq_class ;
3528
3558
int splice_ret = splice_csq (args , & splice , tr -> beg , tr -> end );
3529
3559
if ( splice_ret != SPLICE_INSIDE && splice_ret != SPLICE_OVERLAP ) continue ; // SPLICE_OUTSIDE or SPLICE_REF
@@ -3609,10 +3639,6 @@ static void process(args_t *args, bcf1_t **rec_ptr)
3609
3639
int call_csq = 1 ;
3610
3640
if ( rec -> n_allele < 2 ) call_csq = 0 ; // no alternate allele
3611
3641
else if ( rec -> n_allele == 2 && (rec -> d .allele [1 ][0 ]== '*' || rec -> d .allele [1 ][1 ]== '*' ) ) call_csq = 0 ; // gVCF, not an alt allele
3612
- else if ( rec -> d .allele [1 ][0 ]== '<' )
3613
- {
3614
- if ( strncmp ("<INS" ,rec -> d .allele [1 ], 4 ) ) call_csq = 0 ; // only <INS[:.*]> is supported at the moment
3615
- }
3616
3642
if ( call_csq && args -> filter )
3617
3643
{
3618
3644
call_csq = filter_test (args -> filter , rec , NULL );
0 commit comments