Skip to content

Commit 8c1deea

Browse files
committed
Add support for complex substitutions, such as AC>TAA
1 parent 458f97a commit 8c1deea

File tree

11 files changed

+224
-17
lines changed

11 files changed

+224
-17
lines changed

NEWS

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,10 @@ Changes affecting specific commands:
1313

1414
- Fix a bug which prevented reading fasta files containing empty lines in their entirety (#2424)
1515

16+
* bcftools csq
17+
18+
- Add support for complex substitutions, such as AC>TAA
19+
1620
* bcftools gtcheck
1721

1822
- The program is now able to process gVCF blocks. Newly, monoallelic sites are excluded only

csq.c

Lines changed: 39 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -203,17 +203,18 @@
203203
#define CSQ_UPSTREAM_STOP (1<<19) // adds * in front of the csq string
204204
#define CSQ_INCOMPLETE_CDS (1<<20) // to remove START/STOP in incomplete CDS, see ENSG00000173376/synon.vcf
205205
#define CSQ_CODING_SEQUENCE (1<<21) // cannot tell exactly what it is, but it does affect the coding sequence
206-
#define CSQ_ELONGATION (1<<22) // symbolic insertion
207-
#define CSQ_START_RETAINED (1<<23)
206+
#define CSQ_ELONGATION (1<<22) // symbolic insertion or complex elongating variant
207+
#define CSQ_TRUNCATION (1<<23) // complex truncating variant
208+
#define CSQ_START_RETAINED (1<<24)
208209

209210
// Haplotype-aware consequences, printed in one vcf record only, the rest has a reference @12345
210211
#define CSQ_COMPOUND (CSQ_SYNONYMOUS_VARIANT|CSQ_MISSENSE_VARIANT|CSQ_STOP_LOST|CSQ_STOP_GAINED| \
211212
CSQ_INFRAME_DELETION|CSQ_INFRAME_INSERTION|CSQ_FRAMESHIFT_VARIANT| \
212213
CSQ_START_LOST|CSQ_STOP_RETAINED|CSQ_INFRAME_ALTERING|CSQ_INCOMPLETE_CDS| \
213-
CSQ_UPSTREAM_STOP|CSQ_START_RETAINED)
214+
CSQ_UPSTREAM_STOP|CSQ_START_RETAINED|CSQ_ELONGATION|CSQ_TRUNCATION)
214215
#define CSQ_START_STOP (CSQ_STOP_LOST|CSQ_STOP_GAINED|CSQ_STOP_RETAINED|CSQ_START_LOST|CSQ_START_RETAINED)
215216

216-
#define CSQ_PRN_STRAND(csq) ((csq)&CSQ_COMPOUND && !((csq)&(CSQ_SPLICE_ACCEPTOR|CSQ_SPLICE_DONOR|CSQ_SPLICE_REGION)))
217+
#define CSQ_PRN_STRAND(csq) ((csq)&CSQ_COMPOUND && !((csq)&(CSQ_SPLICE_ACCEPTOR|CSQ_SPLICE_DONOR|CSQ_SPLICE_REGION|CSQ_ELONGATION|CSQ_TRUNCATION)))
217218
#define CSQ_PRN_TSCRIPT (~(CSQ_INTRON|CSQ_NON_CODING))
218219
#define CSQ_PRN_NMD (~(CSQ_INTRON|CSQ_NON_CODING))
219220
#define CSQ_PRN_BIOTYPE CSQ_NON_CODING
@@ -248,6 +249,7 @@ const char *csq_strings[] =
248249
NULL,
249250
"coding_sequence",
250251
"feature_elongation",
252+
"feature_truncation",
251253
"start_retained"
252254
};
253255

@@ -1032,7 +1034,7 @@ static inline int csq_stage_utr(args_t *args, regitr_t *itr, bcf1_t *rec, uint32
10321034
static inline void csq_stage_splice(args_t *args, bcf1_t *rec, gf_tscript_t *tr, uint32_t type, int ial)
10331035
{
10341036
#if XDBG
1035-
fprintf(stderr,"csq_stage_splice %d: type=%d\n",(int)rec->pos+1,type);
1037+
fprintf(stderr,"csq_stage_splice %d: type=%d ial=%d\n",(int)rec->pos+1,type,ial);
10361038
#endif
10371039
if ( !type ) return;
10381040
csq_t csq;
@@ -1456,6 +1458,7 @@ static inline int splice_csq_mnp(args_t *args, splice_t *splice, uint32_t ex_beg
14561458
fprintf(stderr,"mnp: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg,tend=%d,%d check_utr=%d start,stop,beg,end=%d,%d,%d,%d\n", splice->vcf.ref,splice->vcf.alt,ex_beg,ex_end,splice->ref_beg,splice->ref_end,splice->tbeg,splice->tend,splice->check_utr,splice->check_start,splice->check_stop,splice->check_region_beg,splice->check_region_end);
14571459
#endif
14581460

1461+
int ret = SPLICE_INSIDE;
14591462
if ( splice->ref_beg < ex_beg ) // the part before the exon
14601463
{
14611464
if ( splice->check_region_beg )
@@ -1484,6 +1487,7 @@ fprintf(stderr,"mnp: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg,tend=%d,%d check_ut
14841487
{
14851488
splice->tbeg = splice->ref_beg - splice->vcf.pos;
14861489
splice->ref_beg = ex_beg;
1490+
ret = SPLICE_OVERLAP;
14871491
}
14881492
}
14891493
if ( ex_end < splice->ref_end ) // the part after the exon
@@ -1514,6 +1518,7 @@ fprintf(stderr,"mnp: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg,tend=%d,%d check_ut
15141518
{
15151519
splice->tend = splice->vcf.rlen - (splice->ref_end - splice->vcf.pos + 1);
15161520
splice->ref_end = ex_end;
1521+
ret = SPLICE_OVERLAP;
15171522
}
15181523
}
15191524
if ( splice->ref_end < ex_beg || splice->ref_beg > ex_end )
@@ -1537,11 +1542,18 @@ fprintf(stderr,"mnp: %s>%s .. ex=%d,%d beg,end=%d,%d tbeg,tend=%d,%d check_ut
15371542
if ( splice->set_refalt )
15381543
{
15391544
splice->vcf.rlen -= splice->tbeg + splice->tend;
1545+
splice->vcf.alen -= splice->tbeg + splice->tend;
15401546
splice->kref.l = 0; kputsn(splice->vcf.ref + splice->tbeg, splice->vcf.rlen, &splice->kref);
1541-
splice->kalt.l = 0; kputsn(splice->vcf.alt + splice->tbeg, splice->vcf.rlen, &splice->kalt);
1547+
splice->kalt.l = 0; kputsn(splice->vcf.alt + splice->tbeg, splice->vcf.alen, &splice->kalt);
15421548
}
15431549
csq_stage_splice(args, splice->vcf.rec, splice->tr, splice->csq, splice->vcf.ial);
1544-
return SPLICE_INSIDE;
1550+
return ret;
1551+
}
1552+
static inline int splice_csq_complex(args_t *args, splice_t *splice, uint32_t ex_beg, uint32_t ex_end)
1553+
{
1554+
splice->csq |= splice->vcf.rlen > splice->vcf.alen ? CSQ_TRUNCATION : CSQ_ELONGATION;
1555+
int ret = splice_csq_mnp(args, splice, ex_beg, ex_end);
1556+
return ret;
15451557
}
15461558
static inline int splice_csq(args_t *args, splice_t *splice, uint32_t ex_beg, uint32_t ex_end)
15471559
{
@@ -1565,9 +1577,14 @@ static inline int splice_csq(args_t *args, splice_t *splice, uint32_t ex_beg, ui
15651577
}
15661578
splice->tbeg = i;
15671579

1580+
int rtrim = splice->vcf.rlen - splice->tbeg - splice->tend;
1581+
int atrim = splice->vcf.alen - splice->tbeg - splice->tend;
1582+
if ( splice->vcf.alt[0]=='<' ) rtrim = atrim = 0;
1583+
15681584
// The mnp, ins and del code was split into near-identical functions for clarity and debugging;
15691585
// possible todo: generalize once stable
15701586
if ( splice->vcf.rlen==splice->vcf.alen ) return splice_csq_mnp(args, splice, ex_beg, ex_end);
1587+
if ( rtrim>1 && atrim>1 ) return splice_csq_complex(args, splice, ex_beg, ex_end);
15711588
if ( splice->vcf.rlen < splice->vcf.alen ) return splice_csq_ins(args, splice, ex_beg, ex_end);
15721589
if ( splice->vcf.rlen > splice->vcf.alen ) return splice_csq_del(args, splice, ex_beg, ex_end);
15731590

@@ -1976,12 +1993,15 @@ void tscript_splice_ref(gf_tscript_t *tr)
19761993
int csq_push(args_t *args, csq_t *csq, bcf1_t *rec)
19771994
{
19781995
#if XDBG
1979-
fprintf(stderr,"csq_push: %d .. %d\n",(int)rec->pos+1,csq->type.type);
1996+
fprintf(stderr,"csq_push: pos=%d .. type=%d ial=%d\n",(int)rec->pos+1,csq->type.type,csq->type.vcf_ial);
19801997
#endif
19811998
khint_t k = kh_get(pos2vbuf, args->pos2vbuf, (int)csq->pos);
19821999
vbuf_t *vbuf = (k == kh_end(args->pos2vbuf)) ? NULL : kh_val(args->pos2vbuf, k);
19832000
if ( !vbuf ) error("This should not happen. %s:%d %s\n",bcf_seqname(args->hdr,rec),csq->pos+1,csq->type.vstr.s);
19842001

2002+
if ( csq->type.type&CSQ_INFRAME_INSERTION && csq->type.type&CSQ_ELONGATION ) csq->type.type &= ~CSQ_INFRAME_INSERTION;
2003+
if ( csq->type.type&CSQ_INFRAME_DELETION && csq->type.type&CSQ_TRUNCATION ) csq->type.type &= ~CSQ_INFRAME_DELETION;
2004+
19852005
int i;
19862006
for (i=0; i<vbuf->n; i++)
19872007
if ( vbuf->vrec[i]->line==rec ) break;
@@ -3350,6 +3370,7 @@ int test_utr(args_t *args, bcf1_t *rec)
33503370
{
33513371
if ( rec->d.allele[i][0]=='<' || rec->d.allele[i][0]=='*' ) { continue; }
33523372
splice.vcf.alt = rec->d.allele[i];
3373+
splice.vcf.ial = i;
33533374
splice.csq = 0;
33543375
int splice_ret = splice_csq(args, &splice, utr->beg, utr->end);
33553376
if ( splice_ret!=SPLICE_INSIDE && splice_ret!=SPLICE_OVERLAP ) continue;
@@ -3394,6 +3415,7 @@ int test_splice(args_t *args, bcf1_t *rec)
33943415
{
33953416
if ( rec->d.allele[1][0]=='<' || rec->d.allele[1][0]=='*' ) { continue; }
33963417
splice.vcf.alt = rec->d.allele[i];
3418+
splice.vcf.ial = i;
33973419
splice.csq = 0;
33983420
splice_csq(args, &splice, exon->beg, exon->end);
33993421
if ( splice.csq ) ret = 1;
@@ -3420,6 +3442,7 @@ int test_tscript(args_t *args, bcf1_t *rec)
34203442
{
34213443
if ( rec->d.allele[i][0]=='<' || rec->d.allele[i][0]=='*' ) { continue; }
34223444
splice.vcf.alt = rec->d.allele[i];
3445+
splice.vcf.ial = i;
34233446
splice.csq = 0;
34243447
int splice_ret = splice_csq(args, &splice, tr->beg, tr->end);
34253448
if ( splice_ret!=SPLICE_INSIDE && splice_ret!=SPLICE_OVERLAP ) continue; // SPLICE_OUTSIDE or SPLICE_REF
@@ -3455,7 +3478,10 @@ void test_symbolic_alt(args_t *args, bcf1_t *rec)
34553478
// only insertions atm
34563479
int beg = rec->pos + 1;
34573480
int end = beg;
3458-
int csq_class = CSQ_ELONGATION;
3481+
int csq_class;
3482+
if ( !strncasecmp("<INS",rec->d.allele[1],4) ) csq_class = CSQ_ELONGATION;
3483+
else if ( !strncasecmp("<DEL",rec->d.allele[1],4) ) csq_class = CSQ_TRUNCATION;
3484+
else return;
34593485

34603486
int hit = 0;
34613487
if ( regidx_overlap(args->idx_cds,chr_gff,beg,end, args->itr) )
@@ -3472,6 +3498,7 @@ void test_symbolic_alt(args_t *args, bcf1_t *rec)
34723498
csq.type.strand = tr->strand;
34733499
csq.type.trid = tr->id;
34743500
csq.type.gene = tr->gene->name;
3501+
csq.type.vcf_ial = 1;
34753502
csq_stage(args, &csq, rec);
34763503
hit = 1;
34773504
}
@@ -3490,6 +3517,7 @@ void test_symbolic_alt(args_t *args, bcf1_t *rec)
34903517
csq.type.strand = tr->strand;
34913518
csq.type.trid = tr->id;
34923519
csq.type.gene = tr->gene->name;
3520+
csq.type.vcf_ial = 1;
34933521
csq_stage(args, &csq, rec);
34943522
hit = 1;
34953523
}
@@ -3508,6 +3536,7 @@ void test_symbolic_alt(args_t *args, bcf1_t *rec)
35083536
splice.check_region_beg = splice.tr->beg==exon->beg ? 0 : 1;
35093537
splice.check_region_end = splice.tr->end==exon->end ? 0 : 1;
35103538
splice.vcf.alt = rec->d.allele[1];
3539+
splice.vcf.ial = 1;
35113540
splice.csq = csq_class;
35123541
splice_csq(args, &splice, exon->beg, exon->end);
35133542
if ( splice.csq ) hit = 1;
@@ -3524,6 +3553,7 @@ void test_symbolic_alt(args_t *args, bcf1_t *rec)
35243553
memset(&csq, 0, sizeof(csq_t));
35253554
gf_tscript_t *tr = splice.tr = regitr_payload(args->itr, gf_tscript_t*);
35263555
splice.vcf.alt = rec->d.allele[1];
3556+
splice.vcf.ial = 1;
35273557
splice.csq = csq_class;
35283558
int splice_ret = splice_csq(args, &splice, tr->beg, tr->end);
35293559
if ( splice_ret!=SPLICE_INSIDE && splice_ret!=SPLICE_OVERLAP ) continue; // SPLICE_OUTSIDE or SPLICE_REF
@@ -3609,10 +3639,6 @@ static void process(args_t *args, bcf1_t **rec_ptr)
36093639
int call_csq = 1;
36103640
if ( rec->n_allele < 2 ) call_csq = 0; // no alternate allele
36113641
else if ( rec->n_allele==2 && (rec->d.allele[1][0]=='*' || rec->d.allele[1][1]=='*') ) call_csq = 0; // gVCF, not an alt allele
3612-
else if ( rec->d.allele[1][0]=='<' )
3613-
{
3614-
if ( strncmp("<INS",rec->d.allele[1], 4) ) call_csq = 0; // only <INS[:.*]> is supported at the moment
3615-
}
36163642
if ( call_csq && args->filter )
36173643
{
36183644
call_csq = filter_test(args->filter, rec, NULL);

test/csq.1.out

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@
2929
129 A C @128,splice_region|XYZ|ENST00000000001|protein_coding
3030

3131
140 TA AACG inframe_insertion|XYZ|ENST00000000001|protein_coding|+|8LR>7QRR|140TA>AACG+142C>CC,splice_region|XYZ|ENST00000000001|protein_coding
32-
140 TA AACG inframe_insertion|XYZ|ENST00000000001|protein_coding|+|8LR>7QRR|140TA>AACG+142C>CC,splice_region|XYZ|ENST00000000001|protein_coding
32+
140 TA AACG feature_elongation|XYZ|ENST00000000001|protein_coding|+|8LR>7QRR|140TA>AACG+142C>CC,splice_region&feature_elongation|XYZ|ENST00000000001|protein_coding
3333

3434
142 C CC @140,splice_region|XYZ|ENST00000000001|protein_coding
3535
142 C CC @140,splice_region|XYZ|ENST00000000001|protein_coding

test/csq.vcf.ascii-art.txt

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
2+
3+
110 120 130 140 150
4+
01234567890123456789012345678901--2-345678901234
5+
e.........eeeeeeeeeee.........ee--e-eeeeeeee....
6+
CGTACGTACGTACGTACGTACGTACGTACGTA--C-GTACGTACGTACGTACGT
7+
CC AACGcC
8+
9+
10+

test/csq/ENST00000263103-INS/insertions.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,6 @@
1616
315771 A <INS:ME:ALU> coding_sequence&feature_elongation|BICC1|ENST00000373886|protein_coding
1717
315771 A <INS:ME:ALU> coding_sequence&feature_elongation|BICC1|ENST00000373886|protein_coding
1818

19-
315772 G <INS:ME:ALU> feature_elongation|BICC1|ENST00000373886|protein_coding
20-
315772 G <INS:ME:ALU> feature_elongation|BICC1|ENST00000373886|protein_coding
19+
315772 G <DEL> feature_truncation|BICC1|ENST00000373886|protein_coding
20+
315772 G <DEL> feature_truncation|BICC1|ENST00000373886|protein_coding
2121

test/csq/ENST00000263103-INS/insertions.vcf

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,4 +12,4 @@
1212
10 315641 . G <INS:ME:ALU> . . END=315641;type=ENST00000373886:60588520-G-<INS:ME:ALU>;EXP=splice_region&coding_sequence&feature_elongation|BICC1|ENST00000373886|protein_coding
1313
10 315642 . A <INS:ME:ALU> . . END=315642;type=ENST00000373886:60588521-A-<INS:ME:ALU>;EXP=splice_region&coding_sequence&feature_elongation|BICC1|ENST00000373886|protein_coding
1414
10 315771 . A <INS:ME:ALU> . . END=315771;type=ENST00000373886:60588651-A-<INS:ME:ALU>;EXP=coding_sequence&feature_elongation|BICC1|ENST00000373886|protein_coding
15-
10 315772 . G <INS:ME:ALU> . . END=315772;type=ENST00000373886:60588651-G-<INS:ME:ALU>;EXP=feature_elongation|BICC1|ENST00000373886|protein_coding
15+
10 315772 . G <DEL> . . END=315775;type=ENST00000373886:60588651-G-<DEL>;EXP=feature_truncation|BICC1|ENST00000373886|protein_coding

0 commit comments

Comments
 (0)