Skip to content

Commit 37eea22

Browse files
committed
Careful when merging symbolic alleles
Don't merge symbolic alleles unless they have not just the same type, eg. <DEL>, but also length, i.e the INFO/END coordinate Fixes #2362
1 parent c4ff82b commit 37eea22

File tree

6 files changed

+73
-13
lines changed

6 files changed

+73
-13
lines changed

NEWS

+3
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,9 @@ Changes affecting specific commands:
4242
- Make `--merge both` work with indel-only records; for example, the multiallelic
4343
site G>GT,T should be merged with G>GT (#2339)
4444

45+
- Do not merge symbolic alleles unless they have not just the same type, eg. <DEL>,
46+
but also length, i.e the INFO/END coordinate (#2362)
47+
4548
* bcftools norm
4649

4750
- Print the number of removed duplicate sites in the final statistics (#2346)

test/merge.symbolic.1.1.out

+8
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
##fileformat=VCFv4.1
2+
##FILTER=<ID=PASS,Description="All filters passed">
3+
##contig=<ID=chr1,length=248956422>
4+
##INFO=<ID=END,Number=1,Type=Integer,Description="End position of structural variation">
5+
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
6+
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Sample Other
7+
chr1 147022730 SV1 N <DEL> . . END=147593064 GT 0/1 ./.
8+
chr1 147022730 SV2 N <DEL> . . END=148013144 GT ./. 1/1

test/merge.symbolic.1.a.vcf

+6
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
##fileformat=VCFv4.1
2+
##contig=<ID=chr1,length=248956422>
3+
##INFO=<ID=END,Number=1,Type=Integer,Description="End position of structural variation">
4+
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
5+
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Sample
6+
chr1 147022730 SV1 N <DEL> . . END=147593064 GT 0/1

test/merge.symbolic.1.b.vcf

+6
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
##fileformat=VCFv4.1
2+
##contig=<ID=chr1,length=248956422>
3+
##INFO=<ID=END,Number=1,Type=Integer,Description="End position of structural variation">
4+
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
5+
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT Other
6+
chr1 147022730 SV2 N <DEL> . . END=148013144 GT 1/1

test/test.pl

+1
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@
6161
run_test(\&test_vcf_isec,$opts,in=>['isec-miss.1.1','isec-miss.1.2','isec-miss.1.3'],out=>'isec-miss.1.1.out',args=>'-R {PATH}/isec-miss.1.regs.txt -n +1');
6262
run_test(\&test_vcf_isec,$opts,in=>['isec-miss.2.1','isec-miss.2.2','isec-miss.2.3'],out=>'isec-miss.2.1.out',args=>'-n +1 -r 20:100,20:140,12:55,20:140,20:100');
6363
run_test(\&test_vcf_isec,$opts,in=>['isec-miss.2.1','isec-miss.2.2','isec-miss.2.3'],out=>'isec-miss.2.1.out',args=>'-R {PATH}/isec-miss.1.regs.txt -n +1');
64+
run_test(\&test_vcf_merge,$opts,in=>['merge.symbolic.1.a','merge.symbolic.1.b'],out=>'merge.symbolic.1.1.out',args=>'');
6465
run_test(\&test_vcf_merge,$opts,in=>['merge.multiallelics.1.a','merge.multiallelics.1.b'],out=>'merge.multiallelics.1.1.out',args=>'--merge none');
6566
run_test(\&test_vcf_merge,$opts,in=>['merge.multiallelics.1.a','merge.multiallelics.1.b'],out=>'merge.multiallelics.1.1.out',args=>'--merge both');
6667
run_test(\&test_vcf_merge,$opts,in=>['merge.12.a','merge.12.b'],out=>'merge.12.1.out',args=>'--merge none');

vcfmerge.c

+49-13
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,7 @@ typedef struct
132132
int mrec; // allocated size of buf
133133
maux1_t *rec; // buffer to keep reader's lines
134134
bcf1_t **lines; // source buffer: either gvcf or readers' buffer
135+
bcf_hdr_t *hdr; // this reader's header
135136
int var_types; // reader's variant types in the active [beg,end] window
136137
}
137138
buffer_t;
@@ -871,7 +872,10 @@ maux_t *maux_init(args_t *args)
871872
ma->smpl_nGsize = (int*) malloc(n_smpl*sizeof(int));
872873
ma->buf = (buffer_t*) calloc(ma->n,sizeof(buffer_t));
873874
for (i=0; i<ma->n; i++)
875+
{
874876
ma->buf[i].rid = -1;
877+
ma->buf[i].hdr = files->readers[i].header;
878+
}
875879
ma->str = (kstring_t*) calloc(n_smpl,sizeof(kstring_t));
876880
if ( args->local_alleles )
877881
{
@@ -2925,23 +2929,48 @@ static const int
29252929
indel_mask = (VCF_INDEL<<1),
29262930
ins_mask = VCF_INS<<1,
29272931
del_mask = VCF_DEL<<1,
2928-
ref_mask = 1;
2932+
ref_mask = 1,
2933+
other_mask = VCF_OTHER<<1;
2934+
2935+
typedef struct
2936+
{
2937+
int types, // selected types, see the *_mask(s) above
2938+
end; // if symbolic allele is involved, the END coordinate of the first record
2939+
bcf1_t *rec; // the first record selected
2940+
}
2941+
selected_t;
29292942

29302943
// Can these types be merged given the -m settings? Despite the function's name, its focus is on
29312944
// excluding incompatible records, there will be a finer matching later in stage_line()
2932-
static inline int types_compatible(args_t *args, int selected_types, buffer_t *buf, int irec)
2945+
static inline int types_compatible(args_t *args, selected_t *selected, buffer_t *buf, int irec)
29332946
{
29342947
int k;
29352948
maux_t *maux = args->maux;
29362949
bcf1_t *rec = buf->lines[irec];
29372950
int rec_types = buf->rec[irec].var_types;
29382951

2939-
assert( selected_types ); // this is trivially true, set in can_merge()
2952+
int end = -1;
2953+
if ( rec_types&other_mask )
2954+
{
2955+
int32_t *itmp = NULL, nitmp = 0;
2956+
bcf_get_info_int32(buf->hdr,rec,"END",&itmp,&nitmp);
2957+
end = nitmp==1 ? itmp[0] : -1;
2958+
free(itmp);
2959+
}
2960+
2961+
// First time here?
2962+
if ( !selected->types )
2963+
{
2964+
selected->end = end;
2965+
selected->rec = rec;
2966+
selected->types = rec_types;
2967+
return 1;
2968+
}
29402969

29412970
if ( args->collapse & COLLAPSE_ANY ) return 1; // can merge anything with anything
29422971

29432972
// REF and gVCF_REF with no other alleles present can be merged with anything
2944-
if ( (selected_types&ref_mask) && !(selected_types&(~ref_mask)) ) return 1;
2973+
if ( (selected->types&ref_mask) && !(selected->types&(~ref_mask)) ) return 1;
29452974
if ( (rec_types&ref_mask) && !(rec_types&(~ref_mask)) ) return 1;
29462975

29472976
if ( args->collapse!=COLLAPSE_NONE )
@@ -2952,23 +2981,23 @@ static inline int types_compatible(args_t *args, int selected_types, buffer_t *b
29522981
// - rec has indel, we already have an indel, and -m both,indels,snp-ins-del
29532982
if ( args->collapse&(COLLAPSE_SNPS|COLLAPSE_SNP_INS_DEL) )
29542983
{
2955-
if ( (rec_types&snp_mask) && (selected_types&snp_mask) ) return 1;
2984+
if ( (rec_types&snp_mask) && (selected->types&snp_mask) ) return 1;
29562985
}
29572986
if ( args->collapse&COLLAPSE_INDELS )
29582987
{
2959-
if ( (rec_types&indel_mask) && (selected_types&indel_mask) ) return 1;
2988+
if ( (rec_types&indel_mask) && (selected->types&indel_mask) ) return 1;
29602989
}
29612990
if ( args->collapse&COLLAPSE_SNP_INS_DEL )
29622991
{
2963-
if ( (rec_types&ins_mask) && (selected_types&ins_mask) ) return 1;
2964-
if ( (rec_types&del_mask) && (selected_types&del_mask) ) return 1;
2992+
if ( (rec_types&ins_mask) && (selected->types&ins_mask) ) return 1;
2993+
if ( (rec_types&del_mask) && (selected->types&del_mask) ) return 1;
29652994
}
29662995
// Whatever is left, allow to match if the alleles match exactly
29672996
}
29682997

29692998
// The -m none mode or exact matching requested
29702999
// Simple test first: are the variants of the same type?
2971-
int x = selected_types;
3000+
int x = selected->types;
29723001
int y = rec_types;
29733002
if ( !(x&y) ) return 0; // no matching type
29743003
if ( (x&y)!=x && (x&y)!=y ) return 0; // not a subset
@@ -2980,6 +3009,13 @@ static inline int types_compatible(args_t *args, int selected_types, buffer_t *b
29803009
if ( vcmp_find_allele(args->vcmp,maux->als+1,maux->nals-1,rec->d.allele[k])>=0 ) break;
29813010
}
29823011
if ( k==rec->n_allele ) return 0; // this record has a new allele rec->d.allele[k]
3012+
3013+
if ( selected->types&other_mask && rec_types&other_mask )
3014+
{
3015+
// both records have symbolic alleles and the alleles are the same
3016+
if ( selected->end!=end ) return 0;
3017+
}
3018+
29833019
return 1; // all alleles in rec are also in the records selected thus far, perhaps save for gVCF_REF
29843020
}
29853021

@@ -3106,7 +3142,7 @@ int can_merge(args_t *args)
31063142
}
31073143
if ( !ntodo ) return 0;
31083144

3109-
int selected_types = 0;
3145+
selected_t selected = {0,0,NULL};
31103146

31113147
// In this loop we select from each reader compatible candidate lines.
31123148
// (i.e. SNPs or indels). Go through all files and all lines at this
@@ -3121,7 +3157,7 @@ int can_merge(args_t *args)
31213157
gaux[i].line->d.allele[0][0] = ref;
31223158
gaux[i].line->pos = maux->pos;
31233159
maux_update_alleles(args, i, buf->beg);
3124-
selected_types |= ref_mask;
3160+
selected.types |= ref_mask;
31253161
continue;
31263162
}
31273163
for (j=buf->beg; j<buf->end; j++)
@@ -3136,7 +3172,7 @@ int can_merge(args_t *args)
31363172
{
31373173
if ( strcmp(id,line->d.id) ) continue; // matching by ID and it does not match the selected record
31383174
}
3139-
else if ( selected_types && !types_compatible(args,selected_types,buf,j) ) continue;
3175+
else if ( !types_compatible(args,&selected,buf,j) ) continue;
31403176

31413177
// This is not a good code. It makes the incorrect assumption of always having a SNP record available.
31423178
// However, that is not always the case and prevents the merging of G>GT,T with G>GT (see test/merge.multiallelics.1.*.vcf).
@@ -3154,7 +3190,7 @@ int can_merge(args_t *args)
31543190
// ) continue;
31553191
// }
31563192

3157-
selected_types |= line_types;
3193+
selected.types |= line_types;
31583194

31593195
buf->rec[j].skip = 0; // the j-th record from i-th reader can be included. Final decision will be made in stage_line
31603196
maux_update_alleles(args, i, j);

0 commit comments

Comments
 (0)