Skip to content

Commit c4ff82b

Browse files
committed
Preserve the original alleles in --old-rec-tag
with `--check-ref s` Resolves #2357
1 parent 02b7246 commit c4ff82b

File tree

7 files changed

+105
-51
lines changed

7 files changed

+105
-51
lines changed

NEWS

+2
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,8 @@ Changes affecting specific commands:
4646

4747
- Print the number of removed duplicate sites in the final statistics (#2346)
4848

49+
- Preserve the original alleles in `--old-rec-tag` when `--check-ref s` requested (#2357)
50+
4951
* plot-vcfstats
5052

5153
- Make the option `-s, --sample-names` functional again (#2353)

test/norm.2.out

+4-3
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
11
##fileformat=VCFv4.2
22
##FILTER=<ID=PASS,Description="All filters passed">
33
##contig=<ID=1,length=249250621>
4+
##INFO=<ID=XX,Number=1,Type=String,Description="Original variant. Format: CHR|POS|REF|ALT|USED_ALT_IDX">
45
#CHROM POS ID REF ALT QUAL FILTER INFO
56
1 10 . G . . . .
6-
1 10 . g gTC . . .
7-
1 10 . g gtC . . .
7+
1 10 . g gTC . . XX=1|11|T|TCT
8+
1 10 . g gtC . . XX=1|12|C|CTC
89
1 13 . T . . . .
9-
1 14 . a aGC . . .
10+
1 14 . a aGC . . XX=1|14|B|BGC

test/norm.3.2.out

+2-1
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
##FILTER=<ID=PASS,Description="All filters passed">
33
##reference=norm.3.fa
44
##contig=<ID=1,length=4411532>
5+
##INFO=<ID=XX,Number=1,Type=String,Description="Original variant. Format: CHR|POS|REF|ALT|USED_ALT_IDX">
56
#CHROM POS ID REF ALT QUAL FILTER INFO
6-
1 9 . TCA T . . .
7+
1 9 . TCA T . . XX=1|10|NAC|N
78
1 11 . N C . . .

test/norm.3.out

+3-2
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
##FILTER=<ID=PASS,Description="All filters passed">
33
##reference=norm.3.fa
44
##contig=<ID=1,length=4411532>
5+
##INFO=<ID=XX,Number=1,Type=String,Description="Original variant. Format: CHR|POS|REF|ALT|USED_ALT_IDX">
56
#CHROM POS ID REF ALT QUAL FILTER INFO
6-
1 9 . TCA T . . .
7-
1 11 . A C . . .
7+
1 9 . TCA T . . XX=1|10|NAC|N
8+
1 11 . A C . . XX=1|11|N|C

test/norm.iupac.out

+2-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
##fileformat=VCFv4.2
22
##FILTER=<ID=PASS,Description="All filters passed">
33
##contig=<ID=1,length=249250621>
4+
##INFO=<ID=XX,Number=1,Type=String,Description="Original variant. Format: CHR|POS|REF|ALT|USED_ALT_IDX">
45
#CHROM POS ID REF ALT QUAL FILTER INFO
5-
1 8 . N NA . . .
6+
1 8 . N NA . . XX=1|8|BA|BAA

test/test.pl

+4-4
Original file line numberDiff line numberDiff line change
@@ -316,10 +316,10 @@
316316
run_test(\&test_vcf_norm,$opts,in=>'norm.rmdup.2',out=>'norm.rmdup.2.2.out',args=>'-d snps');
317317
run_test(\&test_vcf_norm,$opts,in=>'norm.rmdup.3',fai=>'norm.rmdup.3',out=>'norm.rmdup.3.1.out',args=>'-d exact');
318318
run_test(\&test_vcf_norm,$opts,in=>'norm.rmdup.3',fai=>'norm.rmdup.3',out=>'norm.rmdup.3.2.out',args=>'-d all');
319-
run_test(\&test_vcf_norm,$opts,in=>'norm.2',fai=>'norm.2',out=>'norm.2.out',args=>'-c s -a');
320-
run_test(\&test_vcf_norm,$opts,in=>'norm.iupac',fai=>'norm.iupac',out=>'norm.iupac.out',args=>'-c s');
321-
run_test(\&test_vcf_norm,$opts,in=>'norm.3',fai=>'norm.3',out=>'norm.3.out',args=>'-c s');
322-
run_test(\&test_vcf_norm,$opts,in=>'norm.3',fai=>'norm.3',out=>'norm.3.2.out',args=>q[-c s -i'alt="N"']);
319+
run_test(\&test_vcf_norm,$opts,in=>'norm.2',fai=>'norm.2',out=>'norm.2.out',args=>'-c s -a --old-rec-tag XX');
320+
run_test(\&test_vcf_norm,$opts,in=>'norm.iupac',fai=>'norm.iupac',out=>'norm.iupac.out',args=>'-c s --old-rec-tag XX');
321+
run_test(\&test_vcf_norm,$opts,in=>'norm.3',fai=>'norm.3',out=>'norm.3.out',args=>'-c s --old-rec-tag XX');
322+
run_test(\&test_vcf_norm,$opts,in=>'norm.3',fai=>'norm.3',out=>'norm.3.2.out',args=>q[-c s -i'alt="N"' --old-rec-tag XX]);
323323
run_test(\&test_vcf_norm,$opts,in=>'atomize.split.1',out=>'atomize.split.1.0.out',args=>['-a --old-rec-tag OLD_REC','-m -any --force']);
324324
run_test(\&test_vcf_norm,$opts,in=>'atomize.split.1',out=>'atomize.split.1.1.out',args=>['-m -any --old-rec-tag OLD_REC --force','-a']);
325325
run_test(\&test_vcf_norm,$opts,in=>'atomize.split.1',out=>'atomize.split.1.1.out',args=>'-m -any --old-rec-tag OLD_REC --force -a');

vcfnorm.c

+88-40
Original file line numberDiff line numberDiff line change
@@ -93,7 +93,7 @@ typedef struct
9393
int32_t *int32_arr;
9494
int ntmp_arr1, ntmp_arr2, nint32_arr;
9595
kstring_t *tmp_str;
96-
kstring_t *tmp_als, *tmp_sym, tmp_kstr;
96+
kstring_t *tmp_als, *tmp_sym, tmp_kstr, old_rec_tag_kstr;
9797
int ntmp_als, ntmp_sym;
9898
rbuf_t rbuf;
9999
int buf_win; // maximum distance between two records to consider
@@ -127,6 +127,42 @@ typedef struct
127127
}
128128
args_t;
129129

130+
static void old_rec_tag_init(args_t *args, bcf1_t *line)
131+
{
132+
if ( !args->old_rec_tag ) return;
133+
134+
args->old_rec_tag_kstr.l = 0;
135+
ksprintf(&args->old_rec_tag_kstr,"%s|%"PRIhts_pos"|%s|",bcf_seqname(args->hdr,line),line->pos+1,line->d.allele[0]);
136+
int i;
137+
for (i=1; i<line->n_allele; i++)
138+
{
139+
kputs(line->d.allele[i],&args->old_rec_tag_kstr);
140+
if ( i+1<line->n_allele ) kputc(',',&args->old_rec_tag_kstr);
141+
}
142+
}
143+
static void old_rec_tag_set(args_t *args, bcf1_t *line, int ialt)
144+
{
145+
if ( !args->old_rec_tag || !args->old_rec_tag_kstr.l ) return;
146+
147+
// only update if the tag is not present already, there can be multiple normalization steps
148+
int i, id = bcf_hdr_id2int(args->out_hdr, BCF_DT_ID, args->old_rec_tag);
149+
bcf_unpack(line, BCF_UN_INFO);
150+
for (i=0; i<line->n_info; i++)
151+
{
152+
bcf_info_t *inf = &line->d.info[i];
153+
if ( inf && inf->key == id ) return;
154+
}
155+
156+
if ( ialt>0 )
157+
{
158+
kputc('|',&args->old_rec_tag_kstr);
159+
kputw(ialt,&args->old_rec_tag_kstr);
160+
}
161+
if ( (bcf_update_info_string(args->out_hdr, line, args->old_rec_tag, args->old_rec_tag_kstr.s))!=0 )
162+
error("An error occurred while updating INFO/%s\n",args->old_rec_tag);
163+
args->old_rec_tag_kstr.l = 0;
164+
}
165+
130166
static inline int replace_iupac_codes(char *seq, int nseq)
131167
{
132168
// Replace ambiguity codes with N for now, it awaits to be seen what the VCF spec codifies in the end
@@ -159,7 +195,8 @@ static void seq_to_upper(char *seq, int len)
159195
for (i=0; seq[i]; i++) seq[i] = nt_to_upper(seq[i]);
160196
}
161197

162-
static void fix_ref(args_t *args, bcf1_t *line)
198+
// returns 0 when no fix was needed, 1 otherwise
199+
static int fix_ref(args_t *args, bcf1_t *line)
163200
{
164201
bcf_unpack(line, BCF_UN_STR);
165202
int reflen = strlen(line->d.allele[0]);
@@ -177,7 +214,7 @@ static void fix_ref(args_t *args, bcf1_t *line)
177214
args->nref.tot++;
178215

179216
// is the REF different? If not, we are done
180-
if ( !strncasecmp(line->d.allele[0],ref,reflen) ) { free(ref); return; }
217+
if ( !strncasecmp(line->d.allele[0],ref,reflen) ) { free(ref); return 0; }
181218

182219
// is the REF allele missing?
183220
if ( reflen==1 && line->d.allele[0][0]=='.' )
@@ -186,11 +223,11 @@ static void fix_ref(args_t *args, bcf1_t *line)
186223
args->nref.set++;
187224
free(ref);
188225
bcf_update_alleles(args->out_hdr,line,(const char**)line->d.allele,line->n_allele);
189-
return;
226+
return 1;
190227
}
191228

192229
// does REF or ALT contain non-standard bases?
193-
int has_non_acgtn = 0;
230+
int ret = 0, has_non_acgtn = 0;
194231
for (i=0; i<line->n_allele; i++)
195232
{
196233
if ( line->d.allele[i][0]=='<' ) continue;
@@ -200,7 +237,8 @@ static void fix_ref(args_t *args, bcf1_t *line)
200237
{
201238
args->nref.set++;
202239
bcf_update_alleles(args->out_hdr,line,(const char**)line->d.allele,line->n_allele);
203-
if ( !strncasecmp(line->d.allele[0],ref,reflen) ) { free(ref); return; }
240+
if ( !strncasecmp(line->d.allele[0],ref,reflen) ) { free(ref); return 1; }
241+
ret = 1;
204242
}
205243

206244
// does the REF allele contain N's ?
@@ -221,12 +259,12 @@ static void fix_ref(args_t *args, bcf1_t *line)
221259
}
222260
if ( fix )
223261
{
262+
ret = 1;
224263
args->nref.set++;
225264
bcf_update_alleles(args->out_hdr,line,(const char**)line->d.allele,line->n_allele);
226-
if ( !strncasecmp(line->d.allele[0],ref,reflen) ) { free(ref); return; }
265+
if ( !strncasecmp(line->d.allele[0],ref,reflen) ) { free(ref); return ret; }
227266
}
228267

229-
230268
// is it swapped?
231269
for (i=1; i<line->n_allele; i++)
232270
{
@@ -237,6 +275,7 @@ static void fix_ref(args_t *args, bcf1_t *line)
237275
kstring_t str = {0,0,0};
238276
if ( i==line->n_allele ) // none of the alternate alleles matches the reference
239277
{
278+
ret = 1;
240279
args->nref.set++;
241280
kputsn(ref,reflen,&str);
242281
for (i=1; i<line->n_allele; i++)
@@ -247,7 +286,7 @@ static void fix_ref(args_t *args, bcf1_t *line)
247286
bcf_update_alleles_str(args->out_hdr,line,str.s);
248287
free(ref);
249288
free(str.s);
250-
return;
289+
return ret;
251290
}
252291

253292
// one of the alternate alleles matches the reference, assume it's a simple swap
@@ -289,6 +328,7 @@ static void fix_ref(args_t *args, bcf1_t *line)
289328
ac[i-1] = ni;
290329
bcf_update_info_int32(args->out_hdr, line, "AC", ac, nac);
291330
}
331+
return 1;
292332
}
293333

294334
static void fix_dup_alt(args_t *args, bcf1_t *line)
@@ -334,34 +374,35 @@ static void fix_dup_alt(args_t *args, bcf1_t *line)
334374
if ( changed ) bcf_update_genotypes(args->out_hdr,line,gts,ngts);
335375
}
336376

337-
static void set_old_rec_tag(args_t *args, bcf1_t *dst, bcf1_t *src, int ialt)
338-
{
339-
if ( !args->old_rec_tag ) return;
340-
341-
// only update if the tag is not present already, there can be multiple normalization steps
342-
int i, id = bcf_hdr_id2int(args->out_hdr, BCF_DT_ID, args->old_rec_tag);
343-
bcf_unpack(dst, BCF_UN_INFO);
344-
for (i=0; i<dst->n_info; i++)
345-
{
346-
bcf_info_t *inf = &dst->d.info[i];
347-
if ( inf && inf->key == id ) return;
348-
}
349-
350-
args->tmp_kstr.l = 0;
351-
ksprintf(&args->tmp_kstr,"%s|%"PRIhts_pos"|%s|",bcf_seqname(args->hdr,src),src->pos+1,src->d.allele[0]);
352-
for (i=1; i<src->n_allele; i++)
353-
{
354-
kputs(src->d.allele[i],&args->tmp_kstr);
355-
if ( i+1<src->n_allele ) kputc(',',&args->tmp_kstr);
356-
}
357-
if ( ialt>0 )
358-
{
359-
kputc('|',&args->tmp_kstr);
360-
kputw(ialt,&args->tmp_kstr);
361-
}
362-
if ( (bcf_update_info_string(args->out_hdr, dst, args->old_rec_tag, args->tmp_kstr.s))!=0 )
363-
error("An error occurred while updating INFO/%s\n",args->old_rec_tag);
364-
}
377+
// static void set_old_rec_tag(args_t *args, bcf1_t *dst, bcf1_t *src, int ialt)
378+
// {
379+
// fprintf(stderr,"remove me\n");
380+
// if ( !args->old_rec_tag ) return;
381+
//
382+
// // only update if the tag is not present already, there can be multiple normalization steps
383+
// int i, id = bcf_hdr_id2int(args->out_hdr, BCF_DT_ID, args->old_rec_tag);
384+
// bcf_unpack(dst, BCF_UN_INFO);
385+
// for (i=0; i<dst->n_info; i++)
386+
// {
387+
// bcf_info_t *inf = &dst->d.info[i];
388+
// if ( inf && inf->key == id ) return;
389+
// }
390+
//
391+
// args->tmp_kstr.l = 0;
392+
// ksprintf(&args->tmp_kstr,"%s|%"PRIhts_pos"|%s|",bcf_seqname(args->hdr,src),src->pos+1,src->d.allele[0]);
393+
// for (i=1; i<src->n_allele; i++)
394+
// {
395+
// kputs(src->d.allele[i],&args->tmp_kstr);
396+
// if ( i+1<src->n_allele ) kputc(',',&args->tmp_kstr);
397+
// }
398+
// if ( ialt>0 )
399+
// {
400+
// kputc('|',&args->tmp_kstr);
401+
// kputw(ialt,&args->tmp_kstr);
402+
// }
403+
// if ( (bcf_update_info_string(args->out_hdr, dst, args->old_rec_tag, args->tmp_kstr.s))!=0 )
404+
// error("An error occurred while updating INFO/%s\n",args->old_rec_tag);
405+
// }
365406

366407
static int is_left_align(args_t *args, bcf1_t *line)
367408
{
@@ -523,6 +564,7 @@ static hts_pos_t realign_right(args_t *args, bcf1_t *line)
523564
static int realign(args_t *args, bcf1_t *line)
524565
{
525566
bcf_unpack(line, BCF_UN_STR);
567+
old_rec_tag_init(args,line);
526568

527569
// Sanity check REF
528570
int i, nref, reflen = strlen(line->d.allele[0]);
@@ -655,7 +697,7 @@ static int realign(args_t *args, bcf1_t *line)
655697
}
656698
if ( new_pos==line->pos && !strcasecmp(line->d.allele[0],als[0].s) ) return ERR_OK;
657699

658-
set_old_rec_tag(args, line, line, 0);
700+
old_rec_tag_set(args, line, 0);
659701

660702
// Create new block of alleles and update
661703
args->tmp_kstr.l = 0;
@@ -1247,6 +1289,7 @@ static void split_multiallelic_to_biallelics(args_t *args, bcf1_t *line)
12471289
if ( !args->tmp_lines[i] ) args->tmp_lines[i] = bcf_init1();
12481290
bcf1_t *dst = args->tmp_lines[i];
12491291
bcf_clear(dst);
1292+
old_rec_tag_init(args,line);
12501293

12511294
dst->rid = line->rid;
12521295
dst->pos = line->pos;
@@ -1271,7 +1314,7 @@ static void split_multiallelic_to_biallelics(args_t *args, bcf1_t *line)
12711314
else if ( type==BCF_HT_FLAG ) split_info_flag(args, line, info, i, dst);
12721315
else split_info_string(args, line, info, i, dst);
12731316
}
1274-
set_old_rec_tag(args, dst, line, i + 1); // 1-based indexes
1317+
old_rec_tag_set(args, dst, i + 1); // 1-based indexes
12751318

12761319
dst->n_sample = line->n_sample;
12771320
for (j=0; j<line->n_fmt; j++)
@@ -2246,6 +2289,7 @@ static void destroy_data(args_t *args)
22462289
free(args->tmp_als);
22472290
free(args->tmp_sym);
22482291
free(args->tmp_kstr.s);
2292+
free(args->old_rec_tag_kstr.s);
22492293
if ( args->tmp_str )
22502294
{
22512295
for (i=0; i<bcf_hdr_nsamples(args->hdr); i++) free(args->tmp_str[i].s);
@@ -2269,7 +2313,11 @@ static void normalize_line(args_t *args, bcf1_t *line)
22692313
{
22702314
if ( args->fai )
22712315
{
2272-
if ( args->filter_pass && (args->check_ref & CHECK_REF_FIX) ) fix_ref(args, line);
2316+
if ( args->filter_pass && (args->check_ref & CHECK_REF_FIX) )
2317+
{
2318+
old_rec_tag_init(args,line);
2319+
if ( fix_ref(args,line) ) old_rec_tag_set(args,line,0);
2320+
}
22732321
if ( args->do_indels )
22742322
{
22752323
int ret = args->filter_pass ? realign(args, line) : ERR_OK;

0 commit comments

Comments
 (0)