@@ -93,7 +93,7 @@ typedef struct
93
93
int32_t * int32_arr ;
94
94
int ntmp_arr1 , ntmp_arr2 , nint32_arr ;
95
95
kstring_t * tmp_str ;
96
- kstring_t * tmp_als , * tmp_sym , tmp_kstr ;
96
+ kstring_t * tmp_als , * tmp_sym , tmp_kstr , old_rec_tag_kstr ;
97
97
int ntmp_als , ntmp_sym ;
98
98
rbuf_t rbuf ;
99
99
int buf_win ; // maximum distance between two records to consider
@@ -127,6 +127,42 @@ typedef struct
127
127
}
128
128
args_t ;
129
129
130
+ static void old_rec_tag_init (args_t * args , bcf1_t * line )
131
+ {
132
+ if ( !args -> old_rec_tag ) return ;
133
+
134
+ args -> old_rec_tag_kstr .l = 0 ;
135
+ ksprintf (& args -> old_rec_tag_kstr ,"%s|%" PRIhts_pos "|%s|" ,bcf_seqname (args -> hdr ,line ),line -> pos + 1 ,line -> d .allele [0 ]);
136
+ int i ;
137
+ for (i = 1 ; i < line -> n_allele ; i ++ )
138
+ {
139
+ kputs (line -> d .allele [i ],& args -> old_rec_tag_kstr );
140
+ if ( i + 1 < line -> n_allele ) kputc (',' ,& args -> old_rec_tag_kstr );
141
+ }
142
+ }
143
+ static void old_rec_tag_set (args_t * args , bcf1_t * line , int ialt )
144
+ {
145
+ if ( !args -> old_rec_tag || !args -> old_rec_tag_kstr .l ) return ;
146
+
147
+ // only update if the tag is not present already, there can be multiple normalization steps
148
+ int i , id = bcf_hdr_id2int (args -> out_hdr , BCF_DT_ID , args -> old_rec_tag );
149
+ bcf_unpack (line , BCF_UN_INFO );
150
+ for (i = 0 ; i < line -> n_info ; i ++ )
151
+ {
152
+ bcf_info_t * inf = & line -> d .info [i ];
153
+ if ( inf && inf -> key == id ) return ;
154
+ }
155
+
156
+ if ( ialt > 0 )
157
+ {
158
+ kputc ('|' ,& args -> old_rec_tag_kstr );
159
+ kputw (ialt ,& args -> old_rec_tag_kstr );
160
+ }
161
+ if ( (bcf_update_info_string (args -> out_hdr , line , args -> old_rec_tag , args -> old_rec_tag_kstr .s ))!= 0 )
162
+ error ("An error occurred while updating INFO/%s\n" ,args -> old_rec_tag );
163
+ args -> old_rec_tag_kstr .l = 0 ;
164
+ }
165
+
130
166
static inline int replace_iupac_codes (char * seq , int nseq )
131
167
{
132
168
// Replace ambiguity codes with N for now, it awaits to be seen what the VCF spec codifies in the end
@@ -159,7 +195,8 @@ static void seq_to_upper(char *seq, int len)
159
195
for (i = 0 ; seq [i ]; i ++ ) seq [i ] = nt_to_upper (seq [i ]);
160
196
}
161
197
162
- static void fix_ref (args_t * args , bcf1_t * line )
198
+ // returns 0 when no fix was needed, 1 otherwise
199
+ static int fix_ref (args_t * args , bcf1_t * line )
163
200
{
164
201
bcf_unpack (line , BCF_UN_STR );
165
202
int reflen = strlen (line -> d .allele [0 ]);
@@ -177,7 +214,7 @@ static void fix_ref(args_t *args, bcf1_t *line)
177
214
args -> nref .tot ++ ;
178
215
179
216
// is the REF different? If not, we are done
180
- if ( !strncasecmp (line -> d .allele [0 ],ref ,reflen ) ) { free (ref ); return ; }
217
+ if ( !strncasecmp (line -> d .allele [0 ],ref ,reflen ) ) { free (ref ); return 0 ; }
181
218
182
219
// is the REF allele missing?
183
220
if ( reflen == 1 && line -> d .allele [0 ][0 ]== '.' )
@@ -186,11 +223,11 @@ static void fix_ref(args_t *args, bcf1_t *line)
186
223
args -> nref .set ++ ;
187
224
free (ref );
188
225
bcf_update_alleles (args -> out_hdr ,line ,(const char * * )line -> d .allele ,line -> n_allele );
189
- return ;
226
+ return 1 ;
190
227
}
191
228
192
229
// does REF or ALT contain non-standard bases?
193
- int has_non_acgtn = 0 ;
230
+ int ret = 0 , has_non_acgtn = 0 ;
194
231
for (i = 0 ; i < line -> n_allele ; i ++ )
195
232
{
196
233
if ( line -> d .allele [i ][0 ]== '<' ) continue ;
@@ -200,7 +237,8 @@ static void fix_ref(args_t *args, bcf1_t *line)
200
237
{
201
238
args -> nref .set ++ ;
202
239
bcf_update_alleles (args -> out_hdr ,line ,(const char * * )line -> d .allele ,line -> n_allele );
203
- if ( !strncasecmp (line -> d .allele [0 ],ref ,reflen ) ) { free (ref ); return ; }
240
+ if ( !strncasecmp (line -> d .allele [0 ],ref ,reflen ) ) { free (ref ); return 1 ; }
241
+ ret = 1 ;
204
242
}
205
243
206
244
// does the REF allele contain N's ?
@@ -221,12 +259,12 @@ static void fix_ref(args_t *args, bcf1_t *line)
221
259
}
222
260
if ( fix )
223
261
{
262
+ ret = 1 ;
224
263
args -> nref .set ++ ;
225
264
bcf_update_alleles (args -> out_hdr ,line ,(const char * * )line -> d .allele ,line -> n_allele );
226
- if ( !strncasecmp (line -> d .allele [0 ],ref ,reflen ) ) { free (ref ); return ; }
265
+ if ( !strncasecmp (line -> d .allele [0 ],ref ,reflen ) ) { free (ref ); return ret ; }
227
266
}
228
267
229
-
230
268
// is it swapped?
231
269
for (i = 1 ; i < line -> n_allele ; i ++ )
232
270
{
@@ -237,6 +275,7 @@ static void fix_ref(args_t *args, bcf1_t *line)
237
275
kstring_t str = {0 ,0 ,0 };
238
276
if ( i == line -> n_allele ) // none of the alternate alleles matches the reference
239
277
{
278
+ ret = 1 ;
240
279
args -> nref .set ++ ;
241
280
kputsn (ref ,reflen ,& str );
242
281
for (i = 1 ; i < line -> n_allele ; i ++ )
@@ -247,7 +286,7 @@ static void fix_ref(args_t *args, bcf1_t *line)
247
286
bcf_update_alleles_str (args -> out_hdr ,line ,str .s );
248
287
free (ref );
249
288
free (str .s );
250
- return ;
289
+ return ret ;
251
290
}
252
291
253
292
// one of the alternate alleles matches the reference, assume it's a simple swap
@@ -289,6 +328,7 @@ static void fix_ref(args_t *args, bcf1_t *line)
289
328
ac [i - 1 ] = ni ;
290
329
bcf_update_info_int32 (args -> out_hdr , line , "AC" , ac , nac );
291
330
}
331
+ return 1 ;
292
332
}
293
333
294
334
static void fix_dup_alt (args_t * args , bcf1_t * line )
@@ -334,34 +374,35 @@ static void fix_dup_alt(args_t *args, bcf1_t *line)
334
374
if ( changed ) bcf_update_genotypes (args -> out_hdr ,line ,gts ,ngts );
335
375
}
336
376
337
- static void set_old_rec_tag (args_t * args , bcf1_t * dst , bcf1_t * src , int ialt )
338
- {
339
- if ( !args -> old_rec_tag ) return ;
340
-
341
- // only update if the tag is not present already, there can be multiple normalization steps
342
- int i , id = bcf_hdr_id2int (args -> out_hdr , BCF_DT_ID , args -> old_rec_tag );
343
- bcf_unpack (dst , BCF_UN_INFO );
344
- for (i = 0 ; i < dst -> n_info ; i ++ )
345
- {
346
- bcf_info_t * inf = & dst -> d .info [i ];
347
- if ( inf && inf -> key == id ) return ;
348
- }
349
-
350
- args -> tmp_kstr .l = 0 ;
351
- ksprintf (& args -> tmp_kstr ,"%s|%" PRIhts_pos "|%s|" ,bcf_seqname (args -> hdr ,src ),src -> pos + 1 ,src -> d .allele [0 ]);
352
- for (i = 1 ; i < src -> n_allele ; i ++ )
353
- {
354
- kputs (src -> d .allele [i ],& args -> tmp_kstr );
355
- if ( i + 1 < src -> n_allele ) kputc (',' ,& args -> tmp_kstr );
356
- }
357
- if ( ialt > 0 )
358
- {
359
- kputc ('|' ,& args -> tmp_kstr );
360
- kputw (ialt ,& args -> tmp_kstr );
361
- }
362
- if ( (bcf_update_info_string (args -> out_hdr , dst , args -> old_rec_tag , args -> tmp_kstr .s ))!= 0 )
363
- error ("An error occurred while updating INFO/%s\n" ,args -> old_rec_tag );
364
- }
377
+ // static void set_old_rec_tag(args_t *args, bcf1_t *dst, bcf1_t *src, int ialt)
378
+ // {
379
+ // fprintf(stderr,"remove me\n");
380
+ // if ( !args->old_rec_tag ) return;
381
+ //
382
+ // // only update if the tag is not present already, there can be multiple normalization steps
383
+ // int i, id = bcf_hdr_id2int(args->out_hdr, BCF_DT_ID, args->old_rec_tag);
384
+ // bcf_unpack(dst, BCF_UN_INFO);
385
+ // for (i=0; i<dst->n_info; i++)
386
+ // {
387
+ // bcf_info_t *inf = &dst->d.info[i];
388
+ // if ( inf && inf->key == id ) return;
389
+ // }
390
+ //
391
+ // args->tmp_kstr.l = 0;
392
+ // ksprintf(&args->tmp_kstr,"%s|%"PRIhts_pos"|%s|",bcf_seqname(args->hdr,src),src->pos+1,src->d.allele[0]);
393
+ // for (i=1; i<src->n_allele; i++)
394
+ // {
395
+ // kputs(src->d.allele[i],&args->tmp_kstr);
396
+ // if ( i+1<src->n_allele ) kputc(',',&args->tmp_kstr);
397
+ // }
398
+ // if ( ialt>0 )
399
+ // {
400
+ // kputc('|',&args->tmp_kstr);
401
+ // kputw(ialt,&args->tmp_kstr);
402
+ // }
403
+ // if ( (bcf_update_info_string(args->out_hdr, dst, args->old_rec_tag, args->tmp_kstr.s))!=0 )
404
+ // error("An error occurred while updating INFO/%s\n",args->old_rec_tag);
405
+ // }
365
406
366
407
static int is_left_align (args_t * args , bcf1_t * line )
367
408
{
@@ -523,6 +564,7 @@ static hts_pos_t realign_right(args_t *args, bcf1_t *line)
523
564
static int realign (args_t * args , bcf1_t * line )
524
565
{
525
566
bcf_unpack (line , BCF_UN_STR );
567
+ old_rec_tag_init (args ,line );
526
568
527
569
// Sanity check REF
528
570
int i , nref , reflen = strlen (line -> d .allele [0 ]);
@@ -655,7 +697,7 @@ static int realign(args_t *args, bcf1_t *line)
655
697
}
656
698
if ( new_pos == line -> pos && !strcasecmp (line -> d .allele [0 ],als [0 ].s ) ) return ERR_OK ;
657
699
658
- set_old_rec_tag (args , line , line , 0 );
700
+ old_rec_tag_set (args , line , 0 );
659
701
660
702
// Create new block of alleles and update
661
703
args -> tmp_kstr .l = 0 ;
@@ -1247,6 +1289,7 @@ static void split_multiallelic_to_biallelics(args_t *args, bcf1_t *line)
1247
1289
if ( !args -> tmp_lines [i ] ) args -> tmp_lines [i ] = bcf_init1 ();
1248
1290
bcf1_t * dst = args -> tmp_lines [i ];
1249
1291
bcf_clear (dst );
1292
+ old_rec_tag_init (args ,line );
1250
1293
1251
1294
dst -> rid = line -> rid ;
1252
1295
dst -> pos = line -> pos ;
@@ -1271,7 +1314,7 @@ static void split_multiallelic_to_biallelics(args_t *args, bcf1_t *line)
1271
1314
else if ( type == BCF_HT_FLAG ) split_info_flag (args , line , info , i , dst );
1272
1315
else split_info_string (args , line , info , i , dst );
1273
1316
}
1274
- set_old_rec_tag (args , dst , line , i + 1 ); // 1-based indexes
1317
+ old_rec_tag_set (args , dst , i + 1 ); // 1-based indexes
1275
1318
1276
1319
dst -> n_sample = line -> n_sample ;
1277
1320
for (j = 0 ; j < line -> n_fmt ; j ++ )
@@ -2246,6 +2289,7 @@ static void destroy_data(args_t *args)
2246
2289
free (args -> tmp_als );
2247
2290
free (args -> tmp_sym );
2248
2291
free (args -> tmp_kstr .s );
2292
+ free (args -> old_rec_tag_kstr .s );
2249
2293
if ( args -> tmp_str )
2250
2294
{
2251
2295
for (i = 0 ; i < bcf_hdr_nsamples (args -> hdr ); i ++ ) free (args -> tmp_str [i ].s );
@@ -2269,7 +2313,11 @@ static void normalize_line(args_t *args, bcf1_t *line)
2269
2313
{
2270
2314
if ( args -> fai )
2271
2315
{
2272
- if ( args -> filter_pass && (args -> check_ref & CHECK_REF_FIX ) ) fix_ref (args , line );
2316
+ if ( args -> filter_pass && (args -> check_ref & CHECK_REF_FIX ) )
2317
+ {
2318
+ old_rec_tag_init (args ,line );
2319
+ if ( fix_ref (args ,line ) ) old_rec_tag_set (args ,line ,0 );
2320
+ }
2273
2321
if ( args -> do_indels )
2274
2322
{
2275
2323
int ret = args -> filter_pass ? realign (args , line ) : ERR_OK ;
0 commit comments