@@ -132,6 +132,7 @@ typedef struct
132132 int mrec ; // allocated size of buf
133133 maux1_t * rec ; // buffer to keep reader's lines
134134 bcf1_t * * lines ; // source buffer: either gvcf or readers' buffer
135+ bcf_hdr_t * hdr ; // this reader's header
135136 int var_types ; // reader's variant types in the active [beg,end] window
136137}
137138buffer_t ;
@@ -871,7 +872,10 @@ maux_t *maux_init(args_t *args)
871872 ma -> smpl_nGsize = (int * ) malloc (n_smpl * sizeof (int ));
872873 ma -> buf = (buffer_t * ) calloc (ma -> n ,sizeof (buffer_t ));
873874 for (i = 0 ; i < ma -> n ; i ++ )
875+ {
874876 ma -> buf [i ].rid = -1 ;
877+ ma -> buf [i ].hdr = files -> readers [i ].header ;
878+ }
875879 ma -> str = (kstring_t * ) calloc (n_smpl ,sizeof (kstring_t ));
876880 if ( args -> local_alleles )
877881 {
@@ -2925,23 +2929,48 @@ static const int
29252929 indel_mask = (VCF_INDEL <<1 ),
29262930 ins_mask = VCF_INS <<1 ,
29272931 del_mask = VCF_DEL <<1 ,
2928- ref_mask = 1 ;
2932+ ref_mask = 1 ,
2933+ other_mask = VCF_OTHER <<1 ;
2934+
2935+ typedef struct
2936+ {
2937+ int types , // selected types, see the *_mask(s) above
2938+ end ; // if symbolic allele is involved, the END coordinate of the first record
2939+ bcf1_t * rec ; // the first record selected
2940+ }
2941+ selected_t ;
29292942
29302943// Can these types be merged given the -m settings? Despite the function's name, its focus is on
29312944// excluding incompatible records, there will be a finer matching later in stage_line()
2932- static inline int types_compatible (args_t * args , int selected_types , buffer_t * buf , int irec )
2945+ static inline int types_compatible (args_t * args , selected_t * selected , buffer_t * buf , int irec )
29332946{
29342947 int k ;
29352948 maux_t * maux = args -> maux ;
29362949 bcf1_t * rec = buf -> lines [irec ];
29372950 int rec_types = buf -> rec [irec ].var_types ;
29382951
2939- assert ( selected_types ); // this is trivially true, set in can_merge()
2952+ int end = -1 ;
2953+ if ( rec_types & other_mask )
2954+ {
2955+ int32_t * itmp = NULL , nitmp = 0 ;
2956+ bcf_get_info_int32 (buf -> hdr ,rec ,"END" ,& itmp ,& nitmp );
2957+ end = nitmp == 1 ? itmp [0 ] : -1 ;
2958+ free (itmp );
2959+ }
2960+
2961+ // First time here?
2962+ if ( !selected -> types )
2963+ {
2964+ selected -> end = end ;
2965+ selected -> rec = rec ;
2966+ selected -> types = rec_types ;
2967+ return 1 ;
2968+ }
29402969
29412970 if ( args -> collapse & COLLAPSE_ANY ) return 1 ; // can merge anything with anything
29422971
29432972 // REF and gVCF_REF with no other alleles present can be merged with anything
2944- if ( (selected_types & ref_mask ) && !(selected_types & (~ref_mask )) ) return 1 ;
2973+ if ( (selected -> types & ref_mask ) && !(selected -> types & (~ref_mask )) ) return 1 ;
29452974 if ( (rec_types & ref_mask ) && !(rec_types & (~ref_mask )) ) return 1 ;
29462975
29472976 if ( args -> collapse != COLLAPSE_NONE )
@@ -2952,23 +2981,23 @@ static inline int types_compatible(args_t *args, int selected_types, buffer_t *b
29522981 // - rec has indel, we already have an indel, and -m both,indels,snp-ins-del
29532982 if ( args -> collapse & (COLLAPSE_SNPS |COLLAPSE_SNP_INS_DEL ) )
29542983 {
2955- if ( (rec_types & snp_mask ) && (selected_types & snp_mask ) ) return 1 ;
2984+ if ( (rec_types & snp_mask ) && (selected -> types & snp_mask ) ) return 1 ;
29562985 }
29572986 if ( args -> collapse & COLLAPSE_INDELS )
29582987 {
2959- if ( (rec_types & indel_mask ) && (selected_types & indel_mask ) ) return 1 ;
2988+ if ( (rec_types & indel_mask ) && (selected -> types & indel_mask ) ) return 1 ;
29602989 }
29612990 if ( args -> collapse & COLLAPSE_SNP_INS_DEL )
29622991 {
2963- if ( (rec_types & ins_mask ) && (selected_types & ins_mask ) ) return 1 ;
2964- if ( (rec_types & del_mask ) && (selected_types & del_mask ) ) return 1 ;
2992+ if ( (rec_types & ins_mask ) && (selected -> types & ins_mask ) ) return 1 ;
2993+ if ( (rec_types & del_mask ) && (selected -> types & del_mask ) ) return 1 ;
29652994 }
29662995 // Whatever is left, allow to match if the alleles match exactly
29672996 }
29682997
29692998 // The -m none mode or exact matching requested
29702999 // Simple test first: are the variants of the same type?
2971- int x = selected_types ;
3000+ int x = selected -> types ;
29723001 int y = rec_types ;
29733002 if ( !(x & y ) ) return 0 ; // no matching type
29743003 if ( (x & y )!= x && (x & y )!= y ) return 0 ; // not a subset
@@ -2980,6 +3009,13 @@ static inline int types_compatible(args_t *args, int selected_types, buffer_t *b
29803009 if ( vcmp_find_allele (args -> vcmp ,maux -> als + 1 ,maux -> nals - 1 ,rec -> d .allele [k ])>=0 ) break ;
29813010 }
29823011 if ( k == rec -> n_allele ) return 0 ; // this record has a new allele rec->d.allele[k]
3012+
3013+ if ( selected -> types & other_mask && rec_types & other_mask )
3014+ {
3015+ // both records have symbolic alleles and the alleles are the same
3016+ if ( selected -> end != end ) return 0 ;
3017+ }
3018+
29833019 return 1 ; // all alleles in rec are also in the records selected thus far, perhaps save for gVCF_REF
29843020}
29853021
@@ -3106,7 +3142,7 @@ int can_merge(args_t *args)
31063142 }
31073143 if ( !ntodo ) return 0 ;
31083144
3109- int selected_types = 0 ;
3145+ selected_t selected = { 0 , 0 , NULL } ;
31103146
31113147 // In this loop we select from each reader compatible candidate lines.
31123148 // (i.e. SNPs or indels). Go through all files and all lines at this
@@ -3121,7 +3157,7 @@ int can_merge(args_t *args)
31213157 gaux [i ].line -> d .allele [0 ][0 ] = ref ;
31223158 gaux [i ].line -> pos = maux -> pos ;
31233159 maux_update_alleles (args , i , buf -> beg );
3124- selected_types |= ref_mask ;
3160+ selected . types |= ref_mask ;
31253161 continue ;
31263162 }
31273163 for (j = buf -> beg ; j < buf -> end ; j ++ )
@@ -3136,7 +3172,7 @@ int can_merge(args_t *args)
31363172 {
31373173 if ( strcmp (id ,line -> d .id ) ) continue ; // matching by ID and it does not match the selected record
31383174 }
3139- else if ( selected_types && !types_compatible (args ,selected_types ,buf ,j ) ) continue ;
3175+ else if ( !types_compatible (args ,& selected ,buf ,j ) ) continue ;
31403176
31413177 // This is not a good code. It makes the incorrect assumption of always having a SNP record available.
31423178 // However, that is not always the case and prevents the merging of G>GT,T with G>GT (see test/merge.multiallelics.1.*.vcf).
@@ -3154,7 +3190,7 @@ int can_merge(args_t *args)
31543190 // ) continue;
31553191 // }
31563192
3157- selected_types |= line_types ;
3193+ selected . types |= line_types ;
31583194
31593195 buf -> rec [j ].skip = 0 ; // the j-th record from i-th reader can be included. Final decision will be made in stage_line
31603196 maux_update_alleles (args , i , j );
0 commit comments