Skip to content

Commit 0f3cb9e

Browse files
committed
Make the mpileup BAM_CREF_SKIP filter optional.
Mpileup removes alignments using the cigar ref skip operator ("N"). This was originally added in 2011 in samtools/samtools#d1643d6 with the commit message of "fixed a bug in indel calling related to unmapped and refskip reads". Unfortunately I don't know what that bug was, but removing the code shows it still works (at least for some data!). We need better understanding of what's going on and why it was added, but this PR makes it optional, keeping the default as before. Note there appears to be no filtering of BAM_CREF_SKIP in indels-2.0 so the option would be a nop there. This is an alternative PR to samtools#2281. I've leave it to the project maintainer as to what is preferable: removing the (no longer needed?) filtering, or keeping the default behaviour identical and adding a new option instead (which is safer, but possibly leads to accidental bad calls due to not noticing a new option has appeared). Fixes samtools#2277
1 parent 9d31418 commit 0f3cb9e

File tree

5 files changed

+38
-16
lines changed

5 files changed

+38
-16
lines changed

Diff for: bam2bcf.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,7 @@ plp_cd_t;
108108

109109

110110
typedef struct __bcf_callaux_t {
111-
int fmt_flag, ambig_reads;
111+
int fmt_flag, ambig_reads, ref_skip_reads;
112112
int capQ, min_baseQ, max_baseQ, delta_baseQ;
113113
int openQ, extQ, tandemQ; // for indels
114114
uint32_t min_support, max_support; // for collecting indel candidates

Diff for: bam2bcf_edlib.c

+10-7
Original file line numberDiff line numberDiff line change
@@ -1559,18 +1559,21 @@ int bcf_edlib_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos,
15591559
}
15601560
}
15611561

1562-
int qbeg, qpos, qend, tbeg, tend, kk;
1562+
int qbeg, qpos, qend, tbeg, tend;
15631563
uint8_t *seq = bam_get_seq(p->b);
1564-
uint32_t *cigar = bam_get_cigar(p->b);
15651564
if (p->b->core.flag & BAM_FUNMAP) continue;
15661565

15671566
// FIXME: the following loop should be better moved outside;
15681567
// nonetheless, realignment should be much slower anyway.
1569-
for (kk = 0; kk < p->b->core.n_cigar; ++kk)
1570-
if ((cigar[kk]&BAM_CIGAR_MASK) == BAM_CREF_SKIP)
1571-
break;
1572-
if (kk < p->b->core.n_cigar)
1573-
continue;
1568+
if (!bca->ref_skip_reads) {
1569+
uint32_t *cigar = bam_get_cigar(p->b);
1570+
int kk;
1571+
for (kk = 0; kk < p->b->core.n_cigar; ++kk)
1572+
if ((cigar[kk]&BAM_CIGAR_MASK) == BAM_CREF_SKIP)
1573+
break;
1574+
if (kk < p->b->core.n_cigar)
1575+
continue;
1576+
}
15741577

15751578
// determine the start and end of sequences for alignment
15761579
int left2 = left, right2 = right;

Diff for: bam2bcf_indel.c

+10-7
Original file line numberDiff line numberDiff line change
@@ -845,18 +845,21 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos,
845845
}
846846
}
847847

848-
int qbeg, qpos, qend, tbeg, tend, kk;
848+
int qbeg, qpos, qend, tbeg, tend;
849849
uint8_t *seq = bam_get_seq(p->b);
850-
uint32_t *cigar = bam_get_cigar(p->b);
851850
if (p->b->core.flag & BAM_FUNMAP) continue;
852851

853852
// FIXME: the following loop should be better moved outside;
854853
// nonetheless, realignment should be much slower anyway.
855-
for (kk = 0; kk < p->b->core.n_cigar; ++kk)
856-
if ((cigar[kk]&BAM_CIGAR_MASK) == BAM_CREF_SKIP)
857-
break;
858-
if (kk < p->b->core.n_cigar)
859-
continue;
854+
if (!bca->ref_skip_reads) {
855+
uint32_t *cigar = bam_get_cigar(p->b);
856+
int kk;
857+
for (kk = 0; kk < p->b->core.n_cigar; ++kk)
858+
if ((cigar[kk]&BAM_CIGAR_MASK) == BAM_CREF_SKIP)
859+
break;
860+
if (kk < p->b->core.n_cigar)
861+
continue;
862+
}
860863

861864
// determine the start and end of sequences for alignment
862865
// FIXME: loops over CIGAR multiple times

Diff for: doc/bcftools.txt

+8
Original file line numberDiff line numberDiff line change
@@ -2503,6 +2503,14 @@ INFO/DPR .. Deprecated in favor of INFO/AD; Number of high-quality bases for
25032503
exclude from calling and increment the first value of the AD counter
25042504
('incAD0') ['drop']
25052505

2506+
*--ref-skip-reads*::
2507+
Include reads with CIGAR 'N' (ref-skip) operators in their alignment.
2508+
By default these are filtered out. Enabling this option may be
2509+
necessary for calling on some RNASeq pipelines. This works with
2510+
the default caller and --indels-cns. The experimental -indels-2.0
2511+
option does not filter out alignments with ref-skips so this option
2512+
is unnecessary there.
2513+
25062514
*-e, --ext-prob* 'INT'::
25072515
Phred-scaled gap extension sequencing error probability. Reducing 'INT'
25082516
leads to longer indels [20]

Diff for: mpileup.c

+9-1
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ typedef struct _mplp_pileup_t mplp_pileup_t;
6868
// Data shared by all bam files
6969
typedef struct {
7070
int min_mq, flag, min_baseQ, max_baseQ, delta_baseQ, capQ_thres, max_depth,
71-
max_indel_depth, max_read_len, ambig_reads;
71+
max_indel_depth, max_read_len, ambig_reads, ref_skip_reads;
7272
uint32_t fmt_flag;
7373
int rflag_skip_any_unset, rflag_skip_all_unset, rflag_skip_any_set, rflag_skip_all_set, output_type;
7474
int openQ, extQ, tandemQ, min_support, indel_win_size; // for indels
@@ -882,6 +882,7 @@ static int mpileup(mplp_conf_t *conf)
882882
conf->bca->per_sample_flt = conf->flag & MPLP_PER_SAMPLE;
883883
conf->bca->fmt_flag = conf->fmt_flag;
884884
conf->bca->ambig_reads = conf->ambig_reads;
885+
conf->bca->ref_skip_reads = conf->ref_skip_reads;
885886
conf->bca->indel_win_size = conf->indel_win_size;
886887
conf->bca->indels_v20 = conf->indels_v20;
887888
conf->bca->edlib = conf->edlib;
@@ -1291,6 +1292,8 @@ static void print_usage(FILE *fp, const mplp_conf_t *mplp)
12911292
" -p, --per-sample-mF Apply -m and -F per-sample for increased sensitivity\n"
12921293
" -P, --platforms STR Comma separated list of platforms for indels [all]\n"
12931294
" --ar, --ambig-reads STR What to do with ambiguous indel reads: drop,incAD,incAD0 [drop]\n");
1295+
fprintf(fp,
1296+
" --ref-skip-reads Use reads with N CIGAR op [discard by default]\n");
12941297
fprintf(fp,
12951298
" --indel-bias FLOAT Raise to favour recall over precision [%.2f]\n", mplp->indel_bias);
12961299
fprintf(fp,
@@ -1381,6 +1384,7 @@ int main_mpileup(int argc, char *argv[])
13811384
mplp.fmt_flag = B2B_INFO_BQBZ|B2B_INFO_IDV|B2B_INFO_IMF|B2B_INFO_MQ0F|B2B_INFO_MQBZ|B2B_INFO_MQSBZ|B2B_INFO_RPBZ|B2B_INFO_SCBZ|B2B_INFO_SGB|B2B_INFO_VDB|B2B_FMT_AD;
13821385
mplp.max_read_len = 500;
13831386
mplp.ambig_reads = B2B_DROP;
1387+
mplp.ref_skip_reads = 0;
13841388
mplp.indel_win_size = 110;
13851389
mplp.poly_mqual = 0;
13861390
mplp.seqQ_offset = 120;
@@ -1458,6 +1462,7 @@ int main_mpileup(int argc, char *argv[])
14581462
{"seed", required_argument, NULL, 13},
14591463
{"ambig-reads", required_argument, NULL, 14},
14601464
{"ar", required_argument, NULL, 14},
1465+
{"ref-skip-reads", no_argument, NULL, 29},
14611466
{"write-index",optional_argument,NULL,'W'},
14621467
{"del-bias", required_argument, NULL, 23},
14631468
{"poly-mqual", no_argument, NULL, 24},
@@ -1620,6 +1625,9 @@ int main_mpileup(int argc, char *argv[])
16201625
if (mplp.seqQ_offset > 200)
16211626
mplp.seqQ_offset = 200;
16221627
break;
1628+
case 29:
1629+
mplp.ref_skip_reads = 1;
1630+
break;
16231631
case 23: mplp.del_bias = atof(optarg); break;
16241632
case 24: mplp.poly_mqual = 1; break;
16251633
case 26: mplp.poly_mqual = 0; break;

0 commit comments

Comments
 (0)