Skip to content

Commit 9b352b7

Browse files
committed
Support auto indexing during writing BCF and VCF.gz via new --write-index option
1 parent 9655089 commit 9b352b7

32 files changed

+845
-875
lines changed

NEWS

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,10 @@
11
## Release a.b
22

3+
Changes affecting the whole of bcftools, or multiple commands:
4+
5+
* Support auto indexing during writing BCF and VCF.gz via new `--write-index` option
6+
7+
38
Changes affecting specific commands:
49

510
* bcftools annotate

bcftools.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
/* bcftools.h -- utility function declarations.
22
3-
Copyright (C) 2013-2022 Genome Research Ltd.
3+
Copyright (C) 2013-2023 Genome Research Ltd.
44
55
Author: Petr Danecek <[email protected]>
66
@@ -49,6 +49,9 @@ void error(const char *format, ...) HTS_NORETURN HTS_FORMAT(HTS_PRINTF_FMT, 1, 2
4949
// newline will be added by the function.
5050
void error_errno(const char *format, ...) HTS_NORETURN HTS_FORMAT(HTS_PRINTF_FMT, 1, 2);
5151

52+
// For on the fly index creation with --write-index
53+
int init_index(htsFile *fh, bcf_hdr_t *hdr, char *fname, char **idx_fname);
54+
5255
void bcf_hdr_append_version(bcf_hdr_t *hdr, int argc, char **argv, const char *cmd);
5356
const char *hts_bcf_wmode(int file_type);
5457
const char *hts_bcf_wmode2(int file_type, const char *fname);

csq.c

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -574,6 +574,8 @@ typedef struct _args_t
574574
// text tab-delimited output (out) or vcf/bcf output (out_fh)
575575
FILE *out;
576576
htsFile *out_fh;
577+
char *index_fn;
578+
int write_index;
577579

578580
// vcf
579581
bcf_srs_t *sr;
@@ -1536,6 +1538,7 @@ void init_data(args_t *args)
15361538
if ( args->hdr_nsmpl )
15371539
bcf_hdr_printf(args->hdr,"##FORMAT=<ID=%s,Number=.,Type=Integer,Description=\"Bitmask of indexes to INFO/BCSQ, with interleaved first/second haplotype. Use \\\"bcftools query -f'[%%CHROM\\t%%POS\\t%%SAMPLE\\t%%TBCSQ\\n]'\\\" to translate.\">",args->bcsq_tag);
15381540
if ( bcf_hdr_write(args->out_fh, args->hdr)!=0 ) error("[%s] Error: cannot write the header to %s\n", __func__,args->output_fname?args->output_fname:"standard output");
1541+
if ( args->write_index && init_index(args->out_fh,args->hdr,args->output_fname,&args->index_fn)<0 ) error("Error: failed to initialise index for %s\n",args->output_fname);
15391542
}
15401543
if ( args->verbosity > 0 ) fprintf(stderr,"Calling...\n");
15411544
}
@@ -1571,7 +1574,18 @@ void destroy_data(args_t *args)
15711574
if ( args->smpl ) smpl_ilist_destroy(args->smpl);
15721575
int ret;
15731576
if ( args->out_fh )
1577+
{
1578+
if ( args->write_index )
1579+
{
1580+
if ( bcf_idx_save(args->out_fh)<0 )
1581+
{
1582+
if ( hts_close(args->out_fh)!=0 ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"stdout");
1583+
error("Error: cannot write to index %s\n", args->index_fn);
1584+
}
1585+
free(args->index_fn);
1586+
}
15741587
ret = hts_close(args->out_fh);
1588+
}
15751589
else
15761590
ret = fclose(args->out);
15771591
if ( ret ) error("Error: close failed .. %s\n", args->output_fname?args->output_fname:"stdout");
@@ -4272,6 +4286,7 @@ static const char *usage(void)
42724286
" --targets-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [0]\n"
42734287
" --threads INT Use multithreading with <int> worker threads [0]\n"
42744288
" -v, --verbose INT Verbosity level 0-2 [1]\n"
4289+
" --write-index Automatically index the output files [off]\n"
42754290
"\n"
42764291
"Example:\n"
42774292
" bcftools csq -f hs37d5.fa -g Homo_sapiens.GRCh37.82.gff3.gz in.vcf\n"
@@ -4321,6 +4336,7 @@ int main_csq(int argc, char *argv[])
43214336
{"targets-file",1,0,'T'},
43224337
{"targets-overlap",required_argument,NULL,5},
43234338
{"no-version",no_argument,NULL,3},
4339+
{"write-index",no_argument,NULL,6},
43244340
{0,0,0,0}
43254341
};
43264342
int c, targets_is_file = 0, regions_is_file = 0;
@@ -4409,6 +4425,7 @@ int main_csq(int argc, char *argv[])
44094425
targets_overlap = parse_overlap_option(optarg);
44104426
if ( targets_overlap < 0 ) error("Could not parse: --targets-overlap %s\n",optarg);
44114427
break;
4428+
case 6 : args->write_index = 1; break;
44124429
case 'h':
44134430
case '?': error("%s",usage());
44144431
default: error("The option not recognised: %s\n\n", optarg); break;

doc/bcftools.1

Lines changed: 107 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,13 @@
11
'\" t
22
.\" Title: bcftools
33
.\" Author: [see the "AUTHOR(S)" section]
4-
.\" Generator: Asciidoctor 2.0.16.dev
5-
.\" Date: 2023-02-21
4+
.\" Generator: Asciidoctor 2.0.16
5+
.\" Date: 2023-03-10
66
.\" Manual: \ \&
77
.\" Source: \ \&
88
.\" Language: English
99
.\"
10-
.TH "BCFTOOLS" "1" "2023-02-21" "\ \&" "\ \&"
10+
.TH "BCFTOOLS" "1" "2023-03-10" "\ \&" "\ \&"
1111
.ie \n(.g .ds Aq \(aq
1212
.el .ds Aq '
1313
.ss \n[.ss] 0
@@ -51,10 +51,10 @@ standard input (stdin) and outputs to the standard output (stdout). Several
5151
commands can thus be combined with Unix pipes.
5252
.SS "VERSION"
5353
.sp
54-
This manual page was last updated \fB2023\-02\-21\fP and refers to bcftools git version \fB1.17\fP.
54+
This manual page was last updated \fB2023\-03\-10 08:27 GMT\fP and refers to bcftools git version \fB1.17\-10\-g9655089+\fP.
5555
.SS "BCF1"
5656
.sp
57-
The BCF1 format output by versions of samtools <= 0.1.19 is \fBnot\fP
57+
The obsolete BCF1 format output by versions of samtools <= 0.1.19 is \fBnot\fP
5858
compatible with this version of bcftools. To read BCF1 files one can use
5959
the view command from old versions of bcftools packaged with samtools
6060
versions <= 0.1.19 to convert to VCF, which can then be read by
@@ -75,6 +75,9 @@ done with \fIbcftools view\fP. Users are now required to choose between the old
7575
samtools calling model (\fI\-c/\-\-consensus\-caller\fP) and the new multiallelic
7676
calling model (\fI\-m/\-\-multiallelic\-caller\fP). The multiallelic calling model
7777
is recommended for most tasks.
78+
.SS "FILTERING EXPRESSIONS"
79+
.sp
80+
See \fBEXPRESSIONS\fP
7881
.SH "LIST OF COMMANDS"
7982
.sp
8083
For a full list of available commands, run \fBbcftools\fP without arguments. For a full
@@ -344,6 +347,17 @@ Some helper scripts are bundled with the bcftools code.
344347
. sp -1
345348
. IP \(bu 2.3
346349
.\}
350+
\fBgff2gff\fP .. converts a GFF file to the format required by \fBcsq\fP
351+
.RE
352+
.sp
353+
.RS 4
354+
.ie n \{\
355+
\h'-04'\(bu\h'+03'\c
356+
.\}
357+
.el \{\
358+
. sp -1
359+
. IP \(bu 2.3
360+
.\}
347361
\fBplot\-vcfstats\fP .. plots the output of \fBstats\fP
348362
.RE
349363
.SH "COMMANDS AND OPTIONS"
@@ -597,6 +611,11 @@ Same as \fB\-\-regions\-overlap\fP but for \fB\-t/\-T\fP.
597611
Use multithreading with \fIINT\fP worker threads. The option is currently used only for the compression of the
598612
output stream, only when \fI\-\-output\-type\fP is \fIb\fP or \fIz\fP. Default: 0.
599613
.RE
614+
.sp
615+
\fB\-\-write\-index\fP
616+
.RS 4
617+
Automatically index the output files. Can be used only for compressed BCF and VCF output.
618+
.RE
600619
.SS "bcftools annotate \fI[OPTIONS]\fP \fIFILE\fP"
601620
.sp
602621
Add or remove annotations.
@@ -881,6 +900,11 @@ except GT. To remove all INFO tags except "FOO" and "BAR", use
881900
"INFO" can be abbreviated to "INF" and "FORMAT" to "FMT".
882901
.RE
883902
.sp
903+
\fB\-\-write\-index\fP
904+
.RS 4
905+
Automatically index the output file
906+
.RE
907+
.sp
884908
\fBExamples:\fP
885909
.sp
886910
.if n .RS 4
@@ -1017,6 +1041,11 @@ see \fBCommon Options\fP
10171041
.RS 4
10181042
see \fBCommon Options\fP
10191043
.RE
1044+
.sp
1045+
\fB\-\-write\-index\fP
1046+
.RS 4
1047+
Automatically index the output file
1048+
.RE
10201049
.SS "Input/output options:"
10211050
.sp
10221051
\fB\-A, \-\-keep\-alts\fP
@@ -1401,6 +1430,11 @@ see \fBCommon Options\fP
14011430
.RS 4
14021431
see \fBCommon Options\fP
14031432
.RE
1433+
.sp
1434+
\fB\-\-write\-index\fP
1435+
.RS 4
1436+
Automatically index the output file
1437+
.RE
14041438
.SS "bcftools consensus \fI[OPTIONS]\fP \fIFILE\fP"
14051439
.sp
14061440
Create consensus sequence by applying VCF variants to a reference fasta file.
@@ -1617,6 +1651,11 @@ see \fBCommon Options\fP
16171651
.RS 4
16181652
see \fBCommon Options\fP
16191653
.RE
1654+
.sp
1655+
\fB\-\-write\-index\fP
1656+
.RS 4
1657+
Automatically index the output file
1658+
.RE
16201659
.SS "VCF output options:"
16211660
.sp
16221661
\fB\-\-no\-version\fP
@@ -1987,6 +2026,7 @@ transcripts in malformatted GFFs with incorrect phase
19872026
.RS 4
19882027
GFF3 annotation file (required), such as \c
19892028
.URL "ftp://ftp.ensembl.org/pub/current_gff3/homo_sapiens" "" "."
2029+
The script \fBgff2gff\fP can help with conversion from non\-standard GFF formats.
19902030
An example of a minimal working GFF file:
19912031
.RE
19922032
.sp
@@ -2137,6 +2177,11 @@ see \fBCommon Options\fP
21372177
see \fBCommon Options\fP
21382178
.RE
21392179
.sp
2180+
\fB\-\-write\-index\fP
2181+
.RS 4
2182+
Automatically index the output file
2183+
.RE
2184+
.sp
21402185
\fBExamples:\fP
21412186
.sp
21422187
.if n .RS 4
@@ -2366,6 +2411,11 @@ see \fBCommon Options\fP
23662411
.RS 4
23672412
see \fBCommon Options\fP
23682413
.RE
2414+
.sp
2415+
\fB\-\-write\-index\fP
2416+
.RS 4
2417+
Automatically index the output file
2418+
.RE
23692419
.SS "bcftools gtcheck [\fIOPTIONS\fP] [\fB\-g\fP \fIgenotypes.vcf.gz\fP] \fIquery.vcf.gz\fP"
23702420
.sp
23712421
Checks sample identity. The program can operate in two modes. If the \fB\-g\fP
@@ -2676,6 +2726,11 @@ see \fBCommon Options\fP
26762726
list of input files to output given as 1\-based indices. With \fB\-p\fP and no
26772727
\fB\-w\fP, all files are written.
26782728
.RE
2729+
.sp
2730+
\fB\-\-write\-index\fP
2731+
.RS 4
2732+
Automatically index the output file. This is done automatically with the \fB\-p\fP option.
2733+
.RE
26792734
.SS "Examples:"
26802735
.sp
26812736
Create intersection and complements of two sets saving the output in dir/*
@@ -2876,6 +2931,11 @@ see \fBCommon Options\fP
28762931
.RS 4
28772932
see \fBCommon Options\fP
28782933
.RE
2934+
.sp
2935+
\fB\-\-write\-index\fP
2936+
.RS 4
2937+
Automatically index the output file
2938+
.RE
28792939
.SS "bcftools mpileup [\fIOPTIONS\fP] \fB\-f\fP \fIref.fa\fP \fIin.bam\fP [\fIin2.bam\fP [...]]"
28802940
.sp
28812941
Generate VCF or BCF containing genotype likelihoods for one or multiple
@@ -3209,6 +3269,11 @@ BQB.
32093269
.fi
32103270
.if n .RE
32113271
.RE
3272+
.sp
3273+
\fB\-\-write\-index\fP
3274+
.RS 4
3275+
Automatically index the output file
3276+
.RE
32123277
.SS "Options for SNP/INDEL genotype likelihood computation"
32133278
.sp
32143279
\fB\-X, \-\-config\fP \fISTR\fP
@@ -3528,6 +3593,11 @@ see \fBCommon Options\fP
35283593
maximum distance between two records to consider when locally
35293594
sorting variants which changed position during the realignment
35303595
.RE
3596+
.sp
3597+
\fB\-\-write\-index\fP
3598+
.RS 4
3599+
Automatically index the output file
3600+
.RE
35313601
.SS "bcftools [plugin \fINAME\fP|+\fINAME\fP] \fI[OPTIONS]\fP \fIFILE\fP \(em \fI[PLUGIN OPTIONS]\fP"
35323602
.sp
35333603
A common framework for various utilities. The plugins can be used
@@ -3601,6 +3671,11 @@ see \fBCommon Options\fP
36013671
.RS 4
36023672
see \fBCommon Options\fP
36033673
.RE
3674+
.sp
3675+
\fB\-\-write\-index\fP
3676+
.RS 4
3677+
Automatically index the output file
3678+
.RE
36043679
.SS "Plugin options:"
36053680
.sp
36063681
\fB\-h, \-\-help\fP
@@ -4725,6 +4800,11 @@ see \fBCommon Options\fP
47254800
.RS 4
47264801
Use this directory to store temporary files
47274802
.RE
4803+
.sp
4804+
\fB\-\-write\-index\fP
4805+
.RS 4
4806+
Automatically index the output file
4807+
.RE
47284808
.SS "bcftools stats [\fIOPTIONS\fP] \fIA.vcf.gz\fP [\fIB.vcf.gz\fP]"
47294809
.sp
47304810
Parses VCF or BCF and produces text file stats which is suitable for machine
@@ -4943,6 +5023,11 @@ see \fBCommon Options\fP
49435023
.RS 4
49445024
see \fBCommon Options\fP
49455025
.RE
5026+
.sp
5027+
\fB\-\-write\-index\fP
5028+
.RS 4
5029+
Automatically index the output file
5030+
.RE
49465031
.SS "Subset options:"
49475032
.sp
49485033
\fB\-a, \-\-trim\-alt\-alleles\fP
@@ -5137,7 +5222,7 @@ important libraries used by bcftools.
51375222
.SS "bcftools [\fI\-\-version\-only\fP]"
51385223
.sp
51395224
Display the full bcftools version number in a machine\-readable format.
5140-
.SH "EXPRESSIONS"
5225+
.SH "FILTERING EXPRESSIONS"
51415226
.sp
51425227
These filtering expressions are accepted by most of the commands.
51435228
.sp
@@ -5919,7 +6004,18 @@ bcftools view \-i \*(Aq%ID!="." & MAF[0]<0.01\*(Aq
59196004
.if n .RE
59206005
.sp
59216006
Please refer to the documentation of your shell for details.
5922-
.SH "SCRIPTS AND OPTIONS"
6007+
.SH "SCRIPTS"
6008+
.SS "gff2gff"
6009+
.sp
6010+
Attempts to fix a GFF file to be correctly parsed by \fBcsq\fP.
6011+
.sp
6012+
.if n .RS 4
6013+
.nf
6014+
.fam C
6015+
zcat in.gff.gz | gff2gff | gzip \-c > out.gff.gz
6016+
.fam
6017+
.fi
6018+
.if n .RE
59236019
.SS "plot\-vcfstats [\fIOPTIONS\fP] \fIfile.vchk\fP [...]"
59246020
.sp
59256021
Script for processing output of \fBbcftools stats\fP. It can merge
@@ -6013,8 +6109,10 @@ Please report any bugs you encounter on the github website: \c
60136109
.sp
60146110
Heng Li from the Sanger Institute wrote the original C version of htslib,
60156111
samtools and bcftools. Bob Handsaker from the Broad Institute implemented the
6016-
BGZF library. Petr Danecek, Shane McCarthy and John Marshall are maintaining
6017-
and further developing bcftools. Many other people contributed to the program
6112+
BGZF library. Petr Danecek is maintaining and further developing bcftools, together
6113+
with the rest of the \c
6114+
.URL "https://www.sanger.ac.uk/tool/samtools\-bcftools\-htslib" "samtools team" "."
6115+
Many other people contributed to the program
60186116
and to the file format specifications, both directly and indirectly by
60196117
providing patches, testing and reporting bugs. We thank them all.
60206118
.SH "RESOURCES"

0 commit comments

Comments
 (0)