Skip to content

Commit 487cecf

Browse files
committed
Merge branch 'develop' of github.com:samtools/bcftools into develop
2 parents f9649a6 + 2d84e07 commit 487cecf

17 files changed

+748
-155
lines changed

NEWS

+8
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,14 @@ Changes affecting specific commands:
3434
- Check the input GFF for features outside transcript boundaries and extend the transcript
3535
to contain the feature fully (#2323)
3636

37+
- Add experimental support for alternative genetic code tables, accessible via
38+
a new option `-C, --genetic-code` (#2368)
39+
40+
- Change in the `--unify-chr-names` option, no automatic sequence name modification
41+
is attempted anymore, the prefixes to trim must be given explictly. For example,
42+
if run with `--unify-chr-names chr,Chromosome,-`, the program will trim the "chr"
43+
prefix in the VCF, "Chromosome" in the GFF, leaving the fasta unchanged (#2378)
44+
3745
* bcftools +fill-tags
3846

3947
- Thanks to the extension of filtering expressions with Fisher's exact test, the plugin

csq.c

+359-121
Large diffs are not rendered by default.

doc/bcftools.txt

+12-7
Original file line numberDiff line numberDiff line change
@@ -1360,15 +1360,18 @@ Symbolic alleles are not supported. They will remain unannotated in the
13601360
output VCF and are ignored for the prediction analysis.
13611361

13621362

1363-
*-c, --custom-tag* 'STRING'::
1364-
use this custom tag to store consequences rather than the default BCSQ tag
1365-
13661363
*-B, --trim-protein-seq* 'INT'::
13671364
abbreviate protein-changing predictions to maximum of INT aminoacids.
13681365
For example, instead of writing the whole modified protein sequence with potentially hundreds of
13691366
aminoacids, with *-B 1* only an abbreviated version such as '25E..329>25G..94' will be
13701367
written.
13711368

1369+
*-C, --genetic-code* 'INT'|'l'::
1370+
specify the genetic code table to use, 'l' to print a list of supported tables
1371+
1372+
*-c, --custom-tag* 'STRING'::
1373+
use this custom tag to store consequences rather than the default BCSQ tag
1374+
13721375
*--dump-gff* 'FILE'::
13731376
dump the parsed GFF into a gzipped FILE. Intended for debugging purposes,
13741377
shows how is the input GFF viewed by the program.
@@ -1500,10 +1503,12 @@ output VCF and are ignored for the prediction analysis.
15001503
*--targets-overlap* '0'|'1'|'2'::
15011504
see *<<common_options,Common Options>>*
15021505

1503-
*--unify-chr-names* '0'|'1'::
1504-
Automatically detect and unify chromosome naming conventions in the GFF, fasta
1505-
and VCF, such as "chrX" vs "X". The chromosome names in the output VCF will match
1506-
that of the input VCF. The default is to attempt the automatic translation.
1506+
*--unify-chr-names* '0'|'LIST'::
1507+
unify chromosome naming by stripping a prefix in VCF, GFF, fasta, respectively.
1508+
For example, if the LIST is "chr,Chromosome,-", the program will trim the "chr" prefix
1509+
in the VCF, "Chromosome" in the GFF, and the fasta will be left unchanged.
1510+
The chromosome names in the output VCF will match that of the input VCF. The default is
1511+
not to modify the sequence names.
15071512

15081513
*-v, --verbose* 'INT'::
15091514
verbosity level (0-2)

gff.c

+13-14
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
/* The MIT License
22
3-
Copyright (c) 2023-2024 Genome Research Ltd.
3+
Copyright (c) 2023-2025 Genome Research Ltd.
44
55
Author: Petr Danecek <[email protected]>
66
@@ -88,7 +88,6 @@ typedef struct
8888
kh_int2tscript_t *id2tr;
8989

9090
// sequences
91-
void *seq2int; // str2int hash
9291
char **seq;
9392
int nseq, mseq;
9493

@@ -111,13 +110,16 @@ struct gff_t_
111110
// index iterator
112111
regidx_t *idx_cds, *idx_utr, *idx_exon, *idx_tscript;
113112

113+
// str2int hash with parsed sequence names
114+
void *seq2int;
115+
114116
// temporary structures, deleted after initializtion
115117
aux_t init;
116118

117119
// mapping between transcript id (eg. Zm00001d027245_T001) and a numeric idx
118120
id_tbl_t tscript_ids;
119121

120-
int strip_chr_names, verbosity;
122+
int verbosity;
121123
int force; // force run under various conditions. Currently only to skip out-of-phase transcripts
122124

123125
struct {
@@ -158,12 +160,6 @@ int gff_set(gff_t *gff, gff_opt_t key, ...)
158160
va_end(args);
159161
return 0;
160162

161-
case strip_chr_names:
162-
va_start(args, key);
163-
gff->strip_chr_names = va_arg(args,int);
164-
va_end(args);
165-
return 0;
166-
167163
case verbosity:
168164
va_start(args, key);
169165
gff->verbosity = va_arg(args,int);
@@ -216,12 +212,12 @@ static inline int feature_set_seq(gff_t *gff, char *chr_beg, char *chr_end)
216212
char tmp = chr_end[1];
217213
chr_end[1] = 0;
218214
int iseq;
219-
if ( khash_str2int_get(aux->seq2int, chr_beg, &iseq)!=0 )
215+
if ( khash_str2int_get(gff->seq2int, chr_beg, &iseq)!=0 )
220216
{
221217
char *new_chr = strdup(chr_beg);
222218
hts_expand(char*, aux->nseq+1, aux->mseq, aux->seq);
223219
aux->seq[aux->nseq] = new_chr;
224-
iseq = khash_str2int_inc(aux->seq2int, aux->seq[aux->nseq]);
220+
iseq = khash_str2int_inc(gff->seq2int, aux->seq[aux->nseq]);
225221
aux->nseq++;
226222
assert( aux->nseq < 1<<29 ); // see gf_gene_t.iseq and ftr_t.iseq
227223
}
@@ -239,7 +235,6 @@ static inline void gff_parse_chr(gff_t *gff, const char *line, char **chr_beg, c
239235
char *se = (char*) line;
240236
while ( *se && *se!='\t' ) se++;
241237
if ( !*se ) error("[%s:%d %s] Could not parse the line: %s\n",__FILE__,__LINE__,__FUNCTION__,line);
242-
if ( gff->strip_chr_names && !strncasecmp("chr",line,3) ) line += 3;
243238
*chr_beg = (char*) line;
244239
*chr_end = se-1;
245240
}
@@ -974,7 +969,7 @@ int gff_parse(gff_t *gff)
974969
if ( gff->verbosity > 0 ) fprintf(stderr,"Parsing %s ...\n", gff->fname);
975970

976971
aux_t *aux = &gff->init;
977-
aux->seq2int = khash_str2int_init(); // chrom's numeric id
972+
gff->seq2int = khash_str2int_init(); // chrom's numeric id
978973
aux->gid2gene = kh_init(int2gene); // gene id to gf_gene_t, for idx_gene
979974
aux->id2tr = kh_init(int2tscript); // transcript id to tscript_t
980975
gff->idx_tscript = regidx_init(NULL, NULL, regidx_free_tscript, sizeof(gf_tscript_t*), NULL);
@@ -1085,7 +1080,6 @@ int gff_parse(gff_t *gff)
10851080

10861081
free(aux->seq);
10871082
free(aux->ftr);
1088-
khash_str2int_destroy_free(aux->seq2int);
10891083
// keeping only to destroy the genes at the end: kh_destroy(int2gene,aux->gid2gene);
10901084
kh_destroy(int2tscript,aux->id2tr);
10911085
gff_id_destroy(&aux->gene_ids);
@@ -1119,7 +1113,12 @@ void gff_destroy(gff_t *gff)
11191113
regidx_destroy(gff->idx_exon);
11201114
regidx_destroy(gff->idx_tscript);
11211115

1116+
khash_str2int_destroy_free(gff->seq2int);
11221117
gff_id_destroy(&gff->tscript_ids);
11231118
free(gff);
11241119
}
1120+
int gff_has_seq(gff_t *gff, const char *seq)
1121+
{
1122+
return khash_str2int_has_key(gff->seq2int, seq);
1123+
}
11251124

gff.h

+3-2
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
/* The MIT License
22
3-
Copyright (c) 2023-2024 Genome Research Ltd.
3+
Copyright (c) 2023-2025 Genome Research Ltd.
44
55
Author: Petr Danecek <[email protected]>
66
@@ -289,7 +289,6 @@ typedef enum
289289
{
290290
// write options
291291
verbosity, // int, 0-2
292-
strip_chr_names, // int, 0 to leave as is, 1 to strip 'chr' prefix
293292
force_out_of_phase, // int, 1 to proceed even CDS exon out of expected phase
294293
dump_fname, // const char*, dump the parsed GFF into this file, for debugging purposes
295294

@@ -314,4 +313,6 @@ void *gff_get(gff_t *gff, gff_opt_t key);
314313
const char *gff_id2string(gff_t *gff, id_type_t type, int id);
315314
const char *gf_type2gff_string(int type);
316315

316+
int gff_has_seq(gff_t *gff, const char *chr);
317+
317318
#endif

0 commit comments

Comments
 (0)