samtools
diff --git a/‎NEWS
+8 b/‎NEWS
+8
diff --git a/‎csq.c
+359-121 b/‎csq.c
+359-121
diff --git a/‎doc/bcftools.txt
+12-7 b/‎doc/bcftools.txt
+12-7
diff --git a/‎gff.c
+13-14 b/‎gff.c
+13-14
diff --git a/‎gff.h
+3-2 b/‎gff.h
+3-2
@@ -34,6 +34,14 @@ Changes affecting specific commands:
     - Check the input GFF for features outside transcript boundaries and extend the transcript
       to contain the feature fully (#2323)
 
+    - Add experimental support for alternative genetic code tables, accessible via
+      a new option `-C, --genetic-code` (#2368)
+
+    - Change in the `--unify-chr-names` option, no automatic sequence name modification
+      is attempted anymore, the prefixes to trim must be given explictly. For example,
+      if run with `--unify-chr-names chr,Chromosome,-`, the program will trim the "chr"
+      prefix in the VCF, "Chromosome" in the GFF, leaving the fasta unchanged (#2378)
+
 * bcftools +fill-tags
 
     - Thanks to the extension of filtering expressions with Fisher's exact test, the plugin
 
@@ -1360,15 +1360,18 @@ Symbolic alleles are not supported. They will remain unannotated in the
 output VCF and are ignored for the prediction analysis.
 
 
-*-c, --custom-tag* 'STRING'::
-    use this custom tag to store consequences rather than the default BCSQ tag
-
 *-B, --trim-protein-seq* 'INT'::
     abbreviate protein-changing predictions to maximum of INT aminoacids.
     For example, instead of writing the whole modified protein sequence with potentially hundreds of
     aminoacids, with *-B 1* only an abbreviated version such as '25E..329>25G..94' will be
     written.
 
+*-C, --genetic-code* 'INT'|'l'::
+    specify the genetic code table to use, 'l' to print a list of supported tables
+
+*-c, --custom-tag* 'STRING'::
+    use this custom tag to store consequences rather than the default BCSQ tag
+
 *--dump-gff* 'FILE'::
     dump the parsed GFF into a gzipped FILE. Intended for debugging purposes,
     shows how is the input GFF viewed by the program.
@@ -1500,10 +1503,12 @@ output VCF and are ignored for the prediction analysis.
 *--targets-overlap* '0'|'1'|'2'::
     see *<<common_options,Common Options>>*
 
-*--unify-chr-names* '0'|'1'::
-    Automatically detect and unify chromosome naming conventions in the GFF, fasta
-    and VCF, such as "chrX" vs "X". The chromosome names in the output VCF will match
-    that of the input VCF. The default is to attempt the automatic translation.
+*--unify-chr-names* '0'|'LIST'::
+    unify chromosome naming by stripping a prefix in VCF, GFF, fasta, respectively.
+    For example, if the LIST is "chr,Chromosome,-", the program will trim the "chr" prefix
+    in the VCF, "Chromosome" in the GFF, and the fasta will be left unchanged.
+    The chromosome names in the output VCF will match that of the input VCF. The default is
+    not to modify the sequence names.
 
 *-v, --verbose* 'INT'::
     verbosity level (0-2)
 
@@ -1,6 +1,6 @@
 /* The MIT License
 
-   Copyright (c) 2023-2024 Genome Research Ltd.
+   Copyright (c) 2023-2025 Genome Research Ltd.
 
    Author: Petr Danecek <[email protected]>
 
@@ -88,7 +88,6 @@ typedef struct
     kh_int2tscript_t *id2tr;
 
     // sequences
-    void *seq2int;  // str2int hash
     char **seq;
     int nseq, mseq;
 
@@ -111,13 +110,16 @@ struct gff_t_
     // index iterator
     regidx_t *idx_cds, *idx_utr, *idx_exon, *idx_tscript;
 
+    // str2int hash with parsed sequence names
+    void *seq2int;
+
     // temporary structures, deleted after initializtion
     aux_t init;
 
     // mapping between transcript id (eg. Zm00001d027245_T001) and a numeric idx
     id_tbl_t tscript_ids;
 
-    int strip_chr_names, verbosity;
+    int verbosity;
     int force;      // force run under various conditions. Currently only to skip out-of-phase transcripts
 
     struct {
@@ -158,12 +160,6 @@ int gff_set(gff_t *gff, gff_opt_t key, ...)
             va_end(args);
             return 0;
 
-        case strip_chr_names:
-            va_start(args, key);
-            gff->strip_chr_names = va_arg(args,int);
-            va_end(args);
-            return 0;
-
         case verbosity:
             va_start(args, key);
             gff->verbosity = va_arg(args,int);
@@ -216,12 +212,12 @@ static inline int feature_set_seq(gff_t *gff, char *chr_beg, char *chr_end)
     char tmp = chr_end[1];
     chr_end[1] = 0;
     int iseq;
-    if ( khash_str2int_get(aux->seq2int, chr_beg, &iseq)!=0 )
+    if ( khash_str2int_get(gff->seq2int, chr_beg, &iseq)!=0 )
     {
         char *new_chr = strdup(chr_beg);
         hts_expand(char*, aux->nseq+1, aux->mseq, aux->seq);
         aux->seq[aux->nseq] = new_chr;
-        iseq = khash_str2int_inc(aux->seq2int, aux->seq[aux->nseq]);
+        iseq = khash_str2int_inc(gff->seq2int, aux->seq[aux->nseq]);
         aux->nseq++;
         assert( aux->nseq < 1<<29 );  // see gf_gene_t.iseq and ftr_t.iseq
     }
@@ -239,7 +235,6 @@ static inline void gff_parse_chr(gff_t *gff, const char *line, char **chr_beg, c
     char *se = (char*) line;
     while ( *se && *se!='\t' ) se++;
     if ( !*se ) error("[%s:%d %s] Could not parse the line: %s\n",__FILE__,__LINE__,__FUNCTION__,line);
-    if ( gff->strip_chr_names && !strncasecmp("chr",line,3) ) line += 3;
     *chr_beg = (char*) line;
     *chr_end = se-1;
 }
@@ -974,7 +969,7 @@ int gff_parse(gff_t *gff)
     if ( gff->verbosity > 0 ) fprintf(stderr,"Parsing %s ...\n", gff->fname);
 
     aux_t *aux = &gff->init;
-    aux->seq2int   = khash_str2int_init();   // chrom's numeric id
+    gff->seq2int   = khash_str2int_init();   // chrom's numeric id
     aux->gid2gene  = kh_init(int2gene);      // gene id to gf_gene_t, for idx_gene
     aux->id2tr     = kh_init(int2tscript);   // transcript id to tscript_t
     gff->idx_tscript = regidx_init(NULL, NULL, regidx_free_tscript, sizeof(gf_tscript_t*), NULL);
@@ -1085,7 +1080,6 @@ int gff_parse(gff_t *gff)
 
     free(aux->seq);
     free(aux->ftr);
-    khash_str2int_destroy_free(aux->seq2int);
     // keeping only to destroy the genes at the end: kh_destroy(int2gene,aux->gid2gene);
     kh_destroy(int2tscript,aux->id2tr);
     gff_id_destroy(&aux->gene_ids);
@@ -1119,7 +1113,12 @@ void gff_destroy(gff_t *gff)
     regidx_destroy(gff->idx_exon);
     regidx_destroy(gff->idx_tscript);
 
+    khash_str2int_destroy_free(gff->seq2int);
     gff_id_destroy(&gff->tscript_ids);
     free(gff);
 }
+int gff_has_seq(gff_t *gff, const char *seq)
+{
+    return khash_str2int_has_key(gff->seq2int, seq);
+}
 
@@ -1,6 +1,6 @@
 /* The MIT License
 
-   Copyright (c) 2023-2024 Genome Research Ltd.
+   Copyright (c) 2023-2025 Genome Research Ltd.
 
    Author: Petr Danecek <[email protected]>
 
@@ -289,7 +289,6 @@ typedef enum
 {
     // write options
     verbosity,          // int, 0-2
-    strip_chr_names,    // int, 0 to leave as is, 1 to strip 'chr' prefix
     force_out_of_phase, // int, 1 to proceed even CDS exon out of expected phase
     dump_fname,         // const char*, dump the parsed GFF into this file, for debugging purposes
 
@@ -314,4 +313,6 @@ void *gff_get(gff_t *gff, gff_opt_t key);
 const char *gff_id2string(gff_t *gff, id_type_t type, int id);
 const char *gf_type2gff_string(int type);
 
+int gff_has_seq(gff_t *gff, const char *chr);
+
 #endif