1
1
/* The MIT License
2
2
3
- Copyright (c) 2023-2024 Genome Research Ltd.
3
+ Copyright (c) 2023-2025 Genome Research Ltd.
4
4
5
5
Author: Petr Danecek <[email protected] >
6
6
@@ -88,7 +88,6 @@ typedef struct
88
88
kh_int2tscript_t * id2tr ;
89
89
90
90
// sequences
91
- void * seq2int ; // str2int hash
92
91
char * * seq ;
93
92
int nseq , mseq ;
94
93
@@ -111,13 +110,16 @@ struct gff_t_
111
110
// index iterator
112
111
regidx_t * idx_cds , * idx_utr , * idx_exon , * idx_tscript ;
113
112
113
+ // str2int hash with parsed sequence names
114
+ void * seq2int ;
115
+
114
116
// temporary structures, deleted after initializtion
115
117
aux_t init ;
116
118
117
119
// mapping between transcript id (eg. Zm00001d027245_T001) and a numeric idx
118
120
id_tbl_t tscript_ids ;
119
121
120
- int strip_chr_names , verbosity ;
122
+ int verbosity ;
121
123
int force ; // force run under various conditions. Currently only to skip out-of-phase transcripts
122
124
123
125
struct {
@@ -158,12 +160,6 @@ int gff_set(gff_t *gff, gff_opt_t key, ...)
158
160
va_end (args );
159
161
return 0 ;
160
162
161
- case strip_chr_names :
162
- va_start (args , key );
163
- gff -> strip_chr_names = va_arg (args ,int );
164
- va_end (args );
165
- return 0 ;
166
-
167
163
case verbosity :
168
164
va_start (args , key );
169
165
gff -> verbosity = va_arg (args ,int );
@@ -216,12 +212,12 @@ static inline int feature_set_seq(gff_t *gff, char *chr_beg, char *chr_end)
216
212
char tmp = chr_end [1 ];
217
213
chr_end [1 ] = 0 ;
218
214
int iseq ;
219
- if ( khash_str2int_get (aux -> seq2int , chr_beg , & iseq )!= 0 )
215
+ if ( khash_str2int_get (gff -> seq2int , chr_beg , & iseq )!= 0 )
220
216
{
221
217
char * new_chr = strdup (chr_beg );
222
218
hts_expand (char * , aux -> nseq + 1 , aux -> mseq , aux -> seq );
223
219
aux -> seq [aux -> nseq ] = new_chr ;
224
- iseq = khash_str2int_inc (aux -> seq2int , aux -> seq [aux -> nseq ]);
220
+ iseq = khash_str2int_inc (gff -> seq2int , aux -> seq [aux -> nseq ]);
225
221
aux -> nseq ++ ;
226
222
assert ( aux -> nseq < 1 <<29 ); // see gf_gene_t.iseq and ftr_t.iseq
227
223
}
@@ -239,7 +235,6 @@ static inline void gff_parse_chr(gff_t *gff, const char *line, char **chr_beg, c
239
235
char * se = (char * ) line ;
240
236
while ( * se && * se != '\t' ) se ++ ;
241
237
if ( !* se ) error ("[%s:%d %s] Could not parse the line: %s\n" ,__FILE__ ,__LINE__ ,__FUNCTION__ ,line );
242
- if ( gff -> strip_chr_names && !strncasecmp ("chr" ,line ,3 ) ) line += 3 ;
243
238
* chr_beg = (char * ) line ;
244
239
* chr_end = se - 1 ;
245
240
}
@@ -974,7 +969,7 @@ int gff_parse(gff_t *gff)
974
969
if ( gff -> verbosity > 0 ) fprintf (stderr ,"Parsing %s ...\n" , gff -> fname );
975
970
976
971
aux_t * aux = & gff -> init ;
977
- aux -> seq2int = khash_str2int_init (); // chrom's numeric id
972
+ gff -> seq2int = khash_str2int_init (); // chrom's numeric id
978
973
aux -> gid2gene = kh_init (int2gene ); // gene id to gf_gene_t, for idx_gene
979
974
aux -> id2tr = kh_init (int2tscript ); // transcript id to tscript_t
980
975
gff -> idx_tscript = regidx_init (NULL , NULL , regidx_free_tscript , sizeof (gf_tscript_t * ), NULL );
@@ -1085,7 +1080,6 @@ int gff_parse(gff_t *gff)
1085
1080
1086
1081
free (aux -> seq );
1087
1082
free (aux -> ftr );
1088
- khash_str2int_destroy_free (aux -> seq2int );
1089
1083
// keeping only to destroy the genes at the end: kh_destroy(int2gene,aux->gid2gene);
1090
1084
kh_destroy (int2tscript ,aux -> id2tr );
1091
1085
gff_id_destroy (& aux -> gene_ids );
@@ -1119,7 +1113,12 @@ void gff_destroy(gff_t *gff)
1119
1113
regidx_destroy (gff -> idx_exon );
1120
1114
regidx_destroy (gff -> idx_tscript );
1121
1115
1116
+ khash_str2int_destroy_free (gff -> seq2int );
1122
1117
gff_id_destroy (& gff -> tscript_ids );
1123
1118
free (gff );
1124
1119
}
1120
+ int gff_has_seq (gff_t * gff , const char * seq )
1121
+ {
1122
+ return khash_str2int_has_key (gff -> seq2int , seq );
1123
+ }
1125
1124
0 commit comments