Skip to content

Commit eaed385

Browse files
committed
refactor ragel-specific re-tokenization into tokens.c
1 parent efe1fbc commit eaed385

File tree

4 files changed

+126
-103
lines changed

4 files changed

+126
-103
lines changed

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ y.tab.h: y.tab.c
1515
y.tab.c: ere.y
1616
$(YACC) -d $(YFLAGS) $<
1717

18-
re2r: main.o lexer.o y.tab.o sblist.o hsearch.o
18+
re2r: main.o lexer.o y.tab.o sblist.o hsearch.o tokens.o
1919
$(CC) -o $@ $^ $(LDFLAGS)
2020

2121
clean:

main.c

Lines changed: 4 additions & 102 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
#include "lexer.h"
99
#include "sblist.h"
1010
#include "hsearch.h"
11+
#include "tokens.h"
1112

1213
extern int yyerror(const char*);
1314

@@ -29,102 +30,6 @@ static char* replace(const char*s, const char* needle, const char* repl) {
2930
return repl_buf;
3031
}
3132

32-
struct list_item {
33-
enum lex_context type;
34-
size_t so, eo;
35-
};
36-
37-
static sblist *lex_to_list() {
38-
int c;
39-
size_t pos;
40-
struct list_item li;
41-
sblist *ret = sblist_new(sizeof li, 32);
42-
while((c = yylex()) != EOF) {
43-
enum lex_context ctx = lex_getcontext();
44-
pos = lex_getpos()-1;
45-
46-
switch(ctx) {
47-
case CTX_DUP:
48-
do { c = yylex(); } while (lex_getcontext() == CTX_DUP);
49-
assert(c == '}');
50-
li.type = CTX_DUP;
51-
li.so = pos;
52-
li.eo = lex_getpos();
53-
sblist_add(ret, &li);
54-
break;
55-
case CTX_BRACKET:
56-
do { c = yylex(); } while (lex_getcontext() == CTX_BRACKET);
57-
assert(c == ']');
58-
li.type = CTX_BRACKET;
59-
li.so = pos;
60-
li.eo = lex_getpos();
61-
sblist_add(ret, &li);
62-
break;
63-
default:
64-
li.type = ctx;
65-
if (c == QUOTED_CHAR) {
66-
li.so = pos-1;
67-
li.eo = pos+1;
68-
} else {
69-
li.so = pos;
70-
li.eo = pos+1;
71-
}
72-
sblist_add(ret, &li);
73-
break;
74-
}
75-
}
76-
return ret;
77-
}
78-
79-
static void list_transform_dupchars(sblist* tokens, const char* org_regex) {
80-
size_t i;
81-
for(i=0; i<sblist_getsize(tokens); i++) {
82-
struct list_item *li= sblist_get(tokens, i);
83-
if(li->type == CTX_NONE) switch(org_regex[li->so]) {
84-
case '?': case '*': case '+':
85-
li->type = CTX_DUP;
86-
break;
87-
}
88-
}
89-
}
90-
91-
static sblist* list_join_literals(sblist* tokens, const char* org_regex) {
92-
sblist *new = sblist_new(sizeof(struct list_item), sblist_getsize(tokens));
93-
size_t i,j;
94-
for(i=0; i<sblist_getsize(tokens); i++) {
95-
size_t pcnt = 0;
96-
for(j=i; j<sblist_getsize(tokens); ++j) {
97-
struct list_item *li= sblist_get(tokens, j);
98-
if(li->type != CTX_NONE) break;
99-
switch(org_regex[li->so]) {
100-
case '"':
101-
case '^':
102-
case '.':
103-
case '[':
104-
case '$':
105-
case '(':
106-
case ')':
107-
case '|':
108-
case '{':
109-
goto break_loop;
110-
default:
111-
pcnt += li->eo-li->so;
112-
}
113-
continue;
114-
break_loop:; break;
115-
}
116-
struct list_item ins = *((struct list_item *)sblist_get(tokens, i));
117-
if(j > i) {
118-
ins.type = 0xff;
119-
ins.eo = ins.so+pcnt;
120-
i = j-1;
121-
}
122-
sblist_add(new, &ins);
123-
}
124-
sblist_free(tokens);
125-
return new;
126-
}
127-
12833
static void print_token(struct list_item *li, const char *org_regex) {
12934
if(li->type == 0xff) {
13035
fprintf(yyout, " \"%.*s\" ", (int) (li->eo-li->so), org_regex+li->so);
@@ -174,13 +79,11 @@ static inline void* sblist_pop(sblist *l) {
17479
return 0;
17580
}
17681

177-
static void dump_ragel_parser(const char *machinename, const char* org_regex, int *maxgroups) {
82+
static void dump_ragel_parser(const char *machinename, const char* org_regex, const char* org_regex_end, int *maxgroups) {
17883
FILE *f = fopen("ragel.tmpl", "r");
17984
char buf[4096];
18085
int groups, cgroup = 0;
181-
sblist *tokens = lex_to_list();
182-
list_transform_dupchars(tokens, org_regex);
183-
tokens = list_join_literals(tokens, org_regex);
86+
sblist *tokens = lex_and_transform(org_regex, org_regex_end);
18487
groups = count_groups(tokens, org_regex);
18588
if(groups > *maxgroups) *maxgroups = groups;
18689
sblist *group_order = sblist_new(sizeof (int), groups) ;
@@ -292,8 +195,7 @@ int main(int argc, char**argv) {
292195
if(yyparse() == 0) {
293196
htab_insert(remap, strdup(p), HTV_P(strdup(buf)));
294197
/* syntax check OK */
295-
lex_init(p, pe, LEXFLAG_SILENT);
296-
dump_ragel_parser(buf, p, &maxgroups);
198+
dump_ragel_parser(buf, p, pe, &maxgroups);
297199
} else {
298200
++err;
299201
size_t errpos = lex_errpos();

tokens.c

Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
#include "y.tab.h"
2+
#include "tokens.h"
3+
#include "lexer.h"
4+
#include "sblist.h"
5+
#include <stdio.h>
6+
#include <assert.h>
7+
8+
static sblist *lex_to_list() {
9+
int c;
10+
size_t pos;
11+
struct list_item li;
12+
sblist *ret = sblist_new(sizeof li, 32);
13+
while((c = yylex()) != EOF) {
14+
enum lex_context ctx = lex_getcontext();
15+
pos = lex_getpos()-1;
16+
17+
switch(ctx) {
18+
case CTX_DUP:
19+
do { c = yylex(); } while (lex_getcontext() == CTX_DUP);
20+
assert(c == '}');
21+
li.type = CTX_DUP;
22+
li.so = pos;
23+
li.eo = lex_getpos();
24+
sblist_add(ret, &li);
25+
break;
26+
case CTX_BRACKET:
27+
do { c = yylex(); } while (lex_getcontext() == CTX_BRACKET);
28+
assert(c == ']');
29+
li.type = CTX_BRACKET;
30+
li.so = pos;
31+
li.eo = lex_getpos();
32+
sblist_add(ret, &li);
33+
break;
34+
default:
35+
li.type = ctx;
36+
if (c == QUOTED_CHAR) {
37+
li.so = pos-1;
38+
li.eo = pos+1;
39+
} else {
40+
li.so = pos;
41+
li.eo = pos+1;
42+
}
43+
sblist_add(ret, &li);
44+
break;
45+
}
46+
}
47+
return ret;
48+
}
49+
50+
static void list_transform_dupchars(sblist* tokens, const char* org_regex) {
51+
size_t i;
52+
for(i=0; i<sblist_getsize(tokens); i++) {
53+
struct list_item *li= sblist_get(tokens, i);
54+
if(li->type == CTX_NONE) switch(org_regex[li->so]) {
55+
case '?': case '*': case '+':
56+
li->type = CTX_DUP;
57+
break;
58+
}
59+
}
60+
}
61+
62+
static sblist* list_join_literals(sblist* tokens, const char* org_regex) {
63+
sblist *new = sblist_new(sizeof(struct list_item), sblist_getsize(tokens));
64+
size_t i,j;
65+
for(i=0; i<sblist_getsize(tokens); i++) {
66+
size_t pcnt = 0;
67+
for(j=i; j<sblist_getsize(tokens); ++j) {
68+
struct list_item *li= sblist_get(tokens, j);
69+
if(li->type != CTX_NONE) break;
70+
switch(org_regex[li->so]) {
71+
case '"':
72+
case '^':
73+
case '.':
74+
case '[':
75+
case '$':
76+
case '(':
77+
case ')':
78+
case '|':
79+
case '{':
80+
goto break_loop;
81+
default:
82+
pcnt += li->eo-li->so;
83+
}
84+
continue;
85+
break_loop:; break;
86+
}
87+
struct list_item ins = *((struct list_item *)sblist_get(tokens, i));
88+
if(j > i) {
89+
ins.type = 0xff;
90+
ins.eo = ins.so+pcnt;
91+
i = j-1;
92+
}
93+
sblist_add(new, &ins);
94+
}
95+
sblist_free(tokens);
96+
return new;
97+
}
98+
99+
sblist *lex_and_transform(const char *re, const char *re_end) {
100+
lex_init(re, re_end, LEXFLAG_SILENT);
101+
sblist *tokens = lex_to_list();
102+
list_transform_dupchars(tokens, re);
103+
tokens = list_join_literals(tokens, re);
104+
return tokens;
105+
}

tokens.h

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
#ifndef TOKENS_H
2+
#define TOKENS_H
3+
4+
#include <unistd.h>
5+
#include "sblist.h"
6+
#include "lexer.h"
7+
8+
struct list_item {
9+
enum lex_context type;
10+
size_t so, eo;
11+
};
12+
13+
sblist *lex_and_transform(const char *re, const char *re_end);
14+
15+
#endif
16+

0 commit comments

Comments
 (0)