Skip to content

Commit d2dd5c1

Browse files
committed
emit ragel parser
mission accomplished.
1 parent 81788d6 commit d2dd5c1

File tree

7 files changed

+424
-10
lines changed

7 files changed

+424
-10
lines changed

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ y.tab.h: y.tab.c
1515
y.tab.c: ere.y
1616
$(YACC) -d $(YFLAGS) $<
1717

18-
re2r: main.o lexer.o y.tab.o
18+
re2r: main.o lexer.o y.tab.o sblist.o
1919
$(CC) -o $@ $^ $(LDFLAGS)
2020

2121
clean:

lexer.c

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -72,15 +72,9 @@ extern int yyerror(const char*);
7272
extern YYSTYPE yylval;
7373
extern FILE* yyin;
7474

75-
enum context {
76-
CTX_NONE=0,
77-
CTX_DUP,
78-
CTX_BRACKET,
79-
};
80-
8175
struct lex_state {
8276
int flags;
83-
enum context ctx;
77+
enum lex_context ctx;
8478
size_t line_pos;
8579
size_t brack_pos;
8680
int brack_neg;
@@ -96,6 +90,10 @@ int yyerror(const char* s) {
9690
return 1;
9791
}
9892

93+
enum lex_context lex_getcontext(void) {
94+
return lex_state.ctx;
95+
}
96+
9997
size_t lex_errpos(void) {
10098
return lex_state.line_pos;
10199
}
@@ -107,7 +105,11 @@ void lex_init(const char *p, const char *pe, int flags) {
107105
lex_state.flags = flags;
108106
}
109107

110-
static int ctxup(enum context newctx) {
108+
size_t lex_getpos(void) {
109+
return lex_state.p - lex_state.ps;
110+
}
111+
112+
static int ctxup(enum lex_context newctx) {
111113
if(newctx != CTX_NONE && lex_state.ctx != CTX_NONE) {
112114
yyerror("invalid context switch");
113115
return 0;

lexer.h

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,17 @@
55

66
#define LEXFLAG_SILENT (1<<0)
77

8+
enum lex_context {
9+
CTX_NONE=0,
10+
CTX_DUP,
11+
CTX_BRACKET,
12+
};
13+
14+
enum lex_context lex_getcontext(void);
815
void lex_init(const char *p, const char *pe, int flags);
916
size_t lex_errpos(void);
10-
17+
size_t lex_getpos(void);
18+
int yylex(void);
1119

1220
#endif
1321

main.c

Lines changed: 214 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,11 @@
11
#include <stdio.h>
22
#include <string.h>
3+
#include <assert.h>
34

45
#include "y.tab.h"
56
#include "yydefs.h"
67
#include "lexer.h"
8+
#include "sblist.h"
79

810
extern int yyerror(const char*);
911

@@ -12,20 +14,232 @@ FILE *yyin;
1214
extern int yyparse();
1315
extern int yydebug;
1416

17+
static char* replace(const char*s, const char* needle, const char* repl) {
18+
char cp[4096+128];
19+
strcpy(cp, s);
20+
static char repl_buf[4096+128];
21+
char *p;
22+
while((p = strstr(cp, needle))) {
23+
snprintf(repl_buf, sizeof repl_buf, "%.*s%s%s",
24+
(int)(p-cp), cp, repl, p+strlen(needle));
25+
strcpy(cp, repl_buf);
26+
}
27+
return repl_buf;
28+
}
29+
30+
struct list_item {
31+
enum lex_context type;
32+
size_t so, eo;
33+
};
34+
35+
static sblist *lex_to_list() {
36+
int c;
37+
size_t pos;
38+
struct list_item li;
39+
sblist *ret = sblist_new(sizeof li, 32);
40+
while((c = yylex()) != EOF) {
41+
enum lex_context ctx = lex_getcontext();
42+
pos = lex_getpos()-1;
43+
44+
switch(ctx) {
45+
case CTX_DUP:
46+
do { c = yylex(); } while (lex_getcontext() == CTX_DUP);
47+
assert(c == '}');
48+
li.type = CTX_DUP;
49+
li.so = pos;
50+
li.eo = lex_getpos();
51+
sblist_add(ret, &li);
52+
break;
53+
case CTX_BRACKET:
54+
do { c = yylex(); } while (lex_getcontext() == CTX_BRACKET);
55+
assert(c == ']');
56+
li.type = CTX_BRACKET;
57+
li.so = pos;
58+
li.eo = lex_getpos();
59+
sblist_add(ret, &li);
60+
break;
61+
default:
62+
li.type = ctx;
63+
if (c == QUOTED_CHAR) {
64+
li.so = pos-1;
65+
li.eo = pos+1;
66+
} else {
67+
li.so = pos;
68+
li.eo = pos+1;
69+
}
70+
sblist_add(ret, &li);
71+
break;
72+
}
73+
}
74+
return ret;
75+
}
76+
77+
static void list_transform_dupchars(sblist* tokens, const char* org_regex) {
78+
size_t i;
79+
for(i=0; i<sblist_getsize(tokens); i++) {
80+
struct list_item *li= sblist_get(tokens, i);
81+
if(li->type == CTX_NONE) switch(org_regex[li->so]) {
82+
case '?': case '*': case '+':
83+
li->type = CTX_DUP;
84+
break;
85+
}
86+
}
87+
}
88+
89+
static sblist* list_join_literals(sblist* tokens, const char* org_regex) {
90+
sblist *new = sblist_new(sizeof(struct list_item), sblist_getsize(tokens));
91+
size_t i,j;
92+
for(i=0; i<sblist_getsize(tokens); i++) {
93+
size_t pcnt = 0;
94+
for(j=i; j<sblist_getsize(tokens); ++j) {
95+
struct list_item *li= sblist_get(tokens, j);
96+
if(li->type != CTX_NONE) break;
97+
switch(org_regex[li->so]) {
98+
case '^':
99+
case '.':
100+
case '[':
101+
case '$':
102+
case '(':
103+
case ')':
104+
case '|':
105+
case '{':
106+
goto break_loop;
107+
default:
108+
pcnt += li->eo-li->so;
109+
}
110+
continue;
111+
break_loop:; break;
112+
}
113+
struct list_item ins = *((struct list_item *)sblist_get(tokens, i));
114+
if(j > i) {
115+
ins.type = 0xff;
116+
ins.eo = ins.so+pcnt;
117+
i = j-1;
118+
}
119+
sblist_add(new, &ins);
120+
}
121+
sblist_free(tokens);
122+
return new;
123+
}
124+
125+
static void print_token(struct list_item *li, const char *org_regex) {
126+
if(li->type == 0xff) {
127+
printf(" \"%.*s\" ", (int) (li->eo-li->so), org_regex+li->so);
128+
return;
129+
} else if(li->type == CTX_BRACKET) {
130+
/* ragel doesn't like leading/trailing dash in bracket expression */
131+
if(org_regex[li->so+1] == '-') {
132+
printf("('-'|[%.*s)", (int) (li->eo-li->so-2), org_regex+li->so+2);
133+
return;
134+
} else if(org_regex[li->eo-2] == '-') {
135+
printf("('-'|%.*s])", (int) (li->eo-li->so-2), org_regex+li->so);
136+
return;
137+
}
138+
}
139+
printf("%.*s", (int) (li->eo-li->so), org_regex+li->so);
140+
}
141+
142+
static int count_groups(sblist *tokens, const char* org_regex) {
143+
size_t i;
144+
int count = 0;
145+
for(i=0; i<sblist_getsize(tokens); ++i) {
146+
struct list_item *li = sblist_get(tokens, i);
147+
if(li->type == CTX_NONE && org_regex[li->so] == '(') ++count;
148+
}
149+
return count;
150+
}
151+
152+
static void expand_groups(char *buf, int groups) {
153+
int i;
154+
char intbuf[16];
155+
for(i=0; i<groups; ++i) {
156+
snprintf(intbuf, sizeof intbuf, "%d", i);
157+
printf("%s", replace(buf, "%GROUPNR%", intbuf));
158+
}
159+
}
160+
161+
static inline void* sblist_pop(sblist *l) {
162+
size_t len = sblist_getsize(l);
163+
if(len > 0) {
164+
void *x = sblist_get(l, len-1);
165+
sblist_delete(l, len-1);
166+
return x;
167+
}
168+
return 0;
169+
}
170+
171+
static void dump_ragel_parser(const char *machinename, const char* org_regex, int *maxgroups) {
172+
FILE *f = fopen("ragel.tmpl", "r");
173+
char buf[4096];
174+
int groups, cgroup = 0;
175+
sblist *tokens = lex_to_list();
176+
list_transform_dupchars(tokens, org_regex);
177+
tokens = list_join_literals(tokens, org_regex);
178+
groups = count_groups(tokens, org_regex);
179+
if(groups > *maxgroups) *maxgroups = groups;
180+
sblist *group_order = sblist_new(sizeof (int), groups) ;
181+
182+
while(fgets(buf, sizeof buf, f)) {
183+
char *p;
184+
if((p = strstr(buf, "%MACHINENAME%"))) {
185+
printf("%s", replace(buf, "%MACHINENAME%", machinename));
186+
} else if((p = strstr(buf, "%GROUPNR%"))) {
187+
expand_groups(buf, groups);
188+
} else if ((p = strstr(buf, "%MACHINEDEF%"))) {
189+
printf("%.*s", (int)(p-buf), buf);
190+
size_t i;
191+
/* insert group match actions */
192+
for(i=0; i<sblist_getsize(tokens); i++) {
193+
struct list_item *li= sblist_get(tokens, i);
194+
if(li->type == CTX_NONE && org_regex[li->so] == '(') {
195+
sblist_add(group_order, &cgroup);
196+
++cgroup;
197+
print_token(li, org_regex);
198+
} else if(li->type == CTX_NONE && org_regex[li->so] == ')') {
199+
struct list_item *next;
200+
int groupno = *((int*)sblist_pop(group_order));
201+
if(i+1 < sblist_getsize(tokens) && (next = sblist_get(tokens, i+1)) && next->type == CTX_DUP) {
202+
print_token(li, org_regex);
203+
print_token(next, org_regex);
204+
printf(" >A%d %%E%d ", groupno, groupno);
205+
++i;
206+
} else {
207+
print_token(li, org_regex);
208+
printf(" >A%d %%E%d ", groupno, groupno);
209+
}
210+
} else {
211+
print_token(li, org_regex);
212+
}
213+
}
214+
printf("%s", p+sizeof("%MACHINEDEF%")-1);
215+
} else {
216+
printf("%s", buf);
217+
}
218+
}
219+
fclose(f);
220+
sblist_free(group_order);
221+
sblist_free(tokens);
222+
}
223+
15224
int main() {
16225
#ifdef YYDEBUG
17226
yydebug = 1;
18227
#endif
19228
char buf[4096];
20229
size_t lineno = 0;
21230
yyin = stdin;
231+
int maxgroups = 0;
22232
while(fgets(buf, sizeof buf, yyin)) {
23233
++lineno;
24234
const char* p = buf, *pe = strrchr(buf, '\n');
25235
if(!pe) pe = buf + strlen(p);
26236
lex_init(p, pe, LEXFLAG_SILENT);
27237
if(yyparse() == 0) {
28238
/* syntax check OK */
239+
char nbuf[128];
240+
snprintf(nbuf, sizeof nbuf, "machine_%04zu", lineno);
241+
lex_init(p, pe, LEXFLAG_SILENT);
242+
dump_ragel_parser(nbuf, p, &maxgroups);
29243
} else {
30244
size_t errpos = lex_errpos();
31245
fprintf(stderr, "parse error @%zu:%zu\n", lineno, errpos);

ragel.tmpl

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
%%{
2+
machine %MACHINENAME%;
3+
action A%GROUPNR% { matches[%GROUPNR%+1].rm_so = p-start; }
4+
action E%GROUPNR% { matches[%GROUPNR%+1].rm_eo = p-start; }
5+
main := %MACHINEDEF% ;
6+
}%%
7+
8+
static int rematch_%MACHINENAME%(const char *p, size_t nmatch, regmatch_t matches[]) {
9+
const char *start = p, *pe = p + strlen(p), *eof = pe;
10+
size_t i, cs;
11+
for(i=0;i<nmatch;++i) matches[i] = (regmatch_t){.rm_so = -1, .rm_eo = -1};
12+
%% write data;
13+
%% write init;
14+
%% write exec;
15+
if(cs < %%{ write first_final; }%% ) return -1;
16+
matches[0] = (regmatch_t){.rm_so = 0, .rm_eo = eof-start};
17+
return 0;
18+
}

0 commit comments

Comments
 (0)