Skip to content

Commit 3bfe6cd

Browse files
committed
C program to sort (and check) the database; chkdb gets too slow
1 parent 8e443d9 commit 3bfe6cd

File tree

2 files changed

+370
-1
lines changed

2 files changed

+370
-1
lines changed

Makefile

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,12 @@
1-
# $MirOS: wtf/Makefile,v 1.4 2015/11/14 21:12:30 tg Exp $
1+
# $MirOS: wtf/Makefile,v 1.5 2019/08/15 01:49:59 tg Exp $
22

33
SCRIPTS= wtf
44
MAN= wtf.1
55

6+
PROG= sortdb
7+
DPADD+= ${LIBMBFUN}
8+
LDADD+= -lmbfun
9+
610
realinstall:
711
cd ${.CURDIR}; install -c -o ${BINOWN} -g ${BINGRP} -m ${BINMODE} \
812
${SCRIPTS} ${DESTDIR}${BINDIR}/

sortdb.c

Lines changed: 365 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,365 @@
1+
/*-
2+
* Copyright © 2019
3+
* mirabilos <[email protected]>
4+
*
5+
* Provided that these terms and disclaimer and all copyright notices
6+
* are retained or reproduced in an accompanying document, permission
7+
* is granted to deal in this work without restriction, including un‐
8+
* limited rights to use, publicly perform, distribute, sell, modify,
9+
* merge, give away, or sublicence.
10+
*
11+
* This work is provided “AS IS” and WITHOUT WARRANTY of any kind, to
12+
* the utmost extent permitted by applicable law, neither express nor
13+
* implied; without malicious intent or gross negligence. In no event
14+
* may a licensor, author or contributor be held liable for indirect,
15+
* direct, other damage, loss, or other issues arising in any way out
16+
* of dealing in the work, even if advised of the possibility of such
17+
* damage or existence of a defect, except proven that it results out
18+
* of said person’s immediate fault when using the work as intended.
19+
*/
20+
21+
#define _ALL_SOURCE
22+
#include <sys/param.h>
23+
#include <sys/mman.h>
24+
#include <sys/stat.h>
25+
#include <err.h>
26+
#include <fcntl.h>
27+
#include <mbfun.h>
28+
#include <unistd.h>
29+
#include <wchar.h>
30+
#include <wctype.h>
31+
32+
__RCSID("$MirOS: wtf/sortdb.c,v 1.1 2019/08/15 01:49:59 tg Exp $");
33+
34+
#define MAXCASECONV 512
35+
struct cconv {
36+
wchar_t lower;
37+
wchar_t upper;
38+
} caseconv[MAXCASECONV];
39+
size_t ncaseconv = 0;
40+
41+
#define MAXLINES 1048576
42+
struct line {
43+
wchar_t *literal;
44+
char *compare;
45+
} lines[MAXLINES];
46+
size_t nlines = 0;
47+
48+
wchar_t *ilines[MAXLINES];
49+
size_t nilines = 0;
50+
51+
#define MAXACRO 128
52+
wchar_t acro[MAXACRO];
53+
54+
#define MAXTAGS 1024
55+
wchar_t tags[MAXTAGS];
56+
57+
#define get(ofs) __extension__({ \
58+
size_t get_ofs = (ofs); \
59+
\
60+
(get_ofs >= len ? (uint8_t)0 : ibuf[get_ofs]); \
61+
})
62+
63+
#define xwcsdup(p) __extension__({ \
64+
wchar_t *xwcsdup_res = wcsdup(p); \
65+
\
66+
if (!xwcsdup_res) \
67+
err(1, "out of memory"); \
68+
(xwcsdup_res); \
69+
})
70+
71+
static int
72+
line_compar(const void *aa, const void *bb)
73+
{
74+
const struct line *a = (const struct line *)aa;
75+
const struct line *b = (const struct line *)bb;
76+
77+
return (strcmp(a->compare, b->compare));
78+
}
79+
80+
static int
81+
cconv_compar(const void *aa, const void *bb)
82+
{
83+
const struct cconv *a = (const struct cconv *)aa;
84+
const struct cconv *b = (const struct cconv *)bb;
85+
86+
if (a->lower < b->lower)
87+
return (-1);
88+
if (a->lower > b->lower)
89+
return (1);
90+
if (!a->upper || !b->upper)
91+
return (0);
92+
if (a->upper < b->upper)
93+
return (-1);
94+
if (a->upper > b->upper)
95+
return (1);
96+
return (0);
97+
}
98+
99+
static wchar_t
100+
acro_toupper(wchar_t wc)
101+
{
102+
struct cconv *match;
103+
struct cconv test;
104+
105+
if (wc < 32 && wc != 9 && wc != 10)
106+
errx(2, "acronym contains control character %02X", wc);
107+
if (wc >= L'a' && wc <= L'z')
108+
return (wc + L'A' - L'a');
109+
test.lower = wc;
110+
test.upper = 0;
111+
if ((match = (struct cconv *)bsearch(&test, caseconv, ncaseconv,
112+
sizeof(struct cconv), cconv_compar)) == NULL)
113+
return (wc);
114+
if (!match->upper)
115+
errx(99, "match.upper for %04X (%lc) is WNUL",
116+
(unsigned int)wc, wc);
117+
return (match->upper);
118+
}
119+
120+
int
121+
main(int argc, char *argv[])
122+
{
123+
wchar_t *cwp, cw, *dwp, *twp;
124+
uint8_t *ibuf, c;
125+
size_t len, bp, cp, tp;
126+
int fd, rv = 0;
127+
struct stat sb;
128+
129+
if (argc != 2) {
130+
fprintf(stderr, "Syntax: %s acronyms\n",
131+
argv[0] ? argv[0] : "sortdb");
132+
return (1);
133+
}
134+
135+
if ((fd = open(argv[1], O_RDONLY | O_SHLOCK)) < 0)
136+
err(1, "open");
137+
if (fstat(fd, &sb))
138+
err(1, "stat");
139+
if (sb.st_size > (off_t)SSIZE_MAX)
140+
errx(1, "input too large");
141+
len = (size_t)sb.st_size;
142+
if ((ibuf = mmap(NULL, len, PROT_READ, MAP_FILE, fd,
143+
(off_t)0)) == MAP_FAILED)
144+
err(1, "mmap");
145+
146+
cp = 0;
147+
nextiline:
148+
if (nilines == MAXLINES)
149+
errx(2, "raise %s and recompile", "MAXLINES");
150+
bp = cp;
151+
while ((c = get(cp++)) && c != '\n')
152+
/* nothing */;
153+
if (!c)
154+
errx(2, "NUL at offset %zu", cp - 1);
155+
if (cp - 1 == bp)
156+
errx(2, "empty line at offset %zu", cp - 1);
157+
switch (get(cp - 1)) {
158+
case 0x09:
159+
case 0x0C:
160+
case 0x0D:
161+
case 0x20:
162+
warnx("line %zu ends with whitespace at offset %zu",
163+
nilines + 1, cp - 1);
164+
rv = 3;
165+
break;
166+
}
167+
ilines[nilines++] = ambsntowcs((void *)(ibuf + bp), cp - bp - 1);
168+
if (cp < len)
169+
goto nextiline;
170+
fprintf(stderr, "I: %zu input lines\n", nilines);
171+
munmap(ibuf, len);
172+
close(fd);
173+
if (nilines < 3)
174+
errx(2, "file likely too short");
175+
176+
cwp = ilines[0];
177+
if (*cwp++ != L' ')
178+
errx(2, "first line does not start with a space: %ls",
179+
ilines[0]);
180+
do {
181+
wchar_t cl, cu, clu, cul;
182+
183+
if (cwp[0] != L' ' || !cwp[1] || cwp[2] != L'/' || !cwp[3])
184+
errx(2, "error in caseconv pair: %ls", cwp);
185+
cl = cwp[1];
186+
cu = cwp[3];
187+
if (cl == L'ℒ' && cu == L'ℓ')
188+
goto caseconv_checks_done;
189+
clu = towupper(cl);
190+
cul = towlower(cu);
191+
192+
if (!iswlower(cl))
193+
errx(2, "caseconv pair %lc/%lc lower is not lower",
194+
cl, cu);
195+
if (!iswupper(cu))
196+
errx(2, "caseconv pair %lc/%lc upper is not upper",
197+
cl, cu);
198+
if (clu != cu)
199+
errx(2, "caseconv pair %lc/%lc LOWER %lc is not upper",
200+
cl, cu, clu);
201+
if (cul != cl &&
202+
!(cl == L'ς' && cu == L'Σ' && cul == L'σ'))
203+
warnx("caseconv pair %lc/%lc upper %lc is not lower",
204+
cl, cu, cul);
205+
caseconv_checks_done:
206+
207+
caseconv[ncaseconv].lower = cl;
208+
caseconv[ncaseconv].upper = cu;
209+
if (++ncaseconv == MAXCASECONV)
210+
errx(2, "raise %s and recompile", "MAXCASECONV");
211+
cwp += 4;
212+
} while (*cwp);
213+
214+
if (mergesort(caseconv, ncaseconv, sizeof(struct cconv), cconv_compar))
215+
err(1, "mergesort caseconv");
216+
if ((cwp = calloc(1 + 4 * ncaseconv + 1, sizeof(wchar_t))) == NULL)
217+
err(1, "out of memory");
218+
cwp[0] = L' ';
219+
for (bp = 0; bp < ncaseconv; ++bp) {
220+
cwp[1 + bp * 4] = L' ';
221+
cwp[1 + bp * 4 + 1] = caseconv[bp].lower;
222+
cwp[1 + bp * 4 + 2] = L'/';
223+
cwp[1 + bp * 4 + 3] = caseconv[bp].upper;
224+
}
225+
/* NUL already there from calloc */
226+
goto firstline;
227+
228+
while (nlines < nilines) {
229+
if ((cwp = wcschr(ilines[nlines], L'\t')) == NULL) {
230+
cwp = ilines[nlines];
231+
/* comment line (no TAB) */
232+
if (cwp[0] != L' ') {
233+
warnx("comment line %zu does not begin with space: %ls",
234+
nlines + 1, cwp);
235+
rv = 3;
236+
}
237+
firstline:
238+
lines[nlines].literal = cwp;
239+
lines[nlines].compare = awcstombs(cwp);
240+
++nlines;
241+
continue;
242+
}
243+
if (wcschr(cwp + 1, L'\t') != NULL) {
244+
warnx("line %zu tab in expansion: %ls",
245+
nlines + 1, ilines[nlines]);
246+
rv = 3;
247+
}
248+
cwp = ilines[nlines];
249+
cp = 0;
250+
while ((cw = *cwp++) != L'\t') {
251+
if (cw == L'.' && cp > 0 &&
252+
acro[cp - 1] >= L'A' && acro[cp - 1] <= L'Z') {
253+
/* skip period after upper-cased latin */
254+
continue;
255+
}
256+
acro[cp++] = acro_toupper(cw);
257+
if (cp == MAXACRO)
258+
errx(2, "raise %s and recompile", "MAXACRO");
259+
}
260+
acro[cp] = L'\0';
261+
tp = 0;
262+
parse_line:
263+
if (!(cw = *cwp++))
264+
goto end_of_line;
265+
if (iswspace(cw))
266+
goto parse_line;
267+
if (cw == L'[' && wcschr(cwp, L']')) {
268+
/* leading tag */
269+
if (tp) {
270+
/* space stuffing between tags */
271+
--cwp;
272+
cw = L' ';
273+
}
274+
stuff_tag:
275+
tags[tp++] = cw;
276+
if (tp == MAXTAGS)
277+
errx(2, "raise %s and recompile", "MAXTAGS");
278+
if (cw == L']')
279+
goto parse_line;
280+
if (!(cw = *cwp++))
281+
errx(2, "EOL inmidst a tag? line %zu",
282+
nlines + 1);
283+
goto stuff_tag;
284+
}
285+
/* not a leading tag nor whitespace nor EOL */
286+
--cwp;
287+
/* find end of string handling trailing tags and whitespace */
288+
twp = cwp + wcslen(cwp);
289+
dwp = twp - 1;
290+
291+
check_trailing:
292+
while (dwp > cwp && iswspace(*dwp))
293+
--dwp;
294+
if (dwp > cwp && *dwp == L']' && wcschr(cwp, L'[')) {
295+
while (dwp > cwp && *dwp != L'[')
296+
--dwp;
297+
twp = dwp--;
298+
goto check_trailing;
299+
}
300+
if (*twp) {
301+
if (tp) {
302+
stuff_trailing_tag:
303+
cw = L' ';
304+
} else {
305+
stuff_trt_content:
306+
cw = *twp++;
307+
}
308+
tags[tp++] = cw;
309+
if (tp == MAXTAGS)
310+
errx(2, "raise %s and recompile", "MAXTAGS");
311+
if (cw != L']')
312+
goto stuff_trt_content;
313+
while (iswspace(*twp))
314+
++twp;
315+
if (*twp /* == L'[' */)
316+
goto stuff_trailing_tag;
317+
}
318+
/* no trailing tags or whitespace */
319+
*++dwp = L'\0';
320+
bp = dwp - cwp;
321+
if (0)
322+
end_of_line:
323+
bp = wcslen(cwp);
324+
tags[tp] = L'\0';
325+
if (!bp) {
326+
warnx("line %zu has no content, only tags: %ls",
327+
nlines + 1, ilines[nlines]);
328+
rv = 3;
329+
}
330+
331+
lines[nlines].literal = calloc(cp + 1 + bp + 1 + tp + 1,
332+
sizeof(wchar_t));
333+
dwp = calloc(cp + 1 + bp + 1 + tp + 1,
334+
sizeof(wchar_t));
335+
memcpy(lines[nlines].literal, acro, cp * sizeof(wchar_t));
336+
memcpy(dwp, acro, cp * sizeof(wchar_t));
337+
lines[nlines].literal[cp] = L'\t';
338+
dwp[cp] = L'\t';
339+
++cp;
340+
memcpy((twp = dwp + cp), cwp, bp * sizeof(wchar_t));
341+
if (tp) {
342+
dwp[cp + bp] = L' ';
343+
memcpy(dwp + cp + bp + 1, tags,
344+
tp * sizeof(wchar_t));
345+
memcpy(lines[nlines].literal + cp, tags,
346+
tp * sizeof(wchar_t));
347+
cp += tp;
348+
lines[nlines].literal[cp++] = L' ';
349+
}
350+
memcpy(lines[nlines].literal + cp, cwp, bp * sizeof(wchar_t));
351+
352+
while ((cw = *twp))
353+
*twp++ = towupper(cw);
354+
lines[nlines++].compare = awcstombs(dwp);
355+
free(dwp);
356+
}
357+
358+
if (mergesort(lines, nlines, sizeof(struct line), line_compar))
359+
err(1, "mergesort lines");
360+
361+
for (nlines = 0; nlines < nilines; ++nlines)
362+
printf("%ls\n", lines[nlines].literal);
363+
364+
return (rv);
365+
}

0 commit comments

Comments
 (0)