Skip to content

Commit 0b9655d

Browse files
committed
Implemented -e options. If it is available, the pattern will be parsed as a regular expression
1 parent d10036a commit 0b9655d

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

72 files changed

+71934
-76
lines changed

Makefile

Lines changed: 51 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,8 @@ GCC = gcc
77
.PHONY: clean
88

99
BUILD_DIR = tmp/build/
10-
INCLUDE = -Iinclude
11-
LIB = -liconv
10+
INCLUDE = -Iinclude -Ivendor/onigmo
11+
LIB = -L/usr/local/lib -liconv
1212
SOURCES = \
1313
highway.c \
1414
file.c \
@@ -19,7 +19,53 @@ SOURCES = \
1919
log.c \
2020
option.c \
2121
util.c \
22-
help.c
22+
help.c \
23+
vendor/onigmo/regcomp.c \
24+
vendor/onigmo/regenc.c \
25+
vendor/onigmo/regerror.c \
26+
vendor/onigmo/regexec.c \
27+
vendor/onigmo/regext.c \
28+
vendor/onigmo/reggnu.c \
29+
vendor/onigmo/regparse.c \
30+
vendor/onigmo/regposerr.c \
31+
vendor/onigmo/regposix.c \
32+
vendor/onigmo/regsyntax.c \
33+
vendor/onigmo/regtrav.c \
34+
vendor/onigmo/regversion.c \
35+
vendor/onigmo/st.c \
36+
vendor/onigmo/enc/ascii.c \
37+
vendor/onigmo/enc/big5.c \
38+
vendor/onigmo/enc/cp1251.c \
39+
vendor/onigmo/enc/cp932.c \
40+
vendor/onigmo/enc/euc_jp.c \
41+
vendor/onigmo/enc/euc_kr.c \
42+
vendor/onigmo/enc/euc_tw.c \
43+
vendor/onigmo/enc/gb18030.c \
44+
vendor/onigmo/enc/iso8859_1.c \
45+
vendor/onigmo/enc/iso8859_10.c \
46+
vendor/onigmo/enc/iso8859_11.c \
47+
vendor/onigmo/enc/iso8859_13.c \
48+
vendor/onigmo/enc/iso8859_14.c \
49+
vendor/onigmo/enc/iso8859_15.c \
50+
vendor/onigmo/enc/iso8859_16.c \
51+
vendor/onigmo/enc/iso8859_2.c \
52+
vendor/onigmo/enc/iso8859_3.c \
53+
vendor/onigmo/enc/iso8859_4.c \
54+
vendor/onigmo/enc/iso8859_5.c \
55+
vendor/onigmo/enc/iso8859_6.c \
56+
vendor/onigmo/enc/iso8859_7.c \
57+
vendor/onigmo/enc/iso8859_8.c \
58+
vendor/onigmo/enc/iso8859_9.c \
59+
vendor/onigmo/enc/koi8.c \
60+
vendor/onigmo/enc/koi8_r.c \
61+
vendor/onigmo/enc/mktable.c \
62+
vendor/onigmo/enc/sjis.c \
63+
vendor/onigmo/enc/unicode.c \
64+
vendor/onigmo/enc/utf16_be.c \
65+
vendor/onigmo/enc/utf16_le.c \
66+
vendor/onigmo/enc/utf32_be.c \
67+
vendor/onigmo/enc/utf32_le.c \
68+
vendor/onigmo/enc/utf8.c
2369
OBJECTS = $(addprefix $(BUILD_DIR),$(SOURCES:%.c=%.o))
2470
DEPENDS = $(OBJECTS:%.o=%.d)
2571

@@ -32,7 +78,8 @@ $(TARGET): $(OBJECTS)
3278

3379
$(BUILD_DIR)%.d: %.c
3480
mkdir -p $(BUILD_DIR)
35-
$(GCC) -MM $(INCLUDE) $< | sed 's,\($*\)\.o[ :]*,$(BUILD_DIR)\1.o: ,g' > $@
81+
mkdir -p $(BUILD_DIR)vendor/onigmo/enc
82+
$(GCC) -MM $(INCLUDE) $< | sed 's,\($(@F:.d=)\)\.o[ :]*,$(@D)/\1.o: ,g' > $@
3683
echo "\t$(GCC) -O2 $(INCLUDE) -c -o $(subst .d,.o,$@) $$<" >> $@
3784

3885
-include $(DEPENDS)

include/option.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ typedef struct _hw_option {
1111
int pattern_len;
1212
int worker;
1313
bool file_with_matches;
14+
bool use_regex;
1415
} hw_option;
1516

1617
void init_option(int argc, char **argv, hw_option *op);

src/file.c

Lines changed: 28 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -37,13 +37,7 @@ enum file_type is_binary(int fd)
3737
continue;
3838
}
3939

40-
// half-byte character detection. This is Shift-JIS encoding.
41-
if (0xA1 <= c1 && c1 <= 0xDF) {
42-
sjis++;
43-
continue;
44-
}
45-
46-
// 2-byte character detection. Shift-JIS or EUC-JP or UTF-8.
40+
// 2-byte character detection for UTF-8.
4741
unsigned char c2;
4842
if (i + 1 < read_bytes) {
4943
i++;
@@ -53,24 +47,6 @@ enum file_type is_binary(int fd)
5347
utf8++;
5448
continue;
5549
}
56-
57-
if ((c1 == 0x8E) &&
58-
(0xA1 <= c2 && c2 <= 0xDF)) {
59-
euc++;
60-
continue;
61-
}
62-
63-
if ((0xA1 <= c1 && c1 <= 0xFE) &&
64-
(0xA1 <= c2 && c2 <= 0xFE)) {
65-
euc++;
66-
continue;
67-
}
68-
69-
if (((0x81 <= c1 && c1 <= 0x9F) || (0xE0 <= c1 && c1 <= 0xEF)) &&
70-
((0x40 <= c2 && c2 <= 0x7E) || (0x80 <= c2 && c2 <= 0xFC))) {
71-
sjis++;
72-
continue;
73-
}
7450
}
7551

7652
// 3-byte character detection. Only UTF-8.
@@ -100,6 +76,33 @@ enum file_type is_binary(int fd)
10076
}
10177
}
10278

79+
// 2-byte character detection for EUC-JP or SHIFT_JIS.
80+
if (i + 1 < read_bytes) {
81+
if ((c1 == 0x8E) &&
82+
(0xA1 <= c2 && c2 <= 0xDF)) {
83+
euc++;
84+
continue;
85+
}
86+
87+
if ((0xA1 <= c1 && c1 <= 0xFE) &&
88+
(0xA1 <= c2 && c2 <= 0xFE)) {
89+
euc++;
90+
continue;
91+
}
92+
93+
if (((0x81 <= c1 && c1 <= 0x9F) || (0xE0 <= c1 && c1 <= 0xEF)) &&
94+
((0x40 <= c2 && c2 <= 0x7E) || (0x80 <= c2 && c2 <= 0xFC))) {
95+
sjis++;
96+
continue;
97+
}
98+
}
99+
100+
// half-byte character detection. This is Shift-JIS encoding.
101+
if (0xA1 <= c1 && c1 <= 0xDF) {
102+
sjis++;
103+
continue;
104+
}
105+
103106
// Unknown character.
104107
unknown++;
105108
}

src/highway.c

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
#include "ignore.h"
1414
#include "util.h"
1515
#include "color.h"
16+
#include "oniguruma.h"
1617

1718
static bool complete_finding_file = false;
1819

@@ -91,6 +92,10 @@ int main(int argc, char **argv)
9192
init_option(argc, argv, &op);
9293
init_iconv();
9394

95+
if (op.use_regex) {
96+
onig_init();
97+
}
98+
9499
file_queue *queue = create_file_queue();
95100
worker_params params = { queue, &op };
96101
pthread_t th[op.worker], pth;
@@ -122,5 +127,9 @@ int main(int argc, char **argv)
122127
free_file_queue(queue);
123128
close_iconv();
124129

130+
if (op.use_regex) {
131+
onig_end();
132+
}
133+
125134
return return_code;
126135
}

src/option.c

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -25,9 +25,10 @@ void init_option(int argc, char **argv, hw_option *op)
2525
op->root_paths[0] = ".";
2626
op->paths_count = 1;
2727
op->file_with_matches = false;
28+
op->use_regex = false;
2829

2930
int ch;
30-
while ((ch = getopt_long(argc, argv, "hl", longopts, NULL)) != -1) {
31+
while ((ch = getopt_long(argc, argv, "ehl", longopts, NULL)) != -1) {
3132
switch (ch) {
3233
case 0:
3334
switch (flag) {
@@ -40,12 +41,16 @@ void init_option(int argc, char **argv, hw_option *op)
4041
}
4142
break;
4243

43-
case 'h':
44+
case 'e': /* Use regular expression */
45+
op->use_regex = true;
46+
break;
47+
48+
case 'h': /* Show help */
4449
usage();
4550
exit(0);
4651
break;
4752

48-
case 'l':
53+
case 'l': /* Show only filenames */
4954
op->file_with_matches = true;
5055
break;
5156

src/search.c

Lines changed: 101 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
#include <stdio.h>
22
#include <string.h>
33
#include <unistd.h>
4+
#include "oniguruma.h"
45
#include "search.h"
56
#include "file.h"
67
#include "color.h"
@@ -79,6 +80,9 @@ int ssabs(const unsigned char *buf, int buf_len, int line_no_offset, const char
7980
return match_count;
8081
}
8182

83+
/**
84+
* Format search results. This method does formatting results every line and colorize results.
85+
*/
8286
int format(const char *buf, const match *matches, int match_count, int read_len, const char *pattern, const hw_option *op, matched_line_queue *match_lines)
8387
{
8488
int match_line_count = 0;
@@ -154,6 +158,77 @@ int format(const char *buf, const match *matches, int match_count, int read_len,
154158
return match_line_count;
155159
}
156160

161+
/**
162+
* Search the pattern as a regular expression.
163+
*/
164+
int regex(const unsigned char *buf, int read_len, const char *pattern, enum file_type t, match *matches, int max_match, int *last_line_start)
165+
{
166+
regex_t *reg;
167+
OnigErrorInfo einfo;
168+
OnigEncodingType *enc;
169+
UChar *p = (UChar *)pattern;
170+
171+
switch (t) {
172+
case FILE_TYPE_EUC_JP:
173+
enc = ONIG_ENCODING_EUC_JP;
174+
break;
175+
case FILE_TYPE_SHIFT_JIS:
176+
enc = ONIG_ENCODING_SJIS;
177+
break;
178+
default:
179+
enc = ONIG_ENCODING_UTF8;
180+
break;
181+
}
182+
int r = onig_new(&reg, p, p + strlen((char *)p), ONIG_OPTION_DEFAULT, enc, ONIG_SYNTAX_DEFAULT, &einfo);
183+
if (r != ONIG_NORMAL) {
184+
return 0;
185+
}
186+
187+
OnigRegion *region = onig_region_new();
188+
189+
int match_count = 0;
190+
int i = 0, line_start = 0, line_no = 1;
191+
while (i < read_len) {
192+
if (buf[i] == 0x0A || buf[i] == 0x0D) {
193+
// Skip if current line has no contents.
194+
if (i != line_start) {
195+
int pos = 0;
196+
while (1) {
197+
const unsigned char *start = buf + line_start + pos,
198+
*end = buf + i,
199+
*range = end;
200+
r = onig_search(reg, buf, end, start, range, region, ONIG_OPTION_NONE);
201+
if (r >= 0) {
202+
matches[match_count].start = region->beg[0];
203+
matches[match_count].line_no = line_no;
204+
matches[match_count].line_start = line_start;
205+
match_count++;
206+
207+
if (i <= region->end[0]) {
208+
break;
209+
}
210+
pos += region->end[0] - line_start;
211+
} else {
212+
break;
213+
}
214+
}
215+
}
216+
217+
line_start = i + 1;
218+
line_no++;
219+
}
220+
221+
i++;
222+
}
223+
224+
onig_region_free(region, 1);
225+
onig_free(reg);
226+
227+
*last_line_start = line_start;
228+
229+
return match_count;
230+
}
231+
157232
/**
158233
* Search the pattern from the file descriptor and add formatted matched lines to the queue if the
159234
* pattern was matched on the read buffer.
@@ -177,19 +252,33 @@ int search(int fd, const char *pattern, const hw_option *op, enum file_type t, m
177252
// Check if pointer was reached to the end of the file.
178253
bool eof = read_len < n;
179254

180-
// Using SSABS pattern matching algorithm.
255+
int match_count;
181256
match matches[MAX_MATCH_COUNT];
182-
int match_count = ssabs(
183-
(unsigned char *)buf,
184-
read_len,
185-
line_no_offset,
186-
pattern,
187-
matches,
188-
MAX_MATCH_COUNT,
189-
t,
190-
&last_line_start,
191-
&line_no_offset
192-
);
257+
if (op->use_regex) {
258+
// Using a regular expression.
259+
match_count = regex(
260+
(unsigned char *)buf,
261+
read_len,
262+
pattern,
263+
t,
264+
matches,
265+
MAX_MATCH_COUNT,
266+
&last_line_start
267+
);
268+
} else {
269+
// Using SSABS pattern matching algorithm.
270+
match_count = ssabs(
271+
(unsigned char *)buf,
272+
read_len,
273+
line_no_offset,
274+
pattern,
275+
matches,
276+
MAX_MATCH_COUNT,
277+
t,
278+
&last_line_start,
279+
&line_no_offset
280+
);
281+
}
193282

194283
if (!eof) {
195284
// If there is too long line over 65536 bytes, reallocate the twice memory for the

0 commit comments

Comments
 (0)