Skip to content

Commit bbfc3f8

Browse files
committed
unix-ffi: re: convert to PCRE2
PCRE is marked as EOL and won't receive any new security update. Convert the re module to PCRE2 API to enforce security. Additional dependency is now needed with uctypes due to changes in how PCRE2 return the match_data in a pointer and require special handling. The converted module is tested with the test_re.py with no regression. Signed-off-by: Christian Marangi <[email protected]>
1 parent e6b89ea commit bbfc3f8

File tree

1 file changed

+45
-25
lines changed

1 file changed

+45
-25
lines changed

unix-ffi/re/re.py

Lines changed: 45 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1,36 +1,49 @@
11
import sys
2-
import ffilib
32
import array
3+
import ffilib
4+
import uctypes
45

6+
pcre2 = ffilib.open("libpcre2-8")
57

6-
pcre = ffilib.open("libpcre")
8+
# pcre2_code *pcre2_compile(PCRE2_SPTR pattern, PCRE2_SIZE length,
9+
# uint32_t options, int *errorcode, PCRE2_SIZE *erroroffset,
10+
# pcre2_compile_context *ccontext);
11+
pcre2_compile = pcre2.func("p", "pcre2_compile_8", "siippp")
712

8-
# pcre *pcre_compile(const char *pattern, int options,
9-
# const char **errptr, int *erroffset,
10-
# const unsigned char *tableptr);
11-
pcre_compile = pcre.func("p", "pcre_compile", "sipps")
13+
# int pcre2_match(const pcre2_code *code, PCRE2_SPTR subject,
14+
# PCRE2_SIZE length, PCRE2_SIZE startoffset, uint32_t options,
15+
# pcre2_match_data *match_data, pcre2_match_context *mcontext);
16+
pcre2_match = pcre2.func("i", "pcre2_match_8", "Psiiipp")
1217

13-
# int pcre_exec(const pcre *code, const pcre_extra *extra,
14-
# const char *subject, int length, int startoffset,
15-
# int options, int *ovector, int ovecsize);
16-
pcre_exec = pcre.func("i", "pcre_exec", "PPsiiipi")
18+
# int pcre2_pattern_info(const pcre2_code *code, uint32_t what,
19+
# void *where);
20+
pcre2_pattern_info = pcre2.func("i", "pcre2_pattern_info_8", "Pip")
1721

18-
# int pcre_fullinfo(const pcre *code, const pcre_extra *extra,
19-
# int what, void *where);
20-
pcre_fullinfo = pcre.func("i", "pcre_fullinfo", "PPip")
22+
# PCRE2_SIZE *pcre2_get_ovector_pointer(pcre2_match_data *match_data);
23+
pcre2_get_ovector_pointer = pcre2.func("p", "pcre2_get_ovector_pointer_8", "p")
2124

25+
# pcre2_match_data *pcre2_match_data_create_from_pattern(const pcre2_code *code,
26+
# pcre2_general_context *gcontext);
27+
pcre2_match_data_create_from_pattern = pcre2.func(
28+
"p", "pcre2_match_data_create_from_pattern_8", "Pp"
29+
)
2230

23-
IGNORECASE = I = 1
24-
MULTILINE = M = 2
25-
DOTALL = S = 4
31+
IGNORECASE = I = 0x8
32+
MULTILINE = M = 0x400
33+
DOTALL = S = 0x20
2634
VERBOSE = X = 8
27-
PCRE_ANCHORED = 0x10
35+
PCRE2_ANCHORED = 0x80000000
36+
37+
# Real value in pcre2.h is 0xFFFFFFFF for 32bit and
38+
# 0x0xFFFFFFFFFFFFFFFF for 64bit that is equivalent
39+
# to -1
40+
PCRE2_ZERO_TERMINATED = -1
2841

2942
# TODO. Note that Python3 has unicode by default
3043
ASCII = A = 0
3144
UNICODE = U = 0
3245

33-
PCRE_INFO_CAPTURECOUNT = 2
46+
PCRE2_INFO_CAPTURECOUNT = 4
3447

3548

3649
class PCREMatch:
@@ -67,19 +80,26 @@ def __init__(self, compiled_ptn):
6780
def search(self, s, pos=0, endpos=-1, _flags=0):
6881
assert endpos == -1, "pos: %d, endpos: %d" % (pos, endpos)
6982
buf = array.array("i", [0])
70-
pcre_fullinfo(self.obj, None, PCRE_INFO_CAPTURECOUNT, buf)
83+
pcre2_pattern_info(self.obj, PCRE2_INFO_CAPTURECOUNT, buf)
7184
cap_count = buf[0]
72-
ov = array.array("i", [0, 0, 0] * (cap_count + 1))
73-
num = pcre_exec(self.obj, None, s, len(s), pos, _flags, ov, len(ov))
85+
match_data = pcre2_match_data_create_from_pattern(self.obj, None)
86+
num = pcre2_match(self.obj, s, len(s), pos, _flags, match_data, None)
7487
if num == -1:
7588
# No match
7689
return None
90+
ov_ptr = pcre2_get_ovector_pointer(match_data)
91+
# pcre2_get_ovector_pointer return PCRE2_SIZE that is of type
92+
# size_t. Use ULONG as type to support both 32bit and 64bit.
93+
ov_buf = uctypes.bytearray_at(
94+
ov_ptr, uctypes.sizeof({"field": 0 | uctypes.ULONG}) * (cap_count + 1) * 2
95+
)
96+
ov = array.array("L", ov_buf)
7797
# We don't care how many matching subexpressions we got, we
7898
# care only about total # of capturing ones (including empty)
7999
return PCREMatch(s, cap_count + 1, ov)
80100

81101
def match(self, s, pos=0, endpos=-1):
82-
return self.search(s, pos, endpos, PCRE_ANCHORED)
102+
return self.search(s, pos, endpos, PCRE2_ANCHORED)
83103

84104
def sub(self, repl, s, count=0):
85105
if not callable(repl):
@@ -141,9 +161,9 @@ def findall(self, s):
141161

142162

143163
def compile(pattern, flags=0):
144-
errptr = bytes(4)
164+
errcode = bytes(4)
145165
erroffset = bytes(4)
146-
regex = pcre_compile(pattern, flags, errptr, erroffset, None)
166+
regex = pcre2_compile(pattern, PCRE2_ZERO_TERMINATED, flags, errcode, erroffset, None)
147167
assert regex
148168
return PCREPattern(regex)
149169

@@ -154,7 +174,7 @@ def search(pattern, string, flags=0):
154174

155175

156176
def match(pattern, string, flags=0):
157-
r = compile(pattern, flags | PCRE_ANCHORED)
177+
r = compile(pattern, flags | PCRE2_ANCHORED)
158178
return r.search(string)
159179

160180

0 commit comments

Comments
 (0)