Skip to content

Commit

Permalink
unix-ffi/re: Convert to PCRE2.
Browse files Browse the repository at this point in the history
PCRE is marked as EOL and won't receive any new security update.

Convert the re module to PCRE2 API to enforce security.  Additional
dependency is now needed with uctypes due to changes in how PCRE2 return
the match_data in a pointer and require special handling.

The converted module is tested with the test_re.py with no regression.

Signed-off-by: Christian Marangi <[email protected]>
  • Loading branch information
Ansuel authored and dpgeorge committed Oct 31, 2023
1 parent 0620d02 commit d8e163b
Showing 1 changed file with 48 additions and 25 deletions.
73 changes: 48 additions & 25 deletions unix-ffi/re/re.py
Original file line number Diff line number Diff line change
@@ -1,36 +1,55 @@
import sys
import ffilib
import array
import uctypes

pcre2 = ffilib.open("libpcre2-8")

pcre = ffilib.open("libpcre")
# pcre2_code *pcre2_compile(PCRE2_SPTR pattern, PCRE2_SIZE length,
# uint32_t options, int *errorcode, PCRE2_SIZE *erroroffset,
# pcre2_compile_context *ccontext);
pcre2_compile = pcre2.func("p", "pcre2_compile_8", "siippp")

# pcre *pcre_compile(const char *pattern, int options,
# const char **errptr, int *erroffset,
# const unsigned char *tableptr);
pcre_compile = pcre.func("p", "pcre_compile", "sipps")
# int pcre2_match(const pcre2_code *code, PCRE2_SPTR subject,
# PCRE2_SIZE length, PCRE2_SIZE startoffset, uint32_t options,
# pcre2_match_data *match_data, pcre2_match_context *mcontext);
pcre2_match = pcre2.func("i", "pcre2_match_8", "Psiiipp")

# int pcre_exec(const pcre *code, const pcre_extra *extra,
# const char *subject, int length, int startoffset,
# int options, int *ovector, int ovecsize);
pcre_exec = pcre.func("i", "pcre_exec", "PPsiiipi")
# int pcre2_pattern_info(const pcre2_code *code, uint32_t what,
# void *where);
pcre2_pattern_info = pcre2.func("i", "pcre2_pattern_info_8", "Pip")

# int pcre_fullinfo(const pcre *code, const pcre_extra *extra,
# int what, void *where);
pcre_fullinfo = pcre.func("i", "pcre_fullinfo", "PPip")
# PCRE2_SIZE *pcre2_get_ovector_pointer(pcre2_match_data *match_data);
pcre2_get_ovector_pointer = pcre2.func("p", "pcre2_get_ovector_pointer_8", "p")

# pcre2_match_data *pcre2_match_data_create_from_pattern(const pcre2_code *code,
# pcre2_general_context *gcontext);
pcre2_match_data_create_from_pattern = pcre2.func(
"p", "pcre2_match_data_create_from_pattern_8", "Pp"
)

IGNORECASE = I = 1
MULTILINE = M = 2
DOTALL = S = 4
VERBOSE = X = 8
PCRE_ANCHORED = 0x10
# PCRE2_SIZE that is of type size_t.
# Use ULONG as type to support both 32bit and 64bit.
PCRE2_SIZE_SIZE = uctypes.sizeof({"field": 0 | uctypes.ULONG})
PCRE2_SIZE_TYPE = "L"

# Real value in pcre2.h is 0xFFFFFFFF for 32bit and
# 0x0xFFFFFFFFFFFFFFFF for 64bit that is equivalent
# to -1
PCRE2_ZERO_TERMINATED = -1


IGNORECASE = I = 0x8
MULTILINE = M = 0x400
DOTALL = S = 0x20
VERBOSE = X = 0x80
PCRE2_ANCHORED = 0x80000000

# TODO. Note that Python3 has unicode by default
ASCII = A = 0
UNICODE = U = 0

PCRE_INFO_CAPTURECOUNT = 2
PCRE2_INFO_CAPTURECOUNT = 0x4


class PCREMatch:
Expand Down Expand Up @@ -67,19 +86,23 @@ def __init__(self, compiled_ptn):
def search(self, s, pos=0, endpos=-1, _flags=0):
assert endpos == -1, "pos: %d, endpos: %d" % (pos, endpos)
buf = array.array("i", [0])
pcre_fullinfo(self.obj, None, PCRE_INFO_CAPTURECOUNT, buf)
pcre2_pattern_info(self.obj, PCRE2_INFO_CAPTURECOUNT, buf)
cap_count = buf[0]
ov = array.array("i", [0, 0, 0] * (cap_count + 1))
num = pcre_exec(self.obj, None, s, len(s), pos, _flags, ov, len(ov))
match_data = pcre2_match_data_create_from_pattern(self.obj, None)
num = pcre2_match(self.obj, s, len(s), pos, _flags, match_data, None)
if num == -1:
# No match
return None
ov_ptr = pcre2_get_ovector_pointer(match_data)
# pcre2_get_ovector_pointer return PCRE2_SIZE
ov_buf = uctypes.bytearray_at(ov_ptr, PCRE2_SIZE_SIZE * (cap_count + 1) * 2)
ov = array.array(PCRE2_SIZE_TYPE, ov_buf)
# We don't care how many matching subexpressions we got, we
# care only about total # of capturing ones (including empty)
return PCREMatch(s, cap_count + 1, ov)

def match(self, s, pos=0, endpos=-1):
return self.search(s, pos, endpos, PCRE_ANCHORED)
return self.search(s, pos, endpos, PCRE2_ANCHORED)

def sub(self, repl, s, count=0):
if not callable(repl):
Expand Down Expand Up @@ -141,9 +164,9 @@ def findall(self, s):


def compile(pattern, flags=0):
errptr = bytes(4)
errcode = bytes(4)
erroffset = bytes(4)
regex = pcre_compile(pattern, flags, errptr, erroffset, None)
regex = pcre2_compile(pattern, PCRE2_ZERO_TERMINATED, flags, errcode, erroffset, None)
assert regex
return PCREPattern(regex)

Expand All @@ -154,7 +177,7 @@ def search(pattern, string, flags=0):


def match(pattern, string, flags=0):
r = compile(pattern, flags | PCRE_ANCHORED)
r = compile(pattern, flags | PCRE2_ANCHORED)
return r.search(string)


Expand Down

0 comments on commit d8e163b

Please sign in to comment.