Skip to content
This repository has been archived by the owner on Feb 19, 2021. It is now read-only.

Commit

Permalink
Clean up date regex, don't use lookbehind/lookahead
Browse files Browse the repository at this point in the history
  • Loading branch information
heinrich5991 committed Aug 19, 2019
1 parent e46ad8e commit 1984972
Showing 1 changed file with 9 additions and 7 deletions.
16 changes: 9 additions & 7 deletions src/documents/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,13 @@
# - MONTH ZZZZ, with ZZZZ being 4 digits
# - MONTH XX, ZZZZ with XX being 1 or 2 and ZZZZ being 4 digits
DATE_REGEX = re.compile(
r'(\b|(?<=([_-])))([0-9]{1,2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{4}|[0-9]{2})(\b|(?=([_-])))|' + # NOQA: E501
r'(\b|(?<=([_-])))([0-9]{4}|[0-9]{2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{1,2})(\b|(?=([_-])))|' + # NOQA: E501
r'(\b|(?<=([_-])))([0-9]{1,2}[\. ]+[^ ]{3,9} ([0-9]{4}|[0-9]{2}))(\b|(?=([_-])))|' + # NOQA: E501
r'(\b|(?<=([_-])))([^\W\d_]{3,9} [0-9]{1,2}, ([0-9]{4}))(\b|(?=([_-])))|' +
r'(\b|(?<=([_-])))([^\W\d_]{3,9} [0-9]{4})(\b|(?=([_-])))'
r'(?:\b|[_-])(' +
r'[0-9]{1,2}[\.\/-][0-9]{1,2}[\.\/-](?:[0-9]{4}|[0-9]{2})|' +
r'(?:[0-9]{4}|[0-9]{2})[\.\/-][0-9]{1,2}[\.\/-][0-9]{1,2}|' +
r'[0-9]{1,2}[\. ]+[^ ]{3,9} (?:[0-9]{4}|[0-9]{2})|' +
r'[^\W\d_]{3,9} [0-9]{1,2}, [0-9]{4}|' +
r'[^\W\d_]{3,9} [0-9]{4}' +
r')(?:\b|[_-])'
)


Expand Down Expand Up @@ -104,7 +106,7 @@ def __parser(ds, date_order):
if self.FILENAME_DATE_ORDER:
self.log("info", "Checking document title for date")
for m in re.finditer(DATE_REGEX, title):
date_string = m.group(0)
date_string = m.group(1)

try:
date = __parser(date_string, self.FILENAME_DATE_ORDER)
Expand All @@ -130,7 +132,7 @@ def __parser(ds, date_order):

# Iterate through all regex matches in text and try to parse the date
for m in re.finditer(DATE_REGEX, text):
date_string = m.group(0)
date_string = m.group(1)

try:
date = __parser(date_string, self.DATE_ORDER)
Expand Down

0 comments on commit 1984972

Please sign in to comment.