Skip to content
This repository has been archived by the owner on Feb 19, 2021. It is now read-only.

Don't parse dates with more than 4 digits for the year #556

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions src/documents/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,11 @@
# - MONTH ZZZZ, with ZZZZ being 4 digits
# - MONTH XX, ZZZZ with XX being 1 or 2 and ZZZZ being 4 digits
DATE_REGEX = re.compile(
r'(\b|(?!=([_-])))([0-9]{1,2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{4}|[0-9]{2})(\b|(?=([_-])))|' + # NOQA: E501
r'(\b|(?!=([_-])))([0-9]{4}|[0-9]{2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{1,2})(\b|(?=([_-])))|' + # NOQA: E501
r'(\b|(?!=([_-])))([0-9]{1,2}[\. ]+[^ ]{3,9} ([0-9]{4}|[0-9]{2}))(\b|(?=([_-])))|' + # NOQA: E501
r'(\b|(?!=([_-])))([^\W\d_]{3,9} [0-9]{1,2}, ([0-9]{4}))(\b|(?=([_-])))|' +
r'(\b|(?!=([_-])))([^\W\d_]{3,9} [0-9]{4})(\b|(?=([_-])))'
r'(\b|(?<=([_-])))([0-9]{1,2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{4}|[0-9]{2})(\b|(?=([_-])))|' + # NOQA: E501
r'(\b|(?<=([_-])))([0-9]{4}|[0-9]{2})[\.\/-]([0-9]{1,2})[\.\/-]([0-9]{1,2})(\b|(?=([_-])))|' + # NOQA: E501
r'(\b|(?<=([_-])))([0-9]{1,2}[\. ]+[^ ]{3,9} ([0-9]{4}|[0-9]{2}))(\b|(?=([_-])))|' + # NOQA: E501
r'(\b|(?<=([_-])))([^\W\d_]{3,9} [0-9]{1,2}, ([0-9]{4}))(\b|(?=([_-])))|' +
r'(\b|(?<=([_-])))([^\W\d_]{3,9} [0-9]{4})(\b|(?=([_-])))'
)


Expand Down
24 changes: 24 additions & 0 deletions src/paperless_tesseract/tests/test_date.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,3 +182,27 @@ def test_crazy_date_past(self, *args):
document = RasterisedDocumentParser("/dev/null")
document.get_text()
self.assertIsNone(document.get_date())

EXTRA = {
"123/04/2020/3423": None,
"-23/04/2020-foo": "2020 04 23",
"-23-04-2020-blurb": "2020 04 23",
# gets parsed as month: 23, day: 04, which is invalid
# "-2020-04-23-bar": "2020 04 23",
"12020-04-23-": None,
"-2020-04-234": None,
}

@mock.patch(MOCK_SCRATCH, SCRATCH)
def test_date_format_bulk(self):
timezone = tz.gettz(settings.TIME_ZONE)
for input, expected in self.EXTRA.items():
if expected is not None:
raw = [int(x) for x in expected.split()]
expected = datetime.datetime(*raw, tzinfo=timezone)

input_file = os.path.join(self.SAMPLE_FILES, "")
document = RasterisedDocumentParser(input_file)
document._text = input
message = "Test case {!r}".format(input)
self.assertEqual(document.get_date(), expected, msg=message)