From 8010d72f189a7a037040eef956e21ca9023bea94 Mon Sep 17 00:00:00 2001 From: Daniel Quinn Date: Mon, 1 Oct 2018 20:03:27 +0100 Subject: [PATCH] Tweak the date guesser to not allow dates prior to 1900 (#414) --- src/paperless_tesseract/parsers.py | 25 ++++++++++++++++------ src/paperless_tesseract/tests/test_date.py | 13 +++++++++++ 2 files changed, 31 insertions(+), 7 deletions(-) diff --git a/src/paperless_tesseract/parsers.py b/src/paperless_tesseract/parsers.py index f54461161..5305ff053 100644 --- a/src/paperless_tesseract/parsers.py +++ b/src/paperless_tesseract/parsers.py @@ -203,6 +203,7 @@ def _assemble_ocr_sections(self, imgs, middle, text): return text def get_date(self): + date = None datestring = None @@ -217,20 +218,30 @@ def get_date(self): try: date = dateparser.parse( - datestring, - settings={'DATE_ORDER': self.DATE_ORDER, - 'PREFER_DAY_OF_MONTH': 'first', - 'RETURN_AS_TIMEZONE_AWARE': True}) + datestring, + settings={ + "DATE_ORDER": self.DATE_ORDER, + "PREFER_DAY_OF_MONTH": "first", + "RETURN_AS_TIMEZONE_AWARE": True + } + ) except TypeError: # Skip all matches that do not parse to a proper date continue - if date is not None: + if date is not None and date.year > 1900: break + else: + date = None if date is not None: - self.log("info", "Detected document date " + date.isoformat() + - " based on string " + datestring) + self.log( + "info", + "Detected document date {} based on string {}".format( + date.isoformat(), + datestring + ) + ) else: self.log("info", "Unable to detect date for document") diff --git a/src/paperless_tesseract/tests/test_date.py b/src/paperless_tesseract/tests/test_date.py index 645cb70ff..e75042ce1 100644 --- a/src/paperless_tesseract/tests/test_date.py +++ b/src/paperless_tesseract/tests/test_date.py @@ -384,3 +384,16 @@ def test_get_text_9_pdf(self): document.get_date(), datetime.datetime(2017, 12, 31, 0, 0, tzinfo=tz.tzutc()) ) + + @mock.patch( + "paperless_tesseract.parsers.RasterisedDocumentParser.get_text", + return_value="01-07-0590 00:00:00" + ) + @mock.patch( + "paperless_tesseract.parsers.RasterisedDocumentParser.SCRATCH", + SCRATCH + ) + def test_crazy_date(self, *args): + document = RasterisedDocumentParser("/dev/null") + document.get_text() + self.assertIsNone(document.get_date())