diff --git a/CHANGELOG.md b/CHANGELOG.md index f2701c9..0f2eed6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,7 +4,7 @@ The version numbers are according to [Semantic Versioning](http://semver.org/). ## Release v0.0.x () ### Added - +- Added a heuristic for splitting date ranges (dd.mm.yyyy-dd.mm.yyyy or dd/mm/yyyy-dd/mm/yyyy) that were merged into a single bounding box by GoogleOCR (and thus by GoogleAzureOCR as well), into three distinct bounding boxes. ### Fixed ### Changed diff --git a/ocr_wrapper/data_clean_utils.py b/ocr_wrapper/data_clean_utils.py new file mode 100644 index 0000000..8917f2c --- /dev/null +++ b/ocr_wrapper/data_clean_utils.py @@ -0,0 +1,57 @@ +from __future__ import annotations + +from .bbox import BBox +import re +from .bbox_utils import split_bbox +from typing import Optional + + +def split_date_boxes(bboxes: list[BBox], confidences: Optional[list] = None) -> list[BBox]: + """ + Splits date boxes that contain a date range of the format "dd/mm/yyyy - dd/mm/yyyy" into three separate boxes. + + Args: + bboxes (list[BBox]): The bboxes to filter. + + Returns: + list[BBox]: The filtered bboxes. + """ + if confidences is not None and len(bboxes) != len(confidences): + raise ValueError("The length of the bboxes and confidences lists must be equal.") + + # Create dummy confidences if none are given. Makes the rest of the code more consistent + if confidences is None: + working_confidences = [0 for i in range(len(bboxes))] + else: + working_confidences = confidences + + date_range_pattern = ( + r"^\s*\d{1,2}\s*[/\.]\s*\d{1,2}\s*[/\.]\s*\d{4}\s*-\s*\d{1,2}\s*[/\.]\s*\d{1,2}\s*[/\.]\s*\d{4}\s*$" + ) + + filtered_bboxes = [] + new_confidences = [] + for bbox, confidence in zip(bboxes, working_confidences): + text = bbox.text + if text is not None and re.match(date_range_pattern, text): + date1, date2 = text.split("-") + date1, date2 = date1.strip(), date2.strip() + # Info: The split points have been determined empirically + bbox1, bbox2 = split_bbox(bbox, 0.49) + bbox1_2, bbox2_2 = split_bbox(bbox2, 0.07) # Split the second bbox again to get a box for the "-" + bbox1.text = date1 + bbox1_2.text = "-" + bbox2_2.text = date2 + filtered_bboxes.append(bbox1) + filtered_bboxes.append(bbox1_2) + filtered_bboxes.append(bbox2_2) + # Confidences are just repeated three times for the three new boxes + new_confidences.extend([confidence, confidence, confidence]) + else: + filtered_bboxes.append(bbox) + new_confidences.append(confidence) + + if confidences is None: + return filtered_bboxes + else: + return filtered_bboxes, new_confidences diff --git a/ocr_wrapper/google_azure_ocr.py b/ocr_wrapper/google_azure_ocr.py index 1187470..26f9f8c 100644 --- a/ocr_wrapper/google_azure_ocr.py +++ b/ocr_wrapper/google_azure_ocr.py @@ -18,13 +18,16 @@ from PIL import Image from ocr_wrapper import AzureOCR, BBox, GoogleOCR -from ocr_wrapper.google_document_ocr_checkbox_detector import GoogleDocumentOcrCheckboxDetector +from ocr_wrapper.google_document_ocr_checkbox_detector import ( + GoogleDocumentOcrCheckboxDetector, +) from ocr_wrapper.ocr_wrapper import rotate_image from ocr_wrapper.tilt_correction import correct_tilt from .bbox_order import get_ordered_bboxes_idxs -from .bbox_utils import split_bbox + from .utils import get_img_hash +from .data_clean_utils import split_date_boxes class GoogleAzureOCR: @@ -112,7 +115,7 @@ def ocr(self, img: Image.Image, return_extra: bool = False) -> Union[list[BBox], azure_bboxes = [bbox.rotate(google_rotation_angle) for bbox in azure_bboxes] if self.add_checkboxes: checkbox_bboxes = [bbox.rotate(google_rotation_angle) for bbox in checkbox_bboxes] - azure_bboxes = _split_azure_date_boxes(azure_bboxes) + azure_bboxes = split_date_boxes(azure_bboxes) img = rotate_image(img, google_rotation_angle) # Remove unwanted bboxes from Google OCR result @@ -406,34 +409,3 @@ def _filter_unwanted_google_bboxes(bboxes: list[BBox], width_height_ratio: float filtered_bboxes.append(bbox) filtered_bboxes = _filter_date_boxes(filtered_bboxes) return filtered_bboxes - - -def _split_azure_date_boxes(bboxes: list[BBox]) -> list[BBox]: - """ - Splits date boxes that contain a date range of the format "dd/mm/yyyy - dd/mm/yyyy" into three separate boxes. - - Args: - bboxes (list[BBox]): The bboxes to filter. - - Returns: - list[BBox]: The filtered bboxes. - """ - date_range_pattern = r"^\s*\d{1,2}\s*/\s*\d{1,2}\s*/\s*\d{4}\s*-\s*\d{1,2}\s*/\s*\d{1,2}\s*/\s*\d{4}\s*$" - filtered_bboxes = [] - for bbox in bboxes: - text = bbox.text - if text is not None and re.match(date_range_pattern, text): - date1, date2 = text.split("-") - # Info: The split points have been determined empirically - bbox1, bbox2 = split_bbox(bbox, 0.49) - bbox1_2, bbox2_2 = split_bbox(bbox2, 0.07) # Split the second bbox again to get a box for the "-" - bbox1.text = date1 - bbox1_2.text = "-" - bbox2_2.text = date2 - filtered_bboxes.append(bbox1) - filtered_bboxes.append(bbox1_2) - filtered_bboxes.append(bbox2_2) - else: - filtered_bboxes.append(bbox) - - return filtered_bboxes diff --git a/ocr_wrapper/ocr_wrapper.py b/ocr_wrapper/ocr_wrapper.py index aaf5ffd..61da773 100644 --- a/ocr_wrapper/ocr_wrapper.py +++ b/ocr_wrapper/ocr_wrapper.py @@ -17,6 +17,7 @@ from .bbox import BBox from .compat import bboxs2dicts, dicts2bboxs from .tilt_correction import correct_tilt +from .data_clean_utils import split_date_boxes def rotate_image(image: Image.Image, angle: int) -> Image.Image: @@ -113,6 +114,10 @@ def ocr(self, img: Image.Image, return_extra: bool = False) -> Union[list[BBox], # Rotate boxes. The given rotation will be done counter-clockwise bboxes = [bbox.rotate(angle) for bbox in bboxes] + # Split date-range boxes + bboxes, confidences = split_date_boxes(bboxes, extra["confidences"][0]) + extra["confidences"] = [confidences] + if return_extra: return bboxes, extra return bboxes diff --git a/tests/test_data_clean_utils.py b/tests/test_data_clean_utils.py new file mode 100644 index 0000000..69845ea --- /dev/null +++ b/tests/test_data_clean_utils.py @@ -0,0 +1,47 @@ +import pytest + +from ocr_wrapper.data_clean_utils import split_date_boxes +from ocr_wrapper import BBox + + +@pytest.mark.parametrize( + "inpt, expected_texts", + [ + ( + BBox(0, 0, 1, 0, 1, 1, 0, 1, text="01/01/2021 - 01/01/2022"), + ["01/01/2021", "-", "01/01/2022"], + ), + ( + BBox(0, 0, 1, 0, 1, 1, 0, 1, text="01.01.2021 - 01.01.2022"), + ["01.01.2021", "-", "01.01.2022"], + ), + ( + BBox(0, 0, 1, 0, 1, 1, 0, 1, text="01/01/2021-01/01/2022"), + ["01/01/2021", "-", "01/01/2022"], + ), + ( + BBox(0, 0, 1, 0, 1, 1, 0, 1, text="01.01.2021-01.01.2022"), + ["01.01.2021", "-", "01.01.2022"], + ), + ( + BBox(0, 0, 1, 0, 1, 1, 0, 1, text="01/01/2021 -01/01/2022"), + ["01/01/2021", "-", "01/01/2022"], + ), + ( + BBox(0, 0, 1, 0, 1, 1, 0, 1, text="01.01.2021 -01.01.2022"), + ["01.01.2021", "-", "01.01.2022"], + ), + ( + BBox(0, 0, 1, 0, 1, 1, 0, 1, text="01/01/2021- 01/01/2022"), + ["01/01/2021", "-", "01/01/2022"], + ), + ( + BBox(0, 0, 1, 0, 1, 1, 0, 1, text="01.01.2021- 01.01.2022"), + ["01.01.2021", "-", "01.01.2022"], + ), + ], +) +def test_split_date_boxes(inpt, expected_texts): + results = split_date_boxes([inpt]) + for res, expected_text in zip(results, expected_texts): + assert res.text == expected_text