Merge pull request #15 from deepopinion/feature/date-range-heuristic

Add a heuristic for splitting merged date ranges in OCR results (GoogleOCR and GoogleAzureOCR)
deepopinion · Mar 26, 2024 · 2e304f6 · 2e304f6
2 parents 1b8391e + a5dd409
commit 2e304f6
Show file tree

Hide file tree

Showing 5 changed files with 116 additions and 35 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,7 +4,7 @@ The version numbers are according to [Semantic Versioning](http://semver.org/).
 
 ## Release v0.0.x ()
 ### Added
-
+- Added a heuristic for splitting date ranges (dd.mm.yyyy-dd.mm.yyyy or dd/mm/yyyy-dd/mm/yyyy) that were merged into a single bounding box by GoogleOCR (and thus by GoogleAzureOCR as well), into three distinct bounding boxes.
 ### Fixed
 
 ### Changed

diff --git a/ocr_wrapper/data_clean_utils.py b/ocr_wrapper/data_clean_utils.py
@@ -0,0 +1,57 @@
+from __future__ import annotations
+
+from .bbox import BBox
+import re
+from .bbox_utils import split_bbox
+from typing import Optional
+
+
+def split_date_boxes(bboxes: list[BBox], confidences: Optional[list] = None) -> list[BBox]:
+    """
+    Splits date boxes that contain a date range of the format "dd/mm/yyyy - dd/mm/yyyy" into three separate boxes.
+
+    Args:
+        bboxes (list[BBox]): The bboxes to filter.
+
+    Returns:
+        list[BBox]: The filtered bboxes.
+    """
+    if confidences is not None and len(bboxes) != len(confidences):
+        raise ValueError("The length of the bboxes and confidences lists must be equal.")
+
+    # Create dummy confidences if none are given. Makes the rest of the code more consistent
+    if confidences is None:
+        working_confidences = [0 for i in range(len(bboxes))]
+    else:
+        working_confidences = confidences
+
+    date_range_pattern = (
+        r"^\s*\d{1,2}\s*[/\.]\s*\d{1,2}\s*[/\.]\s*\d{4}\s*-\s*\d{1,2}\s*[/\.]\s*\d{1,2}\s*[/\.]\s*\d{4}\s*$"
+    )
+
+    filtered_bboxes = []
+    new_confidences = []
+    for bbox, confidence in zip(bboxes, working_confidences):
+        text = bbox.text
+        if text is not None and re.match(date_range_pattern, text):
+            date1, date2 = text.split("-")
+            date1, date2 = date1.strip(), date2.strip()
+            # Info: The split points have been determined empirically
+            bbox1, bbox2 = split_bbox(bbox, 0.49)
+            bbox1_2, bbox2_2 = split_bbox(bbox2, 0.07)  # Split the second bbox again to get a box for the "-"
+            bbox1.text = date1
+            bbox1_2.text = "-"
+            bbox2_2.text = date2
+            filtered_bboxes.append(bbox1)
+            filtered_bboxes.append(bbox1_2)
+            filtered_bboxes.append(bbox2_2)
+            # Confidences are just repeated three times for the three new boxes
+            new_confidences.extend([confidence, confidence, confidence])
+        else:
+            filtered_bboxes.append(bbox)
+            new_confidences.append(confidence)
+
+    if confidences is None:
+        return filtered_bboxes
+    else:
+        return filtered_bboxes, new_confidences
diff --git a/ocr_wrapper/google_azure_ocr.py b/ocr_wrapper/google_azure_ocr.py
@@ -18,13 +18,16 @@
 from PIL import Image
 
 from ocr_wrapper import AzureOCR, BBox, GoogleOCR
-from ocr_wrapper.google_document_ocr_checkbox_detector import GoogleDocumentOcrCheckboxDetector
+from ocr_wrapper.google_document_ocr_checkbox_detector import (
+    GoogleDocumentOcrCheckboxDetector,
+)
 from ocr_wrapper.ocr_wrapper import rotate_image
 from ocr_wrapper.tilt_correction import correct_tilt
 
 from .bbox_order import get_ordered_bboxes_idxs
-from .bbox_utils import split_bbox
+
 from .utils import get_img_hash
+from .data_clean_utils import split_date_boxes
 
 
 class GoogleAzureOCR:
@@ -112,7 +115,7 @@ def ocr(self, img: Image.Image, return_extra: bool = False) -> Union[list[BBox],
         azure_bboxes = [bbox.rotate(google_rotation_angle) for bbox in azure_bboxes]
         if self.add_checkboxes:
             checkbox_bboxes = [bbox.rotate(google_rotation_angle) for bbox in checkbox_bboxes]
-        azure_bboxes = _split_azure_date_boxes(azure_bboxes)
+        azure_bboxes = split_date_boxes(azure_bboxes)
         img = rotate_image(img, google_rotation_angle)
 
         # Remove unwanted bboxes from Google OCR result
@@ -406,34 +409,3 @@ def _filter_unwanted_google_bboxes(bboxes: list[BBox], width_height_ratio: float
             filtered_bboxes.append(bbox)
     filtered_bboxes = _filter_date_boxes(filtered_bboxes)
     return filtered_bboxes
-
-
-def _split_azure_date_boxes(bboxes: list[BBox]) -> list[BBox]:
-    """
-    Splits date boxes that contain a date range of the format "dd/mm/yyyy - dd/mm/yyyy" into three separate boxes.
-
-    Args:
-        bboxes (list[BBox]): The bboxes to filter.
-
-    Returns:
-        list[BBox]: The filtered bboxes.
-    """
-    date_range_pattern = r"^\s*\d{1,2}\s*/\s*\d{1,2}\s*/\s*\d{4}\s*-\s*\d{1,2}\s*/\s*\d{1,2}\s*/\s*\d{4}\s*$"
-    filtered_bboxes = []
-    for bbox in bboxes:
-        text = bbox.text
-        if text is not None and re.match(date_range_pattern, text):
-            date1, date2 = text.split("-")
-            # Info: The split points have been determined empirically
-            bbox1, bbox2 = split_bbox(bbox, 0.49)
-            bbox1_2, bbox2_2 = split_bbox(bbox2, 0.07)  # Split the second bbox again to get a box for the "-"
-            bbox1.text = date1
-            bbox1_2.text = "-"
-            bbox2_2.text = date2
-            filtered_bboxes.append(bbox1)
-            filtered_bboxes.append(bbox1_2)
-            filtered_bboxes.append(bbox2_2)
-        else:
-            filtered_bboxes.append(bbox)
-
-    return filtered_bboxes
diff --git a/ocr_wrapper/ocr_wrapper.py b/ocr_wrapper/ocr_wrapper.py
@@ -17,6 +17,7 @@
 from .bbox import BBox
 from .compat import bboxs2dicts, dicts2bboxs
 from .tilt_correction import correct_tilt
+from .data_clean_utils import split_date_boxes
 
 
 def rotate_image(image: Image.Image, angle: int) -> Image.Image:
@@ -113,6 +114,10 @@ def ocr(self, img: Image.Image, return_extra: bool = False) -> Union[list[BBox],
             # Rotate boxes. The given rotation will be done counter-clockwise
             bboxes = [bbox.rotate(angle) for bbox in bboxes]
 
+        # Split date-range boxes
+        bboxes, confidences = split_date_boxes(bboxes, extra["confidences"][0])
+        extra["confidences"] = [confidences]
+
         if return_extra:
             return bboxes, extra
         return bboxes

diff --git a/tests/test_data_clean_utils.py b/tests/test_data_clean_utils.py
@@ -0,0 +1,47 @@
+import pytest
+
+from ocr_wrapper.data_clean_utils import split_date_boxes
+from ocr_wrapper import BBox
+
+
+@pytest.mark.parametrize(
+    "inpt, expected_texts",
+    [
+        (
+            BBox(0, 0, 1, 0, 1, 1, 0, 1, text="01/01/2021 - 01/01/2022"),
+            ["01/01/2021", "-", "01/01/2022"],
+        ),
+        (
+            BBox(0, 0, 1, 0, 1, 1, 0, 1, text="01.01.2021 - 01.01.2022"),
+            ["01.01.2021", "-", "01.01.2022"],
+        ),
+        (
+            BBox(0, 0, 1, 0, 1, 1, 0, 1, text="01/01/2021-01/01/2022"),
+            ["01/01/2021", "-", "01/01/2022"],
+        ),
+        (
+            BBox(0, 0, 1, 0, 1, 1, 0, 1, text="01.01.2021-01.01.2022"),
+            ["01.01.2021", "-", "01.01.2022"],
+        ),
+        (
+            BBox(0, 0, 1, 0, 1, 1, 0, 1, text="01/01/2021 -01/01/2022"),
+            ["01/01/2021", "-", "01/01/2022"],
+        ),
+        (
+            BBox(0, 0, 1, 0, 1, 1, 0, 1, text="01.01.2021 -01.01.2022"),
+            ["01.01.2021", "-", "01.01.2022"],
+        ),
+        (
+            BBox(0, 0, 1, 0, 1, 1, 0, 1, text="01/01/2021- 01/01/2022"),
+            ["01/01/2021", "-", "01/01/2022"],
+        ),
+        (
+            BBox(0, 0, 1, 0, 1, 1, 0, 1, text="01.01.2021- 01.01.2022"),
+            ["01.01.2021", "-", "01.01.2022"],
+        ),
+    ],
+)
+def test_split_date_boxes(inpt, expected_texts):
+    results = split_date_boxes([inpt])
+    for res, expected_text in zip(results, expected_texts):
+        assert res.text == expected_text