Skip to content

Commit

Permalink
Merge pull request #15 from deepopinion/feature/date-range-heuristic
Browse files Browse the repository at this point in the history
Add a heuristic for splitting merged date ranges in OCR results (GoogleOCR and GoogleAzureOCR)
  • Loading branch information
Paethon authored Mar 26, 2024
2 parents 1b8391e + a5dd409 commit 2e304f6
Show file tree
Hide file tree
Showing 5 changed files with 116 additions and 35 deletions.
2 changes: 1 addition & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ The version numbers are according to [Semantic Versioning](http://semver.org/).

## Release v0.0.x ()
### Added

- Added a heuristic for splitting date ranges (dd.mm.yyyy-dd.mm.yyyy or dd/mm/yyyy-dd/mm/yyyy) that were merged into a single bounding box by GoogleOCR (and thus by GoogleAzureOCR as well), into three distinct bounding boxes.
### Fixed

### Changed
Expand Down
57 changes: 57 additions & 0 deletions ocr_wrapper/data_clean_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
from __future__ import annotations

from .bbox import BBox
import re
from .bbox_utils import split_bbox
from typing import Optional


def split_date_boxes(bboxes: list[BBox], confidences: Optional[list] = None) -> list[BBox]:
"""
Splits date boxes that contain a date range of the format "dd/mm/yyyy - dd/mm/yyyy" into three separate boxes.
Args:
bboxes (list[BBox]): The bboxes to filter.
Returns:
list[BBox]: The filtered bboxes.
"""
if confidences is not None and len(bboxes) != len(confidences):
raise ValueError("The length of the bboxes and confidences lists must be equal.")

# Create dummy confidences if none are given. Makes the rest of the code more consistent
if confidences is None:
working_confidences = [0 for i in range(len(bboxes))]
else:
working_confidences = confidences

date_range_pattern = (
r"^\s*\d{1,2}\s*[/\.]\s*\d{1,2}\s*[/\.]\s*\d{4}\s*-\s*\d{1,2}\s*[/\.]\s*\d{1,2}\s*[/\.]\s*\d{4}\s*$"
)

filtered_bboxes = []
new_confidences = []
for bbox, confidence in zip(bboxes, working_confidences):
text = bbox.text
if text is not None and re.match(date_range_pattern, text):
date1, date2 = text.split("-")
date1, date2 = date1.strip(), date2.strip()
# Info: The split points have been determined empirically
bbox1, bbox2 = split_bbox(bbox, 0.49)
bbox1_2, bbox2_2 = split_bbox(bbox2, 0.07) # Split the second bbox again to get a box for the "-"
bbox1.text = date1
bbox1_2.text = "-"
bbox2_2.text = date2
filtered_bboxes.append(bbox1)
filtered_bboxes.append(bbox1_2)
filtered_bboxes.append(bbox2_2)
# Confidences are just repeated three times for the three new boxes
new_confidences.extend([confidence, confidence, confidence])
else:
filtered_bboxes.append(bbox)
new_confidences.append(confidence)

if confidences is None:
return filtered_bboxes
else:
return filtered_bboxes, new_confidences
40 changes: 6 additions & 34 deletions ocr_wrapper/google_azure_ocr.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,16 @@
from PIL import Image

from ocr_wrapper import AzureOCR, BBox, GoogleOCR
from ocr_wrapper.google_document_ocr_checkbox_detector import GoogleDocumentOcrCheckboxDetector
from ocr_wrapper.google_document_ocr_checkbox_detector import (
GoogleDocumentOcrCheckboxDetector,
)
from ocr_wrapper.ocr_wrapper import rotate_image
from ocr_wrapper.tilt_correction import correct_tilt

from .bbox_order import get_ordered_bboxes_idxs
from .bbox_utils import split_bbox

from .utils import get_img_hash
from .data_clean_utils import split_date_boxes


class GoogleAzureOCR:
Expand Down Expand Up @@ -112,7 +115,7 @@ def ocr(self, img: Image.Image, return_extra: bool = False) -> Union[list[BBox],
azure_bboxes = [bbox.rotate(google_rotation_angle) for bbox in azure_bboxes]
if self.add_checkboxes:
checkbox_bboxes = [bbox.rotate(google_rotation_angle) for bbox in checkbox_bboxes]
azure_bboxes = _split_azure_date_boxes(azure_bboxes)
azure_bboxes = split_date_boxes(azure_bboxes)
img = rotate_image(img, google_rotation_angle)

# Remove unwanted bboxes from Google OCR result
Expand Down Expand Up @@ -406,34 +409,3 @@ def _filter_unwanted_google_bboxes(bboxes: list[BBox], width_height_ratio: float
filtered_bboxes.append(bbox)
filtered_bboxes = _filter_date_boxes(filtered_bboxes)
return filtered_bboxes


def _split_azure_date_boxes(bboxes: list[BBox]) -> list[BBox]:
"""
Splits date boxes that contain a date range of the format "dd/mm/yyyy - dd/mm/yyyy" into three separate boxes.
Args:
bboxes (list[BBox]): The bboxes to filter.
Returns:
list[BBox]: The filtered bboxes.
"""
date_range_pattern = r"^\s*\d{1,2}\s*/\s*\d{1,2}\s*/\s*\d{4}\s*-\s*\d{1,2}\s*/\s*\d{1,2}\s*/\s*\d{4}\s*$"
filtered_bboxes = []
for bbox in bboxes:
text = bbox.text
if text is not None and re.match(date_range_pattern, text):
date1, date2 = text.split("-")
# Info: The split points have been determined empirically
bbox1, bbox2 = split_bbox(bbox, 0.49)
bbox1_2, bbox2_2 = split_bbox(bbox2, 0.07) # Split the second bbox again to get a box for the "-"
bbox1.text = date1
bbox1_2.text = "-"
bbox2_2.text = date2
filtered_bboxes.append(bbox1)
filtered_bboxes.append(bbox1_2)
filtered_bboxes.append(bbox2_2)
else:
filtered_bboxes.append(bbox)

return filtered_bboxes
5 changes: 5 additions & 0 deletions ocr_wrapper/ocr_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from .bbox import BBox
from .compat import bboxs2dicts, dicts2bboxs
from .tilt_correction import correct_tilt
from .data_clean_utils import split_date_boxes


def rotate_image(image: Image.Image, angle: int) -> Image.Image:
Expand Down Expand Up @@ -113,6 +114,10 @@ def ocr(self, img: Image.Image, return_extra: bool = False) -> Union[list[BBox],
# Rotate boxes. The given rotation will be done counter-clockwise
bboxes = [bbox.rotate(angle) for bbox in bboxes]

# Split date-range boxes
bboxes, confidences = split_date_boxes(bboxes, extra["confidences"][0])
extra["confidences"] = [confidences]

if return_extra:
return bboxes, extra
return bboxes
Expand Down
47 changes: 47 additions & 0 deletions tests/test_data_clean_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
import pytest

from ocr_wrapper.data_clean_utils import split_date_boxes
from ocr_wrapper import BBox


@pytest.mark.parametrize(
"inpt, expected_texts",
[
(
BBox(0, 0, 1, 0, 1, 1, 0, 1, text="01/01/2021 - 01/01/2022"),
["01/01/2021", "-", "01/01/2022"],
),
(
BBox(0, 0, 1, 0, 1, 1, 0, 1, text="01.01.2021 - 01.01.2022"),
["01.01.2021", "-", "01.01.2022"],
),
(
BBox(0, 0, 1, 0, 1, 1, 0, 1, text="01/01/2021-01/01/2022"),
["01/01/2021", "-", "01/01/2022"],
),
(
BBox(0, 0, 1, 0, 1, 1, 0, 1, text="01.01.2021-01.01.2022"),
["01.01.2021", "-", "01.01.2022"],
),
(
BBox(0, 0, 1, 0, 1, 1, 0, 1, text="01/01/2021 -01/01/2022"),
["01/01/2021", "-", "01/01/2022"],
),
(
BBox(0, 0, 1, 0, 1, 1, 0, 1, text="01.01.2021 -01.01.2022"),
["01.01.2021", "-", "01.01.2022"],
),
(
BBox(0, 0, 1, 0, 1, 1, 0, 1, text="01/01/2021- 01/01/2022"),
["01/01/2021", "-", "01/01/2022"],
),
(
BBox(0, 0, 1, 0, 1, 1, 0, 1, text="01.01.2021- 01.01.2022"),
["01.01.2021", "-", "01.01.2022"],
),
],
)
def test_split_date_boxes(inpt, expected_texts):
results = split_date_boxes([inpt])
for res, expected_text in zip(results, expected_texts):
assert res.text == expected_text

0 comments on commit 2e304f6

Please sign in to comment.