From 2dc2a842edac6728cccb37385eeef60a5d2bec1e Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Thu, 5 Mar 2020 15:11:39 +0100 Subject: [PATCH 1/2] segment-line: validate intersection with parent --- ocrd_tesserocr/segment_line.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/ocrd_tesserocr/segment_line.py b/ocrd_tesserocr/segment_line.py index 1b82d4a..08be82b 100644 --- a/ocrd_tesserocr/segment_line.py +++ b/ocrd_tesserocr/segment_line.py @@ -111,7 +111,19 @@ def process(self): line_poly = Polygon(line_polygon) if not line_poly.within(region_poly): # this could happen due to rotation - line_poly = line_poly.intersection(region_poly).convex_hull + interline = line_poly.intersection(region_poly) + if interline.is_empty: + continue # ignore this line + if hasattr(interline, 'geoms'): + # is (heterogeneous) GeometryCollection + area = 0 + for geom in interline.geoms: + if geom.area > area: + area = geom.area + interline = geom + if not area: + continue + line_poly = interline.convex_hull line_polygon = line_poly.exterior.coords line_polygon = coordinates_for_segment(line_polygon, region_image, region_coords) line_points = points_from_polygon(line_polygon) From a6659f0712582c3461c029f3527ad5f2e34ae007 Mon Sep 17 00:00:00 2001 From: Robert Sachunsky Date: Sun, 5 Apr 2020 02:53:44 +0200 Subject: [PATCH 2/2] segment-table: use sparse text mode --- ocrd_tesserocr/segment_table.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ocrd_tesserocr/segment_table.py b/ocrd_tesserocr/segment_table.py index 3bb8331..6148507 100644 --- a/ocrd_tesserocr/segment_table.py +++ b/ocrd_tesserocr/segment_table.py @@ -141,7 +141,8 @@ def process(self): LOG.info("Detecting table cells in region '%s'", region.id) # # detect the region segments: - tessapi.SetPageSegMode(PSM.AUTO) # treat table like page + tessapi.SetPageSegMode(PSM.SPARSE_TEXT) # retrieve "cells" + # TODO: we should XY-cut the sparse cells in regroup them into consistent cells layout = tessapi.AnalyseLayout() roelem = reading_order.get(region.id) if not roelem: