Fix wrong ordering of grouping textboxes introduced by #315. The firs…

…t grouping of textboxes should be skipped if there are intermediate textboxes. (#335) Fixes #334
pdfminer · Nov 10, 2019 · 2bee7d8 · 2bee7d8
1 parent 5c6fa8f
commit 2bee7d8
Show file tree

Hide file tree

Showing 6 changed files with 43 additions and 14 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,7 +5,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
 
 ## [Unreleased]
 
-Nothing yet
+### Fixed
+- Wrong order of text box grouping introduced by PR #315 ([#335](https://github.com/pdfminer/pdfminer.six/pull/335))
 
 ## [20191107] - 2019-11-07
 

diff --git a/docs/source/tutorials/highlevel.rst b/docs/source/tutorials/highlevel.rst
@@ -17,23 +17,23 @@ The most simple way to extract text from a PDF is to use
 
     >>> text = extract_text('samples/simple1.pdf')
     >>> print(repr(text))
-    'Hello \n\nWorld\n\nWorld\n\nHello \n\nH e l l o  \n\nH e l l o  \n\nW o r l d\n\nW o r l d\n\n\x0c'
+    'Hello \n\nWorld\n\nHello \n\nWorld\n\nH e l l o  \n\nW o r l d\n\nH e l l o  \n\nW o r l d\n\n\x0c'
     >>> print(text)
     ... # doctest: +NORMALIZE_WHITESPACE
     Hello
     <BLANKLINE>
     World
     <BLANKLINE>
-    World
-    <BLANKLINE>
     Hello
     <BLANKLINE>
-    H e l l o
+    World
     <BLANKLINE>
     H e l l o
     <BLANKLINE>
     W o r l d
     <BLANKLINE>
+    H e l l o
+    <BLANKLINE>
     W o r l d
     <BLANKLINE>
 

diff --git a/pdfminer/layout.py b/pdfminer/layout.py
@@ -1,6 +1,7 @@
 import heapq
+import logging
 
-from .utils import INF
+from .utils import INF, shorten_str
 from .utils import Plane
 from .utils import apply_matrix_pt
 from .utils import bbox2str
@@ -9,6 +10,8 @@
 from .utils import matrix2str
 from .utils import uniq
 
+logger = logging.getLogger(__name__)
+
 
 class IndexAssigner(object):
 
@@ -45,7 +48,7 @@ class LAParams(object):
         considered to be part of the same paragraph. The margin is
         specified relative to the height of a line.
     :param boxes_flow: Specifies how much a horizontal and vertical position
-        of a text matters when determining the order of lines. The value
+        of a text matters when determining the order of text boxes. The value
         should be within the range of -1.0 (only horizontal position
         matters) to +1.0 (only vertical position matters).
     :param detect_vertical: If vertical text should be considered during
@@ -505,7 +508,7 @@ def analyze(self, laparams):
         # reorder the objects from top-right to bottom-left.
         self._objs.sort(key=lambda obj:
                            -(1+laparams.boxes_flow)*(obj.x0+obj.x1)
-                           - (1-laparams.boxes_flow)*(obj.y1))
+                           -(1-laparams.boxes_flow)*(obj.y1))
         return
 
 
@@ -562,6 +565,7 @@ def group_objects(self, laparams, objs):
 
                 if ((halign and isinstance(line, LTTextLineHorizontal)) or
                     (valign and isinstance(line, LTTextLineVertical))):
+
                     line.add(obj1)
                 elif line is not None:
                     yield line
@@ -667,18 +671,19 @@ def isany(obj1, obj2):
             obj1 = boxes[i]
             for j in range(i+1, len(boxes)):
                 obj2 = boxes[j]
-                dists.append((True, dist(obj1, obj2), id(obj1), id(obj2), obj1, obj2))
+                dists.append((False, dist(obj1, obj2), id(obj1), id(obj2),
+                              obj1, obj2))
         heapq.heapify(dists)
 
         plane = Plane(self.bbox)
         plane.extend(boxes)
         done = set()
         while len(dists) > 0:
-            (is_first, d, id1, id2, obj1, obj2) = heapq.heappop(dists)
+            (skip_isany, d, id1, id2, obj1, obj2) = heapq.heappop(dists)
             # Skip objects that are already merged
             if (id1 not in done) and (id2 not in done):
-                if is_first and isany(obj1, obj2):
-                    heapq.heappush(dists, (False, d, id1, id2, obj1, obj2))
+                if skip_isany and isany(obj1, obj2):
+                    heapq.heappush(dists, (True, d, id1, id2, obj1, obj2))
                     continue
                 if isinstance(obj1, (LTTextBoxVertical, LTTextGroupTBRL)) or \
                         isinstance(obj2, (LTTextBoxVertical, LTTextGroupTBRL)):

diff --git a/pdfminer/utils.py b/pdfminer/utils.py
@@ -32,6 +32,16 @@ def make_compat_str(in_str):
     return in_str
 
 
+def shorten_str(s, size):
+    if size < 7:
+        return s[:size]
+    if len(s) > size:
+        length = (size - 5) // 2
+        return '{} ... {}'.format(s[:length], s[-length:])
+    else:
+        return s
+
+
 def compatible_encode_method(bytesorstring, encoding='utf-8', erraction='ignore'):
     """When Py2 str.encode is called, it often means bytes.encode in Py3. This does either."""
     if six.PY2:

diff --git a/tests/test_highlevel_extracttext.py b/tests/test_highlevel_extracttext.py
@@ -11,7 +11,7 @@ def run(sample_path):
 
 
 test_strings = {
-    "simple1.pdf": "Hello \n\nWorld\n\nWorld\n\nHello \n\nH e l l o  \n\nH e l l o  \n\nW o r l d\n\nW o r l d\n\n\f",
+    "simple1.pdf": "Hello \n\nWorld\n\nHello \n\nWorld\n\nH e l l o  \n\nW o r l d\n\nH e l l o  \n\nW o r l d\n\n\f",
     "simple2.pdf": "\f",
     "simple3.pdf": "HelloHello\n\nWorld\n\nWorld\n\n\f",
 }

diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -1,7 +1,7 @@
 from nose.tools import assert_equal
 
 from pdfminer.layout import LTComponent
-from pdfminer.utils import Plane
+from pdfminer.utils import Plane, shorten_str
 
 
 class TestPlane(object):
@@ -38,3 +38,16 @@ def given_plane_with_one_object(object_size=50, gridsize=50):
         obj = LTComponent((0, 0, object_size, object_size))
         plane.add(obj)
         return plane, obj
+
+
+class TestFunctions(object):
+    def test_shorten_str(self):
+        s = shorten_str('Hello there World', 15)
+        assert_equal(s, 'Hello ... World')
+
+    def test_shorten_short_str_is_same(self):
+        s = 'Hello World'
+        assert_equal(s, shorten_str(s, 50))
+
+    def test_shorten_to_really_short(self):
+        assert_equal('Hello', shorten_str('Hello World', 5))