From 8a0c12652cfc7e4328be7023b0da60c97308b68d Mon Sep 17 00:00:00 2001 From: Jeremy Singer-Vine Date: Tue, 22 Nov 2022 12:24:10 -0500 Subject: [PATCH] Restore "text" attr to .textboxhorizontal/etc. Fix regression introduced in 9587cc7 / v0.6.2. --- pdfplumber/page.py | 5 ++++- tests/test_laparams.py | 4 ++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/pdfplumber/page.py b/pdfplumber/page.py index b36a3814..b2f3d42a 100644 --- a/pdfplumber/page.py +++ b/pdfplumber/page.py @@ -23,6 +23,7 @@ LTLine, LTPage, LTRect, + LTTextContainer, ) from pdfminer.pdfinterp import PDFPageInterpreter from pdfminer.pdfpage import PDFPage @@ -211,8 +212,10 @@ def process_attr(item: Tuple[str, Any]) -> Optional[Tuple[str, Any]]: attr["object_type"] = kind attr["page_number"] = self.page_number - if isinstance(obj, LTChar): + if isinstance(obj, (LTChar, LTTextContainer)): attr["text"] = obj.get_text() + + if isinstance(obj, LTChar): gs = obj.graphicstate attr["stroking_color"] = gs.scolor attr["non_stroking_color"] = gs.ncolor diff --git a/tests/test_laparams.py b/tests/test_laparams.py index 700ed451..1f11a31e 100644 --- a/tests/test_laparams.py +++ b/tests/test_laparams.py @@ -26,6 +26,8 @@ def test_with_laparams(self): page = pdf.pages[0] assert len(page.textboxhorizontals) == 27 assert len(page.textlinehorizontals) == 79 + assert "text" in page.textboxhorizontals[0] + assert "text" in page.textlinehorizontals[0] assert len(page.chars) == 4408 assert "anno" not in page.objects.keys() @@ -38,6 +40,8 @@ def test_vertical_texts(self): assert len(page.textboxhorizontals) == 74 assert len(page.textlineverticals) == 11 assert len(page.textboxverticals) == 6 + assert "text" in page.textboxverticals[0] + assert "text" in page.textlineverticals[0] def test_issue_383(self): with pdfplumber.open(self.path, laparams={}) as pdf: