From b1d4ea1fb4364336f84f1f3add19163aab2084a6 Mon Sep 17 00:00:00 2001 From: Martin Thoma Date: Sun, 17 Jul 2022 09:58:31 +0200 Subject: [PATCH] TST: Add xfail test for IndexError when extracting text (#1124) See #1091 --- tests/test_page.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/tests/test_page.py b/tests/test_page.py index fc7c2a71a..656c5111f 100644 --- a/tests/test_page.py +++ b/tests/test_page.py @@ -322,3 +322,14 @@ def test_get_fonts(pdf_path, password, embedded, unembedded): a = a.union(a_tmp) b = b.union(b_tmp) assert (a, b) == (embedded, unembedded) + + +@pytest.mark.xfail(reason="#1091") +def test_text_extraction_issue_1091(): + url = "https://corpora.tika.apache.org/base/docs/govdocs1/966/966635.pdf" + name = "tika-966635.pdf" + stream = BytesIO(get_pdf_from_url(url, name=name)) + with pytest.warns(PdfReadWarning): + reader = PdfReader(stream) + for page in reader.pages: + page.extract_text()