BUG: Text extraction not working with one glyph to char sequence (#1620)

Fixes #1619
py-pdf · Feb 10, 2023 · f5ac79b · f5ac79b
1 parent cfcba1a
commit f5ac79b
Show file tree

Hide file tree

Showing 2 changed files with 14 additions and 8 deletions.
diff --git a/pypdf/_page.py b/pypdf/_page.py
@@ -1826,10 +1826,12 @@ def process_operation(operator: bytes, operands: List) -> None:
  ]
  )
  # "\u0590 - \u08FF \uFB50 - \uFDFF"
- for x in "".join(
- [cmap[1][x] if x in cmap[1] else x for x in t]
- ):
- xx = ord(x)
+ for x in [cmap[1][x] if x in cmap[1] else x for x in t]:
+ # x can be a sequence of bytes ; ex: habibi.pdf
+ if len(x) == 1:
+ xx = ord(x)
+ else:
+ xx = 1
  # fmt: off
  if (
  # cases where the current inserting order is kept

diff --git a/tests/test_page.py b/tests/test_page.py
@@ -877,10 +877,14 @@ def test_empyt_password_1088():
  len(reader.pages)
 
 
-@pytest.mark.xfail(reason="#1088 / #1126")
-def test_arab_text_extraction():
+@pytest.mark.external
+def test_old_habibi():
+ # this habibi has som multiple characters associated with the h
  reader = PdfReader(SAMPLE_ROOT / "015-arabic/habibi.pdf")
- assert reader.pages[0].extract_text() == "habibi حَبيبي"
+ txt = reader.pages[0].extract_text() # very odd file
+ assert (
+ "habibi" in txt and "حَبيبي" in txt
+ ) # extract from acrobat reader "حَبيبي habibi􀀃􀏲􀎒􀏴􀎒􀎣􀋴
 
 
 @pytest.mark.samples
@@ -1016,7 +1020,7 @@ def test_merge_resources(apage1, apage2, expected_result, expected_renames):
 
  # Assert
  assert result == expected_result
-  assert renames == expected_renames
+ assert renames == expected_renames
 
 
 def test_merge_page_resources_smoke_test():