Skip to content

Commit

Permalink
BUG: Text extraction not working with one glyph to char sequence (#1620)
Browse files Browse the repository at this point in the history
Fixes #1619
  • Loading branch information
pubpub-zz authored Feb 10, 2023
1 parent cfcba1a commit f5ac79b
Show file tree
Hide file tree
Showing 2 changed files with 14 additions and 8 deletions.
10 changes: 6 additions & 4 deletions pypdf/_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -1826,10 +1826,12 @@ def process_operation(operator: bytes, operands: List) -> None:
]
)
# "\u0590 - \u08FF \uFB50 - \uFDFF"
for x in "".join(
[cmap[1][x] if x in cmap[1] else x for x in t]
):
xx = ord(x)
for x in [cmap[1][x] if x in cmap[1] else x for x in t]:
# x can be a sequence of bytes ; ex: habibi.pdf
if len(x) == 1:
xx = ord(x)
else:
xx = 1
# fmt: off
if (
# cases where the current inserting order is kept
Expand Down
12 changes: 8 additions & 4 deletions tests/test_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -877,10 +877,14 @@ def test_empyt_password_1088():
len(reader.pages)


@pytest.mark.xfail(reason="#1088 / #1126")
def test_arab_text_extraction():
@pytest.mark.external
def test_old_habibi():
# this habibi has som multiple characters associated with the h
reader = PdfReader(SAMPLE_ROOT / "015-arabic/habibi.pdf")
assert reader.pages[0].extract_text() == "habibi حَبيبي"
txt = reader.pages[0].extract_text() # very odd file
assert (
"habibi" in txt and "حَبيبي" in txt
) # extract from acrobat reader "حَبيبي habibi􀀃􀏲􀎒􀏴􀎒􀎣􀋴


@pytest.mark.samples
Expand Down Expand Up @@ -1016,7 +1020,7 @@ def test_merge_resources(apage1, apage2, expected_result, expected_renames):

# Assert
assert result == expected_result
assert renames == expected_renames
assert renames == expected_renames


def test_merge_page_resources_smoke_test():
Expand Down

0 comments on commit f5ac79b

Please sign in to comment.