Skip to content

Commit

Permalink
ENH : add UniGB-UTF16 encodings
Browse files Browse the repository at this point in the history
  • Loading branch information
pubpub-zz committed Aug 27, 2024
1 parent f55d332 commit 5e9fc28
Show file tree
Hide file tree
Showing 2 changed files with 14 additions and 2 deletions.
2 changes: 2 additions & 0 deletions pypdf/_cmap.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,8 @@ def build_char_map_from_dict(
"/ETenms-B5-V": "cp950",
"/UniCNS-UTF16-H": "utf-16-be",
"/UniCNS-UTF16-V": "utf-16-be",
"/UniGB-UTF16-H": "gb18030",
"/UniGB-UTF16-V": "gb18030",
# UCS2 in code
}

Expand Down
14 changes: 12 additions & 2 deletions tests/test_cmap.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,7 +214,6 @@ def test_eten_b5():
reader.pages[0].extract_text().startswith("1/7 \n富邦新終身壽險")


@pytest.mark.enable_socket()
def test_missing_entries_in_cmap():
"""
Issue #2702: this issue is observed on damaged pdfs
Expand All @@ -231,10 +230,21 @@ def test_missing_entries_in_cmap():


def test_null_missing_width():
"""For coverage of 2792"""
"""For coverage of #2792"""
writer = PdfWriter(RESOURCE_ROOT / "crazyones.pdf")
page = writer.pages[0]
ft = page["/Resources"]["/Font"]["/F1"]
ft[NameObject("/Widths")] = ArrayObject()
ft["/FontDescriptor"][NameObject("/MissingWidth")] = NullObject()
page.extract_text()


@pytest.mark.enable_socket()
def test_unigb_utf16():
"""Cf #2812"""
url = (
"https://github.com/user-attachments/files/16767536/W020240105322424121296.pdf"
)
name = "iss2812.pdf"
reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
assert "《中国能源展望 2060(2024 年版)》编写委员会" in reader.pages[1].extract_text()

0 comments on commit 5e9fc28

Please sign in to comment.