Skip to content

Commit

Permalink
BUG: td matrix (#1373)
Browse files Browse the repository at this point in the history
  • Loading branch information
srogmann authored Oct 9, 2022
1 parent 7faa9b3 commit 50c1b52
Show file tree
Hide file tree
Showing 3 changed files with 34 additions and 3 deletions.
9 changes: 7 additions & 2 deletions PyPDF2/_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -1490,8 +1490,13 @@ def process_operation(operator: bytes, operands: List) -> None:
# Table 5.5 page 406
elif operator == b"Td":
check_crlf_space = True
tm_matrix[4] += float(operands[0])
tm_matrix[5] += float(operands[1])
# A special case is a translating only tm:
# tm[0..5] = 1 0 0 1 e f,
# i.e. tm[4] += tx, tm[5] += ty.
tx = float(operands[0])
ty = float(operands[1])
tm_matrix[4] += tx * tm_matrix[0] + ty * tm_matrix[2]
tm_matrix[5] += tx * tm_matrix[1] + ty * tm_matrix[3]
elif operator == b"Tm":
check_crlf_space = True
tm_matrix = [
Expand Down
Binary file added resources/Sample_Td-matrix.pdf
Binary file not shown.
28 changes: 27 additions & 1 deletion tests/test_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -297,7 +297,14 @@ def test_iss_1142():
name = "st2019.pdf"
reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name)))
txt = reader.pages[3].extract_text()
assert txt.find("有限公司郑州分公司") > 0
# The following text is contained in two different cells:
# assert txt.find("有限公司郑州分公司") > 0
# 有限公司 = limited company
# 郑州分公司 = branch office in Zhengzhou
# First cell (see page 4/254):
assert txt.find("郑州药素电子商务有限公司") > 0
# Next cell (first cell in next line):
assert txt.find("郑州分公司") > 0


@pytest.mark.parametrize(
Expand Down Expand Up @@ -604,6 +611,25 @@ def filter_first_table(r):
assert texts.font_dict["/Encoding"] == "/WinAnsiEncoding"
assert text_dat_of_date.font_size == 9.96

# Test 3: Read a table in a document using a non-translating
# but scaling Tm-operand
reader = PdfReader(RESOURCE_ROOT / "Sample_Td-matrix.pdf")
page_td_model = reader.pages[0]
# We store the translations of the Td-executions.
list_Td = []

def visitor_td(op, args, cm, tm):
if op == b"Td":
list_Td.append((tm[4], tm[5]))

page_td_model.extract_text(visitor_operand_after=visitor_td)
assert len(list_Td) == 4
# Check the translations of the four Td-executions.
assert list_Td[0] == (210.0, 110.0)
assert list_Td[1] == (410.0, 110.0)
assert list_Td[2] == (210.0, 210.0)
assert list_Td[3] == (410.0, 210.0)


@pytest.mark.parametrize(
("pdf_path", "password", "embedded", "unembedded"),
Expand Down

0 comments on commit 50c1b52

Please sign in to comment.