Skip to content

Commit

Permalink
Fix performance issues with large embedded base64 images
Browse files Browse the repository at this point in the history
Certain PDF libraries do embed images as base64 strings. This causes performance issues
in `read_string_from_stream` due to incremental string concatenation, byte by byte.

PDF Lib in our case is
```
<xmp:CreatorTool>Canon iR-ADV C256  PDF</xmp:CreatorTool>
<pdf:Producer>PDF Annotator 8.0.0.826 [Adobe PSL 1.3e for Canon</pdf:Producer>

```
  • Loading branch information
mergezalot committed Sep 16, 2022
1 parent 7c96d13 commit 01c1956
Showing 1 changed file with 8 additions and 4 deletions.
12 changes: 8 additions & 4 deletions PyPDF2/generic/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def read_string_from_stream(
) -> Union["TextStringObject", "ByteStringObject"]:
tok = stream.read(1)
parens = 1
txt = b""
txt = []
while True:
tok = stream.read(1)
if not tok:
Expand Down Expand Up @@ -97,17 +97,21 @@ def read_string_from_stream(
# This case is hit when a backslash followed by a line
# break occurs. If it's a multi-char EOL, consume the
# second character:
prev=tok
tok = stream.read(1)
if tok not in b"\n\r":
stream.seek(-1, 1)

# Then don't add anything to the actual string, since this
# line break was escaped:
tok = b""
else:
msg = rf"Unexpected escaped string: {tok.decode('utf8')}"
logger_warning(msg, __name__)
txt += tok
return create_string_object(txt, forced_encoding)
txt.append(tok)
if stream.tell() % 10000 == 1:
print(stream.tell(), parens, b''.join(txt[:50]))
return create_string_object(b''.join(txt), forced_encoding)


def create_string_object(
Expand Down Expand Up @@ -164,7 +168,7 @@ def decode_pdfdocencoding(byte_array: bytes) -> str:
raise UnicodeDecodeError(
"pdfdocencoding",
bytearray(b),
-1,
-1,
-1,
"does not exist in translation table",
)
Expand Down

0 comments on commit 01c1956

Please sign in to comment.