From 01c1956f6edabb3d75c08363c8ed8a0ebf52c6ae Mon Sep 17 00:00:00 2001 From: Michael Karlen Date: Fri, 16 Sep 2022 13:31:54 +0200 Subject: [PATCH] Fix performance issues with large embedded base64 images Certain PDF libraries do embed images as base64 strings. This causes performance issues in `read_string_from_stream` due to incremental string concatenation, byte by byte. PDF Lib in our case is ``` Canon iR-ADV C256 PDF PDF Annotator 8.0.0.826 [Adobe PSL 1.3e for Canon ``` --- PyPDF2/generic/_utils.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/PyPDF2/generic/_utils.py b/PyPDF2/generic/_utils.py index c5b6129f0..7edcca90c 100644 --- a/PyPDF2/generic/_utils.py +++ b/PyPDF2/generic/_utils.py @@ -41,7 +41,7 @@ def read_string_from_stream( ) -> Union["TextStringObject", "ByteStringObject"]: tok = stream.read(1) parens = 1 - txt = b"" + txt = [] while True: tok = stream.read(1) if not tok: @@ -97,17 +97,21 @@ def read_string_from_stream( # This case is hit when a backslash followed by a line # break occurs. If it's a multi-char EOL, consume the # second character: + prev=tok tok = stream.read(1) if tok not in b"\n\r": stream.seek(-1, 1) + # Then don't add anything to the actual string, since this # line break was escaped: tok = b"" else: msg = rf"Unexpected escaped string: {tok.decode('utf8')}" logger_warning(msg, __name__) - txt += tok - return create_string_object(txt, forced_encoding) + txt.append(tok) + if stream.tell() % 10000 == 1: + print(stream.tell(), parens, b''.join(txt[:50])) + return create_string_object(b''.join(txt), forced_encoding) def create_string_object( @@ -164,7 +168,7 @@ def decode_pdfdocencoding(byte_array: bytes) -> str: raise UnicodeDecodeError( "pdfdocencoding", bytearray(b), - -1, + -1, -1, "does not exist in translation table", )