Fix performance issues with large embedded base64 images

Certain PDF libraries do embed images as base64 strings. This causes performance issues in `read_string_from_stream` due to incremental string concatenation, byte by byte. PDF Lib in our case is ``` <xmp:CreatorTool>Canon iR-ADV C256 PDF</xmp:CreatorTool> <pdf:Producer>PDF Annotator 8.0.0.826 [Adobe PSL 1.3e for Canon</pdf:Producer> ```
py-pdf · Sep 16, 2022 · 01c1956 · 01c1956
1 parent 7c96d13
commit 01c1956
Showing 1 changed file with 8 additions and 4 deletions.
diff --git a/PyPDF2/generic/_utils.py b/PyPDF2/generic/_utils.py
@@ -41,7 +41,7 @@ def read_string_from_stream(
 ) -> Union["TextStringObject", "ByteStringObject"]:
  tok = stream.read(1)
  parens = 1
- txt = b""
+ txt = []
  while True:
  tok = stream.read(1)
  if not tok:
@@ -97,17 +97,21 @@ def read_string_from_stream(
  # This case is hit when a backslash followed by a line
  # break occurs. If it's a multi-char EOL, consume the
  # second character:
+ prev=tok
  tok = stream.read(1)
  if tok not in b"\n\r":
  stream.seek(-1, 1)
+
  # Then don't add anything to the actual string, since this
  # line break was escaped:
  tok = b""
  else:
  msg = rf"Unexpected escaped string: {tok.decode('utf8')}"
  logger_warning(msg, __name__)
- txt += tok
- return create_string_object(txt, forced_encoding)
+ txt.append(tok)
+ if stream.tell() % 10000 == 1:
+ print(stream.tell(), parens, b''.join(txt[:50]))
+ return create_string_object(b''.join(txt), forced_encoding)
 
 
 def create_string_object(
@@ -164,7 +168,7 @@ def decode_pdfdocencoding(byte_array: bytes) -> str:
  raise UnicodeDecodeError(
  "pdfdocencoding",
  bytearray(b),
- -1,
+  -1,
  -1,
  "does not exist in translation table",
  )