From 01c1956f6edabb3d75c08363c8ed8a0ebf52c6ae Mon Sep 17 00:00:00 2001
From: Michael Karlen <michael.karlen@gmail.com>
Date: Fri, 16 Sep 2022 13:31:54 +0200
Subject: [PATCH] Fix performance issues with large embedded base64 images

Certain PDF libraries do embed images as base64 strings. This causes performance issues
in `read_string_from_stream` due to incremental string concatenation, byte by byte.

PDF Lib in our case is
```
<xmp:CreatorTool>Canon iR-ADV C256  PDF</xmp:CreatorTool>
<pdf:Producer>PDF Annotator 8.0.0.826 [Adobe PSL 1.3e for Canon</pdf:Producer>

```
---
 PyPDF2/generic/_utils.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/PyPDF2/generic/_utils.py b/PyPDF2/generic/_utils.py
index c5b6129f0..7edcca90c 100644
--- a/PyPDF2/generic/_utils.py
+++ b/PyPDF2/generic/_utils.py
@@ -41,7 +41,7 @@ def read_string_from_stream(
 ) -> Union["TextStringObject", "ByteStringObject"]:
     tok = stream.read(1)
     parens = 1
-    txt = b""
+    txt = []
     while True:
         tok = stream.read(1)
         if not tok:
@@ -97,17 +97,21 @@ def read_string_from_stream(
                     # This case is  hit when a backslash followed by a line
                     # break occurs.  If it's a multi-char EOL, consume the
                     # second character:
+                    prev=tok
                     tok = stream.read(1)
                     if tok not in b"\n\r":
                         stream.seek(-1, 1)
+
                     # Then don't add anything to the actual string, since this
                     # line break was escaped:
                     tok = b""
                 else:
                     msg = rf"Unexpected escaped string: {tok.decode('utf8')}"
                     logger_warning(msg, __name__)
-        txt += tok
-    return create_string_object(txt, forced_encoding)
+        txt.append(tok)
+        if stream.tell() % 10000 == 1:
+            print(stream.tell(), parens, b''.join(txt[:50]))
+    return create_string_object(b''.join(txt), forced_encoding)
 
 
 def create_string_object(
@@ -164,7 +168,7 @@ def decode_pdfdocencoding(byte_array: bytes) -> str:
             raise UnicodeDecodeError(
                 "pdfdocencoding",
                 bytearray(b),
-                -1,
+                    -1,
                 -1,
                 "does not exist in translation table",
             )