py-pdf · stefan6419846 · Aug 12, 2024 · Aug 8, 2024 · Aug 8, 2024 · Aug 8, 2024
diff --git a/pypdf/_cmap.py b/pypdf/_cmap.py
@@ -3,11 +3,10 @@
 from typing import Any, Dict, List, Tuple, Union, cast
 
 from ._codecs import adobe_glyphs, charset_encoding
-from ._utils import b_, logger_error, logger_warning
+from ._utils import logger_error, logger_warning
 from .generic import (
  DecodedStreamObject,
  DictionaryObject,
- IndirectObject,
  NullObject,
  StreamObject,
 )
@@ -258,7 +257,7 @@ def prepare_cm(ft: DictionaryObject) -> bytes:
  tu = ft["/ToUnicode"]
  cm: bytes
  if isinstance(tu, StreamObject):
- cm = b_(cast(DecodedStreamObject, ft["/ToUnicode"]).get_data())
+ cm = cast(DecodedStreamObject, ft["/ToUnicode"]).get_data()
  elif isinstance(tu, str) and tu.startswith("/Identity"):
  # the full range 0000-FFFF will be processed
  cm = b"beginbfrange\n<0000> <0001> <0000>\nendbfrange"
@@ -448,34 +447,27 @@ def compute_space_width(
  en: int = cast(int, ft["/LastChar"])
  if st > space_code or en < space_code:
  raise Exception("Not in range")
- if w[space_code - st] == 0:
+ if w[space_code - st].get_object() == 0:
  raise Exception("null width")
- sp_width = w[space_code - st]
+ sp_width = w[space_code - st].get_object()
  except Exception:
  if "/FontDescriptor" in ft and "/MissingWidth" in cast(
  DictionaryObject, ft["/FontDescriptor"]
  ):
- sp_width = ft["/FontDescriptor"]["/MissingWidth"] # type: ignore
+ sp_width = ft["/FontDescriptor"]["/MissingWidth"].get_object() # type: ignore
  else:
  # will consider width of char as avg(width)/2
  m = 0
  cpt = 0
- for x in w:
- if x > 0:
- m += x
+ for xx in w:
+ xx = xx.get_object()
+ if xx > 0:
+ m += xx
  cpt += 1
  sp_width = m / max(1, cpt) / 2
 
- if isinstance(sp_width, IndirectObject):
- # According to
- # 'Table 122 - Entries common to all font descriptors (continued)'
- # the MissingWidth should be a number, but according to #2286 it can
- # be an indirect object
- obj = sp_width.get_object()
- if obj is None or isinstance(obj, NullObject):
- return 0.0
- return obj # type: ignore
-
+ if sp_width is None or isinstance(sp_width, NullObject):
+ sp_width = 0.0
  return sp_width
 
 

diff --git a/pypdf/_doc_common.py b/pypdf/_doc_common.py
@@ -49,7 +49,6 @@
 from ._page import PageObject, _VirtualList
 from ._page_labels import index2label as page_index2page_label
 from ._utils import (
- b_,
  deprecate_with_replacement,
  logger_warning,
  parse_iso8824_date,
@@ -1258,7 +1257,7 @@ def xfa(self) -> Optional[Dict[str, Any]]:
  if isinstance(f, IndirectObject):
  field = cast(Optional[EncodedStreamObject], f.get_object())
  if field:
- es = zlib.decompress(b_(field._data))
+ es = zlib.decompress(field._data)
  retval[tag] = es
  return retval
 

diff --git a/pypdf/_encryption.py b/pypdf/_encryption.py
@@ -43,7 +43,7 @@
  rc4_encrypt,
 )
 
-from ._utils import b_, logger_warning
+from ._utils import logger_warning
 from .generic import (
  ArrayObject,
  ByteStringObject,
@@ -78,7 +78,7 @@ def encrypt_object(self, obj: PdfObject) -> PdfObject:
  elif isinstance(obj, StreamObject):
  obj2 = StreamObject()
  obj2.update(obj)
- obj2.set_data(self.stm_crypt.encrypt(b_(obj._data)))
+ obj2.set_data(self.stm_crypt.encrypt(obj._data))
  for key, value in obj.items(): # Dont forget the Stream dict.
  obj2[key] = self.encrypt_object(value)
  obj = obj2
@@ -96,7 +96,7 @@ def decrypt_object(self, obj: PdfObject) -> PdfObject:
  data = self.str_crypt.decrypt(obj.original_bytes)
  obj = create_string_object(data)
  elif isinstance(obj, StreamObject):
- obj._data = self.stm_crypt.decrypt(b_(obj._data))
+ obj._data = self.stm_crypt.decrypt(obj._data)
  for key, value in obj.items(): # Dont forget the Stream dict.
  obj[key] = self.decrypt_object(value)
  elif isinstance(obj, DictionaryObject):

diff --git a/pypdf/_merger.py b/pypdf/_merger.py
@@ -46,7 +46,6 @@
 from ._utils import (
  StrByteType,
  deprecate_with_replacement,
- str_,
 )
 from ._writer import PdfWriter
 from .constants import GoToActionArguments, TypArguments, TypFitArguments
@@ -82,6 +81,15 @@ def __init__(self, pagedata: PageObject, src: PdfReader, id: int) -> None:
  self.id = id
 
 
+# transfered from _utils : as this function is only required here
+# and merger will be soon deprecated
+def str_(b: Any) -> str: # pragma: no cover
+ if isinstance(b, bytes):
+ return b.decode("latin-1")
+ else:
+ return str(b) # will return b.__str__() if defined
+
+
 class PdfMerger:
  """
  Use :class:`PdfWriter` instead.

diff --git a/pypdf/_page.py b/pypdf/_page.py
@@ -852,7 +852,7 @@ def _add_transformation_matrix(
  FloatObject(e),
  FloatObject(f),
  ],
- " cm",
+ b"cm",
  ],
  )
  return contents
@@ -870,7 +870,7 @@ def _get_contents_as_bytes(self) -> Optional[bytes]:
  if isinstance(obj, list):
  return b"".join(x.get_object().get_data() for x in obj)
  else:
- return cast(bytes, cast(EncodedStreamObject, obj).get_data())
+ return cast(EncodedStreamObject, obj).get_data()
  else:
  return None
 
@@ -1063,11 +1063,11 @@ def _merge_page(
  rect.height,
  ],
  ),
- "re",
+ b"re",
  ),
  )
- page2content.operations.insert(1, ([], "W"))
- page2content.operations.insert(2, ([], "n"))
+ page2content.operations.insert(1, ([], b"W"))
+ page2content.operations.insert(2, ([], b"n"))
  if page2transformation is not None:
  page2content = page2transformation(page2content)
  page2content = PageObject._content_stream_rename(
@@ -1201,11 +1201,11 @@ def _merge_page_writer(
  rect.height,
  ],
  ),
- "re",
+ b"re",
  ),
  )
- page2content.operations.insert(1, ([], "W"))
- page2content.operations.insert(2, ([], "n"))
+ page2content.operations.insert(1, ([], b"W"))
+ page2content.operations.insert(2, ([], b"n"))
  if page2transformation is not None:
  page2content = page2transformation(page2content)
  page2content = PageObject._content_stream_rename(

diff --git a/pypdf/_reader.py b/pypdf/_reader.py
@@ -51,7 +51,6 @@
 from ._utils import (
  StrByteType,
  StreamType,
- b_,
  logger_warning,
  read_non_whitespace,
  read_previous_line,
@@ -328,7 +327,7 @@ def _get_object_from_stream(
  assert cast(str, obj_stm["/Type"]) == "/ObjStm"
  # /N is the number of indirect objects in the stream
  assert idx < obj_stm["/N"]
- stream_data = BytesIO(b_(obj_stm.get_data()))
+ stream_data = BytesIO(obj_stm.get_data())
  for i in range(obj_stm["/N"]): # type: ignore
  read_non_whitespace(stream_data)
  stream_data.seek(-1, 1)
@@ -932,7 +931,7 @@ def _read_pdf15_xref_stream(
  xrefstream = cast(ContentStream, read_object(stream, self))
  assert cast(str, xrefstream["/Type"]) == "/XRef"
  self.cache_indirect_object(generation, idnum, xrefstream)
- stream_data = BytesIO(b_(xrefstream.get_data()))
+ stream_data = BytesIO(xrefstream.get_data())
  # Index pairs specify the subsections in the dictionary. If
  # none create one subsection that spans everything.
  idx_pairs = xrefstream.get("/Index", [0, xrefstream.get("/Size")])

diff --git a/pypdf/_utils.py b/pypdf/_utils.py
@@ -336,31 +336,6 @@ def mark_location(stream: StreamType) -> None:
  stream.seek(-radius, 1)
 
 
-B_CACHE: Dict[str, bytes] = {}
-
-
-def b_(s: Union[str, bytes]) -> bytes:
- if isinstance(s, bytes):
- return s
- bc = B_CACHE
- if s in bc:
- return bc[s]
- try:
- r = s.encode("latin-1")
- except UnicodeEncodeError:
- r = s.encode("utf-8")
- if len(s) < 2:
- bc[s] = r
- return r
-
-
-def str_(b: Any) -> str:
- if isinstance(b, bytes):
- return b.decode("latin-1")
- else:
- return str(b) # will return b.__str__() if defined
-
-
 @overload
 def ord_(b: str) -> int:
  ...
@@ -397,12 +372,17 @@ def deprecation(msg: str) -> None:
 
 def deprecate_with_replacement(old_name: str, new_name: str, removed_in: str) -> None:
  """Raise an exception that a feature will be removed, but has a replacement."""
- deprecate(f"{old_name} is deprecated and will be removed in pypdf {removed_in}. Use {new_name} instead.", 4)
+ deprecate(
+ f"{old_name} is deprecated and will be removed in pypdf {removed_in}. Use {new_name} instead.",
+ 4,
+ )
 
 
 def deprecation_with_replacement(old_name: str, new_name: str, removed_in: str) -> None:
  """Raise an exception that a feature was already removed, but has a replacement."""
- deprecation(f"{old_name} is deprecated and was removed in pypdf {removed_in}. Use {new_name} instead.")
+ deprecation(
+ f"{old_name} is deprecated and was removed in pypdf {removed_in}. Use {new_name} instead."
+ )
 
 
 def deprecate_no_replacement(name: str, removed_in: str) -> None:

diff --git a/pypdf/_writer.py b/pypdf/_writer.py
@@ -62,7 +62,6 @@
  StrByteType,
  StreamType,
  _get_max_pdf_version_header,
- b_,
  deprecate_with_replacement,
  logger_warning,
 )
@@ -678,9 +677,10 @@ def add_attachment(self, filename: str, data: Union[str, bytes]) -> None:
  # Hello world!
  # endstream
  # endobj
-
+ if isinstance(data, str):
+ data = data.encode("latin-1")
  file_entry = DecodedStreamObject()
- file_entry.set_data(b_(data))
+ file_entry.set_data(data)
  file_entry.update({NameObject(PA.TYPE): NameObject("/EmbeddedFile")})
 
  # The Filespec entry