py-pdf · stefan6419846 · Sep 11, 2024 · Aug 23, 2024 · Aug 24, 2024 · Aug 25, 2024
diff --git a/pypdf/_doc_common.py b/pypdf/_doc_common.py
@@ -65,9 +65,7 @@
 from .constants import FieldDictionaryAttributes as FA
 from .constants import PageAttributes as PG
 from .constants import PagesAttributes as PA
-from .errors import (
- PdfReadError,
-)
+from .errors import PdfReadError, PyPdfError
 from .generic import (
  ArrayObject,
  BooleanObject,
@@ -254,6 +252,8 @@ class PdfDocCommon:
 
  _encryption: Optional[Encryption] = None
 
+ _readonly: bool = False
+
  @property
  @abstractmethod
  def root_object(self) -> DictionaryObject:
@@ -349,7 +349,7 @@ def get_num_pages(self) -> int:
  return self.root_object["/Pages"]["/Count"] # type: ignore
  else:
  if self.flattened_pages is None:
- self._flatten()
+ self._flatten(self._readonly)
  assert self.flattened_pages is not None
  return len(self.flattened_pages)
 
@@ -366,10 +366,49 @@ def get_page(self, page_number: int) -> PageObject:
  A :class:`PageObject<pypdf._page.PageObject>` instance.
  """
  if self.flattened_pages is None:
- self._flatten()
+ self._flatten(self._readonly)
  assert self.flattened_pages is not None, "hint for mypy"
  return self.flattened_pages[page_number]
 
+ def _get_page_in_node(
+ self,
+ page_number: int,
+ ) -> Tuple[DictionaryObject, int]:
+ """
+ Retrieve the node and position within the /Kids containing the page.
+ If page_number is greater than the number of pages, it returns the top node, -1.
+ """
+ top = cast(DictionaryObject, self.root_object["/Pages"])
+
+ def recursive_call(
+ node: DictionaryObject, mi: int
+ ) -> Tuple[Optional[PdfObject], int]:
+ ma = cast(int, node.get("/Count", 1)) # default 1 for /Page types
+ if node["/Type"] == "/Page":
+ if page_number == mi:
+ return node, -1
+ # else
+ return None, mi + 1
+ if (page_number - mi) >= ma: # not in nodes below
+ if node == top:
+ return top, -1
+ # else
+ return None, mi + ma
+ for idx, kid in enumerate(cast(ArrayObject, node["/Kids"])):
+ kid = cast(DictionaryObject, kid.get_object())
+ n, i = recursive_call(kid, mi)
+ if n is not None: # page has just been found ...
+ if i < 0: # ... just below!
+ return node, idx
+ # else: # ... at lower levels
+ return n, i
+ mi = i
+ raise PyPdfError("Unexpectedly cannot find the node.")
+
+ node, idx = recursive_call(top, 0)
+ assert isinstance(node, DictionaryObject), "mypy"
+ return node, idx
+
  @property
  def named_destinations(self) -> Dict[str, Any]:
  """
@@ -1082,10 +1121,20 @@ def page_mode(self) -> Optional[PagemodeType]:
 
  def _flatten(
  self,
+ list_only: bool = False,
  pages: Union[None, DictionaryObject, PageObject] = None,
  inherit: Optional[Dict[str, Any]] = None,
  indirect_reference: Optional[IndirectObject] = None,
  ) -> None:
+ """
+ Prepare the document pages to ease searching
+
+ Args:
+ list_only: Will only list the pages within _flatten_pages.
+ pages:
+ inherit:
+ indirect_reference: Used recursively to flatten the /Pages object.
+ """
  inheritable_page_attributes = (
  NameObject(PG.RESOURCES),
  NameObject(PG.MEDIABOX),
@@ -1122,7 +1171,7 @@ def _flatten(
  if obj:
  # damaged file may have invalid child in /Pages
  try:
- self._flatten(obj, inherit, **addt)
+ self._flatten(list_only, obj, inherit, **addt)
  except RecursionError:
  raise PdfReadError(
  "Maximum recursion depth reached during page flattening."
@@ -1134,7 +1183,8 @@ def _flatten(
  if attr_in not in pages:
  pages[attr_in] = value
  page_obj = PageObject(self, indirect_reference)
- page_obj.update(pages)
+ if not list_only:
+ page_obj.update(pages)
 
  # TODO: Could flattened_pages be None at this point?
  self.flattened_pages.append(page_obj) # type: ignore
@@ -1158,7 +1208,7 @@ def remove_page(
  or destinations to reference a detached page.
  """
  if self.flattened_pages is None:
- self._flatten()
+ self._flatten(self._readonly)
  assert self.flattened_pages is not None
  if isinstance(page, IndirectObject):
  p = page.get_object()

diff --git a/pypdf/_page.py b/pypdf/_page.py
@@ -492,6 +492,22 @@ def __init__(
  self.inline_images: Optional[Dict[str, ImageFile]] = None
  # below Union for mypy but actually Optional[List[str]]
  self.indirect_reference = indirect_reference
+ if indirect_reference is not None:
+ self.update(cast(DictionaryObject, indirect_reference.get_object()))
+
+ def hash_bin(self) -> int:
+ """
+ Used to detect modified object.
+
+ Note: this function is overloaded to return the same results
+ as a DictionaryObject.
+
+ Returns:
+ Hash considering type and value.
+ """
+ return hash(
+ (DictionaryObject, tuple(((k, v.hash_bin()) for k, v in self.items())))
+ )
 
  def hash_value_data(self) -> bytes:
  data = super().hash_value_data()
@@ -2399,27 +2415,33 @@ def __delitem__(self, index: Union[int, slice]) -> None:
  raise IndexError("index out of range")
  ind = self[index].indirect_reference
  assert ind is not None
- parent = cast(DictionaryObject, ind.get_object()).get("/Parent", None)
+ parent: Optional[PdfObject] = cast(DictionaryObject, ind.get_object()).get(
+ "/Parent", None
+ )
+ first = True
  while parent is not None:
  parent = cast(DictionaryObject, parent.get_object())
  try:
- i = parent["/Kids"].index(ind)
- del parent["/Kids"][i]
+ i = cast(ArrayObject, parent["/Kids"]).index(ind)
+ del cast(ArrayObject, parent["/Kids"])[i]
+ first = False
  try:
  assert ind is not None
  del ind.pdf.flattened_pages[index] # case of page in a Reader
  except Exception: # pragma: no cover
  pass
  if "/Count" in parent:
- parent[NameObject("/Count")] = NumberObject(parent["/Count"] - 1)
- if len(parent["/Kids"]) == 0:
+ parent[NameObject("/Count")] = NumberObject(
+ cast(int, parent["/Count"]) - 1
+ )
+ if len(cast(ArrayObject, parent["/Kids"])) == 0:
  # No more objects in this part of this sub tree
  ind = parent.indirect_reference
- parent = cast(DictionaryObject, parent.get("/Parent", None))
- else:
- parent = None
+ parent = parent.get("/Parent", None)
  except ValueError: # from index
- raise PdfReadError(f"Page Not Found in Page Tree {ind}")
+ if first:
+ raise PdfReadError(f"Page not found in page tree: {ind}")
+ break
 
  def __iter__(self) -> Iterator[PageObject]:
  for i in range(len(self)):

diff --git a/pypdf/_protocols.py b/pypdf/_protocols.py
@@ -74,6 +74,9 @@ class PdfWriterProtocol(PdfCommonDocProtocol, Protocol):
  _objects: List[Any]
  _id_translated: Dict[int, Dict[int, int]]
 
+ incremental: bool
+ _reader: Any # PdfReader
+
  @abstractmethod
  def write(self, stream: Union[Path, StrByteType]) -> Tuple[bool, IO[Any]]:
  ... # pragma: no cover

diff --git a/pypdf/_reader.py b/pypdf/_reader.py
@@ -136,6 +136,7 @@ def __init__(
  with open(stream, "rb") as fh:
  stream = BytesIO(fh.read())
  self._stream_opened = True
+ self._startxref: int = 0
  self.read(stream)
  self.stream = stream
 
@@ -563,6 +564,7 @@ def read(self, stream: StreamType) -> None:
  self._basic_validation(stream)
  self._find_eof_marker(stream)
  startxref = self._find_startxref_pos(stream)
+ self._startxref = startxref
 
  # check and eventually correct the startxref only in not strict
  xref_issue_nr = self._get_xref_issues(stream, startxref)