Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: add incremental capability to PdfWriter #2811

Merged
merged 45 commits into from
Sep 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
45 commits
Select commit Hold shift + click to select a range
fba73a4
ENH: add incremental capability to PdfWriter
pubpub-zz Aug 23, 2024
0543709
fix test
pubpub-zz Aug 24, 2024
29030d4
fixes + first test
pubpub-zz Aug 25, 2024
1067b74
coverage
pubpub-zz Aug 25, 2024
f1d3fbe
coverage
pubpub-zz Aug 25, 2024
ae97bc7
cope with multiple level pages
pubpub-zz Aug 26, 2024
d9a99d9
test + doc
pubpub-zz Aug 26, 2024
3c4cfdc
coverage
pubpub-zz Aug 26, 2024
38d4b35
coverage
pubpub-zz Aug 26, 2024
79eca73
coverage
pubpub-zz Aug 26, 2024
290c5a6
coverage
pubpub-zz Aug 26, 2024
173578d
coverage
pubpub-zz Aug 26, 2024
b2b0c9e
Merge branch 'main' into incremental
pubpub-zz Aug 26, 2024
1a6eda5
simplification
pubpub-zz Aug 26, 2024
d43d25b
coverage
pubpub-zz Aug 27, 2024
7e2e74d
Merge branch 'main' into incremental
pubpub-zz Aug 27, 2024
708e449
Merge branch 'main' into incremental
pubpub-zz Aug 28, 2024
ff76e02
Merge remote-tracking branch 'py-pdf/main' into incremental
pubpub-zz Sep 1, 2024
14a93f1
move to X-reference stream for increment
pubpub-zz Sep 1, 2024
53e141f
coverage
pubpub-zz Sep 1, 2024
b4b7c1b
coverage
pubpub-zz Sep 1, 2024
7bc3abd
coverage
pubpub-zz Sep 1, 2024
ffa2f0c
fix
pubpub-zz Sep 1, 2024
b072952
mypy
pubpub-zz Sep 1, 2024
a3b6246
Merge branch 'main' into incremental
pubpub-zz Sep 5, 2024
494e00a
Update pypdf/_doc_common.py
pubpub-zz Sep 8, 2024
eba1c9f
Update pypdf/_doc_common.py
pubpub-zz Sep 8, 2024
d68db51
Update pypdf/_doc_common.py
pubpub-zz Sep 8, 2024
8b3182d
Update pypdf/_doc_common.py
pubpub-zz Sep 8, 2024
fe6aac7
Update pypdf/_writer.py
pubpub-zz Sep 8, 2024
0be4bb4
Update pypdf/_writer.py
pubpub-zz Sep 8, 2024
4c585c0
Update pypdf/_writer.py
pubpub-zz Sep 8, 2024
fbe54d0
Update pypdf/_writer.py
pubpub-zz Sep 8, 2024
e3c1e2c
Update pypdf/_writer.py
pubpub-zz Sep 8, 2024
6e65943
clarify assert mypy
pubpub-zz Sep 8, 2024
4121672
doc hash_bin
pubpub-zz Sep 8, 2024
bcc5c1d
doc hash_bin
pubpub-zz Sep 8, 2024
02ac507
Merge branch 'main' into incremental
pubpub-zz Sep 8, 2024
bc6caba
Update pypdf/_page.py
stefan6419846 Sep 8, 2024
8659de2
Update pypdf/_writer.py
stefan6419846 Sep 8, 2024
99e6dfc
Apply suggestions from code review
stefan6419846 Sep 8, 2024
3b81ee5
fix in accordance with comments
pubpub-zz Sep 10, 2024
efd948b
fix doc
pubpub-zz Sep 10, 2024
5b030dc
fix typos
stefan6419846 Sep 11, 2024
64cf1f3
Update pypdf/_writer.py
stefan6419846 Sep 11, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
66 changes: 58 additions & 8 deletions pypdf/_doc_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,9 +65,7 @@
from .constants import FieldDictionaryAttributes as FA
from .constants import PageAttributes as PG
from .constants import PagesAttributes as PA
from .errors import (
PdfReadError,
)
from .errors import PdfReadError, PyPdfError
from .generic import (
ArrayObject,
BooleanObject,
Expand Down Expand Up @@ -254,6 +252,8 @@ class PdfDocCommon:

_encryption: Optional[Encryption] = None

_readonly: bool = False

@property
@abstractmethod
def root_object(self) -> DictionaryObject:
Expand Down Expand Up @@ -349,7 +349,7 @@ def get_num_pages(self) -> int:
return self.root_object["/Pages"]["/Count"] # type: ignore
else:
if self.flattened_pages is None:
self._flatten()
self._flatten(self._readonly)
assert self.flattened_pages is not None
return len(self.flattened_pages)

Expand All @@ -366,10 +366,49 @@ def get_page(self, page_number: int) -> PageObject:
A :class:`PageObject<pypdf._page.PageObject>` instance.
"""
if self.flattened_pages is None:
self._flatten()
self._flatten(self._readonly)
assert self.flattened_pages is not None, "hint for mypy"
return self.flattened_pages[page_number]

def _get_page_in_node(
self,
page_number: int,
) -> Tuple[DictionaryObject, int]:
"""
Retrieve the node and position within the /Kids containing the page.
If page_number is greater than the number of pages, it returns the top node, -1.
"""
top = cast(DictionaryObject, self.root_object["/Pages"])

def recursive_call(
node: DictionaryObject, mi: int
) -> Tuple[Optional[PdfObject], int]:
ma = cast(int, node.get("/Count", 1)) # default 1 for /Page types
if node["/Type"] == "/Page":
if page_number == mi:
return node, -1
# else
return None, mi + 1
if (page_number - mi) >= ma: # not in nodes below
if node == top:
return top, -1
# else
return None, mi + ma
for idx, kid in enumerate(cast(ArrayObject, node["/Kids"])):
kid = cast(DictionaryObject, kid.get_object())
n, i = recursive_call(kid, mi)
if n is not None: # page has just been found ...
if i < 0: # ... just below!
return node, idx
# else: # ... at lower levels
return n, i
mi = i
raise PyPdfError("Unexpectedly cannot find the node.")

node, idx = recursive_call(top, 0)
assert isinstance(node, DictionaryObject), "mypy"
return node, idx

@property
def named_destinations(self) -> Dict[str, Any]:
"""
Expand Down Expand Up @@ -1082,10 +1121,20 @@ def page_mode(self) -> Optional[PagemodeType]:

def _flatten(
self,
list_only: bool = False,
pages: Union[None, DictionaryObject, PageObject] = None,
inherit: Optional[Dict[str, Any]] = None,
indirect_reference: Optional[IndirectObject] = None,
) -> None:
"""
pubpub-zz marked this conversation as resolved.
Show resolved Hide resolved
Prepare the document pages to ease searching

Args:
list_only: Will only list the pages within _flatten_pages.
pages:
inherit:
indirect_reference: Used recursively to flatten the /Pages object.
"""
inheritable_page_attributes = (
NameObject(PG.RESOURCES),
NameObject(PG.MEDIABOX),
Expand Down Expand Up @@ -1122,7 +1171,7 @@ def _flatten(
if obj:
# damaged file may have invalid child in /Pages
try:
self._flatten(obj, inherit, **addt)
self._flatten(list_only, obj, inherit, **addt)
except RecursionError:
raise PdfReadError(
"Maximum recursion depth reached during page flattening."
Expand All @@ -1134,7 +1183,8 @@ def _flatten(
if attr_in not in pages:
pages[attr_in] = value
page_obj = PageObject(self, indirect_reference)
page_obj.update(pages)
if not list_only:
page_obj.update(pages)

# TODO: Could flattened_pages be None at this point?
self.flattened_pages.append(page_obj) # type: ignore
Expand All @@ -1158,7 +1208,7 @@ def remove_page(
or destinations to reference a detached page.
"""
if self.flattened_pages is None:
self._flatten()
self._flatten(self._readonly)
assert self.flattened_pages is not None
if isinstance(page, IndirectObject):
p = page.get_object()
Expand Down
40 changes: 31 additions & 9 deletions pypdf/_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -492,6 +492,22 @@ def __init__(
self.inline_images: Optional[Dict[str, ImageFile]] = None
# below Union for mypy but actually Optional[List[str]]
self.indirect_reference = indirect_reference
if indirect_reference is not None:
self.update(cast(DictionaryObject, indirect_reference.get_object()))

def hash_bin(self) -> int:
"""
pubpub-zz marked this conversation as resolved.
Show resolved Hide resolved
Used to detect modified object.

Note: this function is overloaded to return the same results
as a DictionaryObject.

Returns:
Hash considering type and value.
"""
return hash(
(DictionaryObject, tuple(((k, v.hash_bin()) for k, v in self.items())))
)

def hash_value_data(self) -> bytes:
data = super().hash_value_data()
Expand Down Expand Up @@ -2399,27 +2415,33 @@ def __delitem__(self, index: Union[int, slice]) -> None:
raise IndexError("index out of range")
ind = self[index].indirect_reference
assert ind is not None
parent = cast(DictionaryObject, ind.get_object()).get("/Parent", None)
parent: Optional[PdfObject] = cast(DictionaryObject, ind.get_object()).get(
"/Parent", None
)
first = True
while parent is not None:
parent = cast(DictionaryObject, parent.get_object())
try:
i = parent["/Kids"].index(ind)
del parent["/Kids"][i]
i = cast(ArrayObject, parent["/Kids"]).index(ind)
del cast(ArrayObject, parent["/Kids"])[i]
first = False
try:
assert ind is not None
del ind.pdf.flattened_pages[index] # case of page in a Reader
except Exception: # pragma: no cover
pass
if "/Count" in parent:
parent[NameObject("/Count")] = NumberObject(parent["/Count"] - 1)
if len(parent["/Kids"]) == 0:
parent[NameObject("/Count")] = NumberObject(
cast(int, parent["/Count"]) - 1
)
if len(cast(ArrayObject, parent["/Kids"])) == 0:
# No more objects in this part of this sub tree
ind = parent.indirect_reference
parent = cast(DictionaryObject, parent.get("/Parent", None))
else:
parent = None
parent = parent.get("/Parent", None)
except ValueError: # from index
raise PdfReadError(f"Page Not Found in Page Tree {ind}")
if first:
raise PdfReadError(f"Page not found in page tree: {ind}")
break

def __iter__(self) -> Iterator[PageObject]:
for i in range(len(self)):
Expand Down
3 changes: 3 additions & 0 deletions pypdf/_protocols.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,9 @@ class PdfWriterProtocol(PdfCommonDocProtocol, Protocol):
_objects: List[Any]
_id_translated: Dict[int, Dict[int, int]]

incremental: bool
_reader: Any # PdfReader

@abstractmethod
def write(self, stream: Union[Path, StrByteType]) -> Tuple[bool, IO[Any]]:
... # pragma: no cover
Expand Down
2 changes: 2 additions & 0 deletions pypdf/_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,7 @@ def __init__(
with open(stream, "rb") as fh:
stream = BytesIO(fh.read())
self._stream_opened = True
self._startxref: int = 0
self.read(stream)
self.stream = stream

Expand Down Expand Up @@ -563,6 +564,7 @@ def read(self, stream: StreamType) -> None:
self._basic_validation(stream)
self._find_eof_marker(stream)
startxref = self._find_startxref_pos(stream)
self._startxref = startxref

# check and eventually correct the startxref only in not strict
xref_issue_nr = self._get_xref_issues(stream, startxref)
Expand Down
Loading
Loading