Skip to content

Commit

Permalink
ENH: Add page deletion feature to PdfWriter (#1843)
Browse files Browse the repository at this point in the history
Note: This does not delete the objects from the PDF, just the page entry
  • Loading branch information
pubpub-zz authored Jun 30, 2023
1 parent 21ce645 commit 115fbfc
Show file tree
Hide file tree
Showing 3 changed files with 94 additions and 2 deletions.
41 changes: 41 additions & 0 deletions pypdf/_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -2451,6 +2451,47 @@ def __getitem__(
raise IndexError("sequence index out of range")
return self.get_function(index)

def __delitem__(self, index: Union[int, slice]) -> None:
if isinstance(index, slice):
r = list(range(*index.indices(len(self))))
# pages have to be deleted from last to first
r.sort()
r.reverse()
for p in r:
del self[p]
return
if not isinstance(index, int):
raise TypeError("index must be integers")
len_self = len(self)
if index < 0:
# support negative indexes
index = len_self + index
if index < 0 or index >= len_self:
raise IndexError("index out of range")
ind = self[index].indirect_reference
assert ind is not None
parent = cast(DictionaryObject, ind.get_object()).get("/Parent", None)
while parent is not None:
parent = cast(DictionaryObject, parent.get_object())
try:
i = parent["/Kids"].index(ind)
del parent["/Kids"][i]
try:
assert ind is not None
del ind.pdf.flattened_pages[index] # case of page in a Reader
except AttributeError:
pass
if "/Count" in parent:
parent[NameObject("/Count")] = NumberObject(parent["/Count"] - 1)
if len(parent["/Kids"]) == 0:
# No more objects in this part of this sub tree
ind = parent.indirect_reference
parent = cast(DictionaryObject, parent.get("/Parent", None))
else:
parent = None
except ValueError: # from index
raise PdfReadError(f"Page Not Found in Page Tree {ind}")

def __iter__(self) -> Iterator[PageObject]:
for i in range(len(self)):
yield self[i]
Expand Down
13 changes: 12 additions & 1 deletion pypdf/_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -487,7 +487,18 @@ def getNumPages(self) -> int: # deprecated

@property
def pages(self) -> List[PageObject]:
"""Property that emulates a list of :class:`PageObject<pypdf._page.PageObject>`."""
"""
Property that emulates a list of :class:`PageObject<pypdf._page.PageObject>`.
this property allows to get a page or a range of pages.
It provides also capability to remove a page/range of page from the list
(through del operator)
Note: only the page entry is removed. As the objects beneath can be used
somewhere else.
a solution to completely remove them - if they are not used somewhere -
is to write to a buffer/temporary and to then load it into a new PdfWriter
object.
"""
return _VirtualList(self._get_num_pages, self.get_page) # type: ignore

def add_blank_page(
Expand Down
42 changes: 41 additions & 1 deletion tests/test_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from pypdf import PdfReader, PdfWriter, Transformation
from pypdf._page import PageObject
from pypdf.constants import PageAttributes as PG
from pypdf.errors import DeprecationError, PdfReadWarning
from pypdf.errors import DeprecationError, PdfReadError, PdfReadWarning
from pypdf.generic import (
ArrayObject,
ContentStream,
Expand Down Expand Up @@ -1143,6 +1143,46 @@ def test_pages_printing():
reader.pages[0].images["~1~"]


@pytest.mark.enable_socket()
def test_del_pages():
url = "https://corpora.tika.apache.org/base/docs/govdocs1/941/941536.pdf"
name = "tika-941536.pdf"
writer = PdfWriter(clone_from=BytesIO(get_pdf_from_url(url, name=name)))
ll = len(writer.pages)
pp = writer.pages[1].indirect_reference
del writer.pages[1]
assert len(writer.pages) == ll - 1
pages = writer._pages.get_object()
assert pages["/Count"] == ll - 1
assert len(pages["/Kids"]) == ll - 1
assert pp not in pages["/Kids"]
del writer.pages[-2]
with pytest.raises(TypeError):
del writer.pages["aa"]
with pytest.raises(IndexError):
del writer.pages[9999]
pp = tuple(p.indirect_reference for p in writer.pages[3:5])
ll = len(writer.pages)
del writer.pages[3:5]
assert len(writer.pages) == ll - 2
for p in pp:
assert p not in pages["/Kids"]
# del whole arborescence
reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name)))
# error case
pp = reader.pages[2]
i = pp["/Parent"].get_object()["/Kids"].index(pp.indirect_reference)
del pp["/Parent"].get_object()["/Kids"][i]
with pytest.raises(PdfReadError):
del reader.pages[2]
# reader is corrupted we have to reload it
reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name)))
del reader.pages[:]
assert len(reader.pages) == 0
assert len(reader.trailer["/Root"]["/Pages"]["/Kids"]) == 0
assert len(reader.flattened_pages) == 0


def test_pdf_pages_missing_type():
pdf_path = RESOURCE_ROOT / "crazyones.pdf"
reader = PdfReader(pdf_path)
Expand Down

0 comments on commit 115fbfc

Please sign in to comment.