diff --git a/pypdf/_page.py b/pypdf/_page.py index 55521e95b..a9885f84b 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -2451,6 +2451,47 @@ def __getitem__( raise IndexError("sequence index out of range") return self.get_function(index) + def __delitem__(self, index: Union[int, slice]) -> None: + if isinstance(index, slice): + r = list(range(*index.indices(len(self)))) + # pages have to be deleted from last to first + r.sort() + r.reverse() + for p in r: + del self[p] + return + if not isinstance(index, int): + raise TypeError("index must be integers") + len_self = len(self) + if index < 0: + # support negative indexes + index = len_self + index + if index < 0 or index >= len_self: + raise IndexError("index out of range") + ind = self[index].indirect_reference + assert ind is not None + parent = cast(DictionaryObject, ind.get_object()).get("/Parent", None) + while parent is not None: + parent = cast(DictionaryObject, parent.get_object()) + try: + i = parent["/Kids"].index(ind) + del parent["/Kids"][i] + try: + assert ind is not None + del ind.pdf.flattened_pages[index] # case of page in a Reader + except AttributeError: + pass + if "/Count" in parent: + parent[NameObject("/Count")] = NumberObject(parent["/Count"] - 1) + if len(parent["/Kids"]) == 0: + # No more objects in this part of this sub tree + ind = parent.indirect_reference + parent = cast(DictionaryObject, parent.get("/Parent", None)) + else: + parent = None + except ValueError: # from index + raise PdfReadError(f"Page Not Found in Page Tree {ind}") + def __iter__(self) -> Iterator[PageObject]: for i in range(len(self)): yield self[i] diff --git a/pypdf/_writer.py b/pypdf/_writer.py index 598ba4305..a57fd8b91 100644 --- a/pypdf/_writer.py +++ b/pypdf/_writer.py @@ -487,7 +487,18 @@ def getNumPages(self) -> int: # deprecated @property def pages(self) -> List[PageObject]: - """Property that emulates a list of :class:`PageObject`.""" + """ + Property that emulates a list of :class:`PageObject`. + this property allows to get a page or a range of pages. + + It provides also capability to remove a page/range of page from the list + (through del operator) + Note: only the page entry is removed. As the objects beneath can be used + somewhere else. + a solution to completely remove them - if they are not used somewhere - + is to write to a buffer/temporary and to then load it into a new PdfWriter + object. + """ return _VirtualList(self._get_num_pages, self.get_page) # type: ignore def add_blank_page( diff --git a/tests/test_page.py b/tests/test_page.py index 1bcf46b79..2f7d2d7b7 100644 --- a/tests/test_page.py +++ b/tests/test_page.py @@ -11,7 +11,7 @@ from pypdf import PdfReader, PdfWriter, Transformation from pypdf._page import PageObject from pypdf.constants import PageAttributes as PG -from pypdf.errors import DeprecationError, PdfReadWarning +from pypdf.errors import DeprecationError, PdfReadError, PdfReadWarning from pypdf.generic import ( ArrayObject, ContentStream, @@ -1143,6 +1143,46 @@ def test_pages_printing(): reader.pages[0].images["~1~"] +@pytest.mark.enable_socket() +def test_del_pages(): + url = "https://corpora.tika.apache.org/base/docs/govdocs1/941/941536.pdf" + name = "tika-941536.pdf" + writer = PdfWriter(clone_from=BytesIO(get_pdf_from_url(url, name=name))) + ll = len(writer.pages) + pp = writer.pages[1].indirect_reference + del writer.pages[1] + assert len(writer.pages) == ll - 1 + pages = writer._pages.get_object() + assert pages["/Count"] == ll - 1 + assert len(pages["/Kids"]) == ll - 1 + assert pp not in pages["/Kids"] + del writer.pages[-2] + with pytest.raises(TypeError): + del writer.pages["aa"] + with pytest.raises(IndexError): + del writer.pages[9999] + pp = tuple(p.indirect_reference for p in writer.pages[3:5]) + ll = len(writer.pages) + del writer.pages[3:5] + assert len(writer.pages) == ll - 2 + for p in pp: + assert p not in pages["/Kids"] + # del whole arborescence + reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + # error case + pp = reader.pages[2] + i = pp["/Parent"].get_object()["/Kids"].index(pp.indirect_reference) + del pp["/Parent"].get_object()["/Kids"][i] + with pytest.raises(PdfReadError): + del reader.pages[2] + # reader is corrupted we have to reload it + reader = PdfReader(BytesIO(get_pdf_from_url(url, name=name))) + del reader.pages[:] + assert len(reader.pages) == 0 + assert len(reader.trailer["/Root"]["/Pages"]["/Kids"]) == 0 + assert len(reader.flattened_pages) == 0 + + def test_pdf_pages_missing_type(): pdf_path = RESOURCE_ROOT / "crazyones.pdf" reader = PdfReader(pdf_path)