From 2e89609d8bcc2c5d74dd42ddea26eca374bef818 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Mon, 25 Sep 2023 22:35:43 +0200 Subject: [PATCH 1/4] ENH: Add parameter to select images to be removed closes #2208 --- pypdf/__init__.py | 2 + pypdf/_writer.py | 107 ++++++++++++++++++++++++++++--------------- pypdf/constants.py | 11 ++++- tests/test_writer.py | 31 +++++++++++++ 4 files changed, 114 insertions(+), 37 deletions(-) diff --git a/pypdf/__init__.py b/pypdf/__init__.py index 250c05564..64ebf5b9a 100644 --- a/pypdf/__init__.py +++ b/pypdf/__init__.py @@ -14,6 +14,7 @@ from ._reader import DocumentInformation, PdfFileReader, PdfReader from ._version import __version__ from ._writer import ObjectDeletionFlag, PdfFileWriter, PdfWriter +from .constants import ImageType from .pagerange import PageRange, parse_filename_page_ranges from .papersizes import PaperSize @@ -31,6 +32,7 @@ __all__ = [ "__version__", "_debug_versions", + "ImageType", "PageRange", "PaperSize", "DocumentInformation", diff --git a/pypdf/_writer.py b/pypdf/_writer.py index 11a0b77e6..2be7583f0 100644 --- a/pypdf/_writer.py +++ b/pypdf/_writer.py @@ -77,6 +77,7 @@ FieldFlag, FileSpecificationDictionaryEntries, GoToActionArguments, + ImageType, InteractiveFormDictEntries, PageLabelStyle, TypFitArguments, @@ -132,12 +133,16 @@ class ObjectDeletionFlag(enum.IntFlag): + NONE = 0 TEXT = enum.auto() - IMAGES = enum.auto() LINKS = enum.auto() ATTACHMENTS = enum.auto() OBJECTS_3D = enum.auto() ALL_ANNOTATIONS = enum.auto() + XOBJECT_IMAGES = enum.auto() + INLINE_IMAGES = enum.auto() + DRAWING_IMAGES = enum.auto() + IMAGES = XOBJECT_IMAGES | INLINE_IMAGES | DRAWING_IMAGES def _rolling_checksum(stream: BytesIO, blocksize: int = 65536) -> str: @@ -2185,7 +2190,8 @@ def remove_objects_from_page( if to_delete & ObjectDeletionFlag.ALL_ANNOTATIONS: return self._remove_annots_from_page(page, None) - if to_delete & ObjectDeletionFlag.IMAGES: + jump_operators = [] + if to_delete & ObjectDeletionFlag.DRAWING_IMAGES: jump_operators = ( [b"w", b"J", b"j", b"M", b"d", b"i"] + [b"W", b"W*"] @@ -2193,25 +2199,33 @@ def remove_objects_from_page( + [b"m", b"l", b"c", b"v", b"y", b"h", b"re"] + [b"sh"] ) - else: # del text + if to_delete & ObjectDeletionFlag.TEXT: jump_operators = [b"Tj", b"TJ", b"'", b'"'] def clean(content: ContentStream, images: List[str], forms: List[str]) -> None: - nonlocal to_delete + nonlocal jump_operators, to_delete i = 0 while i < len(content.operations): operands, operator = content.operations[i] - if operator in jump_operators: + if ( + ( + operator == b"INLINE IMAGE" + and ( + cast(ObjectDeletionFlag, to_delete) + & ObjectDeletionFlag.INLINE_IMAGES + ) + ) + or (operator in jump_operators) + or ( + operator == b"Do" + and ( + cast(ObjectDeletionFlag, to_delete) + & ObjectDeletionFlag.XOBJECT_IMAGES + ) + and (operands[0] in images) + ) + ): del content.operations[i] - elif operator == b"Do": - if ( - cast(ObjectDeletionFlag, to_delete) & ObjectDeletionFlag.IMAGES - and operands[0] in images - or cast(ObjectDeletionFlag, to_delete) & ObjectDeletionFlag.TEXT - and operands[0] in forms - ): - del content.operations[i] - i += 1 else: i += 1 content.get_data() # this ensures ._data is rebuilt from the .operations @@ -2234,10 +2248,11 @@ def clean_forms( try: content: Any = None if ( - cast(ObjectDeletionFlag, to_delete) & ObjectDeletionFlag.IMAGES + cast(ObjectDeletionFlag, to_delete) + & ObjectDeletionFlag.XOBJECT_IMAGES and o["/Subtype"] == "/Image" ): - content = NullObject() + content = NullObject() # to delete the image keeping the entry images.append(k) if o["/Subtype"] == "/Form": forms.append(k) @@ -2245,12 +2260,13 @@ def clean_forms( content = o else: content = ContentStream(o, self) - content.update(o.items()) - for k1 in ["/Length", "/Filter", "/DecodeParms"]: - try: - del content[k1] - except KeyError: - pass + content.update( + { + k1: v1 + for k1, v1 in o.items() + if k1 not in ["/Length", "/Filter", "/DecodeParms"] + } + ) clean_forms(content, stack + [elt]) # clean sub forms if content is not None: if isinstance(v, IndirectObject): @@ -2261,6 +2277,8 @@ def clean_forms( d[k] = self._add_object(content) # pragma: no cover except (TypeError, KeyError): pass + for im in images: + del d[im] # for clean-up if isinstance(elt, StreamObject): # for /Form if not isinstance(elt, ContentStream): # pragma: no cover e = ContentStream(elt, self) @@ -2269,40 +2287,57 @@ def clean_forms( clean(elt, images, forms) # clean the content return images, forms + if not isinstance(page, PageObject): + page = PageObject(self, page.indirect_reference) if "/Contents" in page: - content = page["/Contents"].get_object() + content = cast(ContentStream, page.get_contents()) if not isinstance(content, ContentStream): content = ContentStream(content, page) images, forms = clean_forms(page, []) clean(cast(ContentStream, content), images, forms) - if isinstance(page["/Contents"], ArrayObject): - for o in cast(ArrayObject, page["/Contents"]): - self._objects[o.idnum - 1] = NullObject() - try: - self._objects[ - cast(IndirectObject, page["/Contents"].indirect_reference).idnum - 1 - ] = NullObject() - except AttributeError: - pass - page[NameObject("/Contents")] = self._add_object(content) + page.replace_contents(content) - def remove_images(self, ignore_byte_string_object: Optional[bool] = None) -> None: + def remove_images( + self, + to_delete: ImageType = ImageType.ALL, + ignore_byte_string_object: Optional[bool] = None, + ) -> None: """ Remove images from this output. Args: ignore_byte_string_object: deprecated """ + if isinstance(to_delete, bool): + ignore_byte_string_object = to_delete + to_delete = ImageType.ALL if ignore_byte_string_object is not None: warnings.warn( "The 'ignore_byte_string_object' argument of remove_images is " "deprecated and will be removed in pypdf 4.0.0.", category=DeprecationWarning, ) + i = ( + ( + ObjectDeletionFlag.XOBJECT_IMAGES + if to_delete & ImageType.XOBJECT_IMAGES + else ObjectDeletionFlag.NONE + ) + | ( + ObjectDeletionFlag.INLINE_IMAGES + if to_delete & ImageType.INLINE_IMAGES + else ObjectDeletionFlag.NONE + ) + | ( + ObjectDeletionFlag.DRAWING_IMAGES + if to_delete & ImageType.DRAWING_IMAGES + else ObjectDeletionFlag.NONE + ) + ) for page in self.pages: - self.remove_objects_from_page(page, ObjectDeletionFlag.IMAGES) + self.remove_objects_from_page(page, i) def removeImages(self, ignoreByteStringObject: bool = False) -> None: # deprecated """ @@ -2311,7 +2346,7 @@ def removeImages(self, ignoreByteStringObject: bool = False) -> None: # depreca .. deprecated:: 1.28.0 """ deprecation_with_replacement("removeImages", "remove_images", "3.0.0") - return self.remove_images(ignoreByteStringObject) + return self.remove_images() def remove_text(self, ignore_byte_string_object: Optional[bool] = None) -> None: """ diff --git a/pypdf/constants.py b/pypdf/constants.py index bde9ff22d..cbb7633ae 100644 --- a/pypdf/constants.py +++ b/pypdf/constants.py @@ -8,7 +8,7 @@ PDF Reference, sixth edition, Version 1.7, 2006. """ -from enum import IntFlag +from enum import IntFlag, auto from typing import Dict, Tuple @@ -585,3 +585,12 @@ class AnnotationFlag(IntFlag): TypArguments, TypFitArguments, ) + + +class ImageType(IntFlag): + NONE = 0 + XOBJECT_IMAGES = auto() + INLINE_IMAGES = auto() + DRAWING_IMAGES = auto() + ALL = XOBJECT_IMAGES | INLINE_IMAGES | DRAWING_IMAGES + IMAGES = ALL # for consistancy with ObjectDeletionFlag diff --git a/tests/test_writer.py b/tests/test_writer.py index 358fdb1cf..7bf5fb4ae 100644 --- a/tests/test_writer.py +++ b/tests/test_writer.py @@ -8,6 +8,7 @@ import pytest from pypdf import ( + ImageType, ObjectDeletionFlag, PageObject, PdfMerger, @@ -1859,3 +1860,33 @@ def test_object_contains_indirect_reference_to_self(): outpage = writer.add_blank_page(width, height) outpage.merge_page(reader.pages[6]) writer.append(reader) + + +def test_remove_image_per_type(): + writer = PdfWriter(clone_from=RESOURCE_ROOT / "reportlab-inline-image.pdf") + writer.remove_images(ImageType.INLINE_IMAGES) + + assert all( + x not in writer.pages[0].get_contents().get_data() + for x in (b"BI", b"ID", b"EI") + ) + + with pytest.raises(DeprecationWarning): + writer.remove_images(True) + + writer = PdfWriter(clone_from=RESOURCE_ROOT / "GeoBase_NHNC1_Data_Model_UML_EN.pdf") + writer.remove_images(ImageType.DRAWING_IMAGES) + assert all( + x not in writer.pages[1].get_contents().get_data() + for x in (b" re\n", b"W*", b"f*") + ) + assert all( + x in writer.pages[1].get_contents().get_data() for x in (b" TJ\n", b"rg", b"Tm") + ) + assert all( + x not in writer.pages[9]["/Resources"]["/XObject"]["/Meta84"].get_data() + for x in (b" re\n", b"W*", b"f*") + ) + writer.remove_images(ImageType.XOBJECT_IMAGES) + assert b"Do\n" not in writer.pages[0].get_contents().get_data() + assert len(writer.pages[0]["/Resources"]["/XObject"]) == 0 From 637cdbaaa2175320ad10a331511c891926afe998 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Tue, 26 Sep 2023 22:21:26 +0200 Subject: [PATCH 2/4] coverage --- pypdf/_writer.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pypdf/_writer.py b/pypdf/_writer.py index 2be7583f0..53cfba019 100644 --- a/pypdf/_writer.py +++ b/pypdf/_writer.py @@ -2288,12 +2288,10 @@ def clean_forms( return images, forms if not isinstance(page, PageObject): - page = PageObject(self, page.indirect_reference) + page = PageObject(self, page.indirect_reference) # pragma: no cover if "/Contents" in page: content = cast(ContentStream, page.get_contents()) - if not isinstance(content, ContentStream): - content = ContentStream(content, page) images, forms = clean_forms(page, []) clean(cast(ContentStream, content), images, forms) From 0896d0c5172d29ff74d9d93087a5c52cdce95cbf Mon Sep 17 00:00:00 2001 From: Martin Thoma Date: Sun, 8 Oct 2023 12:01:24 +0200 Subject: [PATCH 3/4] typo --- pypdf/constants.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pypdf/constants.py b/pypdf/constants.py index cbb7633ae..56a24b183 100644 --- a/pypdf/constants.py +++ b/pypdf/constants.py @@ -593,4 +593,4 @@ class ImageType(IntFlag): INLINE_IMAGES = auto() DRAWING_IMAGES = auto() ALL = XOBJECT_IMAGES | INLINE_IMAGES | DRAWING_IMAGES - IMAGES = ALL # for consistancy with ObjectDeletionFlag + IMAGES = ALL # for consistency with ObjectDeletionFlag From e067514e69fc44e127afdcf4c26cb27ff83e2e6a Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Thu, 12 Oct 2023 19:07:02 +0200 Subject: [PATCH 4/4] Update _writer.py from review --- pypdf/_writer.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pypdf/_writer.py b/pypdf/_writer.py index e5d64044d..f7eb20c9f 100644 --- a/pypdf/_writer.py +++ b/pypdf/_writer.py @@ -2306,6 +2306,8 @@ def remove_images( Remove images from this output. Args: + to_delete : The type of images to be deleted + (default = all images types) ignore_byte_string_object: deprecated """ if isinstance(to_delete, bool):