From 643533dfd02291518f6e6318d64176a43d826673 Mon Sep 17 00:00:00 2001
From: Martin Thoma <info@martin-thoma.de>
Date: Sun, 30 Jul 2023 18:01:52 +0200
Subject: [PATCH 1/2] ENH: Add `level` parameter to compress_content_streams

File    File name
size    The suffix is the compression level
-------------------
5321132 GeoTopo.pdf
9959402 out-0.pdf
5976025 out-1.pdf
5914204 out-2.pdf
5885818 out-3.pdf
5816263 out-4.pdf
5762359 out-5.pdf
5738259 out-6.pdf
5731877 out-7.pdf
5726121 out-8.pdf
5725267 out-9.pdf

Level 1 gives a very good improvement, but already level 2
might not be worth the CPU cycles

See #1910
---
 docs/user/file-size.md            | 6 +++++-
 pypdf/_page.py                    | 4 ++--
 pypdf/filters.py                  | 5 +++--
 pypdf/generic/_data_structures.py | 4 ++--
 4 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/docs/user/file-size.md b/docs/user/file-size.md
index 3b0653d43..a014ae598 100644
--- a/docs/user/file-size.md
+++ b/docs/user/file-size.md
@@ -1,4 +1,4 @@
-# Reduce PDF Size
+# Reduce PDF File Size
 
 There are multiple ways to reduce the size of a given PDF file. The easiest
 one is to remove content (e.g. images) or pages.
@@ -96,6 +96,10 @@ with open("out.pdf", "wb") as f:
     writer.write(f)
 ```
 
+`page.compress_content_streams` uses [`zlib.compress`](https://docs.python.org/3/library/zlib.html#zlib.compress) and support the
+`level` paramter: `level=0` is no compression, `level=9` is the
+highest compression.
+
 Using this method, we have seen a reduction by 70% (from 11.8 MB to 3.5 MB)
 with a real PDF.
 
diff --git a/pypdf/_page.py b/pypdf/_page.py
index 2d31afafe..d70b1b019 100644
--- a/pypdf/_page.py
+++ b/pypdf/_page.py
@@ -1763,7 +1763,7 @@ def scaleTo(self, width: float, height: float) -> None:  # deprecated
         deprecation_with_replacement("scaleTo", "scale_to", "3.0.0")
         self.scale_to(width, height)
 
-    def compress_content_streams(self) -> None:
+    def compress_content_streams(self, level: int = -1) -> None:
         """
         Compress the size of this page by joining all content streams and
         applying a FlateDecode filter.
@@ -1773,7 +1773,7 @@ def compress_content_streams(self) -> None:
         """
         content = self.get_contents()
         if content is not None:
-            content_obj = content.flate_encode()
+            content_obj = content.flate_encode(level)
             try:
                 content.indirect_reference.pdf._objects[  # type: ignore
                     content.indirect_reference.idnum - 1  # type: ignore
diff --git a/pypdf/filters.py b/pypdf/filters.py
index f34e98200..82d2e0c9b 100644
--- a/pypdf/filters.py
+++ b/pypdf/filters.py
@@ -225,17 +225,18 @@ def _decode_png_prediction(data: str, columns: int, rowlength: int) -> bytes:
         return output.getvalue()
 
     @staticmethod
-    def encode(data: bytes) -> bytes:
+    def encode(data: bytes, level: int = -1) -> bytes:
         """
         Compress the input data using zlib.
 
         Args:
             data: The data to be compressed.
+            level: See https://docs.python.org/3/library/zlib.html#zlib.compress
 
         Returns:
             The compressed data.
         """
-        return zlib.compress(data)
+        return zlib.compress(data, level)
 
 
 class ASCIIHexDecode:
diff --git a/pypdf/generic/_data_structures.py b/pypdf/generic/_data_structures.py
index 8577b7be2..03cd848f5 100644
--- a/pypdf/generic/_data_structures.py
+++ b/pypdf/generic/_data_structures.py
@@ -880,7 +880,7 @@ def flateEncode(self) -> "EncodedStreamObject":  # deprecated
         deprecation_with_replacement("flateEncode", "flate_encode", "3.0.0")
         return self.flate_encode()
 
-    def flate_encode(self) -> "EncodedStreamObject":
+    def flate_encode(self, level: int = -1) -> "EncodedStreamObject":
         from ..filters import FlateDecode
 
         if SA.FILTER in self:
@@ -909,7 +909,7 @@ def flate_encode(self) -> "EncodedStreamObject":
         retval[NameObject(SA.FILTER)] = f
         if parms is not None:
             retval[NameObject(SA.DECODE_PARMS)] = parms
-        retval._data = FlateDecode.encode(self._data)
+        retval._data = FlateDecode.encode(self._data, level)
         return retval
 
 

From b2476002608ba76bdd428272e47b11a8bfeb1154 Mon Sep 17 00:00:00 2001
From: Martin Thoma <info@martin-thoma.de>
Date: Wed, 2 Aug 2023 08:48:05 +0200
Subject: [PATCH 2/2] Rephrasing

---
 docs/user/file-size.md | 27 +++++++++++++++++++--------
 1 file changed, 19 insertions(+), 8 deletions(-)

diff --git a/docs/user/file-size.md b/docs/user/file-size.md
index a014ae598..af9e40182 100644
--- a/docs/user/file-size.md
+++ b/docs/user/file-size.md
@@ -77,7 +77,8 @@ pypdf supports the FlateDecode filter which uses the zlib/deflate compression
 method. It is a lossless compression, meaning the resulting PDF looks exactly
 the same.
 
-Deflate compression can be applied to a page via [`page.compress_content_streams`](https://pypdf.readthedocs.io/en/latest/modules/PageObject.html#pypdf._page.PageObject.compress_content_streams):
+Deflate compression can be applied to a page via
+[`page.compress_content_streams`](https://pypdf.readthedocs.io/en/latest/modules/PageObject.html#pypdf._page.PageObject.compress_content_streams):
 
 ```python
 from pypdf import PdfReader, PdfWriter
@@ -96,19 +97,29 @@ with open("out.pdf", "wb") as f:
     writer.write(f)
 ```
 
-`page.compress_content_streams` uses [`zlib.compress`](https://docs.python.org/3/library/zlib.html#zlib.compress) and support the
-`level` paramter: `level=0` is no compression, `level=9` is the
-highest compression.
+`page.compress_content_streams` uses [`zlib.compress`](https://docs.python.org/3/library/zlib.html#zlib.compress)
+and supports the `level` paramter: `level=0` means no compression,
+`level=9` refers to the highest compression.
 
 Using this method, we have seen a reduction by 70% (from 11.8 MB to 3.5 MB)
 with a real PDF.
 
 ## Removing Sources
 
-When a page is removed from the page list, its content will still be present in the PDF file. This means that the data may still be used elsewhere.
+When a page is removed from the page list, its content will still be present in
+the PDF file. This means that the data may still be used elsewhere.
 
-Simply removing a page from the page list will reduce the page count but not the file size. In order to exclude the content completely, the pages should not be added to the PDF using the PdfWriter.append() function. Instead, only the desired pages should be selected for inclusion (note: [PR #1843](https://github.com/py-pdf/pypdf/pull/1843) will add a page deletion feature).
+Simply removing a page from the page list will reduce the page count but not the
+file size. In order to exclude the content completely, the pages should not be
+added to the PDF using the PdfWriter.append() function. Instead, only the
+desired pages should be selected for inclusion
+(note: [PR #1843](https://github.com/py-pdf/pypdf/pull/1843) will add a page
+deletion feature).
 
-There can be issues with poor PDF formatting, such as when all pages are linked to the same resource. In such cases, dropping references to specific pages becomes useless because there is only one source for all pages.
+There can be issues with poor PDF formatting, such as when all pages are linked
+to the same resource. In such cases, dropping references to specific pages
+becomes useless because there is only one source for all pages.
 
-Cropping is an ineffective method for reducing the file size because it only adjusts the viewboxes and not the external parts of the source image. Therefore, the content that is no longer visible will still be present in the PDF.
+Cropping is an ineffective method for reducing the file size because it only
+adjusts the viewboxes and not the external parts of the source image. Therefore,
+the content that is no longer visible will still be present in the PDF.