Pleasing VeraPDF / fixing ICC profiles insertion / warn on CMYK not s…

…upported - close #697
py-pdf · Feb 28, 2023 · 667a295 · 667a295
1 parent e9dfb18
commit 667a295
Show file tree

Hide file tree

Showing 10 changed files with 55 additions and 55 deletions.
diff --git a/docs/index.md b/docs/index.md
@@ -108,7 +108,7 @@ or [open a discussion](https://github.com/PyFPDF/fpdf2/discussions).
 ### Related ###
 
 * Looking for alternative libraries? Check out [this detailed list of PDF-related Python libs by Patrick Maupin (`pdfrw` author)](https://github.com/pmaupin/pdfrw#other-libraries).
-  There is also [borb](https://github.com/jorisschellekens/borb), [PyPDF2](https://github.com/py-pdf/PyPDF2), [pikepdf](https://github.com/pikepdf/pikepdf), [WeasyPrint](https://github.com/Kozea/WeasyPrint), [pydyf](https://pypi.org/project/pydyf/) and [PyMuPDF](https://pymupdf.readthedocs.io/en/latest/index.html): [features comparison](https://pymupdf.readthedocs.io/en/latest/about.html), [examples](https://github.com/pymupdf/PyMuPDF-Utilities/tree/master/examples#examples), [Jupyter notebooks](https://github.com/pymupdf/PyMuPDF-Utilities/tree/master/jupyter-notebooks)
+  There is also [borb](https://github.com/jorisschellekens/borb), [PyPDF2](https://github.com/py-pdf/PyPDF2), [pikepdf](https://github.com/pikepdf/pikepdf), [WeasyPrint](https://github.com/Kozea/WeasyPrint), [pydyf](https://pypi.org/project/pydyf/) and [PyMuPDF](https://pymupdf.readthedocs.io/en/latest/index.html): [features comparison](https://pymupdf.readthedocs.io/en/latest/about.html), [examples](https://github.com/pymupdf/PyMuPDF-Utilities/tree/master/examples#examples), [Jupyter notebooks](https://github.com/pymupdf/PyMuPDF-Utilities/tree/master/jupyter-notebooks).
   We have some documentations about combining `fpdf2` with [`borb`](CombineWithBorb.md), [`pdfrw`](CombineWithPdfrw.md), & [`PyPDF2`](CombineWithPyPDF2.md).
 * [Create PDFs with Python](https://www.youtube.com/playlist?list=PLjNQtX45f0dR9K2sMJ5ad9wVjqslNBIC0) : a series of tutorial videos by bvalgard
 * [digidigital/Extensions-and-Scripts-for-pyFPDF-fpdf2](https://github.com/digidigital/Extensions-and-Scripts-for-pyFPDF-fpdf2) : scripts ported from PHP to add transpareny to elements of the page or part of an image, allow to write circular text,

diff --git a/fpdf/image_parsing.py b/fpdf/image_parsing.py
@@ -314,17 +314,22 @@ def _decode_base64_image(base64Image):
 ]
 
 
-def iccp_is_valid(iccp):
-    """
-    checks the validity of an iccp profile
-    """
+def is_iccp_valid(iccp, filename):
+    "Checks the validity of an ICC profile"
     try:
-        iccp_io = BytesIO(iccp)
-        profile = ImageCms.getOpenProfile(iccp_io)
-        ImageCms.getProfileInfo(profile)
-        return True
+        profile = ImageCms.getOpenProfile(BytesIO(iccp))
     except ImageCms.PyCMSError:
+        LOGGER.warning("Invalid ICC Profile in file %s", filename)
+        return False
+    color_space = profile.profile.xcolor_space.strip()
+    if color_space not in ("GRAY", "RGB"):
+        LOGGER.warning(
+            "Unsupported color space %s in ICC Profile of file %s - cf. issue #711",
+            color_space,
+            filename,
+        )
         return False
+    return True
 
 
 def get_img_info(filename, img=None, image_filter="AUTO", dims=None):
@@ -371,10 +376,8 @@ def get_img_info(filename, img=None, image_filter="AUTO", dims=None):
 
     iccp = None
     if "icc_profile" in img.info:
-        iccp = img.info.get("icc_profile")
-        if not iccp_is_valid(iccp):
-            LOGGER.error("ICCP for %s is invalid", filename)
-            iccp = None
+        if is_iccp_valid(img.info["icc_profile"], filename):
+            iccp = img.info["icc_profile"]
 
     if img_raw_data is not None and not img_altered:
         # if we can use the original image bytes directly we do (JPEG and group4 TIFF only):

diff --git a/fpdf/output.py b/fpdf/output.py
@@ -754,6 +754,7 @@ def _ensure_iccp(self, img_info):
                 iccp_content = iccp_c
                 break
         assert iccp_content is not None
+        # Note: n should be 4 if the profile ColorSpace is CMYK
         iccp_obj = PDFICCPObject(
             contents=iccp_content, n=img_info["dpn"], alternate=img_info["cs"]
         )
@@ -770,11 +771,13 @@ def _add_image(self, info):
                 ["/Indexed", "/DeviceRGB", f"{len(info['pal']) // 3 - 1}"]
             )
         elif iccp_i is not None:
-            # indexed images are not supposed to have ICC profiles
             iccp_pdf_i = self._ensure_iccp(info)
             color_space = PDFArray(["/ICCBased", str(iccp_pdf_i), str("0"), "R"])
         elif color_space == "DeviceCMYK":
             decode = "[1 0 1 0 1 0 1 0]"
+            raise NotImplementedError(
+                "fpdf2 does not support DeviceCMYK ColorSpace yet - cf. issue #711"
+            )
 
         decode_parms = f"<<{info['dp']} /BitsPerComponent {info['bpc']}>>"
         img_obj = PDFXObject(

diff --git a/scripts/checker_commons.py b/scripts/checker_commons.py
@@ -8,7 +8,7 @@ def aggregate(pdf_filepath, report, aggregated_report_filepath):
         "errors": defaultdict(list),
     }
     try:
-        with open(aggregated_report_filepath) as agg_file:
+        with open(aggregated_report_filepath, encoding="utf8") as agg_file:
             prev_agg_report = json.load(agg_file)
         agg_report["failures"].update(prev_agg_report["failures"])
         agg_report["errors"].update(prev_agg_report["errors"])
@@ -22,33 +22,34 @@ def aggregate(pdf_filepath, report, aggregated_report_filepath):
     else:
         for error in report.get("errors", []):
             agg_report["errors"][error].append(pdf_filepath)
-    with open(aggregated_report_filepath, "w") as agg_file:
+    with open(aggregated_report_filepath, "w", encoding="utf8") as agg_file:
         json.dump(agg_report, agg_file)
 
 
 def print_aggregated_report(
     aggregated_report_filepath, checks_details_url, ignore_whitelist_filepath
 ):
-    with open(aggregated_report_filepath) as agg_file:
+    with open(aggregated_report_filepath, encoding="utf8") as agg_file:
         agg_report = json.load(agg_file)
     if "version" in agg_report:
         print(agg_report["version"])
     print("Documentation on the checks:", checks_details_url)
     print("# AGGREGATED REPORT #")
     if agg_report["failures"]:
         print("Failures:")
-        for failure, pdf_filepaths in agg_report["failures"].items():
+        for failure, pdf_filepaths in sorted(agg_report["failures"].items()):
             print(f"- {failure} ({len(pdf_filepaths)}): {', '.join(pdf_filepaths)}")
     print("Errors:")
-    sort_key = lambda error: -len(error[1])
-    for error, pdf_filepaths in sorted(agg_report["errors"].items(), key=sort_key):
+    for error, pdf_filepaths in sorted(
+        sorted(agg_report["errors"].items(), key=lambda error: -len(error[1]))
+    ):
         print(f"- {error} ({len(pdf_filepaths)}): {', '.join(pdf_filepaths)}")
     fail_on_unexpected_check_failure(agg_report, ignore_whitelist_filepath)
 
 
 def fail_on_unexpected_check_failure(agg_report, ignore_whitelist_filepath):
     "exit(1) if there is any non-passing & non-whitelisted error remaining"
-    with open(ignore_whitelist_filepath) as ignore_file:
+    with open(ignore_whitelist_filepath, encoding="utf8") as ignore_file:
         ignore = json.load(ignore_file)
     errors = set(agg_report["errors"].keys()) - set(ignore["errors"].keys())
     if agg_report["failures"] or errors:

diff --git a/scripts/verapdf-ignore.json b/scripts/verapdf-ignore.json
@@ -3,8 +3,8 @@
         "6.1.2-2": "REASON: this required byte value greater than 127 at the beginning of a file seemingly only appears in the ISO 19005-1:2005 spec and no other public one",
         "6.1.3-1": "REASON: fpdf2 does not currently define a file /ID in the trailer section",
         "6.1.3-2": "REASON: fpdf2 allows to create encrypted files",
-        "6.2.3-2": "REASON: fpdf2 does not currently support defining ICC profiles",
-        "6.2.3-4": "REASON: fpdf2 does not currently support defining ICC profiles",
+        "6.2.3-1": "REASON: false positive - fpdf2 does not currently produce any ICCInputProfile",
+        "6.2.3-2": "REASON: fpdf2 does not currently support PDF/A",
         "6.3.4-1": "REASON: fpdf2 still allows using the PostScript standard 14 fonts. Quoting PDF 1.7 spec from 2006: Beginning with PDF 1.5, the special treatment given to the standard 14 fonts is deprecated. All fonts used in a PDF document should be represented using a com- plete font descriptor. For backwards capability, viewer applications must still provide the special treatment identified for the standard 14 fonts.",
         "6.3.5-3": "FIXME: corresponding GitHub issue -> https://github.com/PyFPDF/fpdf2/issues/88",
         "6.4-1": "REASON: enabled by default, can be disabled by setting pdf.allow_images_transparency = False",

diff --git a/scripts/verapdf.py b/scripts/verapdf.py
@@ -15,21 +15,20 @@
 
 AGGREGATED_REPORT_FILEPATH = "verapdf-aggregated.json"
 IGNORE_WHITELIST_FILEPATH = "scripts/verapdf-ignore.json"
-CHECKS_DETAILS_URL = "https://docs.verapdf.org/validation/pdfa-part1/ & https://docs.verapdf.org/validation/pdfa-parts-2-and-3/"
+CHECKS_DETAILS_URL = "https://docs.verapdf.org/validation/"
 BAT_EXT = ".bat" if sys.platform in ("cygwin", "win32") else ""
 
 
 def analyze_pdf_file(pdf_filepath):
-    output = run(
-        [
-            "verapdf/verapdf" + BAT_EXT,
-            "--format",
-            "text",
-            "-v",
-            pdf_filepath,
-        ],
-        stdout=PIPE,
-    ).stdout.decode()
+    command = [
+        "verapdf/verapdf" + BAT_EXT,
+        "--format",
+        "text",
+        "-v",
+        pdf_filepath,
+    ]
+    # print(" ".join(command))
+    output = run(command, check=False, stdout=PIPE).stdout.decode()
     report = parse_output(output)
     aggregate(pdf_filepath, report, AGGREGATED_REPORT_FILEPATH)
 

diff --git a/test/image/image_types/test_insert_images.py b/test/image/image_types/test_insert_images.py
@@ -132,11 +132,12 @@ def test_insert_jpg_icc(tmp_path):
     assert_pdf_equal(pdf, HERE / "image_types_insert_jpg_icc.pdf", tmp_path)
 
 
-def test_insert_jpg_invalid_icc(tmp_path):
+def test_insert_jpg_invalid_icc(caplog, tmp_path):
     pdf = fpdf.FPDF()
     pdf.add_page(format=(448, 498))
     pdf.set_margin(0)
     pdf.image(HERE / "insert_images_insert_jpg_icc_invalid.jpg", x=0, y=0, h=498)
+    assert "Invalid ICC Profile in file" in caplog.text
     assert_pdf_equal(pdf, HERE / "image_types_insert_jpg_icc_invalid.pdf", tmp_path)
 
 

diff --git a/test/image/png_images/image_png_insert_png_files.pdf b/test/image/png_images/image_png_insert_png_files.pdf
diff --git a/test/image/png_images/test_png_file.py b/test/image/png_images/test_png_file.py
@@ -1,28 +1,21 @@
 from pathlib import Path
 
-import fpdf
+from fpdf import FPDF
 from test.conftest import assert_pdf_equal
 
 
 HERE = Path(__file__).resolve().parent
 
 
-def test_insert_png_files(tmp_path):
-    pdf = fpdf.FPDF(unit="pt")
-    pdf.compress = False
-
-    not_supported = {
-        "e59ec0cfb8ab64558099543dc19f8378.png",  # Interlacing not supported:
-        "6c853ed9dacd5716bc54eb59cec30889.png",  # 16-bit depth not supported:
-        "ac6343a98f8edabfcc6e536dd75aacb0.png",  # Interlacing not supported:
-        "93e6127b9c4e7a99459c558b81d31bc5.png",  # Interlacing not supported:
-        "18f9baf3834980f4b80a3e82ad45be48.png",  # Interlacing not supported:
-        "51a4d21670dc8dfa8ffc9e54afd62f5f.png",  # Interlacing not supported:
-    }
-
+def test_insert_png_files(caplog, tmp_path):
+    pdf = FPDF()
     for path in sorted(HERE.glob("*.png")):
-        if path.name not in not_supported:
-            pdf.add_page()
-            pdf.image(str(path), x=0, y=0, w=0, h=0)
-
+        pdf.add_page()
+        pdf.image(str(path), x=0, y=0, w=0, h=0)
+    # Note: 7 of those images have an ICC profile, and there are only 5 distinct ICC profiles among them
     assert_pdf_equal(pdf, HERE / "image_png_insert_png_files.pdf", tmp_path)
+
+    assert "Unsupported color space CMYK in ICC Profile of file" in caplog.text
+    # Note: the warning above comes from the following files, for which ImageMagics also raise warnings:
+    #   identify-im6.q16: iCCP: profile 'icc': 'CMYK': invalid ICC profile color space `test/image/png_images/0839d93f8e77e21acd0ac40a80b14b7b.png'
+    #   identify-im6.q16: iCCP: profile 'icc': 'CMYK': invalid ICC profile color space `test/image/png_images/1ebd73c1d3fbc89782f29507364128fc.png'
diff --git a/test/image/test_url_images.py b/test/image/test_url_images.py
@@ -4,11 +4,11 @@
 from test.conftest import assert_pdf_equal
 
 HERE = Path(__file__).resolve().parent
+PNG_IMG_URL = "https://upload.wikimedia.org/wikipedia/commons/7/70/Example.png"
 
 
 def test_png_url(tmp_path):
     pdf = fpdf.FPDF()
     pdf.add_page()
-    png = "https://upload.wikimedia.org/wikipedia/commons/7/70/Example.png"
-    pdf.image(png, x=15, y=15, w=30, h=25)
+    pdf.image(PNG_IMG_URL, x=15, y=15, w=30, h=25)
     assert_pdf_equal(pdf, HERE / "image_png_url.pdf", tmp_path)