From d28aa5a25b151d661b317717a4f74c1bd87d1374 Mon Sep 17 00:00:00 2001 From: deeplow Date: Fri, 21 Oct 2022 15:55:46 +0100 Subject: [PATCH 1/4] Remove PDFtk dependency (replace w/ pdftoppm) PDFtk actually isn't needed. It was being used for breaking a PDF into pages but this is something that be replaced by the already present 'pdftoppm'. Furthermore, by removing this dependency we contribute to reproducible builds and overall supply chain security because it was obtained from gitlab with no signature verification or version pinning. The replacement 'pdftoppm' enabled us to do a shortcut: - before: PDF -> PDF pages -> PNG images -> RGB images - after: PDF -> PPM images -> RGB images And this last conversion step is trivial since the RGB format we were using is just a PPM file without the metadata in its header. --- container/Dockerfile | 10 --- container/dangerzone.py | 190 ++++++++++++++++++++++++++-------------- 2 files changed, 124 insertions(+), 76 deletions(-) diff --git a/container/Dockerfile b/container/Dockerfile index 70fdbadf9..cb07cab2a 100644 --- a/container/Dockerfile +++ b/container/Dockerfile @@ -10,7 +10,6 @@ RUN apk -U upgrade && \ poppler-utils \ python3 \ py3-magic \ - py3-pillow \ sudo \ tesseract-ocr \ tesseract-ocr-data-afr \ @@ -78,15 +77,6 @@ RUN apk -U upgrade && \ tesseract-ocr-data-ukr \ tesseract-ocr-data-vie -# Install pdftk -RUN \ - wget https://gitlab.com/pdftk-java/pdftk/-/jobs/924565145/artifacts/raw/build/libs/pdftk-all.jar && \ - mv pdftk-all.jar /usr/local/bin && \ - chmod +x /usr/local/bin/pdftk-all.jar && \ - echo '#!/bin/sh' > /usr/local/bin/pdftk && \ - echo '/usr/bin/java -jar "/usr/local/bin/pdftk-all.jar" "$@"' >> /usr/local/bin/pdftk && \ - chmod +x /usr/local/bin/pdftk - COPY dangerzone.py /usr/local/bin/ RUN chmod +x /usr/local/bin/dangerzone.py diff --git a/container/dangerzone.py b/container/dangerzone.py index 97e926fde..2d30ffdef 100644 --- a/container/dangerzone.py +++ b/container/dangerzone.py @@ -15,13 +15,14 @@ import glob import json import os +import re import shutil import subprocess import sys -from typing import Dict, List, Optional +import time +from typing import Callable, Dict, List, Optional import magic -from PIL import Image # timeout in seconds for any single subprocess DEFAULT_TIMEOUT: float = 120 @@ -36,25 +37,63 @@ def run_command( error_message: str, timeout_message: str, timeout: float = DEFAULT_TIMEOUT, -) -> subprocess.CompletedProcess: + stdout_callback: Callable = None, + stderr_callback: Callable = None, +) -> None: """ Runs a command and returns the result. :raises RuntimeError: if the process returns a non-zero exit status :raises TimeoutError: if the process times out """ - try: - return subprocess.run( + if stdout_callback is None and stderr_callback is None: + try: + subprocess.run(args, timeout=timeout, check=True) + except subprocess.CalledProcessError as e: + raise RuntimeError(error_message) from e + except subprocess.TimeoutExpired as e: + raise TimeoutError(timeout_message) from e + + else: + p = subprocess.Popen( args, - stdout=subprocess.DEVNULL, - stderr=subprocess.DEVNULL, - timeout=timeout, - check=True, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + universal_newlines=True, ) - except subprocess.CalledProcessError as e: - raise RuntimeError(error_message) from e - except subprocess.TimeoutExpired as e: - raise TimeoutError(timeout_message) from e + + # Progress callback requires a manually implemented timeout + start_time = time.time() + + # Make reading from stdout or stderr non-blocking + if p.stdout: + os.set_blocking(p.stdout.fileno(), False) + if p.stderr: + os.set_blocking(p.stderr.fileno(), False) + + while True: + # Processes hasn't finished + if p.poll() is not None: + if p.returncode != 0: + raise RuntimeError(error_message) + break + + # Check if timeout hasn't expired + if time.time() - start_time > timeout: + p.kill() + raise TimeoutError(timeout_message) + + if p.stdout and stdout_callback is not None: + line = p.stdout.readline() + if len(line) > 0: + line = line.rstrip() # strip trailing "\n" + stdout_callback(line) + + if p.stderr and stderr_callback is not None: + line = p.stderr.readline() + if len(line) > 0: + line = line.rstrip() # strip trailing "\n" + stderr_callback(line) class DangerzoneConverter: @@ -181,65 +220,84 @@ def document_to_pixels(self) -> None: ) self.percentage += 3 - # Separate PDF into pages - self.update_progress("Separating document into pages") - args = ["pdftk", pdf_filename, "burst", "output", "/tmp/page-%d.pdf"] - run_command( - args, - error_message="Separating document into pages failed", - timeout_message=f"Error separating document into pages, pdftk timed out after {DEFAULT_TIMEOUT} seconds", - ) + self.update_progress("Obtaining PDF metadata") - page_filenames = glob.glob("/tmp/page-*.pdf") + def pdftoppm_progress_callback(line: str) -> None: + """Function called for every line the 'pdftoppm'command outputs - self.percentage += 2 + Sample pdftoppm output: - # Convert to RGB pixel data - percentage_per_page = 45.0 / len(page_filenames) - for page in range(1, len(page_filenames) + 1): - pdf_filename = f"/tmp/page-{page}.pdf" - png_filename = f"/tmp/page-{page}.png" - rgb_filename = f"/tmp/page-{page}.rgb" - width_filename = f"/tmp/page-{page}.width" - height_filename = f"/tmp/page-{page}.height" - filename_base = f"/tmp/page-{page}" - - self.update_progress( - f"Converting page {page}/{len(page_filenames)} to pixels" - ) - - # Convert to png - run_command( - ["pdftocairo", pdf_filename, "-png", "-singlefile", filename_base], - error_message="Conversion from PDF to PNG failed", - timeout_message=f"Error converting from PDF to PNG, pdftocairo timed out after {DEFAULT_TIMEOUT} seconds", - ) - - # Save the width and height - with Image.open(png_filename, "r") as im: - width, height = im.size - with open(width_filename, "w") as f: - f.write(str(width)) - with open(height_filename, "w") as f: - f.write(str(height)) + $ pdftoppm sample.pdf /tmp/safe -progress + 1 4 /tmp/safe-1.ppm + 2 4 /tmp/safe-2.ppm + 3 4 /tmp/safe-3.ppm + 4 4 /tmp/safe-4.ppm - # Convert to RGB pixels - run_command( - [ - "gm", - "convert", - png_filename, - "-depth", - "8", - f"rgb:{rgb_filename}", - ], - error_message="Conversion from PNG to RGB failed", - timeout_message=f"Error converting from PNG to pixels, convert timed out after {DEFAULT_TIMEOUT} seconds", - ) + Each successful line is in the format "{page} {page_num} {ppm_filename}" + """ + try: + (page_str, num_pages_str, _) = line.split() + num_pages = int(num_pages_str) + page = int(page_str) + except ValueError as e: + raise RuntimeError("Conversion from PDF to PPM failed") from e - # Delete the png - os.remove(png_filename) + percentage_per_page = 45.0 / num_pages self.percentage += percentage_per_page + self.update_progress(f"Converting page {page}/{num_pages} to pixels") + + zero_padding = "0" * (len(num_pages_str) - len(page_str)) + ppm_filename = f"{page_base}-{zero_padding}{page}.ppm" + rgb_filename = f"{page_base}-{page}.rgb" + width_filename = f"{page_base}-{page}.width" + height_filename = f"{page_base}-{page}.height" + filename_base = f"{page_base}-{page}" + + with open(ppm_filename, "rb") as f: + # NOTE: PPM files have multiple ways of writing headers. + # For our specific case we parse it expecting the header format that ppmtopdf produces + # More info on PPM headers: https://people.uncw.edu/tompkinsj/112/texnh/assignments/imageFormat.html + + # Read the header + header = f.readline().decode().strip() + if header != "P6": + raise ValueError("Invalid PPM header") + + # Save the width and height + dims = f.readline().decode().strip() + width, height = dims.split() + with open(width_filename, "w") as width_file: + width_file.write(width) + with open(height_filename, "w") as height_file: + height_file.write(height) + + maxval = int(f.readline().decode().strip()) + # Check that the depth is 8 + if maxval != 255: + raise ValueError("Invalid PPM depth") + + data = f.read() + + # Save pixel data + with open(rgb_filename, "wb") as f: + f.write(data) + + # Delete the ppm file + os.remove(ppm_filename) + + page_base = "/tmp/page" + # Convert to PPM, which is essentially an RGB format + run_command( + [ + "pdftoppm", + pdf_filename, + page_base, + "-progress", + ], + error_message="Conversion from PDF to PPM failed", + timeout_message=f"Error converting from PDF to PPM, pdftoppm timed out after {DEFAULT_TIMEOUT} seconds", + stderr_callback=pdftoppm_progress_callback, + ) self.update_progress("Converted document to pixels") From 272d25aee07f2f27e0ecce25adf737bf47a25f3b Mon Sep 17 00:00:00 2001 From: deeplow Date: Mon, 23 Jan 2023 13:20:15 +0000 Subject: [PATCH 2/4] Make pdf to ppm conversion dependent on num pages --- container/dangerzone.py | 26 +++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/container/dangerzone.py b/container/dangerzone.py index 2d30ffdef..f888ee55e 100644 --- a/container/dangerzone.py +++ b/container/dangerzone.py @@ -20,7 +20,7 @@ import subprocess import sys import time -from typing import Callable, Dict, List, Optional +from typing import Callable, Dict, List, Optional, Union import magic @@ -220,7 +220,24 @@ def document_to_pixels(self) -> None: ) self.percentage += 3 - self.update_progress("Obtaining PDF metadata") + # Obtain number of pages + self.update_progress("Calculating number of pages") + self.num_pages: Union[None, int] = None + + def get_num_pages(line: str) -> None: + search = re.search(r"^Pages: (\d+)", line) + if search is not None: + self.num_pages = int(search.group(1)) + + run_command( + ["pdfinfo", pdf_filename], + error_message="PDF file is corrupted", + timeout_message=f"Extracting metadata from PDF timed out after 1 second", + timeout=1, + stdout_callback=get_num_pages, + ) + if self.num_pages == None: + raise ValueError("Number of pages could not be extraced from PDF") def pdftoppm_progress_callback(line: str) -> None: """Function called for every line the 'pdftoppm'command outputs @@ -286,7 +303,9 @@ def pdftoppm_progress_callback(line: str) -> None: os.remove(ppm_filename) page_base = "/tmp/page" + # Convert to PPM, which is essentially an RGB format + pdftoppm_timeout = 1.0 * self.num_pages # type: ignore [operator] run_command( [ "pdftoppm", @@ -295,8 +314,9 @@ def pdftoppm_progress_callback(line: str) -> None: "-progress", ], error_message="Conversion from PDF to PPM failed", - timeout_message=f"Error converting from PDF to PPM, pdftoppm timed out after {DEFAULT_TIMEOUT} seconds", + timeout_message=f"Error converting from PDF to PPM, pdftoppm timed out after {pdftoppm_timeout} seconds", stderr_callback=pdftoppm_progress_callback, + timeout=pdftoppm_timeout, ) self.update_progress("Converted document to pixels") From d7be28ec2a1d4b1002fa0843e72b6b34d083a46f Mon Sep 17 00:00:00 2001 From: deeplow Date: Mon, 9 Jan 2023 17:37:23 +0000 Subject: [PATCH 3/4] Remove openjdk-8 as a dependency. default-jre and java dependencies dependencies had been added initially [1] because of libreoffice-java-common, which is no longer present. Then, when the image was changed from ubuntu to alpine [2], default-jre was replaced with openjdk-8. If java is still a dependency for libreoffice, then it should be pulled automatically. [1] https://github.com/firstlookmedia/dangerzone-converter/commit/9ecdb9e9952dc52b2ad973694aaeef4ba9e8f8ea [2] https://github.com/firstlookmedia/dangerzone-converter/commit/650ae6eee13da6dc3b8d2f1ac8a70c9c52ab7e20 --- container/Dockerfile | 1 - 1 file changed, 1 deletion(-) diff --git a/container/Dockerfile b/container/Dockerfile index cb07cab2a..3172580d1 100644 --- a/container/Dockerfile +++ b/container/Dockerfile @@ -6,7 +6,6 @@ RUN apk -U upgrade && \ ghostscript \ graphicsmagick \ libreoffice \ - openjdk8 \ poppler-utils \ python3 \ py3-magic \ From 2da973232b31f379646c2003007ef834d1db7f7f Mon Sep 17 00:00:00 2001 From: deeplow Date: Mon, 9 Jan 2023 19:16:18 +0000 Subject: [PATCH 4/4] Remove sudo: no longer needed Fixes #232 --- container/Dockerfile | 1 - 1 file changed, 1 deletion(-) diff --git a/container/Dockerfile b/container/Dockerfile index 3172580d1..7adc73b1b 100644 --- a/container/Dockerfile +++ b/container/Dockerfile @@ -9,7 +9,6 @@ RUN apk -U upgrade && \ poppler-utils \ python3 \ py3-magic \ - sudo \ tesseract-ocr \ tesseract-ocr-data-afr \ tesseract-ocr-data-ara \