diff --git a/container/Dockerfile b/container/Dockerfile index 70fdbadf9..7adc73b1b 100644 --- a/container/Dockerfile +++ b/container/Dockerfile @@ -6,12 +6,9 @@ RUN apk -U upgrade && \ ghostscript \ graphicsmagick \ libreoffice \ - openjdk8 \ poppler-utils \ python3 \ py3-magic \ - py3-pillow \ - sudo \ tesseract-ocr \ tesseract-ocr-data-afr \ tesseract-ocr-data-ara \ @@ -78,15 +75,6 @@ RUN apk -U upgrade && \ tesseract-ocr-data-ukr \ tesseract-ocr-data-vie -# Install pdftk -RUN \ - wget https://gitlab.com/pdftk-java/pdftk/-/jobs/924565145/artifacts/raw/build/libs/pdftk-all.jar && \ - mv pdftk-all.jar /usr/local/bin && \ - chmod +x /usr/local/bin/pdftk-all.jar && \ - echo '#!/bin/sh' > /usr/local/bin/pdftk && \ - echo '/usr/bin/java -jar "/usr/local/bin/pdftk-all.jar" "$@"' >> /usr/local/bin/pdftk && \ - chmod +x /usr/local/bin/pdftk - COPY dangerzone.py /usr/local/bin/ RUN chmod +x /usr/local/bin/dangerzone.py diff --git a/container/dangerzone.py b/container/dangerzone.py index 97e926fde..f888ee55e 100644 --- a/container/dangerzone.py +++ b/container/dangerzone.py @@ -15,13 +15,14 @@ import glob import json import os +import re import shutil import subprocess import sys -from typing import Dict, List, Optional +import time +from typing import Callable, Dict, List, Optional, Union import magic -from PIL import Image # timeout in seconds for any single subprocess DEFAULT_TIMEOUT: float = 120 @@ -36,25 +37,63 @@ def run_command( error_message: str, timeout_message: str, timeout: float = DEFAULT_TIMEOUT, -) -> subprocess.CompletedProcess: + stdout_callback: Callable = None, + stderr_callback: Callable = None, +) -> None: """ Runs a command and returns the result. :raises RuntimeError: if the process returns a non-zero exit status :raises TimeoutError: if the process times out """ - try: - return subprocess.run( + if stdout_callback is None and stderr_callback is None: + try: + subprocess.run(args, timeout=timeout, check=True) + except subprocess.CalledProcessError as e: + raise RuntimeError(error_message) from e + except subprocess.TimeoutExpired as e: + raise TimeoutError(timeout_message) from e + + else: + p = subprocess.Popen( args, - stdout=subprocess.DEVNULL, - stderr=subprocess.DEVNULL, - timeout=timeout, - check=True, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + universal_newlines=True, ) - except subprocess.CalledProcessError as e: - raise RuntimeError(error_message) from e - except subprocess.TimeoutExpired as e: - raise TimeoutError(timeout_message) from e + + # Progress callback requires a manually implemented timeout + start_time = time.time() + + # Make reading from stdout or stderr non-blocking + if p.stdout: + os.set_blocking(p.stdout.fileno(), False) + if p.stderr: + os.set_blocking(p.stderr.fileno(), False) + + while True: + # Processes hasn't finished + if p.poll() is not None: + if p.returncode != 0: + raise RuntimeError(error_message) + break + + # Check if timeout hasn't expired + if time.time() - start_time > timeout: + p.kill() + raise TimeoutError(timeout_message) + + if p.stdout and stdout_callback is not None: + line = p.stdout.readline() + if len(line) > 0: + line = line.rstrip() # strip trailing "\n" + stdout_callback(line) + + if p.stderr and stderr_callback is not None: + line = p.stderr.readline() + if len(line) > 0: + line = line.rstrip() # strip trailing "\n" + stderr_callback(line) class DangerzoneConverter: @@ -181,65 +220,104 @@ def document_to_pixels(self) -> None: ) self.percentage += 3 - # Separate PDF into pages - self.update_progress("Separating document into pages") - args = ["pdftk", pdf_filename, "burst", "output", "/tmp/page-%d.pdf"] - run_command( - args, - error_message="Separating document into pages failed", - timeout_message=f"Error separating document into pages, pdftk timed out after {DEFAULT_TIMEOUT} seconds", - ) - - page_filenames = glob.glob("/tmp/page-*.pdf") - - self.percentage += 2 - - # Convert to RGB pixel data - percentage_per_page = 45.0 / len(page_filenames) - for page in range(1, len(page_filenames) + 1): - pdf_filename = f"/tmp/page-{page}.pdf" - png_filename = f"/tmp/page-{page}.png" - rgb_filename = f"/tmp/page-{page}.rgb" - width_filename = f"/tmp/page-{page}.width" - height_filename = f"/tmp/page-{page}.height" - filename_base = f"/tmp/page-{page}" - - self.update_progress( - f"Converting page {page}/{len(page_filenames)} to pixels" - ) - - # Convert to png - run_command( - ["pdftocairo", pdf_filename, "-png", "-singlefile", filename_base], - error_message="Conversion from PDF to PNG failed", - timeout_message=f"Error converting from PDF to PNG, pdftocairo timed out after {DEFAULT_TIMEOUT} seconds", - ) + # Obtain number of pages + self.update_progress("Calculating number of pages") + self.num_pages: Union[None, int] = None - # Save the width and height - with Image.open(png_filename, "r") as im: - width, height = im.size - with open(width_filename, "w") as f: - f.write(str(width)) - with open(height_filename, "w") as f: - f.write(str(height)) + def get_num_pages(line: str) -> None: + search = re.search(r"^Pages: (\d+)", line) + if search is not None: + self.num_pages = int(search.group(1)) - # Convert to RGB pixels - run_command( - [ - "gm", - "convert", - png_filename, - "-depth", - "8", - f"rgb:{rgb_filename}", - ], - error_message="Conversion from PNG to RGB failed", - timeout_message=f"Error converting from PNG to pixels, convert timed out after {DEFAULT_TIMEOUT} seconds", - ) - - # Delete the png - os.remove(png_filename) + run_command( + ["pdfinfo", pdf_filename], + error_message="PDF file is corrupted", + timeout_message=f"Extracting metadata from PDF timed out after 1 second", + timeout=1, + stdout_callback=get_num_pages, + ) + if self.num_pages == None: + raise ValueError("Number of pages could not be extraced from PDF") + + def pdftoppm_progress_callback(line: str) -> None: + """Function called for every line the 'pdftoppm'command outputs + + Sample pdftoppm output: + + $ pdftoppm sample.pdf /tmp/safe -progress + 1 4 /tmp/safe-1.ppm + 2 4 /tmp/safe-2.ppm + 3 4 /tmp/safe-3.ppm + 4 4 /tmp/safe-4.ppm + + Each successful line is in the format "{page} {page_num} {ppm_filename}" + """ + try: + (page_str, num_pages_str, _) = line.split() + num_pages = int(num_pages_str) + page = int(page_str) + except ValueError as e: + raise RuntimeError("Conversion from PDF to PPM failed") from e + + percentage_per_page = 45.0 / num_pages self.percentage += percentage_per_page + self.update_progress(f"Converting page {page}/{num_pages} to pixels") + + zero_padding = "0" * (len(num_pages_str) - len(page_str)) + ppm_filename = f"{page_base}-{zero_padding}{page}.ppm" + rgb_filename = f"{page_base}-{page}.rgb" + width_filename = f"{page_base}-{page}.width" + height_filename = f"{page_base}-{page}.height" + filename_base = f"{page_base}-{page}" + + with open(ppm_filename, "rb") as f: + # NOTE: PPM files have multiple ways of writing headers. + # For our specific case we parse it expecting the header format that ppmtopdf produces + # More info on PPM headers: https://people.uncw.edu/tompkinsj/112/texnh/assignments/imageFormat.html + + # Read the header + header = f.readline().decode().strip() + if header != "P6": + raise ValueError("Invalid PPM header") + + # Save the width and height + dims = f.readline().decode().strip() + width, height = dims.split() + with open(width_filename, "w") as width_file: + width_file.write(width) + with open(height_filename, "w") as height_file: + height_file.write(height) + + maxval = int(f.readline().decode().strip()) + # Check that the depth is 8 + if maxval != 255: + raise ValueError("Invalid PPM depth") + + data = f.read() + + # Save pixel data + with open(rgb_filename, "wb") as f: + f.write(data) + + # Delete the ppm file + os.remove(ppm_filename) + + page_base = "/tmp/page" + + # Convert to PPM, which is essentially an RGB format + pdftoppm_timeout = 1.0 * self.num_pages # type: ignore [operator] + run_command( + [ + "pdftoppm", + pdf_filename, + page_base, + "-progress", + ], + error_message="Conversion from PDF to PPM failed", + timeout_message=f"Error converting from PDF to PPM, pdftoppm timed out after {pdftoppm_timeout} seconds", + stderr_callback=pdftoppm_progress_callback, + timeout=pdftoppm_timeout, + ) self.update_progress("Converted document to pixels")