Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Reduce Container Dependencies #305

Merged
merged 4 commits into from
Jan 23, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 0 additions & 12 deletions container/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,9 @@ RUN apk -U upgrade && \
ghostscript \
graphicsmagick \
libreoffice \
openjdk8 \
poppler-utils \
python3 \
py3-magic \
py3-pillow \
sudo \
tesseract-ocr \
tesseract-ocr-data-afr \
tesseract-ocr-data-ara \
Expand Down Expand Up @@ -78,15 +75,6 @@ RUN apk -U upgrade && \
tesseract-ocr-data-ukr \
tesseract-ocr-data-vie

# Install pdftk
RUN \
wget https://gitlab.com/pdftk-java/pdftk/-/jobs/924565145/artifacts/raw/build/libs/pdftk-all.jar && \
mv pdftk-all.jar /usr/local/bin && \
chmod +x /usr/local/bin/pdftk-all.jar && \
echo '#!/bin/sh' > /usr/local/bin/pdftk && \
echo '/usr/bin/java -jar "/usr/local/bin/pdftk-all.jar" "$@"' >> /usr/local/bin/pdftk && \
chmod +x /usr/local/bin/pdftk

COPY dangerzone.py /usr/local/bin/
RUN chmod +x /usr/local/bin/dangerzone.py

Expand Down
216 changes: 147 additions & 69 deletions container/dangerzone.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,14 @@
import glob
import json
import os
import re
import shutil
import subprocess
import sys
from typing import Dict, List, Optional
import time
from typing import Callable, Dict, List, Optional, Union

import magic
from PIL import Image

# timeout in seconds for any single subprocess
DEFAULT_TIMEOUT: float = 120
Expand All @@ -36,25 +37,63 @@ def run_command(
error_message: str,
timeout_message: str,
timeout: float = DEFAULT_TIMEOUT,
) -> subprocess.CompletedProcess:
stdout_callback: Callable = None,
stderr_callback: Callable = None,
) -> None:
"""
Runs a command and returns the result.

:raises RuntimeError: if the process returns a non-zero exit status
:raises TimeoutError: if the process times out
"""
try:
return subprocess.run(
if stdout_callback is None and stderr_callback is None:
try:
subprocess.run(args, timeout=timeout, check=True)
except subprocess.CalledProcessError as e:
raise RuntimeError(error_message) from e
except subprocess.TimeoutExpired as e:
raise TimeoutError(timeout_message) from e

else:
p = subprocess.Popen(
args,
stdout=subprocess.DEVNULL,
stderr=subprocess.DEVNULL,
timeout=timeout,
check=True,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
universal_newlines=True,
apyrgio marked this conversation as resolved.
Show resolved Hide resolved
)
except subprocess.CalledProcessError as e:
raise RuntimeError(error_message) from e
except subprocess.TimeoutExpired as e:
raise TimeoutError(timeout_message) from e

# Progress callback requires a manually implemented timeout
start_time = time.time()

# Make reading from stdout or stderr non-blocking
if p.stdout:
os.set_blocking(p.stdout.fileno(), False)
if p.stderr:
os.set_blocking(p.stderr.fileno(), False)

while True:
# Processes hasn't finished
if p.poll() is not None:
if p.returncode != 0:
raise RuntimeError(error_message)
break

# Check if timeout hasn't expired
if time.time() - start_time > timeout:
p.kill()
raise TimeoutError(timeout_message)

if p.stdout and stdout_callback is not None:
line = p.stdout.readline()
if len(line) > 0:
line = line.rstrip() # strip trailing "\n"
stdout_callback(line)

if p.stderr and stderr_callback is not None:
line = p.stderr.readline()
if len(line) > 0:
line = line.rstrip() # strip trailing "\n"
stderr_callback(line)


class DangerzoneConverter:
Expand Down Expand Up @@ -181,65 +220,104 @@ def document_to_pixels(self) -> None:
)
self.percentage += 3

# Separate PDF into pages
self.update_progress("Separating document into pages")
args = ["pdftk", pdf_filename, "burst", "output", "/tmp/page-%d.pdf"]
run_command(
args,
error_message="Separating document into pages failed",
timeout_message=f"Error separating document into pages, pdftk timed out after {DEFAULT_TIMEOUT} seconds",
)

page_filenames = glob.glob("/tmp/page-*.pdf")

self.percentage += 2

# Convert to RGB pixel data
percentage_per_page = 45.0 / len(page_filenames)
for page in range(1, len(page_filenames) + 1):
pdf_filename = f"/tmp/page-{page}.pdf"
png_filename = f"/tmp/page-{page}.png"
rgb_filename = f"/tmp/page-{page}.rgb"
width_filename = f"/tmp/page-{page}.width"
height_filename = f"/tmp/page-{page}.height"
filename_base = f"/tmp/page-{page}"

self.update_progress(
f"Converting page {page}/{len(page_filenames)} to pixels"
)

# Convert to png
run_command(
["pdftocairo", pdf_filename, "-png", "-singlefile", filename_base],
error_message="Conversion from PDF to PNG failed",
timeout_message=f"Error converting from PDF to PNG, pdftocairo timed out after {DEFAULT_TIMEOUT} seconds",
)
# Obtain number of pages
self.update_progress("Calculating number of pages")
self.num_pages: Union[None, int] = None

# Save the width and height
with Image.open(png_filename, "r") as im:
width, height = im.size
with open(width_filename, "w") as f:
f.write(str(width))
with open(height_filename, "w") as f:
f.write(str(height))
def get_num_pages(line: str) -> None:
search = re.search(r"^Pages: (\d+)", line)
if search is not None:
self.num_pages = int(search.group(1))

# Convert to RGB pixels
run_command(
[
"gm",
"convert",
png_filename,
"-depth",
"8",
f"rgb:{rgb_filename}",
],
error_message="Conversion from PNG to RGB failed",
timeout_message=f"Error converting from PNG to pixels, convert timed out after {DEFAULT_TIMEOUT} seconds",
)

# Delete the png
os.remove(png_filename)
run_command(
["pdfinfo", pdf_filename],
error_message="PDF file is corrupted",
timeout_message=f"Extracting metadata from PDF timed out after 1 second",
timeout=1,
stdout_callback=get_num_pages,
)
if self.num_pages == None:
raise ValueError("Number of pages could not be extraced from PDF")

def pdftoppm_progress_callback(line: str) -> None:
"""Function called for every line the 'pdftoppm'command outputs

Sample pdftoppm output:

$ pdftoppm sample.pdf /tmp/safe -progress
1 4 /tmp/safe-1.ppm
2 4 /tmp/safe-2.ppm
3 4 /tmp/safe-3.ppm
4 4 /tmp/safe-4.ppm

Each successful line is in the format "{page} {page_num} {ppm_filename}"
"""
try:
(page_str, num_pages_str, _) = line.split()
num_pages = int(num_pages_str)
page = int(page_str)
except ValueError as e:
raise RuntimeError("Conversion from PDF to PPM failed") from e

percentage_per_page = 45.0 / num_pages
self.percentage += percentage_per_page
self.update_progress(f"Converting page {page}/{num_pages} to pixels")

zero_padding = "0" * (len(num_pages_str) - len(page_str))
ppm_filename = f"{page_base}-{zero_padding}{page}.ppm"
rgb_filename = f"{page_base}-{page}.rgb"
width_filename = f"{page_base}-{page}.width"
height_filename = f"{page_base}-{page}.height"
filename_base = f"{page_base}-{page}"

with open(ppm_filename, "rb") as f:
# NOTE: PPM files have multiple ways of writing headers.
# For our specific case we parse it expecting the header format that ppmtopdf produces
# More info on PPM headers: https://people.uncw.edu/tompkinsj/112/texnh/assignments/imageFormat.html

# Read the header
header = f.readline().decode().strip()
if header != "P6":
raise ValueError("Invalid PPM header")
apyrgio marked this conversation as resolved.
Show resolved Hide resolved

# Save the width and height
dims = f.readline().decode().strip()
width, height = dims.split()
with open(width_filename, "w") as width_file:
width_file.write(width)
with open(height_filename, "w") as height_file:
height_file.write(height)

maxval = int(f.readline().decode().strip())
# Check that the depth is 8
if maxval != 255:
raise ValueError("Invalid PPM depth")
apyrgio marked this conversation as resolved.
Show resolved Hide resolved

data = f.read()

# Save pixel data
with open(rgb_filename, "wb") as f:
f.write(data)

# Delete the ppm file
os.remove(ppm_filename)

page_base = "/tmp/page"

# Convert to PPM, which is essentially an RGB format
pdftoppm_timeout = 1.0 * self.num_pages # type: ignore [operator]
run_command(
[
"pdftoppm",
pdf_filename,
page_base,
"-progress",
],
error_message="Conversion from PDF to PPM failed",
timeout_message=f"Error converting from PDF to PPM, pdftoppm timed out after {pdftoppm_timeout} seconds",
stderr_callback=pdftoppm_progress_callback,
timeout=pdftoppm_timeout,
)

self.update_progress("Converted document to pixels")

Expand Down