Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix conversion issues regarding OpenJDK and pdfinfo #328

Merged
merged 8 commits into from
Feb 7, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 36 additions & 3 deletions .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -151,10 +151,14 @@ jobs:
- run:
name: Install poetry dependencies
command: |
# Remove this pin once the upstream Poetry issue is fixed:
sudo pip3 install poetry
# This flag is important, due to an open upstream Poetry issue:
# https://github.com/python-poetry/poetry/issues/7184
sudo pip3 install poetry==1.2.2
poetry install
poetry install --no-ansi
- run:
name: Install test dependencies
command: |
sudo apt-get install -y libqt5gui5 --no-install-recommends
- run:
name: Prepare cache directory
command: |
Expand Down Expand Up @@ -219,6 +223,32 @@ jobs:
./dev_scripts/env.py --distro ubuntu --version 22.04 run --dev \
bash -c 'cd dangerzone; poetry run make test'

ci-fedora-37:
machine:
image: ubuntu-2004:202111-01
steps:
- checkout
- run: *install-podman

- run:
name: Prepare cache directory
command: |
sudo mkdir -p /caches
sudo chown -R $USER:$USER /caches
- restore_cache: *restore-cache
- run: *copy-image

- run:
name: Prepare Dangerzone environment
command: |
./dev_scripts/env.py --distro fedora --version 37 build-dev

- run:
name: Run CI tests
command: |
./dev_scripts/env.py --distro fedora --version 37 run --dev \
bash -c 'cd dangerzone; poetry run make test'

ci-fedora-36:
machine:
image: ubuntu-2004:202111-01
Expand Down Expand Up @@ -462,6 +492,9 @@ workflows:
- ci-debian-bookworm:
requires:
- build-container-image
- ci-fedora-37:
requires:
- build-container-image
- ci-fedora-36:
requires:
- build-container-image
Expand Down
1 change: 1 addition & 0 deletions container/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ RUN apk -U upgrade && \
ghostscript \
graphicsmagick \
libreoffice \
openjdk8 \
poppler-utils \
python3 \
py3-magic \
Expand Down
166 changes: 85 additions & 81 deletions container/dangerzone.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
- 95%-100%: Compress the final PDF
"""

import asyncio
import glob
import json
import os
Expand All @@ -20,7 +21,7 @@
import subprocess
import sys
import time
from typing import Callable, Dict, List, Optional, Union
from typing import Callable, Dict, List, Optional, Tuple, Union

import magic

Expand All @@ -31,76 +32,83 @@
COMPRESSION_TIMEOUT: float = 10


def run_command(
async def read_stream(sr: asyncio.StreamReader, callback: Callable = None) -> bytes:
"""Consume a byte stream line-by-line.

Read all lines in a stream until EOF. If a user has passed a callback, call it for
each line.

Note that the lines are in bytes, since we can't assume that all command output will
be UTF-8 encoded. Higher level commands are advised to decode the output to Unicode,
if they know its encoding.
"""
buf = b""
while True:
line = await sr.readline()
if sr.at_eof():
break
if callback is not None:
callback(line)
# TODO: This would be a good place to log the received line, mostly for debug
# logging.
buf += line
return buf


async def run_command(
args: List[str],
*,
error_message: str,
timeout_message: str,
timeout: float = DEFAULT_TIMEOUT,
stdout_callback: Callable = None,
stderr_callback: Callable = None,
) -> None:
"""
Runs a command and returns the result.
) -> Tuple[bytes, bytes]:
"""Run a command and get its output.

Run a command using asyncio.subprocess, consume its standard streams, and return its
output in bytes.

:raises RuntimeError: if the process returns a non-zero exit status
:raises TimeoutError: if the process times out
"""
if stdout_callback is None and stderr_callback is None:
try:
subprocess.run(args, timeout=timeout, check=True)
except subprocess.CalledProcessError as e:
raise RuntimeError(error_message) from e
except subprocess.TimeoutExpired as e:
raise TimeoutError(timeout_message) from e

else:
p = subprocess.Popen(
args,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
universal_newlines=True,
)

# Progress callback requires a manually implemented timeout
start_time = time.time()

# Make reading from stdout or stderr non-blocking
if p.stdout:
os.set_blocking(p.stdout.fileno(), False)
if p.stderr:
os.set_blocking(p.stderr.fileno(), False)

while True:
# Processes hasn't finished
if p.poll() is not None:
if p.returncode != 0:
raise RuntimeError(error_message)
break

# Check if timeout hasn't expired
if time.time() - start_time > timeout:
p.kill()
raise TimeoutError(timeout_message)

if p.stdout and stdout_callback is not None:
line = p.stdout.readline()
if len(line) > 0:
line = line.rstrip() # strip trailing "\n"
stdout_callback(line)
# Start the provided command, and return a handle. The command will run in the
# background.
proc = await asyncio.subprocess.create_subprocess_exec(
*args,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE,
)

assert proc.stdout is not None
assert proc.stderr is not None

# Create asynchronous tasks that will consume the standard streams of the command,
# and call callbacks if necessary.
stdout_task = asyncio.create_task(read_stream(proc.stdout, stdout_callback))
stderr_task = asyncio.create_task(read_stream(proc.stderr, stderr_callback))

# Wait until the command has finished, for a specific timeout. Then, verify that the
# command has completed successfully. In any other case, raise an exception.
try:
ret = await asyncio.wait_for(proc.wait(), timeout=timeout)
except asyncio.exceptions.TimeoutError:
raise TimeoutError(timeout_message)
if ret != 0:
raise RuntimeError(error_message)

if p.stderr and stderr_callback is not None:
line = p.stderr.readline()
if len(line) > 0:
line = line.rstrip() # strip trailing "\n"
stderr_callback(line)
# Wait until the tasks that consume the command's standard streams have exited as
# well, and return their output.
stdout = await stdout_task
stderr = await stderr_task
return (stdout, stderr)


class DangerzoneConverter:
def __init__(self) -> None:
self.percentage: float = 0.0

def document_to_pixels(self) -> None:
async def document_to_pixels(self) -> None:

conversions: Dict[str, Dict[str, Optional[str]]] = {
# .pdf
Expand Down Expand Up @@ -194,7 +202,7 @@ def document_to_pixels(self) -> None:
"/tmp",
"/tmp/input_file",
]
run_command(
await run_command(
args,
error_message="Conversion to PDF with LibreOffice failed",
timeout_message=f"Error converting document to PDF, LibreOffice timed out after {DEFAULT_TIMEOUT} seconds",
Expand All @@ -208,7 +216,7 @@ def document_to_pixels(self) -> None:
"/tmp/input_file",
"/tmp/input_file.pdf",
]
run_command(
await run_command(
args,
error_message="Conversion to PDF with GraphicsMagick failed",
timeout_message=f"Error converting document to PDF, GraphicsMagick timed out after {DEFAULT_TIMEOUT} seconds",
Expand All @@ -222,25 +230,21 @@ def document_to_pixels(self) -> None:

# Obtain number of pages
self.update_progress("Calculating number of pages")
self.num_pages: Union[None, int] = None

def get_num_pages(line: str) -> None:
search = re.search(r"^Pages: (\d+)", line)
if search is not None:
self.num_pages = int(search.group(1))

run_command(
stdout, _ = await run_command(
["pdfinfo", pdf_filename],
error_message="PDF file is corrupted",
timeout_message=f"Extracting metadata from PDF timed out after 1 second",
timeout=1,
stdout_callback=get_num_pages,
)
if self.num_pages == None:
raise ValueError("Number of pages could not be extraced from PDF")

def pdftoppm_progress_callback(line: str) -> None:
"""Function called for every line the 'pdftoppm'command outputs
search = re.search(r"Pages:\s*(\d+)\s*\n", stdout.decode())
if search is not None:
self.num_pages: int = int(search.group(1))
else:
raise ValueError("Number of pages could not be extracted from PDF")

def pdftoppm_progress_callback(line: bytes) -> None:
"""Function called for every line the 'pdftoppm' command outputs

Sample pdftoppm output:

Expand All @@ -253,7 +257,7 @@ def pdftoppm_progress_callback(line: str) -> None:
Each successful line is in the format "{page} {page_num} {ppm_filename}"
"""
try:
(page_str, num_pages_str, _) = line.split()
(page_str, num_pages_str, _) = line.decode().split()
num_pages = int(num_pages_str)
page = int(page_str)
except ValueError as e:
Expand Down Expand Up @@ -305,8 +309,8 @@ def pdftoppm_progress_callback(line: str) -> None:
page_base = "/tmp/page"

# Convert to PPM, which is essentially an RGB format
pdftoppm_timeout = 1.0 * self.num_pages # type: ignore [operator]
run_command(
pdftoppm_timeout = 1.0 * self.num_pages
await run_command(
[
"pdftoppm",
pdf_filename,
Expand All @@ -329,7 +333,7 @@ def pdftoppm_progress_callback(line: str) -> None:
):
shutil.move(filename, "/dangerzone")

def pixels_to_pdf(self) -> None:
async def pixels_to_pdf(self) -> None:
self.percentage = 50.0

num_pages = len(glob.glob("/dangerzone/page-*.rgb"))
Expand All @@ -354,7 +358,7 @@ def pixels_to_pdf(self) -> None:
self.update_progress(
f"Converting page {page}/{num_pages} from pixels to searchable PDF"
)
run_command(
await run_command(
[
"gm",
"convert",
Expand All @@ -368,7 +372,7 @@ def pixels_to_pdf(self) -> None:
error_message=f"Page {page}/{num_pages} conversion to PNG failed",
timeout_message=f"Error converting pixels to PNG, convert timed out after {DEFAULT_TIMEOUT} seconds",
)
run_command(
await run_command(
[
"tesseract",
png_filename,
Expand All @@ -387,7 +391,7 @@ def pixels_to_pdf(self) -> None:
self.update_progress(
f"Converting page {page}/{num_pages} from pixels to PDF"
)
run_command(
await run_command(
[
"gm",
"convert",
Expand All @@ -410,7 +414,7 @@ def pixels_to_pdf(self) -> None:
for page in range(1, num_pages + 1):
args.append(f"/tmp/page-{page}.pdf")
args.append(f"/tmp/safe-output.pdf")
run_command(
await run_command(
args,
error_message="Merging pages into a single PDF failed",
timeout_message=f"Error merging pages into a single PDF, pdfunite timed out after {DEFAULT_TIMEOUT} seconds",
Expand All @@ -421,7 +425,7 @@ def pixels_to_pdf(self) -> None:
# Compress
self.update_progress("Compressing PDF")
compress_timeout = num_pages * COMPRESSION_TIMEOUT
run_command(
await run_command(
["ps2pdf", "/tmp/safe-output.pdf", "/tmp/safe-output-compressed.pdf"],
timeout_message=f"Error compressing PDF, ps2pdf timed out after {compress_timeout} seconds",
error_message="Compressing PDF failed",
Expand All @@ -444,7 +448,7 @@ def update_progress(self, text: str, *, error: bool = False) -> None:
sys.stdout.flush()


def main() -> int:
async def main() -> int:
if len(sys.argv) != 2:
print(f"Usage: {sys.argv[0]} [document-to-pixels]|[pixels-to-pdf]")
return -1
Expand All @@ -453,9 +457,9 @@ def main() -> int:

try:
if sys.argv[1] == "document-to-pixels":
converter.document_to_pixels()
await converter.document_to_pixels()
elif sys.argv[1] == "pixels-to-pdf":
converter.pixels_to_pdf()
await converter.pixels_to_pdf()
except (RuntimeError, TimeoutError, ValueError) as e:
converter.update_progress(str(e), error=True)
return 1
Expand All @@ -464,4 +468,4 @@ def main() -> int:


if __name__ == "__main__":
sys.exit(main())
sys.exit(asyncio.run(main()))
4 changes: 2 additions & 2 deletions dev_scripts/env.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@

# FIXME: Drop this fix after it's resolved upstream.
# See https://github.com/freedomofpress/dangerzone/issues/286#issuecomment-1347149783
RUN dnf reinstall -y shadow-utils && dnf clean all
RUN rpm --restore shadow-utils

RUN dnf install -y mupdf && dnf clean all
"""
Expand Down Expand Up @@ -120,7 +120,7 @@

# FIXME: Drop this fix after it's resolved upstream.
# See https://github.com/freedomofpress/dangerzone/issues/286#issuecomment-1347149783
RUN dnf reinstall -y shadow-utils && dnf clean all
RUN rpm --restore shadow-utils
"""

# The Dockerfile for building an environment with Dangerzone installed in it. Parts of
Expand Down
Loading