Skip to content

Commit

Permalink
Ensure files are pdfs
Browse files Browse the repository at this point in the history
Uploading invalid pdfs to our cdn leads to errors, so we need to make
sure they are valid. Rather then reinventing the wheel ourselves we can
make use of pypdf's reader.
  • Loading branch information
olaughter committed Apr 29, 2024
1 parent 16a1a9e commit 6888be4
Show file tree
Hide file tree
Showing 3 changed files with 62 additions and 5 deletions.
48 changes: 45 additions & 3 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ requests = "^2.28.1"
tenacity = "^8.1.0"
json-logging = "^1.3.0"
cpr-data-access = {git = "https://github.com/climatepolicyradar/data-access.git", tag = "0.4.6"}
pypdf = "^4.2.0"

[tool.poetry.group.dev-dependencies.dependencies]
black = "^22.1.0"
Expand Down
18 changes: 16 additions & 2 deletions src/navigator_data_ingest/base/api_client.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,22 @@
"""A simple API client for creating documents & associations."""
import hashlib
import io
import json
import logging
from typing import cast

import requests
from cloudpathlib import CloudPath, S3Path
from cpr_data_access.parser_models import ParserInput
from pypdf import PdfReader
from pypdf.errors import PyPdfError
from tenacity import retry
from tenacity.stop import stop_after_attempt
from tenacity.wait import wait_random_exponential

from navigator_data_ingest.base.utils import determine_content_type
from navigator_data_ingest.base.types import (
CONTENT_TYPE_PDF,
MULTI_FILE_CONTENT_TYPES,
SUPPORTED_CONTENT_TYPES,
FILE_EXTENSION_MAPPING,
Expand Down Expand Up @@ -55,6 +59,7 @@ def upload_document(
try:
download_response = _download_from_source(session, source_url)
content_type = determine_content_type(download_response, source_url)
file_content = download_response.content

# Update the result object with the detected content type
upload_result.content_type = content_type
Expand All @@ -65,9 +70,13 @@ def upload_document(

if content_type not in SUPPORTED_CONTENT_TYPES:
raise UnsupportedContentTypeError(content_type)


# Ensure valid file types can be read accordingly
if content_type == CONTENT_TYPE_PDF:
# Invalid pdf should raise a PyPdfError error
PdfReader(io.BytesIO(file_content))

# Calculate the m5sum & update the result object with the calculated value
file_content = download_response.content
file_hash = hashlib.md5(file_content).hexdigest()
upload_result.md5_sum = file_hash

Expand Down Expand Up @@ -107,6 +116,11 @@ def upload_document(
f"Uploads for document {import_id} at '{source_url}' could not be completed because "
f"the content type '{e.content_type}' is not currently supported."
)
except PyPdfError as e:
_LOGGER.warn(
f"Uploads for document {import_id} at '{source_url}' could not be completed because "
f"the pdf document is invalid: {e.with_traceback(e.__traceback__)}"
)
except Exception:
_LOGGER.exception(f"Downloading source document {import_id} failed")
finally:
Expand Down

0 comments on commit 6888be4

Please sign in to comment.