Ensure files are pdfs

Uploading invalid pdfs to our cdn leads to errors, so we need to make sure they are valid. Rather then reinventing the wheel ourselves we can make use of pypdf's reader.
climatepolicyradar · Apr 29, 2024 · 6888be4 · 6888be4
1 parent 16a1a9e
commit 6888be4
Show file tree

Hide file tree

Showing 3 changed files with 62 additions and 5 deletions.
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -24,6 +24,7 @@ requests = "^2.28.1"
 tenacity = "^8.1.0"
 json-logging = "^1.3.0"
 cpr-data-access = {git = "https://github.com/climatepolicyradar/data-access.git", tag = "0.4.6"}
+pypdf = "^4.2.0"
 
 [tool.poetry.group.dev-dependencies.dependencies]
 black = "^22.1.0"

diff --git a/src/navigator_data_ingest/base/api_client.py b/src/navigator_data_ingest/base/api_client.py
@@ -1,18 +1,22 @@
 """A simple API client for creating documents & associations."""
 import hashlib
+import io
 import json
 import logging
 from typing import cast
 
 import requests
 from cloudpathlib import CloudPath, S3Path
 from cpr_data_access.parser_models import ParserInput
+from pypdf import PdfReader
+from pypdf.errors import PyPdfError
 from tenacity import retry
 from tenacity.stop import stop_after_attempt
 from tenacity.wait import wait_random_exponential
 
 from navigator_data_ingest.base.utils import determine_content_type
 from navigator_data_ingest.base.types import (
+    CONTENT_TYPE_PDF,
     MULTI_FILE_CONTENT_TYPES,
     SUPPORTED_CONTENT_TYPES,
     FILE_EXTENSION_MAPPING,
@@ -55,6 +59,7 @@ def upload_document(
     try:
         download_response = _download_from_source(session, source_url)
         content_type = determine_content_type(download_response, source_url)
+        file_content = download_response.content
 
         # Update the result object with the detected content type
         upload_result.content_type = content_type
@@ -65,9 +70,13 @@ def upload_document(
 
         if content_type not in SUPPORTED_CONTENT_TYPES:
             raise UnsupportedContentTypeError(content_type)
-
+
+        # Ensure valid file types can be read accordingly
+        if content_type == CONTENT_TYPE_PDF:
+            # Invalid pdf should raise a PyPdfError error
+            PdfReader(io.BytesIO(file_content))
+
         # Calculate the m5sum & update the result object with the calculated value
-        file_content = download_response.content
         file_hash = hashlib.md5(file_content).hexdigest()
         upload_result.md5_sum = file_hash
 
@@ -107,6 +116,11 @@ def upload_document(
             f"Uploads for document {import_id} at '{source_url}' could not be completed because "
             f"the content type '{e.content_type}' is not currently supported."
         )
+    except PyPdfError as e:
+        _LOGGER.warn(
+            f"Uploads for document {import_id} at '{source_url}' could not be completed because "
+            f"the pdf document is invalid: {e.with_traceback(e.__traceback__)}"
+        )
     except Exception:
         _LOGGER.exception(f"Downloading source document {import_id} failed")
     finally: