Azure-Samples · pamelafox · Oct 27, 2023 · Sep 15, 2023 · Sep 15, 2023 · Sep 16, 2023
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -18,5 +18,10 @@
     "search.exclude": {
         "**/node_modules": true,
         "static": true
-    }
+    },
+    "python.testing.pytestArgs": [
+        "tests"
+    ],
+    "python.testing.unittestEnabled": false,
+    "python.testing.pytestEnabled": true
 }
diff --git a/app/backend/app.py b/app/backend/app.py
@@ -9,6 +9,7 @@
 
 import aiohttp
 import openai
+from azure.core.exceptions import ResourceNotFoundError
 from azure.identity.aio import DefaultAzureCredential
 from azure.monitor.opentelemetry import configure_azure_monitor
 from azure.search.documents.aio import SearchClient
@@ -69,9 +70,18 @@ async def assets(path):
 # *** NOTE *** this assumes that the content files are public, or at least that all users of the app
 # can access all the files. This is also slow and memory hungry.
 @bp.route("/content/<path>")
-async def content_file(path):
+async def content_file(path: str):
+    # Remove page number from path, filename-1.txt -> filename.txt
+    if path.find("#page=") > 0:
+        path_parts = path.rsplit("#page=", 1)
+        path = path_parts[0]
+    logging.info("Opening file %s at page %s", path)
     blob_container_client = current_app.config[CONFIG_BLOB_CONTAINER_CLIENT]
-    blob = await blob_container_client.get_blob_client(path).download_blob()
+    try:
+        blob = await blob_container_client.get_blob_client(path).download_blob()
+    except ResourceNotFoundError:
+        logging.exception("Path not found: %s", path)
+        abort(404)
     if not blob.properties or not blob.properties.has_key("content_settings"):
         abort(404)
     mime_type = blob.properties["content_settings"]["content_type"]

diff --git a/data/employee_handbook.pdf b/data/employee_handbook.pdf
diff --git a/pyproject.toml b/pyproject.toml
@@ -9,7 +9,7 @@ line-length = 120
 
 [tool.pytest.ini_options]
 addopts = "-ra"
-pythonpath = ["app/backend"]
+pythonpath = ["app/backend", "scripts"]
 
 [tool.coverage.paths]
 source = ["scripts", "app"]

diff --git a/scripts/prepdocs.py b/scripts/prepdocs.py
@@ -3,7 +3,6 @@
 import glob
 import hashlib
 import html
-import io
 import os
 import re
 import tempfile
@@ -35,7 +34,7 @@
 from azure.storage.filedatalake import (
     DataLakeServiceClient,
 )
-from pypdf import PdfReader, PdfWriter
+from pypdf import PdfReader
 from tenacity import (
     retry,
     retry_if_exception_type,
@@ -78,7 +77,7 @@ def calculate_tokens_emb_aoai(input: str):
 
 def blob_name_from_file_page(filename, page=0):
     if os.path.splitext(filename)[1].lower() == ".pdf":
-        return os.path.splitext(os.path.basename(filename))[0] + f"-{page}" + ".pdf"
+        return f"{os.path.basename(filename)}#page={page+1}"
     else:
         return os.path.basename(filename)
 
@@ -91,24 +90,11 @@ def upload_blobs(filename):
     if not blob_container.exists():
         blob_container.create_container()
 
-    # if file is PDF split into pages and upload each page as a separate blob
-    if os.path.splitext(filename)[1].lower() == ".pdf":
-        reader = PdfReader(filename)
-        pages = reader.pages
-        for i in range(len(pages)):
-            blob_name = blob_name_from_file_page(filename, i)
-            if args.verbose:
-                print(f"\tUploading blob for page {i} -> {blob_name}")
-            f = io.BytesIO()
-            writer = PdfWriter()
-            writer.add_page(pages[i])
-            writer.write(f)
-            f.seek(0)
-            blob_container.upload_blob(blob_name, f, overwrite=True)
-    else:
-        blob_name = blob_name_from_file_page(filename)
-        with open(filename, "rb") as data:
-            blob_container.upload_blob(blob_name, data, overwrite=True)
+    # Upload the original file
+    blob_name = os.path.basename(filename)
+    print(f"\tUploading blob for whole file -> {blob_name}")
+    with open(filename, "rb") as data:
+        blob_container.upload_blob(blob_name, data, overwrite=True)
 
 
 def remove_blobs(filename):
@@ -124,7 +110,7 @@ def remove_blobs(filename):
         else:
             prefix = os.path.splitext(os.path.basename(filename))[0]
             blobs = filter(
-                lambda b: re.match(f"{prefix}-\d+\.pdf", b),
+                lambda b: re.match(f"{prefix}-\d+\.pdf", b) or b == os.path.basename(filename),
                 blob_container.list_blob_names(name_starts_with=os.path.splitext(os.path.basename(prefix))[0]),
             )
         for b in blobs:

diff --git a/tests/test_content_file.py b/tests/test_content_file.py
@@ -0,0 +1,99 @@
+import os
+from collections import namedtuple
+
+import aiohttp
+import pytest
+from azure.core.exceptions import ResourceNotFoundError
+from azure.core.pipeline.transport import (
+    AioHttpTransportResponse,
+    AsyncHttpTransport,
+    HttpRequest,
+)
+from azure.storage.blob.aio import BlobServiceClient
+
+import app
+
+MockToken = namedtuple("MockToken", ["token", "expires_on"])
+
+
+class MockAzureCredential:
+    async def get_token(self, uri):
+        return MockToken("mock_token", 9999999999)
+
+
+@pytest.mark.asyncio
+async def test_content_file(monkeypatch, mock_env):
+    class MockAiohttpClientResponse404(aiohttp.ClientResponse):
+        def __init__(self, url, body_bytes, headers=None):
+            self._body = body_bytes
+            self._headers = headers
+            self._cache = {}
+            self.status = 404
+            self.reason = "Not Found"
+            self._url = url
+
+    class MockAiohttpClientResponse(aiohttp.ClientResponse):
+        def __init__(self, url, body_bytes, headers=None):
+            self._body = body_bytes
+            self._headers = headers
+            self._cache = {}
+            self.status = 200
+            self.reason = "OK"
+            self._url = url
+
+    class MockTransport(AsyncHttpTransport):
+        async def send(self, request: HttpRequest, **kwargs) -> AioHttpTransportResponse:
+            if request.url.endswith("notfound.pdf"):
+                raise ResourceNotFoundError(MockAiohttpClientResponse404(request.url, b""))
+            else:
+                return AioHttpTransportResponse(
+                    request,
+                    MockAiohttpClientResponse(
+                        request.url,
+                        b"test content",
+                        {
+                            "Content-Type": "application/octet-stream",
+                            "Content-Range": "bytes 0-27/28",
+                            "Content-Length": "28",
+                        },
+                    ),
+                )
+
+        async def __aenter__(self):
+            return self
+
+        async def __aexit__(self, *args):
+            pass
+
+        async def open(self):
+            pass
+
+        async def close(self):
+            pass
+
+    # Then we can plug this into any SDK via kwargs:
+    blob_client = BlobServiceClient(
+        f"https://{os.environ['AZURE_STORAGE_ACCOUNT']}.blob.core.windows.net",
+        credential=MockAzureCredential(),
+        transport=MockTransport(),
+        retry_total=0,  # Necessary to avoid unnecessary network requests during tests
+    )
+    blob_container_client = blob_client.get_container_client(os.environ["AZURE_STORAGE_CONTAINER"])
+
+    quart_app = app.create_app()
+    async with quart_app.test_app() as test_app:
+        quart_app.config.update({"blob_container_client": blob_container_client})
+
+        client = test_app.test_client()
+        response = await client.get("/content/notfound.pdf")
+        assert response.status_code == 404
+
+        response = await client.get("/content/role_library.pdf")
+        assert response.status_code == 200
+        assert response.headers["Content-Type"] == "application/pdf"
+        assert await response.get_data() == b"test content"
+
+        response = await client.get("/content/role_library.pdf#page=10")
+        assert response.status_code == 200
+        assert response.headers["Content-Type"] == "application/pdf"
+        assert await response.get_data() == b"test content"