Azure-Samples · pamelafox · Oct 27, 2023 · Sep 15, 2023 · Sep 15, 2023 · Sep 16, 2023
diff --git a/.github/workflows/python-test.yaml b/.github/workflows/python-test.yaml
@@ -49,7 +49,7 @@ jobs:
           run: black . --check --verbose
         - name: Run Python tests
           if: runner.os != 'Windows'
-          run: python3 -m pytest -s -vv --cov --cov-fail-under=78
+          run: python3 -m pytest -s -vv --cov --cov-fail-under=87
         - name: Run E2E tests with Playwright
           id: e2e
           if: runner.os != 'Windows'

diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -18,5 +18,10 @@
     "search.exclude": {
         "**/node_modules": true,
         "static": true
-    }
+    },
+    "python.testing.pytestArgs": [
+        "tests"
+    ],
+    "python.testing.unittestEnabled": false,
+    "python.testing.pytestEnabled": true
 }
diff --git a/app/backend/app.py b/app/backend/app.py
@@ -9,6 +9,7 @@
 
 import aiohttp
 import openai
+from azure.core.exceptions import ResourceNotFoundError
 from azure.identity.aio import DefaultAzureCredential
 from azure.monitor.opentelemetry import configure_azure_monitor
 from azure.search.documents.aio import SearchClient
@@ -69,9 +70,18 @@ async def assets(path):
 # *** NOTE *** this assumes that the content files are public, or at least that all users of the app
 # can access all the files. This is also slow and memory hungry.
 @bp.route("/content/<path>")
-async def content_file(path):
+async def content_file(path: str):
+    # Remove page number from path, filename-1.txt -> filename.txt
+    if path.find("#page=") > 0:
+        path_parts = path.rsplit("#page=", 1)
+        path = path_parts[0]
+    logging.info("Opening file %s at page %s", path)
     blob_container_client = current_app.config[CONFIG_BLOB_CONTAINER_CLIENT]
-    blob = await blob_container_client.get_blob_client(path).download_blob()
+    try:
+        blob = await blob_container_client.get_blob_client(path).download_blob()
+    except ResourceNotFoundError:
+        logging.exception("Path not found: %s", path)
+        abort(404)
     if not blob.properties or not blob.properties.has_key("content_settings"):
         abort(404)
     mime_type = blob.properties["content_settings"]["content_type"]

diff --git a/pyproject.toml b/pyproject.toml
@@ -2,14 +2,17 @@
 target-version = "py38"
 select = ["E", "F", "I", "UP"]
 ignore = ["E501", "E701"] # line too long, multiple statements on one line
-src = ["app/backend"]
+src = ["app/backend", "scripts"]
+
+[tool.ruff.isort]
+known-local-folder = ["scripts"]
 
 [tool.black]
 line-length = 120
 
 [tool.pytest.ini_options]
 addopts = "-ra"
-pythonpath = ["app/backend"]
+pythonpath = ["app/backend", "scripts"]
 
 [tool.coverage.paths]
 source = ["scripts", "app"]

diff --git a/scripts/prepdocs.py b/scripts/prepdocs.py
@@ -5,6 +5,7 @@
 from azure.core.credentials import AzureKeyCredential
 from azure.core.credentials_async import AsyncTokenCredential
 from azure.identity.aio import AzureDeveloperCliCredential
+
 from prepdocslib.blobmanager import BlobManager
 from prepdocslib.embeddings import (
     AzureOpenAIEmbeddingService,

diff --git a/scripts/prepdocslib/blobmanager.py b/scripts/prepdocslib/blobmanager.py
@@ -1,11 +1,9 @@
-import io
 import os
 import re
 from typing import Optional, Union
 
 from azure.core.credentials_async import AsyncTokenCredential
 from azure.storage.blob.aio import BlobServiceClient
-from pypdf import PdfReader, PdfWriter
 
 from .listfilestrategy import File
 
@@ -34,24 +32,11 @@ async def upload_blob(self, file: File):
             if not await container_client.exists():
                 await container_client.create_container()
 
-            # if file is PDF split into pages and upload each page as a separate blob
-            if os.path.splitext(file.content.name)[1].lower() == ".pdf":
-                with open(file.content.name, "rb") as reopened_file:
-                    reader = PdfReader(reopened_file)
-                    pages = reader.pages
-                    for i in range(len(pages)):
-                        blob_name = BlobManager.blob_name_from_file_page(file.content.name, i)
-                        if self.verbose:
-                            print(f"\tUploading blob for page {i} -> {blob_name}")
-                        f = io.BytesIO()
-                        writer = PdfWriter()
-                        writer.add_page(pages[i])
-                        writer.write(f)
-                        f.seek(0)
-                        await container_client.upload_blob(blob_name, f, overwrite=True)
-            else:
-                blob_name = BlobManager.blob_name_from_file_page(file.content.name, page=0)
-                await container_client.upload_blob(blob_name, file.content, overwrite=True)
+            # Re-open and upload the original file
+            with open(file.content.name, "rb") as reopened_file:
+                blob_name = BlobManager.blob_name_from_file_name(file.content.name)
+                print(f"\tUploading blob for whole file -> {blob_name}")
+                await container_client.upload_blob(blob_name, reopened_file, overwrite=True)
 
     async def remove_blob(self, path: Optional[str] = None):
         async with BlobServiceClient(
@@ -65,16 +50,23 @@ async def remove_blob(self, path: Optional[str] = None):
             else:
                 prefix = os.path.splitext(os.path.basename(path))[0]
                 blobs = container_client.list_blob_names(name_starts_with=os.path.splitext(os.path.basename(prefix))[0])
-            async for b in blobs:
-                if prefix is not None and not re.match(f"{prefix}-\d+\.pdf", b):
+            async for blob_path in blobs:
+                # This still supports PDFs split into individual pages, but we could remove in future to simplify code
+                if (prefix is not None and not re.match(rf"{prefix}-\d+\.pdf", blob_path)) or (
+                    path is not None and blob_path == os.path.basename(path)
+                ):
                     continue
                 if self.verbose:
-                    print(f"\tRemoving blob {b}")
-                await container_client.delete_blob(b)
+                    print(f"\tRemoving blob {blob_path}")
+                await container_client.delete_blob(blob_path)
 
     @classmethod
-    def blob_name_from_file_page(cls, filename, page=0) -> str:
+    def sourcepage_from_file_page(cls, filename, page=0) -> str:
         if os.path.splitext(filename)[1].lower() == ".pdf":
-            return os.path.splitext(os.path.basename(filename))[0] + f"-{page}" + ".pdf"
+            return f"{os.path.basename(filename)}#page={page+1}"
         else:
             return os.path.basename(filename)
+
+    @classmethod
+    def blob_name_from_file_name(cls, filename) -> str:
+        return os.path.basename(filename)
diff --git a/scripts/prepdocslib/searchmanager.py b/scripts/prepdocslib/searchmanager.py
@@ -128,7 +128,7 @@ async def update_content(self, sections: List[Section]):
                         "id": f"{section.content.filename_to_id()}-page-{i}",
                         "content": section.split_page.text,
                         "category": section.category,
-                        "sourcepage": BlobManager.blob_name_from_file_page(
+                        "sourcepage": BlobManager.sourcepage_from_file_page(
                             filename=section.content.filename(), page=section.split_page.page_num
                         ),
                         "sourcefile": section.content.filename(),

diff --git a/tests/test_adlsgen2setup.py b/tests/test_adlsgen2setup.py
@@ -5,6 +5,7 @@
 import azure.storage.filedatalake.aio
 import pytest
 from conftest import MockAzureCredential, MockResponse
+
 from scripts.adlsgen2setup import AdlsGen2Setup
 
 valid_data_access_control_format = {

diff --git a/tests/test_blob_manager.py b/tests/test_blob_manager.py
@@ -0,0 +1,173 @@
+import os
+import sys
+from tempfile import NamedTemporaryFile
+
+import pytest
+from conftest import MockAzureCredential
+
+from scripts.prepdocslib.blobmanager import BlobManager
+from scripts.prepdocslib.listfilestrategy import File
+
+
+@pytest.fixture
+def blob_manager(monkeypatch):
+    return BlobManager(
+        endpoint=f"https://{os.environ['AZURE_STORAGE_ACCOUNT']}.blob.core.windows.net",
+        credential=MockAzureCredential(),
+        container=os.environ["AZURE_STORAGE_CONTAINER"],
+        verbose=True,
+    )
+
+
+@pytest.mark.asyncio
+@pytest.mark.skipif(sys.version_info.minor < 10, reason="requires Python 3.10 or higher")
+async def test_upload_and_remove(monkeypatch, mock_env, blob_manager):
+    with NamedTemporaryFile(suffix=".pdf") as temp_file:
+        f = File(temp_file.file)
+        filename = f.content.name.split("/tmp/")[1]
+
+        # Set up mocks used by upload_blob
+        async def mock_exists(*args, **kwargs):
+            return True
+
+        monkeypatch.setattr("azure.storage.blob.aio.ContainerClient.exists", mock_exists)
+
+        async def mock_upload_blob(self, name, *args, **kwargs):
+            assert name == filename
+            return True
+
+        monkeypatch.setattr("azure.storage.blob.aio.ContainerClient.upload_blob", mock_upload_blob)
+
+        await blob_manager.upload_blob(f)
+
+        # Set up mocks used by remove_blob
+        def mock_list_blob_names(*args, **kwargs):
+            assert kwargs.get("name_starts_with") == filename.split(".pdf")[0]
+
+            class AsyncBlobItemsIterator:
+                def __init__(self, file):
+                    self.files = [file, "dontdelete.pdf"]
+
+                def __aiter__(self):
+                    return self
+
+                async def __anext__(self):
+                    if self.files:
+                        return self.files.pop()
+                    raise StopAsyncIteration
+
+            return AsyncBlobItemsIterator(filename)
+
+        monkeypatch.setattr("azure.storage.blob.aio.ContainerClient.list_blob_names", mock_list_blob_names)
+
+        async def mock_delete_blob(self, name, *args, **kwargs):
+            assert name == filename
+            return True
+
+        monkeypatch.setattr("azure.storage.blob.aio.ContainerClient.delete_blob", mock_delete_blob)
+
+        await blob_manager.remove_blob(f.content.name)
+
+
+@pytest.mark.asyncio
+@pytest.mark.skipif(sys.version_info.minor < 10, reason="requires Python 3.10 or higher")
+async def test_upload_and_remove_all(monkeypatch, mock_env, blob_manager):
+    with NamedTemporaryFile(suffix=".pdf") as temp_file:
+        f = File(temp_file.file)
+        print(f.content.name)
+        filename = f.content.name.split("/tmp/")[1]
+
+        # Set up mocks used by upload_blob
+        async def mock_exists(*args, **kwargs):
+            return True
+
+        monkeypatch.setattr("azure.storage.blob.aio.ContainerClient.exists", mock_exists)
+
+        async def mock_upload_blob(self, name, *args, **kwargs):
+            assert name == filename
+            return True
+
+        monkeypatch.setattr("azure.storage.blob.aio.ContainerClient.upload_blob", mock_upload_blob)
+
+        await blob_manager.upload_blob(f)
+
+        # Set up mocks used by remove_blob
+        def mock_list_blob_names(*args, **kwargs):
+            assert kwargs.get("name_starts_with") is None
+
+            class AsyncBlobItemsIterator:
+                def __init__(self, file):
+                    self.files = [file]
+
+                def __aiter__(self):
+                    return self
+
+                async def __anext__(self):
+                    if self.files:
+                        return self.files.pop()
+                    raise StopAsyncIteration
+
+            return AsyncBlobItemsIterator(filename)
+
+        monkeypatch.setattr("azure.storage.blob.aio.ContainerClient.list_blob_names", mock_list_blob_names)
+
+        async def mock_delete_blob(self, name, *args, **kwargs):
+            assert name == filename
+            return True
+
+        monkeypatch.setattr("azure.storage.blob.aio.ContainerClient.delete_blob", mock_delete_blob)
+
+        await blob_manager.remove_blob()
+
+
+@pytest.mark.asyncio
+@pytest.mark.skipif(sys.version_info.minor < 10, reason="requires Python 3.10 or higher")
+async def test_create_container_upon_upload(monkeypatch, mock_env, blob_manager):
+    with NamedTemporaryFile(suffix=".pdf") as temp_file:
+        f = File(temp_file.file)
+        filename = f.content.name.split("/tmp/")[1]
+
+        # Set up mocks used by upload_blob
+        async def mock_exists(*args, **kwargs):
+            return False
+
+        monkeypatch.setattr("azure.storage.blob.aio.ContainerClient.exists", mock_exists)
+
+        async def mock_create_container(*args, **kwargs):
+            return
+
+        monkeypatch.setattr("azure.storage.blob.aio.ContainerClient.create_container", mock_create_container)
+
+        async def mock_upload_blob(self, name, *args, **kwargs):
+            assert name == filename
+            return True
+
+        monkeypatch.setattr("azure.storage.blob.aio.ContainerClient.upload_blob", mock_upload_blob)
+
+        await blob_manager.upload_blob(f)
+
+
+@pytest.mark.asyncio
+@pytest.mark.skipif(sys.version_info.minor < 10, reason="requires Python 3.10 or higher")
+async def test_dont_remove_if_no_container(monkeypatch, mock_env, blob_manager):
+    async def mock_exists(*args, **kwargs):
+        return False
+
+    monkeypatch.setattr("azure.storage.blob.aio.ContainerClient.exists", mock_exists)
+
+    async def mock_delete_blob(*args, **kwargs):
+        assert False, "delete_blob() shouldn't have been called"
+
+    monkeypatch.setattr("azure.storage.blob.aio.ContainerClient.delete_blob", mock_delete_blob)
+
+    await blob_manager.remove_blob()
+
+
+def test_sourcepage_from_file_page():
+    assert BlobManager.sourcepage_from_file_page("test.pdf", 0) == "test.pdf#page=1"
+    assert BlobManager.sourcepage_from_file_page("test.html", 0) == "test.html"
+
+
+def test_blob_name_from_file_name():
+    assert BlobManager.blob_name_from_file_name("tmp/test.pdf") == "test.pdf"
+    assert BlobManager.blob_name_from_file_name("tmp/test.html") == "test.html"