diff --git a/.github/workflows/python-test.yaml b/.github/workflows/python-test.yaml index bbc4b53a50..304fdb301c 100644 --- a/.github/workflows/python-test.yaml +++ b/.github/workflows/python-test.yaml @@ -49,7 +49,7 @@ jobs: run: black . --check --verbose - name: Run Python tests if: runner.os != 'Windows' - run: python3 -m pytest -s -vv --cov --cov-fail-under=78 + run: python3 -m pytest -s -vv --cov --cov-fail-under=87 - name: Run E2E tests with Playwright id: e2e if: runner.os != 'Windows' diff --git a/.vscode/settings.json b/.vscode/settings.json index 7181b35e8a..836f1157c5 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -18,5 +18,10 @@ "search.exclude": { "**/node_modules": true, "static": true - } + }, + "python.testing.pytestArgs": [ + "tests" + ], + "python.testing.unittestEnabled": false, + "python.testing.pytestEnabled": true } diff --git a/app/backend/app.py b/app/backend/app.py index 990c86be2d..a44ccea186 100644 --- a/app/backend/app.py +++ b/app/backend/app.py @@ -9,6 +9,7 @@ import aiohttp import openai +from azure.core.exceptions import ResourceNotFoundError from azure.identity.aio import DefaultAzureCredential from azure.monitor.opentelemetry import configure_azure_monitor from azure.search.documents.aio import SearchClient @@ -69,9 +70,18 @@ async def assets(path): # *** NOTE *** this assumes that the content files are public, or at least that all users of the app # can access all the files. This is also slow and memory hungry. @bp.route("/content/") -async def content_file(path): +async def content_file(path: str): + # Remove page number from path, filename-1.txt -> filename.txt + if path.find("#page=") > 0: + path_parts = path.rsplit("#page=", 1) + path = path_parts[0] + logging.info("Opening file %s at page %s", path) blob_container_client = current_app.config[CONFIG_BLOB_CONTAINER_CLIENT] - blob = await blob_container_client.get_blob_client(path).download_blob() + try: + blob = await blob_container_client.get_blob_client(path).download_blob() + except ResourceNotFoundError: + logging.exception("Path not found: %s", path) + abort(404) if not blob.properties or not blob.properties.has_key("content_settings"): abort(404) mime_type = blob.properties["content_settings"]["content_type"] diff --git a/pyproject.toml b/pyproject.toml index def1afefd7..910e6c74fe 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,14 +2,17 @@ target-version = "py38" select = ["E", "F", "I", "UP"] ignore = ["E501", "E701"] # line too long, multiple statements on one line -src = ["app/backend"] +src = ["app/backend", "scripts"] + +[tool.ruff.isort] +known-local-folder = ["scripts"] [tool.black] line-length = 120 [tool.pytest.ini_options] addopts = "-ra" -pythonpath = ["app/backend"] +pythonpath = ["app/backend", "scripts"] [tool.coverage.paths] source = ["scripts", "app"] diff --git a/scripts/prepdocs.py b/scripts/prepdocs.py index b6d05834db..d647f8d085 100644 --- a/scripts/prepdocs.py +++ b/scripts/prepdocs.py @@ -5,6 +5,7 @@ from azure.core.credentials import AzureKeyCredential from azure.core.credentials_async import AsyncTokenCredential from azure.identity.aio import AzureDeveloperCliCredential + from prepdocslib.blobmanager import BlobManager from prepdocslib.embeddings import ( AzureOpenAIEmbeddingService, diff --git a/scripts/prepdocslib/blobmanager.py b/scripts/prepdocslib/blobmanager.py index ff98ef5133..e2867fba83 100644 --- a/scripts/prepdocslib/blobmanager.py +++ b/scripts/prepdocslib/blobmanager.py @@ -1,11 +1,9 @@ -import io import os import re from typing import Optional, Union from azure.core.credentials_async import AsyncTokenCredential from azure.storage.blob.aio import BlobServiceClient -from pypdf import PdfReader, PdfWriter from .listfilestrategy import File @@ -34,24 +32,11 @@ async def upload_blob(self, file: File): if not await container_client.exists(): await container_client.create_container() - # if file is PDF split into pages and upload each page as a separate blob - if os.path.splitext(file.content.name)[1].lower() == ".pdf": - with open(file.content.name, "rb") as reopened_file: - reader = PdfReader(reopened_file) - pages = reader.pages - for i in range(len(pages)): - blob_name = BlobManager.blob_name_from_file_page(file.content.name, i) - if self.verbose: - print(f"\tUploading blob for page {i} -> {blob_name}") - f = io.BytesIO() - writer = PdfWriter() - writer.add_page(pages[i]) - writer.write(f) - f.seek(0) - await container_client.upload_blob(blob_name, f, overwrite=True) - else: - blob_name = BlobManager.blob_name_from_file_page(file.content.name, page=0) - await container_client.upload_blob(blob_name, file.content, overwrite=True) + # Re-open and upload the original file + with open(file.content.name, "rb") as reopened_file: + blob_name = BlobManager.blob_name_from_file_name(file.content.name) + print(f"\tUploading blob for whole file -> {blob_name}") + await container_client.upload_blob(blob_name, reopened_file, overwrite=True) async def remove_blob(self, path: Optional[str] = None): async with BlobServiceClient( @@ -65,16 +50,23 @@ async def remove_blob(self, path: Optional[str] = None): else: prefix = os.path.splitext(os.path.basename(path))[0] blobs = container_client.list_blob_names(name_starts_with=os.path.splitext(os.path.basename(prefix))[0]) - async for b in blobs: - if prefix is not None and not re.match(f"{prefix}-\d+\.pdf", b): + async for blob_path in blobs: + # This still supports PDFs split into individual pages, but we could remove in future to simplify code + if (prefix is not None and not re.match(rf"{prefix}-\d+\.pdf", blob_path)) or ( + path is not None and blob_path == os.path.basename(path) + ): continue if self.verbose: - print(f"\tRemoving blob {b}") - await container_client.delete_blob(b) + print(f"\tRemoving blob {blob_path}") + await container_client.delete_blob(blob_path) @classmethod - def blob_name_from_file_page(cls, filename, page=0) -> str: + def sourcepage_from_file_page(cls, filename, page=0) -> str: if os.path.splitext(filename)[1].lower() == ".pdf": - return os.path.splitext(os.path.basename(filename))[0] + f"-{page}" + ".pdf" + return f"{os.path.basename(filename)}#page={page+1}" else: return os.path.basename(filename) + + @classmethod + def blob_name_from_file_name(cls, filename) -> str: + return os.path.basename(filename) diff --git a/scripts/prepdocslib/searchmanager.py b/scripts/prepdocslib/searchmanager.py index fc708f36b2..6fb8ba8a66 100644 --- a/scripts/prepdocslib/searchmanager.py +++ b/scripts/prepdocslib/searchmanager.py @@ -128,7 +128,7 @@ async def update_content(self, sections: List[Section]): "id": f"{section.content.filename_to_id()}-page-{i}", "content": section.split_page.text, "category": section.category, - "sourcepage": BlobManager.blob_name_from_file_page( + "sourcepage": BlobManager.sourcepage_from_file_page( filename=section.content.filename(), page=section.split_page.page_num ), "sourcefile": section.content.filename(), diff --git a/tests/test_adlsgen2setup.py b/tests/test_adlsgen2setup.py index 632ba313ba..d52c746ce3 100644 --- a/tests/test_adlsgen2setup.py +++ b/tests/test_adlsgen2setup.py @@ -5,6 +5,7 @@ import azure.storage.filedatalake.aio import pytest from conftest import MockAzureCredential, MockResponse + from scripts.adlsgen2setup import AdlsGen2Setup valid_data_access_control_format = { diff --git a/tests/test_blob_manager.py b/tests/test_blob_manager.py new file mode 100644 index 0000000000..cdd92c1b64 --- /dev/null +++ b/tests/test_blob_manager.py @@ -0,0 +1,173 @@ +import os +import sys +from tempfile import NamedTemporaryFile + +import pytest +from conftest import MockAzureCredential + +from scripts.prepdocslib.blobmanager import BlobManager +from scripts.prepdocslib.listfilestrategy import File + + +@pytest.fixture +def blob_manager(monkeypatch): + return BlobManager( + endpoint=f"https://{os.environ['AZURE_STORAGE_ACCOUNT']}.blob.core.windows.net", + credential=MockAzureCredential(), + container=os.environ["AZURE_STORAGE_CONTAINER"], + verbose=True, + ) + + +@pytest.mark.asyncio +@pytest.mark.skipif(sys.version_info.minor < 10, reason="requires Python 3.10 or higher") +async def test_upload_and_remove(monkeypatch, mock_env, blob_manager): + with NamedTemporaryFile(suffix=".pdf") as temp_file: + f = File(temp_file.file) + filename = f.content.name.split("/tmp/")[1] + + # Set up mocks used by upload_blob + async def mock_exists(*args, **kwargs): + return True + + monkeypatch.setattr("azure.storage.blob.aio.ContainerClient.exists", mock_exists) + + async def mock_upload_blob(self, name, *args, **kwargs): + assert name == filename + return True + + monkeypatch.setattr("azure.storage.blob.aio.ContainerClient.upload_blob", mock_upload_blob) + + await blob_manager.upload_blob(f) + + # Set up mocks used by remove_blob + def mock_list_blob_names(*args, **kwargs): + assert kwargs.get("name_starts_with") == filename.split(".pdf")[0] + + class AsyncBlobItemsIterator: + def __init__(self, file): + self.files = [file, "dontdelete.pdf"] + + def __aiter__(self): + return self + + async def __anext__(self): + if self.files: + return self.files.pop() + raise StopAsyncIteration + + return AsyncBlobItemsIterator(filename) + + monkeypatch.setattr("azure.storage.blob.aio.ContainerClient.list_blob_names", mock_list_blob_names) + + async def mock_delete_blob(self, name, *args, **kwargs): + assert name == filename + return True + + monkeypatch.setattr("azure.storage.blob.aio.ContainerClient.delete_blob", mock_delete_blob) + + await blob_manager.remove_blob(f.content.name) + + +@pytest.mark.asyncio +@pytest.mark.skipif(sys.version_info.minor < 10, reason="requires Python 3.10 or higher") +async def test_upload_and_remove_all(monkeypatch, mock_env, blob_manager): + with NamedTemporaryFile(suffix=".pdf") as temp_file: + f = File(temp_file.file) + print(f.content.name) + filename = f.content.name.split("/tmp/")[1] + + # Set up mocks used by upload_blob + async def mock_exists(*args, **kwargs): + return True + + monkeypatch.setattr("azure.storage.blob.aio.ContainerClient.exists", mock_exists) + + async def mock_upload_blob(self, name, *args, **kwargs): + assert name == filename + return True + + monkeypatch.setattr("azure.storage.blob.aio.ContainerClient.upload_blob", mock_upload_blob) + + await blob_manager.upload_blob(f) + + # Set up mocks used by remove_blob + def mock_list_blob_names(*args, **kwargs): + assert kwargs.get("name_starts_with") is None + + class AsyncBlobItemsIterator: + def __init__(self, file): + self.files = [file] + + def __aiter__(self): + return self + + async def __anext__(self): + if self.files: + return self.files.pop() + raise StopAsyncIteration + + return AsyncBlobItemsIterator(filename) + + monkeypatch.setattr("azure.storage.blob.aio.ContainerClient.list_blob_names", mock_list_blob_names) + + async def mock_delete_blob(self, name, *args, **kwargs): + assert name == filename + return True + + monkeypatch.setattr("azure.storage.blob.aio.ContainerClient.delete_blob", mock_delete_blob) + + await blob_manager.remove_blob() + + +@pytest.mark.asyncio +@pytest.mark.skipif(sys.version_info.minor < 10, reason="requires Python 3.10 or higher") +async def test_create_container_upon_upload(monkeypatch, mock_env, blob_manager): + with NamedTemporaryFile(suffix=".pdf") as temp_file: + f = File(temp_file.file) + filename = f.content.name.split("/tmp/")[1] + + # Set up mocks used by upload_blob + async def mock_exists(*args, **kwargs): + return False + + monkeypatch.setattr("azure.storage.blob.aio.ContainerClient.exists", mock_exists) + + async def mock_create_container(*args, **kwargs): + return + + monkeypatch.setattr("azure.storage.blob.aio.ContainerClient.create_container", mock_create_container) + + async def mock_upload_blob(self, name, *args, **kwargs): + assert name == filename + return True + + monkeypatch.setattr("azure.storage.blob.aio.ContainerClient.upload_blob", mock_upload_blob) + + await blob_manager.upload_blob(f) + + +@pytest.mark.asyncio +@pytest.mark.skipif(sys.version_info.minor < 10, reason="requires Python 3.10 or higher") +async def test_dont_remove_if_no_container(monkeypatch, mock_env, blob_manager): + async def mock_exists(*args, **kwargs): + return False + + monkeypatch.setattr("azure.storage.blob.aio.ContainerClient.exists", mock_exists) + + async def mock_delete_blob(*args, **kwargs): + assert False, "delete_blob() shouldn't have been called" + + monkeypatch.setattr("azure.storage.blob.aio.ContainerClient.delete_blob", mock_delete_blob) + + await blob_manager.remove_blob() + + +def test_sourcepage_from_file_page(): + assert BlobManager.sourcepage_from_file_page("test.pdf", 0) == "test.pdf#page=1" + assert BlobManager.sourcepage_from_file_page("test.html", 0) == "test.html" + + +def test_blob_name_from_file_name(): + assert BlobManager.blob_name_from_file_name("tmp/test.pdf") == "test.pdf" + assert BlobManager.blob_name_from_file_name("tmp/test.html") == "test.html" diff --git a/tests/test_content_file.py b/tests/test_content_file.py new file mode 100644 index 0000000000..9abb21675c --- /dev/null +++ b/tests/test_content_file.py @@ -0,0 +1,99 @@ +import os +from collections import namedtuple + +import aiohttp +import pytest +from azure.core.exceptions import ResourceNotFoundError +from azure.core.pipeline.transport import ( + AioHttpTransportResponse, + AsyncHttpTransport, + HttpRequest, +) +from azure.storage.blob.aio import BlobServiceClient + +import app + +MockToken = namedtuple("MockToken", ["token", "expires_on"]) + + +class MockAzureCredential: + async def get_token(self, uri): + return MockToken("mock_token", 9999999999) + + +@pytest.mark.asyncio +async def test_content_file(monkeypatch, mock_env): + class MockAiohttpClientResponse404(aiohttp.ClientResponse): + def __init__(self, url, body_bytes, headers=None): + self._body = body_bytes + self._headers = headers + self._cache = {} + self.status = 404 + self.reason = "Not Found" + self._url = url + + class MockAiohttpClientResponse(aiohttp.ClientResponse): + def __init__(self, url, body_bytes, headers=None): + self._body = body_bytes + self._headers = headers + self._cache = {} + self.status = 200 + self.reason = "OK" + self._url = url + + class MockTransport(AsyncHttpTransport): + async def send(self, request: HttpRequest, **kwargs) -> AioHttpTransportResponse: + if request.url.endswith("notfound.pdf"): + raise ResourceNotFoundError(MockAiohttpClientResponse404(request.url, b"")) + else: + return AioHttpTransportResponse( + request, + MockAiohttpClientResponse( + request.url, + b"test content", + { + "Content-Type": "application/octet-stream", + "Content-Range": "bytes 0-27/28", + "Content-Length": "28", + }, + ), + ) + + async def __aenter__(self): + return self + + async def __aexit__(self, *args): + pass + + async def open(self): + pass + + async def close(self): + pass + + # Then we can plug this into any SDK via kwargs: + blob_client = BlobServiceClient( + f"https://{os.environ['AZURE_STORAGE_ACCOUNT']}.blob.core.windows.net", + credential=MockAzureCredential(), + transport=MockTransport(), + retry_total=0, # Necessary to avoid unnecessary network requests during tests + ) + blob_container_client = blob_client.get_container_client(os.environ["AZURE_STORAGE_CONTAINER"]) + + quart_app = app.create_app() + async with quart_app.test_app() as test_app: + quart_app.config.update({"blob_container_client": blob_container_client}) + + client = test_app.test_client() + response = await client.get("/content/notfound.pdf") + assert response.status_code == 404 + + response = await client.get("/content/role_library.pdf") + assert response.status_code == 200 + assert response.headers["Content-Type"] == "application/pdf" + assert await response.get_data() == b"test content" + + response = await client.get("/content/role_library.pdf#page=10") + assert response.status_code == 200 + assert response.headers["Content-Type"] == "application/pdf" + assert await response.get_data() == b"test content" diff --git a/tests/test_manageacl.py b/tests/test_manageacl.py index 5cc2534a40..351ab4c56d 100644 --- a/tests/test_manageacl.py +++ b/tests/test_manageacl.py @@ -7,6 +7,7 @@ SimpleField, ) from conftest import MockAzureCredential + from scripts.manageacl import ManageAcl diff --git a/tests/test_prepdocs.py b/tests/test_prepdocs.py index c22e43c9b7..59d46373ca 100644 --- a/tests/test_prepdocs.py +++ b/tests/test_prepdocs.py @@ -4,6 +4,7 @@ import pytest import tenacity from conftest import MockAzureCredential + from scripts.prepdocslib.blobmanager import File from scripts.prepdocslib.embeddings import ( AzureOpenAIEmbeddingService,