Skip to content

Commit

Permalink
Render entire PDFs instead of single pages (Azure-Samples#840)
Browse files Browse the repository at this point in the history
* Adding anchors

* Show whole file

* Show whole file

* Page number support

* More experiments with whole file

* Revert unintentional changes

* Add tests

* Remove random num

* Add retry_total=0 to avoid unnecessary network requests

* Add comment to explain retry_total

* Bring back deleted file

* Blob manager refactor after merge

* Update coverage amount

* Make mypy happy with explicit check of path

* Add debug for 3.9

* Skip in 3.9 since its silly

* Reduce fail under percentage due to 3.9
  • Loading branch information
pamelafox authored Oct 27, 2023
1 parent 9b61024 commit 7ac6699
Show file tree
Hide file tree
Showing 12 changed files with 319 additions and 33 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/python-test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ jobs:
run: black . --check --verbose
- name: Run Python tests
if: runner.os != 'Windows'
run: python3 -m pytest -s -vv --cov --cov-fail-under=78
run: python3 -m pytest -s -vv --cov --cov-fail-under=87
- name: Run E2E tests with Playwright
id: e2e
if: runner.os != 'Windows'
Expand Down
7 changes: 6 additions & 1 deletion .vscode/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -18,5 +18,10 @@
"search.exclude": {
"**/node_modules": true,
"static": true
}
},
"python.testing.pytestArgs": [
"tests"
],
"python.testing.unittestEnabled": false,
"python.testing.pytestEnabled": true
}
14 changes: 12 additions & 2 deletions app/backend/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

import aiohttp
import openai
from azure.core.exceptions import ResourceNotFoundError
from azure.identity.aio import DefaultAzureCredential
from azure.monitor.opentelemetry import configure_azure_monitor
from azure.search.documents.aio import SearchClient
Expand Down Expand Up @@ -69,9 +70,18 @@ async def assets(path):
# *** NOTE *** this assumes that the content files are public, or at least that all users of the app
# can access all the files. This is also slow and memory hungry.
@bp.route("/content/<path>")
async def content_file(path):
async def content_file(path: str):
# Remove page number from path, filename-1.txt -> filename.txt
if path.find("#page=") > 0:
path_parts = path.rsplit("#page=", 1)
path = path_parts[0]
logging.info("Opening file %s at page %s", path)
blob_container_client = current_app.config[CONFIG_BLOB_CONTAINER_CLIENT]
blob = await blob_container_client.get_blob_client(path).download_blob()
try:
blob = await blob_container_client.get_blob_client(path).download_blob()
except ResourceNotFoundError:
logging.exception("Path not found: %s", path)
abort(404)
if not blob.properties or not blob.properties.has_key("content_settings"):
abort(404)
mime_type = blob.properties["content_settings"]["content_type"]
Expand Down
7 changes: 5 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,17 @@
target-version = "py38"
select = ["E", "F", "I", "UP"]
ignore = ["E501", "E701"] # line too long, multiple statements on one line
src = ["app/backend"]
src = ["app/backend", "scripts"]

[tool.ruff.isort]
known-local-folder = ["scripts"]

[tool.black]
line-length = 120

[tool.pytest.ini_options]
addopts = "-ra"
pythonpath = ["app/backend"]
pythonpath = ["app/backend", "scripts"]

[tool.coverage.paths]
source = ["scripts", "app"]
Expand Down
1 change: 1 addition & 0 deletions scripts/prepdocs.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from azure.core.credentials import AzureKeyCredential
from azure.core.credentials_async import AsyncTokenCredential
from azure.identity.aio import AzureDeveloperCliCredential

from prepdocslib.blobmanager import BlobManager
from prepdocslib.embeddings import (
AzureOpenAIEmbeddingService,
Expand Down
44 changes: 18 additions & 26 deletions scripts/prepdocslib/blobmanager.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,9 @@
import io
import os
import re
from typing import Optional, Union

from azure.core.credentials_async import AsyncTokenCredential
from azure.storage.blob.aio import BlobServiceClient
from pypdf import PdfReader, PdfWriter

from .listfilestrategy import File

Expand Down Expand Up @@ -34,24 +32,11 @@ async def upload_blob(self, file: File):
if not await container_client.exists():
await container_client.create_container()

# if file is PDF split into pages and upload each page as a separate blob
if os.path.splitext(file.content.name)[1].lower() == ".pdf":
with open(file.content.name, "rb") as reopened_file:
reader = PdfReader(reopened_file)
pages = reader.pages
for i in range(len(pages)):
blob_name = BlobManager.blob_name_from_file_page(file.content.name, i)
if self.verbose:
print(f"\tUploading blob for page {i} -> {blob_name}")
f = io.BytesIO()
writer = PdfWriter()
writer.add_page(pages[i])
writer.write(f)
f.seek(0)
await container_client.upload_blob(blob_name, f, overwrite=True)
else:
blob_name = BlobManager.blob_name_from_file_page(file.content.name, page=0)
await container_client.upload_blob(blob_name, file.content, overwrite=True)
# Re-open and upload the original file
with open(file.content.name, "rb") as reopened_file:
blob_name = BlobManager.blob_name_from_file_name(file.content.name)
print(f"\tUploading blob for whole file -> {blob_name}")
await container_client.upload_blob(blob_name, reopened_file, overwrite=True)

async def remove_blob(self, path: Optional[str] = None):
async with BlobServiceClient(
Expand All @@ -65,16 +50,23 @@ async def remove_blob(self, path: Optional[str] = None):
else:
prefix = os.path.splitext(os.path.basename(path))[0]
blobs = container_client.list_blob_names(name_starts_with=os.path.splitext(os.path.basename(prefix))[0])
async for b in blobs:
if prefix is not None and not re.match(f"{prefix}-\d+\.pdf", b):
async for blob_path in blobs:
# This still supports PDFs split into individual pages, but we could remove in future to simplify code
if (prefix is not None and not re.match(rf"{prefix}-\d+\.pdf", blob_path)) or (
path is not None and blob_path == os.path.basename(path)
):
continue
if self.verbose:
print(f"\tRemoving blob {b}")
await container_client.delete_blob(b)
print(f"\tRemoving blob {blob_path}")
await container_client.delete_blob(blob_path)

@classmethod
def blob_name_from_file_page(cls, filename, page=0) -> str:
def sourcepage_from_file_page(cls, filename, page=0) -> str:
if os.path.splitext(filename)[1].lower() == ".pdf":
return os.path.splitext(os.path.basename(filename))[0] + f"-{page}" + ".pdf"
return f"{os.path.basename(filename)}#page={page+1}"
else:
return os.path.basename(filename)

@classmethod
def blob_name_from_file_name(cls, filename) -> str:
return os.path.basename(filename)
2 changes: 1 addition & 1 deletion scripts/prepdocslib/searchmanager.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ async def update_content(self, sections: List[Section]):
"id": f"{section.content.filename_to_id()}-page-{i}",
"content": section.split_page.text,
"category": section.category,
"sourcepage": BlobManager.blob_name_from_file_page(
"sourcepage": BlobManager.sourcepage_from_file_page(
filename=section.content.filename(), page=section.split_page.page_num
),
"sourcefile": section.content.filename(),
Expand Down
1 change: 1 addition & 0 deletions tests/test_adlsgen2setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import azure.storage.filedatalake.aio
import pytest
from conftest import MockAzureCredential, MockResponse

from scripts.adlsgen2setup import AdlsGen2Setup

valid_data_access_control_format = {
Expand Down
173 changes: 173 additions & 0 deletions tests/test_blob_manager.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,173 @@
import os
import sys
from tempfile import NamedTemporaryFile

import pytest
from conftest import MockAzureCredential

from scripts.prepdocslib.blobmanager import BlobManager
from scripts.prepdocslib.listfilestrategy import File


@pytest.fixture
def blob_manager(monkeypatch):
return BlobManager(
endpoint=f"https://{os.environ['AZURE_STORAGE_ACCOUNT']}.blob.core.windows.net",
credential=MockAzureCredential(),
container=os.environ["AZURE_STORAGE_CONTAINER"],
verbose=True,
)


@pytest.mark.asyncio
@pytest.mark.skipif(sys.version_info.minor < 10, reason="requires Python 3.10 or higher")
async def test_upload_and_remove(monkeypatch, mock_env, blob_manager):
with NamedTemporaryFile(suffix=".pdf") as temp_file:
f = File(temp_file.file)
filename = f.content.name.split("/tmp/")[1]

# Set up mocks used by upload_blob
async def mock_exists(*args, **kwargs):
return True

monkeypatch.setattr("azure.storage.blob.aio.ContainerClient.exists", mock_exists)

async def mock_upload_blob(self, name, *args, **kwargs):
assert name == filename
return True

monkeypatch.setattr("azure.storage.blob.aio.ContainerClient.upload_blob", mock_upload_blob)

await blob_manager.upload_blob(f)

# Set up mocks used by remove_blob
def mock_list_blob_names(*args, **kwargs):
assert kwargs.get("name_starts_with") == filename.split(".pdf")[0]

class AsyncBlobItemsIterator:
def __init__(self, file):
self.files = [file, "dontdelete.pdf"]

def __aiter__(self):
return self

async def __anext__(self):
if self.files:
return self.files.pop()
raise StopAsyncIteration

return AsyncBlobItemsIterator(filename)

monkeypatch.setattr("azure.storage.blob.aio.ContainerClient.list_blob_names", mock_list_blob_names)

async def mock_delete_blob(self, name, *args, **kwargs):
assert name == filename
return True

monkeypatch.setattr("azure.storage.blob.aio.ContainerClient.delete_blob", mock_delete_blob)

await blob_manager.remove_blob(f.content.name)


@pytest.mark.asyncio
@pytest.mark.skipif(sys.version_info.minor < 10, reason="requires Python 3.10 or higher")
async def test_upload_and_remove_all(monkeypatch, mock_env, blob_manager):
with NamedTemporaryFile(suffix=".pdf") as temp_file:
f = File(temp_file.file)
print(f.content.name)
filename = f.content.name.split("/tmp/")[1]

# Set up mocks used by upload_blob
async def mock_exists(*args, **kwargs):
return True

monkeypatch.setattr("azure.storage.blob.aio.ContainerClient.exists", mock_exists)

async def mock_upload_blob(self, name, *args, **kwargs):
assert name == filename
return True

monkeypatch.setattr("azure.storage.blob.aio.ContainerClient.upload_blob", mock_upload_blob)

await blob_manager.upload_blob(f)

# Set up mocks used by remove_blob
def mock_list_blob_names(*args, **kwargs):
assert kwargs.get("name_starts_with") is None

class AsyncBlobItemsIterator:
def __init__(self, file):
self.files = [file]

def __aiter__(self):
return self

async def __anext__(self):
if self.files:
return self.files.pop()
raise StopAsyncIteration

return AsyncBlobItemsIterator(filename)

monkeypatch.setattr("azure.storage.blob.aio.ContainerClient.list_blob_names", mock_list_blob_names)

async def mock_delete_blob(self, name, *args, **kwargs):
assert name == filename
return True

monkeypatch.setattr("azure.storage.blob.aio.ContainerClient.delete_blob", mock_delete_blob)

await blob_manager.remove_blob()


@pytest.mark.asyncio
@pytest.mark.skipif(sys.version_info.minor < 10, reason="requires Python 3.10 or higher")
async def test_create_container_upon_upload(monkeypatch, mock_env, blob_manager):
with NamedTemporaryFile(suffix=".pdf") as temp_file:
f = File(temp_file.file)
filename = f.content.name.split("/tmp/")[1]

# Set up mocks used by upload_blob
async def mock_exists(*args, **kwargs):
return False

monkeypatch.setattr("azure.storage.blob.aio.ContainerClient.exists", mock_exists)

async def mock_create_container(*args, **kwargs):
return

monkeypatch.setattr("azure.storage.blob.aio.ContainerClient.create_container", mock_create_container)

async def mock_upload_blob(self, name, *args, **kwargs):
assert name == filename
return True

monkeypatch.setattr("azure.storage.blob.aio.ContainerClient.upload_blob", mock_upload_blob)

await blob_manager.upload_blob(f)


@pytest.mark.asyncio
@pytest.mark.skipif(sys.version_info.minor < 10, reason="requires Python 3.10 or higher")
async def test_dont_remove_if_no_container(monkeypatch, mock_env, blob_manager):
async def mock_exists(*args, **kwargs):
return False

monkeypatch.setattr("azure.storage.blob.aio.ContainerClient.exists", mock_exists)

async def mock_delete_blob(*args, **kwargs):
assert False, "delete_blob() shouldn't have been called"

monkeypatch.setattr("azure.storage.blob.aio.ContainerClient.delete_blob", mock_delete_blob)

await blob_manager.remove_blob()


def test_sourcepage_from_file_page():
assert BlobManager.sourcepage_from_file_page("test.pdf", 0) == "test.pdf#page=1"
assert BlobManager.sourcepage_from_file_page("test.html", 0) == "test.html"


def test_blob_name_from_file_name():
assert BlobManager.blob_name_from_file_name("tmp/test.pdf") == "test.pdf"
assert BlobManager.blob_name_from_file_name("tmp/test.html") == "test.html"
Loading

0 comments on commit 7ac6699

Please sign in to comment.