Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Render entire PDFs instead of single pages #840

Merged
merged 32 commits into from
Oct 27, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
b7d9678
Adding anchors
pamelafox Sep 15, 2023
99ea466
Merge branch 'main' of https://github.com/pamelafox/azure-search-open…
pamelafox Sep 15, 2023
a3c0023
Merge branch 'main' of https://github.com/pamelafox/azure-search-open…
pamelafox Sep 16, 2023
cd2706c
Merge branch 'main' of https://github.com/pamelafox/azure-search-open…
pamelafox Sep 20, 2023
6910e05
Merge branch 'main' of https://github.com/pamelafox/azure-search-open…
pamelafox Sep 20, 2023
f63398a
Merge branch 'main' of https://github.com/pamelafox/azure-search-open…
pamelafox Sep 22, 2023
eaef837
Merge branch 'main' of https://github.com/pamelafox/azure-search-open…
pamelafox Sep 25, 2023
3c4fe71
Show whole file
pamelafox Sep 28, 2023
60e07cc
Show whole file
pamelafox Sep 28, 2023
cf46813
Merge branch 'main' of https://github.com/pamelafox/azure-search-open…
pamelafox Sep 30, 2023
5e745a9
Merge branch 'main' of https://github.com/pamelafox/azure-search-open…
pamelafox Oct 2, 2023
1bec0b3
Merge branch 'main' of https://github.com/pamelafox/azure-search-open…
pamelafox Oct 2, 2023
f8f300f
Page number support
pamelafox Oct 2, 2023
820e4b6
Merge branch 'main' into wholefile
pamelafox Oct 2, 2023
4395e39
More experiments with whole file
pamelafox Oct 13, 2023
e0b036b
Merge branch 'main' into wholefile
pamelafox Oct 19, 2023
9d138a3
Revert unintentional changes
pamelafox Oct 19, 2023
47f9a17
Add tests
pamelafox Oct 20, 2023
786e5eb
Remove random num
pamelafox Oct 23, 2023
4d62561
Add retry_total=0 to avoid unnecessary network requests
pamelafox Oct 24, 2023
0de7037
Add comment to explain retry_total
pamelafox Oct 24, 2023
ee51b9e
Merge branch 'main' into wholefile
pamelafox Oct 24, 2023
ae1d95f
Bring back deleted file
pamelafox Oct 25, 2023
78448e9
Merge branch 'wholefile' of https://github.com/pamelafox/azure-search…
pamelafox Oct 25, 2023
f1cd17a
Merge branch 'main' into wholefile
pamelafox Oct 26, 2023
d4d9db0
Blob manager refactor after merge
pamelafox Oct 26, 2023
23e6aba
Update coverage amount
pamelafox Oct 26, 2023
3acccb8
Make mypy happy with explicit check of path
pamelafox Oct 26, 2023
17fd596
Add debug for 3.9
pamelafox Oct 27, 2023
5f11ce6
Skip in 3.9 since its silly
pamelafox Oct 27, 2023
b8202fd
Reduce fail under percentage due to 3.9
pamelafox Oct 27, 2023
de97cff
Merge branch 'main' into wholefile
pamelafox Oct 27, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/python-test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ jobs:
run: black . --check --verbose
- name: Run Python tests
if: runner.os != 'Windows'
run: python3 -m pytest -s -vv --cov --cov-fail-under=78
run: python3 -m pytest -s -vv --cov --cov-fail-under=87
- name: Run E2E tests with Playwright
id: e2e
if: runner.os != 'Windows'
Expand Down
7 changes: 6 additions & 1 deletion .vscode/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -18,5 +18,10 @@
"search.exclude": {
"**/node_modules": true,
"static": true
}
},
"python.testing.pytestArgs": [
"tests"
],
"python.testing.unittestEnabled": false,
"python.testing.pytestEnabled": true
}
14 changes: 12 additions & 2 deletions app/backend/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

import aiohttp
import openai
from azure.core.exceptions import ResourceNotFoundError
from azure.identity.aio import DefaultAzureCredential
from azure.monitor.opentelemetry import configure_azure_monitor
from azure.search.documents.aio import SearchClient
Expand Down Expand Up @@ -69,9 +70,18 @@ async def assets(path):
# *** NOTE *** this assumes that the content files are public, or at least that all users of the app
# can access all the files. This is also slow and memory hungry.
@bp.route("/content/<path>")
async def content_file(path):
async def content_file(path: str):
# Remove page number from path, filename-1.txt -> filename.txt
if path.find("#page=") > 0:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

so this will still work with folks who didn't re-run prepdocs after this change?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yep! I just did another manual test to make sure, its working with an old env with individual pages.

path_parts = path.rsplit("#page=", 1)
path = path_parts[0]
logging.info("Opening file %s at page %s", path)
blob_container_client = current_app.config[CONFIG_BLOB_CONTAINER_CLIENT]
blob = await blob_container_client.get_blob_client(path).download_blob()
try:
blob = await blob_container_client.get_blob_client(path).download_blob()
except ResourceNotFoundError:
logging.exception("Path not found: %s", path)
abort(404)
if not blob.properties or not blob.properties.has_key("content_settings"):
abort(404)
mime_type = blob.properties["content_settings"]["content_type"]
Expand Down
7 changes: 5 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,17 @@
target-version = "py38"
select = ["E", "F", "I", "UP"]
ignore = ["E501", "E701"] # line too long, multiple statements on one line
src = ["app/backend"]
src = ["app/backend", "scripts"]

[tool.ruff.isort]
known-local-folder = ["scripts"]
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fyi @mattmsft This known-local-folder addition fixed the auto-import sorting that was lumping scripts together with third-party on the preplib files.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks!


[tool.black]
line-length = 120

[tool.pytest.ini_options]
addopts = "-ra"
pythonpath = ["app/backend"]
pythonpath = ["app/backend", "scripts"]

[tool.coverage.paths]
source = ["scripts", "app"]
Expand Down
1 change: 1 addition & 0 deletions scripts/prepdocs.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from azure.core.credentials import AzureKeyCredential
from azure.core.credentials_async import AsyncTokenCredential
from azure.identity.aio import AzureDeveloperCliCredential

from prepdocslib.blobmanager import BlobManager
from prepdocslib.embeddings import (
AzureOpenAIEmbeddingService,
Expand Down
44 changes: 18 additions & 26 deletions scripts/prepdocslib/blobmanager.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,9 @@
import io
import os
import re
from typing import Optional, Union

from azure.core.credentials_async import AsyncTokenCredential
from azure.storage.blob.aio import BlobServiceClient
from pypdf import PdfReader, PdfWriter

from .listfilestrategy import File

Expand Down Expand Up @@ -34,24 +32,11 @@ async def upload_blob(self, file: File):
if not await container_client.exists():
await container_client.create_container()

# if file is PDF split into pages and upload each page as a separate blob
if os.path.splitext(file.content.name)[1].lower() == ".pdf":
with open(file.content.name, "rb") as reopened_file:
reader = PdfReader(reopened_file)
pages = reader.pages
for i in range(len(pages)):
blob_name = BlobManager.blob_name_from_file_page(file.content.name, i)
if self.verbose:
print(f"\tUploading blob for page {i} -> {blob_name}")
f = io.BytesIO()
writer = PdfWriter()
writer.add_page(pages[i])
writer.write(f)
f.seek(0)
await container_client.upload_blob(blob_name, f, overwrite=True)
else:
blob_name = BlobManager.blob_name_from_file_page(file.content.name, page=0)
await container_client.upload_blob(blob_name, file.content, overwrite=True)
# Re-open and upload the original file
with open(file.content.name, "rb") as reopened_file:
blob_name = BlobManager.blob_name_from_file_name(file.content.name)
print(f"\tUploading blob for whole file -> {blob_name}")
await container_client.upload_blob(blob_name, reopened_file, overwrite=True)

async def remove_blob(self, path: Optional[str] = None):
async with BlobServiceClient(
Expand All @@ -65,16 +50,23 @@ async def remove_blob(self, path: Optional[str] = None):
else:
prefix = os.path.splitext(os.path.basename(path))[0]
blobs = container_client.list_blob_names(name_starts_with=os.path.splitext(os.path.basename(prefix))[0])
async for b in blobs:
if prefix is not None and not re.match(f"{prefix}-\d+\.pdf", b):
async for blob_path in blobs:
# This still supports PDFs split into individual pages, but we could remove in future to simplify code
if (prefix is not None and not re.match(rf"{prefix}-\d+\.pdf", blob_path)) or (
path is not None and blob_path == os.path.basename(path)
):
continue
if self.verbose:
print(f"\tRemoving blob {b}")
await container_client.delete_blob(b)
print(f"\tRemoving blob {blob_path}")
await container_client.delete_blob(blob_path)

@classmethod
def blob_name_from_file_page(cls, filename, page=0) -> str:
def sourcepage_from_file_page(cls, filename, page=0) -> str:
if os.path.splitext(filename)[1].lower() == ".pdf":
return os.path.splitext(os.path.basename(filename))[0] + f"-{page}" + ".pdf"
return f"{os.path.basename(filename)}#page={page+1}"
else:
return os.path.basename(filename)

@classmethod
def blob_name_from_file_name(cls, filename) -> str:
return os.path.basename(filename)
2 changes: 1 addition & 1 deletion scripts/prepdocslib/searchmanager.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ async def update_content(self, sections: List[Section]):
"id": f"{section.content.filename_to_id()}-page-{i}",
"content": section.split_page.text,
"category": section.category,
"sourcepage": BlobManager.blob_name_from_file_page(
"sourcepage": BlobManager.sourcepage_from_file_page(
filename=section.content.filename(), page=section.split_page.page_num
),
"sourcefile": section.content.filename(),
Expand Down
1 change: 1 addition & 0 deletions tests/test_adlsgen2setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import azure.storage.filedatalake.aio
import pytest
from conftest import MockAzureCredential, MockResponse

from scripts.adlsgen2setup import AdlsGen2Setup

valid_data_access_control_format = {
Expand Down
173 changes: 173 additions & 0 deletions tests/test_blob_manager.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,173 @@
import os
import sys
from tempfile import NamedTemporaryFile

import pytest
from conftest import MockAzureCredential

from scripts.prepdocslib.blobmanager import BlobManager
from scripts.prepdocslib.listfilestrategy import File


@pytest.fixture
def blob_manager(monkeypatch):
return BlobManager(
endpoint=f"https://{os.environ['AZURE_STORAGE_ACCOUNT']}.blob.core.windows.net",
credential=MockAzureCredential(),
container=os.environ["AZURE_STORAGE_CONTAINER"],
verbose=True,
)


@pytest.mark.asyncio
@pytest.mark.skipif(sys.version_info.minor < 10, reason="requires Python 3.10 or higher")
async def test_upload_and_remove(monkeypatch, mock_env, blob_manager):
with NamedTemporaryFile(suffix=".pdf") as temp_file:
f = File(temp_file.file)
filename = f.content.name.split("/tmp/")[1]

# Set up mocks used by upload_blob
async def mock_exists(*args, **kwargs):
return True

monkeypatch.setattr("azure.storage.blob.aio.ContainerClient.exists", mock_exists)

async def mock_upload_blob(self, name, *args, **kwargs):
assert name == filename
return True

monkeypatch.setattr("azure.storage.blob.aio.ContainerClient.upload_blob", mock_upload_blob)

await blob_manager.upload_blob(f)

# Set up mocks used by remove_blob
def mock_list_blob_names(*args, **kwargs):
assert kwargs.get("name_starts_with") == filename.split(".pdf")[0]

class AsyncBlobItemsIterator:
def __init__(self, file):
self.files = [file, "dontdelete.pdf"]

def __aiter__(self):
return self

async def __anext__(self):
if self.files:
return self.files.pop()
raise StopAsyncIteration

return AsyncBlobItemsIterator(filename)

monkeypatch.setattr("azure.storage.blob.aio.ContainerClient.list_blob_names", mock_list_blob_names)

async def mock_delete_blob(self, name, *args, **kwargs):
assert name == filename
return True

monkeypatch.setattr("azure.storage.blob.aio.ContainerClient.delete_blob", mock_delete_blob)

await blob_manager.remove_blob(f.content.name)


@pytest.mark.asyncio
@pytest.mark.skipif(sys.version_info.minor < 10, reason="requires Python 3.10 or higher")
async def test_upload_and_remove_all(monkeypatch, mock_env, blob_manager):
with NamedTemporaryFile(suffix=".pdf") as temp_file:
f = File(temp_file.file)
print(f.content.name)
filename = f.content.name.split("/tmp/")[1]

# Set up mocks used by upload_blob
async def mock_exists(*args, **kwargs):
return True

monkeypatch.setattr("azure.storage.blob.aio.ContainerClient.exists", mock_exists)

async def mock_upload_blob(self, name, *args, **kwargs):
assert name == filename
return True

monkeypatch.setattr("azure.storage.blob.aio.ContainerClient.upload_blob", mock_upload_blob)

await blob_manager.upload_blob(f)

# Set up mocks used by remove_blob
def mock_list_blob_names(*args, **kwargs):
assert kwargs.get("name_starts_with") is None

class AsyncBlobItemsIterator:
def __init__(self, file):
self.files = [file]

def __aiter__(self):
return self

async def __anext__(self):
if self.files:
return self.files.pop()
raise StopAsyncIteration

return AsyncBlobItemsIterator(filename)

monkeypatch.setattr("azure.storage.blob.aio.ContainerClient.list_blob_names", mock_list_blob_names)

async def mock_delete_blob(self, name, *args, **kwargs):
assert name == filename
return True

monkeypatch.setattr("azure.storage.blob.aio.ContainerClient.delete_blob", mock_delete_blob)

await blob_manager.remove_blob()


@pytest.mark.asyncio
@pytest.mark.skipif(sys.version_info.minor < 10, reason="requires Python 3.10 or higher")
async def test_create_container_upon_upload(monkeypatch, mock_env, blob_manager):
with NamedTemporaryFile(suffix=".pdf") as temp_file:
f = File(temp_file.file)
filename = f.content.name.split("/tmp/")[1]

# Set up mocks used by upload_blob
async def mock_exists(*args, **kwargs):
return False

monkeypatch.setattr("azure.storage.blob.aio.ContainerClient.exists", mock_exists)

async def mock_create_container(*args, **kwargs):
return

monkeypatch.setattr("azure.storage.blob.aio.ContainerClient.create_container", mock_create_container)

async def mock_upload_blob(self, name, *args, **kwargs):
assert name == filename
return True

monkeypatch.setattr("azure.storage.blob.aio.ContainerClient.upload_blob", mock_upload_blob)

await blob_manager.upload_blob(f)


@pytest.mark.asyncio
@pytest.mark.skipif(sys.version_info.minor < 10, reason="requires Python 3.10 or higher")
async def test_dont_remove_if_no_container(monkeypatch, mock_env, blob_manager):
async def mock_exists(*args, **kwargs):
return False

monkeypatch.setattr("azure.storage.blob.aio.ContainerClient.exists", mock_exists)

async def mock_delete_blob(*args, **kwargs):
assert False, "delete_blob() shouldn't have been called"

monkeypatch.setattr("azure.storage.blob.aio.ContainerClient.delete_blob", mock_delete_blob)

await blob_manager.remove_blob()


def test_sourcepage_from_file_page():
assert BlobManager.sourcepage_from_file_page("test.pdf", 0) == "test.pdf#page=1"
assert BlobManager.sourcepage_from_file_page("test.html", 0) == "test.html"


def test_blob_name_from_file_name():
assert BlobManager.blob_name_from_file_name("tmp/test.pdf") == "test.pdf"
assert BlobManager.blob_name_from_file_name("tmp/test.html") == "test.html"
Loading