Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: Strategy error with SDK #124

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@ venv
*/cdp/*
*.pkl


!megaparse/tests/output_tests/MegaFake_report.md
*.DS_Store
.tool-versions
.tool-versions
megaparse/sdk/examples/only_pdfs/
3 changes: 1 addition & 2 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@ RUN apt-get clean && apt-get update && apt-get install -y \
COPY requirements.lock pyproject.toml README.md ./
COPY megaparse/sdk/pyproject.toml megaparse/sdk/README.md megaparse/sdk/


RUN PYTHONDONTWRITEBYTECODE=1 pip install --no-cache-dir -r requirements.lock

RUN playwright install --with-deps && \
Expand All @@ -46,4 +45,4 @@ ENV PYTHONPATH="/app:/app/megaparse/sdk"
COPY . .
EXPOSE 8000

CMD ["uvicorn", "megaparse.api.app:app", "--host", "0.0.0.0", "--port", "8000"]
CMD ["uvicorn", "megaparse.api.app:app", "--host", "0.0.0.0", "--port", "8000", "--workers", "4"]
53 changes: 53 additions & 0 deletions benchmark/process_time.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
import asyncio
import os
import time

import numpy as np

from megaparse.sdk import MegaParseSDK


async def process_file(megaparse: MegaParseSDK, file_path):
try:
t0 = time.perf_counter()
response = await megaparse.file.upload(
file_path=file_path,
method="unstructured", # type: ignore # unstructured, llama_parser, megaparse_vision
strategy="auto",
)
total = time.perf_counter() - t0
return total
except Exception as e:
print(f"Exception occured: {e}")
return None


async def test_process_folder(folder_path, api_key):
import os

list_process_time = []
files = os.listdir(folder_path)
task = []

megaparse = MegaParseSDK(api_key)
for file in files:
task.append(process_file(megaparse, os.path.join(folder_path, file)))
list_process_time = await asyncio.gather(*task)

n_errors = sum([t is None for t in list_process_time])
list_process_time = [t for t in list_process_time if t is not None]

np_list_process_time = np.array(list_process_time)
print(f"All errors : {n_errors}")
print(f"Average time taken: {np_list_process_time.mean()}")
print(f"Median time taken: {np.median(list_process_time)}")
print(f"Standard deviation of time taken: {np.std(list_process_time)}")
print(f"Max time taken: {np.max(list_process_time)}")
print(f"Min time taken: {np.min(list_process_time)}")


if __name__ == "__main__":
api_key = os.getenv("MEGAPARSE_API_KEY")
# folder_path = "megaparse/sdk/examples/only_pdfs"
folder_path = "/Users/amine/data/quivr/only_pdfs/"
asyncio.run(test_process_folder(folder_path, api_key))
59 changes: 38 additions & 21 deletions megaparse/api/app.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
import os
import tempfile
from typing import Optional

import httpx
import psutil
import uvicorn
from fastapi import Depends, FastAPI, File, HTTPException, UploadFile
from langchain_anthropic import ChatAnthropic
from langchain_community.document_loaders import PlaywrightURLLoader
Expand All @@ -12,13 +14,15 @@
from megaparse.api.utils.type import HTTPModelNotSupported
from megaparse.core.megaparse import MegaParse
from megaparse.core.parser.builder import ParserBuilder
from megaparse.core.parser.type import ParserConfig, ParserType
from megaparse.core.parser.type import ParserConfig, ParserConfigInput
from megaparse.core.parser.unstructured_parser import StrategyEnum, UnstructuredParser

app = FastAPI()

playwright_loader = PlaywrightURLLoader(urls=[], remove_selectors=["header", "footer"])

_megaparse_instances_cache = {}


def parser_builder_dep():
return ParserBuilder()
Expand All @@ -44,27 +48,29 @@ def _check_free_memory() -> bool:


@app.post("/v1/file")
@app.post(
"/v1/file",
)
async def parse_file(
file: UploadFile = File(...),
method: ParserType = ParserType.UNSTRUCTURED,
strategy: StrategyEnum = StrategyEnum.AUTO,
check_table=False,
language: Language = Language.ENGLISH,
parsing_instruction: str | None = None,
model_name: str | None = None,
file: UploadFile,
parser_config: str = File(...),
parser_builder=Depends(parser_builder_dep),
) -> dict[str, str]:
in_parser_config = ParserConfigInput.model_validate_json(parser_config)

if not _check_free_memory():
raise HTTPException(
status_code=503, detail="Service unavailable due to low memory"
)
model = None
if model_name:
if model_name.startswith("gpt"):
model = ChatOpenAI(model=model_name, api_key=os.getenv("OPENAI_API_KEY")) # type: ignore
elif model_name.startswith("claude"):
if in_parser_config.model_name:
if in_parser_config.model_name.startswith("gpt"):
model = ChatOpenAI(
model=in_parser_config.model_name, api_key=os.getenv("OPENAI_API_KEY")
) # type: ignore
elif in_parser_config.model_name.startswith("claude"):
model = ChatAnthropic(
model_name=model_name,
model_name=in_parser_config.model_name,
api_key=os.getenv("ANTHROPIC_API_KEY"), # type: ignore
timeout=60,
stop=None,
Expand All @@ -73,20 +79,27 @@ async def parse_file(
else:
raise HTTPModelNotSupported()

parser_config = ParserConfig(
method=method,
strategy=strategy,
model=model if model and check_table else None,
language=language,
parsing_instruction=parsing_instruction,
out_parser_config = ParserConfig(
method=in_parser_config.method,
strategy=in_parser_config.strategy,
model=model if model and in_parser_config.check_table else None,
language=in_parser_config.language,
parsing_instruction=in_parser_config.parsing_instruction,
)

# TODO: move to function or metaclass in Megaparse
# if hash(out_parser_config) in _megaparse_instances_cache:
# megaparse = _megaparse_instances_cache[hash(out_parser_config)]
# else:
parser = parser_builder.build(out_parser_config)
megaparse = MegaParse(parser=parser)
# _megaparse_instances_cache[hash(out_parser_config)] = megaparse

try:
parser = parser_builder.build(parser_config)
with tempfile.NamedTemporaryFile(
delete=False, suffix=f".{str(file.filename).split('.')[-1]}"
) as temp_file:
temp_file.write(file.file.read())
megaparse = MegaParse(parser=parser)
result = await megaparse.aload(file_path=temp_file.name)
return {"message": "File parsed successfully", "result": result}
except Exception as e:
Expand Down Expand Up @@ -132,3 +145,7 @@ async def upload_url(
"message": "Website content parsed successfully",
"result": extracted_content,
}


if __name__ == "__main__":
uvicorn.run(app, host="0.0.0.0", port=8000)
15 changes: 14 additions & 1 deletion megaparse/core/parser/type.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from enum import Enum
from llama_parse.utils import Language
from langchain_core.language_models.chat_models import BaseChatModel
from pydantic import BaseModel
from pydantic import BaseModel, ConfigDict


class ParserType(str, Enum):
Expand All @@ -23,8 +23,21 @@ class StrategyEnum(str, Enum):
class ParserConfig(BaseModel):
"""Parser configuration model."""

model_config = ConfigDict(frozen=True)

method: ParserType = ParserType.UNSTRUCTURED
strategy: StrategyEnum = StrategyEnum.AUTO
model: BaseChatModel | None = None
language: Language = Language.ENGLISH
parsing_instruction: str | None = None


class ParserConfigInput(BaseModel):
"""Parser configuration model."""

method: ParserType = ParserType.UNSTRUCTURED
strategy: StrategyEnum = StrategyEnum.FAST
check_table: bool = False
parsing_instruction: str | None = None
model_name: str = "gpt-4o"
language: Language = Language.ENGLISH
6 changes: 1 addition & 5 deletions megaparse/core/parser/unstructured_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,11 +101,7 @@ def get_markdown_line(self, el: dict):

async def convert(self, file_path, **kwargs) -> str:
# Partition the PDF
elements = partition(
filename=str(file_path),
strategy=self.strategy,
skip_infer_table_types=[],
)
elements = partition(filename=str(file_path), strategy=self.strategy)
elements_dict = [el.to_dict() for el in elements]
markdown_content = self.convert_to_markdown(elements_dict)
return markdown_content
4 changes: 2 additions & 2 deletions megaparse/sdk/examples/usage_example.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import asyncio
import os

from megaparse_sdk import MegaParseSDK
from megaparse.sdk import MegaParseSDK


async def main():
Expand All @@ -20,7 +20,7 @@ async def main():
response = await megaparse.file.upload(
file_path=file_path,
method="unstructured", # type: ignore # unstructured, llama_parser, megaparse_vision
strategy="auto",
strategy="fast",
)
print(f"\n----- File Response : {file_path} -----\n")
print(response)
Expand Down
3 changes: 2 additions & 1 deletion megaparse/sdk/megaparse_sdk/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@
class MegaParseClient:
def __init__(self, api_key: str | None = None):
self.base_url = os.getenv(
"MEGAPARSE_URL", "https://megaparse.tooling.quivr.app"
"MEGAPARSE_URL",
"http://localhost:8000", # https://megaparse.tooling.quivr.app"
)

self.api_key = api_key
Expand Down
26 changes: 17 additions & 9 deletions megaparse/sdk/megaparse_sdk/endpoints/file_upload.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from httpx import Response
from megaparse_sdk.client import MegaParseClient
from megaparse_sdk.utils.type import Language, ParserType, StrategyEnum
import json


class FileUpload:
Expand All @@ -19,14 +20,21 @@ async def upload(
parsing_instruction: Optional[str] = None,
model_name: str = "gpt-4o",
) -> Response:
mc_data = {
"method": method,
"strategy": strategy,
"check_table": check_table,
"language": language.value,
"parsing_instruction": parsing_instruction,
"model_name": model_name,
}
with open(file_path, "rb") as file:
files = {"file": (file_path, file)}
data = {
"method": method,
"strategy": strategy,
"check_table": check_table,
"language": language.value,
"parsing_instruction": parsing_instruction,
"model_name": model_name,
multipart_data = {
"parser_config": (None, json.dumps(mc_data), "application/json"),
"file": (file_path, file),
}
return await self.client.request("POST", "/v1/file", files=files, data=data)
return await self.client.request(
"POST",
"/v1/file",
files=multipart_data,
)