diff --git a/.gitignore b/.gitignore index d9d3029..379ea70 100644 --- a/.gitignore +++ b/.gitignore @@ -12,6 +12,8 @@ venv */cdp/* *.pkl + !megaparse/tests/output_tests/MegaFake_report.md *.DS_Store -.tool-versions \ No newline at end of file +.tool-versions +megaparse/sdk/examples/only_pdfs/ diff --git a/Dockerfile b/Dockerfile index 95f3ce5..f94d347 100644 --- a/Dockerfile +++ b/Dockerfile @@ -32,7 +32,6 @@ RUN apt-get clean && apt-get update && apt-get install -y \ COPY requirements.lock pyproject.toml README.md ./ COPY megaparse/sdk/pyproject.toml megaparse/sdk/README.md megaparse/sdk/ - RUN PYTHONDONTWRITEBYTECODE=1 pip install --no-cache-dir -r requirements.lock RUN playwright install --with-deps && \ @@ -46,4 +45,4 @@ ENV PYTHONPATH="/app:/app/megaparse/sdk" COPY . . EXPOSE 8000 -CMD ["uvicorn", "megaparse.api.app:app", "--host", "0.0.0.0", "--port", "8000"] \ No newline at end of file +CMD ["uvicorn", "megaparse.api.app:app", "--host", "0.0.0.0", "--port", "8000", "--workers", "4"] diff --git a/benchmark/process_time.py b/benchmark/process_time.py new file mode 100644 index 0000000..0cba25a --- /dev/null +++ b/benchmark/process_time.py @@ -0,0 +1,53 @@ +import asyncio +import os +import time + +import numpy as np + +from megaparse.sdk import MegaParseSDK + + +async def process_file(megaparse: MegaParseSDK, file_path): + try: + t0 = time.perf_counter() + response = await megaparse.file.upload( + file_path=file_path, + method="unstructured", # type: ignore # unstructured, llama_parser, megaparse_vision + strategy="auto", + ) + total = time.perf_counter() - t0 + return total + except Exception as e: + print(f"Exception occured: {e}") + return None + + +async def test_process_folder(folder_path, api_key): + import os + + list_process_time = [] + files = os.listdir(folder_path) + task = [] + + megaparse = MegaParseSDK(api_key) + for file in files: + task.append(process_file(megaparse, os.path.join(folder_path, file))) + list_process_time = await asyncio.gather(*task) + + n_errors = sum([t is None for t in list_process_time]) + list_process_time = [t for t in list_process_time if t is not None] + + np_list_process_time = np.array(list_process_time) + print(f"All errors : {n_errors}") + print(f"Average time taken: {np_list_process_time.mean()}") + print(f"Median time taken: {np.median(list_process_time)}") + print(f"Standard deviation of time taken: {np.std(list_process_time)}") + print(f"Max time taken: {np.max(list_process_time)}") + print(f"Min time taken: {np.min(list_process_time)}") + + +if __name__ == "__main__": + api_key = os.getenv("MEGAPARSE_API_KEY") + # folder_path = "megaparse/sdk/examples/only_pdfs" + folder_path = "/Users/amine/data/quivr/only_pdfs/" + asyncio.run(test_process_folder(folder_path, api_key)) diff --git a/megaparse/api/app.py b/megaparse/api/app.py index 1cda49c..2928106 100644 --- a/megaparse/api/app.py +++ b/megaparse/api/app.py @@ -1,8 +1,10 @@ import os import tempfile +from typing import Optional import httpx import psutil +import uvicorn from fastapi import Depends, FastAPI, File, HTTPException, UploadFile from langchain_anthropic import ChatAnthropic from langchain_community.document_loaders import PlaywrightURLLoader @@ -12,13 +14,15 @@ from megaparse.api.utils.type import HTTPModelNotSupported from megaparse.core.megaparse import MegaParse from megaparse.core.parser.builder import ParserBuilder -from megaparse.core.parser.type import ParserConfig, ParserType +from megaparse.core.parser.type import ParserConfig, ParserConfigInput from megaparse.core.parser.unstructured_parser import StrategyEnum, UnstructuredParser app = FastAPI() playwright_loader = PlaywrightURLLoader(urls=[], remove_selectors=["header", "footer"]) +_megaparse_instances_cache = {} + def parser_builder_dep(): return ParserBuilder() @@ -44,27 +48,29 @@ def _check_free_memory() -> bool: @app.post("/v1/file") +@app.post( + "/v1/file", +) async def parse_file( - file: UploadFile = File(...), - method: ParserType = ParserType.UNSTRUCTURED, - strategy: StrategyEnum = StrategyEnum.AUTO, - check_table=False, - language: Language = Language.ENGLISH, - parsing_instruction: str | None = None, - model_name: str | None = None, + file: UploadFile, + parser_config: str = File(...), parser_builder=Depends(parser_builder_dep), ) -> dict[str, str]: + in_parser_config = ParserConfigInput.model_validate_json(parser_config) + if not _check_free_memory(): raise HTTPException( status_code=503, detail="Service unavailable due to low memory" ) model = None - if model_name: - if model_name.startswith("gpt"): - model = ChatOpenAI(model=model_name, api_key=os.getenv("OPENAI_API_KEY")) # type: ignore - elif model_name.startswith("claude"): + if in_parser_config.model_name: + if in_parser_config.model_name.startswith("gpt"): + model = ChatOpenAI( + model=in_parser_config.model_name, api_key=os.getenv("OPENAI_API_KEY") + ) # type: ignore + elif in_parser_config.model_name.startswith("claude"): model = ChatAnthropic( - model_name=model_name, + model_name=in_parser_config.model_name, api_key=os.getenv("ANTHROPIC_API_KEY"), # type: ignore timeout=60, stop=None, @@ -73,20 +79,27 @@ async def parse_file( else: raise HTTPModelNotSupported() - parser_config = ParserConfig( - method=method, - strategy=strategy, - model=model if model and check_table else None, - language=language, - parsing_instruction=parsing_instruction, + out_parser_config = ParserConfig( + method=in_parser_config.method, + strategy=in_parser_config.strategy, + model=model if model and in_parser_config.check_table else None, + language=in_parser_config.language, + parsing_instruction=in_parser_config.parsing_instruction, ) + + # TODO: move to function or metaclass in Megaparse + # if hash(out_parser_config) in _megaparse_instances_cache: + # megaparse = _megaparse_instances_cache[hash(out_parser_config)] + # else: + parser = parser_builder.build(out_parser_config) + megaparse = MegaParse(parser=parser) + # _megaparse_instances_cache[hash(out_parser_config)] = megaparse + try: - parser = parser_builder.build(parser_config) with tempfile.NamedTemporaryFile( delete=False, suffix=f".{str(file.filename).split('.')[-1]}" ) as temp_file: temp_file.write(file.file.read()) - megaparse = MegaParse(parser=parser) result = await megaparse.aload(file_path=temp_file.name) return {"message": "File parsed successfully", "result": result} except Exception as e: @@ -132,3 +145,7 @@ async def upload_url( "message": "Website content parsed successfully", "result": extracted_content, } + + +if __name__ == "__main__": + uvicorn.run(app, host="0.0.0.0", port=8000) diff --git a/megaparse/core/parser/type.py b/megaparse/core/parser/type.py index 6de9d07..a9c216f 100644 --- a/megaparse/core/parser/type.py +++ b/megaparse/core/parser/type.py @@ -1,7 +1,7 @@ from enum import Enum from llama_parse.utils import Language from langchain_core.language_models.chat_models import BaseChatModel -from pydantic import BaseModel +from pydantic import BaseModel, ConfigDict class ParserType(str, Enum): @@ -23,8 +23,21 @@ class StrategyEnum(str, Enum): class ParserConfig(BaseModel): """Parser configuration model.""" + model_config = ConfigDict(frozen=True) + method: ParserType = ParserType.UNSTRUCTURED strategy: StrategyEnum = StrategyEnum.AUTO model: BaseChatModel | None = None language: Language = Language.ENGLISH parsing_instruction: str | None = None + + +class ParserConfigInput(BaseModel): + """Parser configuration model.""" + + method: ParserType = ParserType.UNSTRUCTURED + strategy: StrategyEnum = StrategyEnum.FAST + check_table: bool = False + parsing_instruction: str | None = None + model_name: str = "gpt-4o" + language: Language = Language.ENGLISH diff --git a/megaparse/core/parser/unstructured_parser.py b/megaparse/core/parser/unstructured_parser.py index 84e20c4..771ed57 100644 --- a/megaparse/core/parser/unstructured_parser.py +++ b/megaparse/core/parser/unstructured_parser.py @@ -101,11 +101,7 @@ def get_markdown_line(self, el: dict): async def convert(self, file_path, **kwargs) -> str: # Partition the PDF - elements = partition( - filename=str(file_path), - strategy=self.strategy, - skip_infer_table_types=[], - ) + elements = partition(filename=str(file_path), strategy=self.strategy) elements_dict = [el.to_dict() for el in elements] markdown_content = self.convert_to_markdown(elements_dict) return markdown_content diff --git a/megaparse/sdk/examples/usage_example.py b/megaparse/sdk/examples/usage_example.py index eec216b..8442960 100644 --- a/megaparse/sdk/examples/usage_example.py +++ b/megaparse/sdk/examples/usage_example.py @@ -1,7 +1,7 @@ import asyncio import os -from megaparse_sdk import MegaParseSDK +from megaparse.sdk import MegaParseSDK async def main(): @@ -20,7 +20,7 @@ async def main(): response = await megaparse.file.upload( file_path=file_path, method="unstructured", # type: ignore # unstructured, llama_parser, megaparse_vision - strategy="auto", + strategy="fast", ) print(f"\n----- File Response : {file_path} -----\n") print(response) diff --git a/megaparse/sdk/megaparse_sdk/client.py b/megaparse/sdk/megaparse_sdk/client.py index 9356df1..94f2e5c 100644 --- a/megaparse/sdk/megaparse_sdk/client.py +++ b/megaparse/sdk/megaparse_sdk/client.py @@ -7,7 +7,8 @@ class MegaParseClient: def __init__(self, api_key: str | None = None): self.base_url = os.getenv( - "MEGAPARSE_URL", "https://megaparse.tooling.quivr.app" + "MEGAPARSE_URL", + "http://localhost:8000", # https://megaparse.tooling.quivr.app" ) self.api_key = api_key diff --git a/megaparse/sdk/megaparse_sdk/endpoints/file_upload.py b/megaparse/sdk/megaparse_sdk/endpoints/file_upload.py index fdd3887..3c6e177 100644 --- a/megaparse/sdk/megaparse_sdk/endpoints/file_upload.py +++ b/megaparse/sdk/megaparse_sdk/endpoints/file_upload.py @@ -3,6 +3,7 @@ from httpx import Response from megaparse_sdk.client import MegaParseClient from megaparse_sdk.utils.type import Language, ParserType, StrategyEnum +import json class FileUpload: @@ -19,14 +20,21 @@ async def upload( parsing_instruction: Optional[str] = None, model_name: str = "gpt-4o", ) -> Response: + mc_data = { + "method": method, + "strategy": strategy, + "check_table": check_table, + "language": language.value, + "parsing_instruction": parsing_instruction, + "model_name": model_name, + } with open(file_path, "rb") as file: - files = {"file": (file_path, file)} - data = { - "method": method, - "strategy": strategy, - "check_table": check_table, - "language": language.value, - "parsing_instruction": parsing_instruction, - "model_name": model_name, + multipart_data = { + "parser_config": (None, json.dumps(mc_data), "application/json"), + "file": (file_path, file), } - return await self.client.request("POST", "/v1/file", files=files, data=data) + return await self.client.request( + "POST", + "/v1/file", + files=multipart_data, + )