QuivrHQ · chloedia · Nov 13, 2024 · Nov 13, 2024 · Nov 13, 2024 · Nov 13, 2024
diff --git a/.gitignore b/.gitignore
@@ -12,6 +12,8 @@ venv
 */cdp/*
 *.pkl
 
+
 !megaparse/tests/output_tests/MegaFake_report.md
 *.DS_Store
-.tool-versions
+.tool-versions
+megaparse/sdk/examples/only_pdfs/
diff --git a/Dockerfile b/Dockerfile
@@ -32,7 +32,6 @@ RUN apt-get clean && apt-get update && apt-get install -y \
 COPY requirements.lock  pyproject.toml README.md ./
 COPY megaparse/sdk/pyproject.toml megaparse/sdk/README.md megaparse/sdk/
 
-
 RUN PYTHONDONTWRITEBYTECODE=1 pip install --no-cache-dir -r requirements.lock
 
 RUN playwright install --with-deps && \
@@ -46,4 +45,4 @@ ENV PYTHONPATH="/app:/app/megaparse/sdk"
 COPY . .
 EXPOSE 8000
 
-CMD ["uvicorn", "megaparse.api.app:app", "--host", "0.0.0.0", "--port", "8000"]
+CMD ["uvicorn", "megaparse.api.app:app", "--host", "0.0.0.0", "--port", "8000", "--workers", "4"]
diff --git a/benchmark/process_time.py b/benchmark/process_time.py
@@ -0,0 +1,53 @@
+import asyncio
+import os
+import time
+
+import numpy as np
+
+from megaparse.sdk import MegaParseSDK
+
+
+async def process_file(megaparse: MegaParseSDK, file_path):
+    try:
+        t0 = time.perf_counter()
+        response = await megaparse.file.upload(
+            file_path=file_path,
+            method="unstructured",  # type: ignore  # unstructured, llama_parser, megaparse_vision
+            strategy="auto",
+        )
+        total = time.perf_counter() - t0
+        return total
+    except Exception as e:
+        print(f"Exception occured: {e}")
+        return None
+
+
+async def test_process_folder(folder_path, api_key):
+    import os
+
+    list_process_time = []
+    files = os.listdir(folder_path)
+    task = []
+
+    megaparse = MegaParseSDK(api_key)
+    for file in files:
+        task.append(process_file(megaparse, os.path.join(folder_path, file)))
+    list_process_time = await asyncio.gather(*task)
+
+    n_errors = sum([t is None for t in list_process_time])
+    list_process_time = [t for t in list_process_time if t is not None]
+
+    np_list_process_time = np.array(list_process_time)
+    print(f"All errors : {n_errors}")
+    print(f"Average time taken: {np_list_process_time.mean()}")
+    print(f"Median time taken: {np.median(list_process_time)}")
+    print(f"Standard deviation of time taken: {np.std(list_process_time)}")
+    print(f"Max time taken: {np.max(list_process_time)}")
+    print(f"Min time taken: {np.min(list_process_time)}")
+
+
+if __name__ == "__main__":
+    api_key = os.getenv("MEGAPARSE_API_KEY")
+    # folder_path = "megaparse/sdk/examples/only_pdfs"
+    folder_path = "/Users/amine/data/quivr/only_pdfs/"
+    asyncio.run(test_process_folder(folder_path, api_key))
diff --git a/megaparse/api/app.py b/megaparse/api/app.py
@@ -1,8 +1,10 @@
 import os
 import tempfile
+from typing import Optional
 
 import httpx
 import psutil
+import uvicorn
 from fastapi import Depends, FastAPI, File, HTTPException, UploadFile
 from langchain_anthropic import ChatAnthropic
 from langchain_community.document_loaders import PlaywrightURLLoader
@@ -12,13 +14,15 @@
 from megaparse.api.utils.type import HTTPModelNotSupported
 from megaparse.core.megaparse import MegaParse
 from megaparse.core.parser.builder import ParserBuilder
-from megaparse.core.parser.type import ParserConfig, ParserType
+from megaparse.core.parser.type import ParserConfig, ParserConfigInput
 from megaparse.core.parser.unstructured_parser import StrategyEnum, UnstructuredParser
 
 app = FastAPI()
 
 playwright_loader = PlaywrightURLLoader(urls=[], remove_selectors=["header", "footer"])
 
+_megaparse_instances_cache = {}
+
 
 def parser_builder_dep():
     return ParserBuilder()
@@ -44,27 +48,29 @@ def _check_free_memory() -> bool:
 
 
 @app.post("/v1/file")
+@app.post(
+    "/v1/file",
+)
 async def parse_file(
-    file: UploadFile = File(...),
-    method: ParserType = ParserType.UNSTRUCTURED,
-    strategy: StrategyEnum = StrategyEnum.AUTO,
-    check_table=False,
-    language: Language = Language.ENGLISH,
-    parsing_instruction: str | None = None,
-    model_name: str | None = None,
+    file: UploadFile,
+    parser_config: str = File(...),
     parser_builder=Depends(parser_builder_dep),
 ) -> dict[str, str]:
+    in_parser_config = ParserConfigInput.model_validate_json(parser_config)
+
     if not _check_free_memory():
         raise HTTPException(
             status_code=503, detail="Service unavailable due to low memory"
         )
     model = None
-    if model_name:
-        if model_name.startswith("gpt"):
-            model = ChatOpenAI(model=model_name, api_key=os.getenv("OPENAI_API_KEY"))  # type: ignore
-        elif model_name.startswith("claude"):
+    if in_parser_config.model_name:
+        if in_parser_config.model_name.startswith("gpt"):
+            model = ChatOpenAI(
+                model=in_parser_config.model_name, api_key=os.getenv("OPENAI_API_KEY")
+            )  # type: ignore
+        elif in_parser_config.model_name.startswith("claude"):
             model = ChatAnthropic(
-                model_name=model_name,
+                model_name=in_parser_config.model_name,
                 api_key=os.getenv("ANTHROPIC_API_KEY"),  # type: ignore
                 timeout=60,
                 stop=None,
@@ -73,20 +79,27 @@ async def parse_file(
         else:
             raise HTTPModelNotSupported()
 
-    parser_config = ParserConfig(
-        method=method,
-        strategy=strategy,
-        model=model if model and check_table else None,
-        language=language,
-        parsing_instruction=parsing_instruction,
+    out_parser_config = ParserConfig(
+        method=in_parser_config.method,
+        strategy=in_parser_config.strategy,
+        model=model if model and in_parser_config.check_table else None,
+        language=in_parser_config.language,
+        parsing_instruction=in_parser_config.parsing_instruction,
     )
+
+    # TODO: move to function or metaclass in Megaparse
+    # if hash(out_parser_config) in _megaparse_instances_cache:
+    #     megaparse = _megaparse_instances_cache[hash(out_parser_config)]
+    # else:
+    parser = parser_builder.build(out_parser_config)
+    megaparse = MegaParse(parser=parser)
+    # _megaparse_instances_cache[hash(out_parser_config)] = megaparse
+
     try:
-        parser = parser_builder.build(parser_config)
         with tempfile.NamedTemporaryFile(
             delete=False, suffix=f".{str(file.filename).split('.')[-1]}"
         ) as temp_file:
             temp_file.write(file.file.read())
-            megaparse = MegaParse(parser=parser)
             result = await megaparse.aload(file_path=temp_file.name)
             return {"message": "File parsed successfully", "result": result}
     except Exception as e:
@@ -132,3 +145,7 @@ async def upload_url(
             "message": "Website content parsed successfully",
             "result": extracted_content,
         }
+
+
+if __name__ == "__main__":
+    uvicorn.run(app, host="0.0.0.0", port=8000)
diff --git a/megaparse/core/parser/type.py b/megaparse/core/parser/type.py
@@ -1,7 +1,7 @@
 from enum import Enum
 from llama_parse.utils import Language
 from langchain_core.language_models.chat_models import BaseChatModel
-from pydantic import BaseModel
+from pydantic import BaseModel, ConfigDict
 
 
 class ParserType(str, Enum):
@@ -23,8 +23,21 @@ class StrategyEnum(str, Enum):
 class ParserConfig(BaseModel):
     """Parser configuration model."""
 
+    model_config = ConfigDict(frozen=True)
+
     method: ParserType = ParserType.UNSTRUCTURED
     strategy: StrategyEnum = StrategyEnum.AUTO
     model: BaseChatModel | None = None
     language: Language = Language.ENGLISH
     parsing_instruction: str | None = None
+
+
+class ParserConfigInput(BaseModel):
+    """Parser configuration model."""
+
+    method: ParserType = ParserType.UNSTRUCTURED
+    strategy: StrategyEnum = StrategyEnum.FAST
+    check_table: bool = False
+    parsing_instruction: str | None = None
+    model_name: str = "gpt-4o"
+    language: Language = Language.ENGLISH
diff --git a/megaparse/core/parser/unstructured_parser.py b/megaparse/core/parser/unstructured_parser.py
@@ -101,11 +101,7 @@ def get_markdown_line(self, el: dict):
 
     async def convert(self, file_path, **kwargs) -> str:
         # Partition the PDF
-        elements = partition(
-            filename=str(file_path),
-            strategy=self.strategy,
-            skip_infer_table_types=[],
-        )
+        elements = partition(filename=str(file_path), strategy=self.strategy)
         elements_dict = [el.to_dict() for el in elements]
         markdown_content = self.convert_to_markdown(elements_dict)
         return markdown_content
diff --git a/megaparse/sdk/examples/usage_example.py b/megaparse/sdk/examples/usage_example.py
@@ -1,7 +1,7 @@
 import asyncio
 import os
 
-from megaparse_sdk import MegaParseSDK
+from megaparse.sdk import MegaParseSDK
 
 
 async def main():
@@ -20,7 +20,7 @@ async def main():
     response = await megaparse.file.upload(
         file_path=file_path,
         method="unstructured",  # type: ignore  # unstructured, llama_parser, megaparse_vision
-        strategy="auto",
+        strategy="fast",
     )
     print(f"\n----- File Response : {file_path} -----\n")
     print(response)

diff --git a/megaparse/sdk/megaparse_sdk/client.py b/megaparse/sdk/megaparse_sdk/client.py
@@ -7,7 +7,8 @@
 class MegaParseClient:
     def __init__(self, api_key: str | None = None):
         self.base_url = os.getenv(
-            "MEGAPARSE_URL", "https://megaparse.tooling.quivr.app"
+            "MEGAPARSE_URL",
+            "http://localhost:8000",  # https://megaparse.tooling.quivr.app"
         )
 
         self.api_key = api_key

diff --git a/megaparse/sdk/megaparse_sdk/endpoints/file_upload.py b/megaparse/sdk/megaparse_sdk/endpoints/file_upload.py
@@ -3,6 +3,7 @@
 from httpx import Response
 from megaparse_sdk.client import MegaParseClient
 from megaparse_sdk.utils.type import Language, ParserType, StrategyEnum
+import json
 
 
 class FileUpload:
@@ -19,14 +20,21 @@ async def upload(
         parsing_instruction: Optional[str] = None,
         model_name: str = "gpt-4o",
     ) -> Response:
+        mc_data = {
+            "method": method,
+            "strategy": strategy,
+            "check_table": check_table,
+            "language": language.value,
+            "parsing_instruction": parsing_instruction,
+            "model_name": model_name,
+        }
         with open(file_path, "rb") as file:
-            files = {"file": (file_path, file)}
-            data = {
-                "method": method,
-                "strategy": strategy,
-                "check_table": check_table,
-                "language": language.value,
-                "parsing_instruction": parsing_instruction,
-                "model_name": model_name,
+            multipart_data = {
+                "parser_config": (None, json.dumps(mc_data), "application/json"),
+                "file": (file_path, file),
             }
-            return await self.client.request("POST", "/v1/file", files=files, data=data)
+            return await self.client.request(
+                "POST",
+                "/v1/file",
+                files=multipart_data,
+            )