From ae56308d5a896b87de8c2995d7ec3014140994af Mon Sep 17 00:00:00 2001 From: phv2312 Date: Sun, 15 Dec 2024 15:35:39 +0700 Subject: [PATCH] feat: update docling reader into extension manager --- .../kotaemon/indices/ingests/files.py | 2 +- libs/ktem/ktem/app.py | 3 +- libs/ktem/ktem/extensions/__init__.py | 0 .../ktem/extensions}/extensions.py | 125 +++++++++++++----- libs/ktem/ktem/index/file/pipelines.py | 23 ++-- libs/ktem/ktem/pages/chat/__init__.py | 2 +- libs/ktem/ktem/pages/settings.py | 3 +- 7 files changed, 105 insertions(+), 53 deletions(-) create mode 100644 libs/ktem/ktem/extensions/__init__.py rename libs/{kotaemon/kotaemon/indices/ingests => ktem/ktem/extensions}/extensions.py (51%) diff --git a/libs/kotaemon/kotaemon/indices/ingests/files.py b/libs/kotaemon/kotaemon/indices/ingests/files.py index f289567b7..2cb4a7178 100644 --- a/libs/kotaemon/kotaemon/indices/ingests/files.py +++ b/libs/kotaemon/kotaemon/indices/ingests/files.py @@ -2,13 +2,13 @@ from typing import Type from decouple import config +from ktem.extensions.extensions import extension_manager from llama_index.core.readers.base import BaseReader from llama_index.readers.file import PDFReader from theflow.settings import settings as flowsettings from kotaemon.base import BaseComponent, Document, Param from kotaemon.indices.extractors import BaseDocParser -from kotaemon.indices.ingests.extensions import extension_manager from kotaemon.indices.splitters import BaseSplitter, TokenSplitter from kotaemon.loaders import ( AdobeReader, diff --git a/libs/ktem/ktem/app.py b/libs/ktem/ktem/app.py index 53ec58d80..81a68ed12 100644 --- a/libs/ktem/ktem/app.py +++ b/libs/ktem/ktem/app.py @@ -7,13 +7,12 @@ from ktem.assets import PDFJS_PREBUILT_DIR, KotaemonTheme from ktem.components import reasonings from ktem.exceptions import HookAlreadyDeclared, HookNotDeclared +from ktem.extensions.extensions import extension_manager from ktem.index import IndexManager from ktem.settings import BaseSettingGroup, SettingGroup, SettingReasoningGroup from theflow.settings import settings from theflow.utils.modules import import_dotted_string -from kotaemon.indices.ingests.extensions import extension_manager - class BaseApp: """The main app of Kotaemon diff --git a/libs/ktem/ktem/extensions/__init__.py b/libs/ktem/ktem/extensions/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/libs/kotaemon/kotaemon/indices/ingests/extensions.py b/libs/ktem/ktem/extensions/extensions.py similarity index 51% rename from libs/kotaemon/kotaemon/indices/ingests/extensions.py rename to libs/ktem/ktem/extensions/extensions.py index 0d956b96b..2cad24a9b 100644 --- a/libs/kotaemon/kotaemon/indices/ingests/extensions.py +++ b/libs/ktem/ktem/extensions/extensions.py @@ -1,4 +1,5 @@ from copy import deepcopy +from functools import cached_property from typing import Any from decouple import config @@ -8,6 +9,7 @@ from kotaemon.loaders import ( AdobeReader, AzureAIDocumentIntelligenceLoader, + DoclingReader, GOCR2ImageReader, HtmlReader, MhtmlReader, @@ -15,24 +17,69 @@ PDFThumbnailReader, TxtReader, UnstructuredReader, + WebReader, ) -unstructured = UnstructuredReader() -adobe_reader = AdobeReader() -azure_reader = AzureAIDocumentIntelligenceLoader( - endpoint=str(config("AZURE_DI_ENDPOINT", default="")), - credential=str(config("AZURE_DI_CREDENTIAL", default="")), - cache_dir=getattr(flowsettings, "KH_MARKDOWN_OUTPUT_DIR", None), -) -adobe_reader.vlm_endpoint = azure_reader.vlm_endpoint = getattr( - flowsettings, "KH_VLM_ENDPOINT", "" -) + +class ReaderFactory: + @cached_property + def web(self) -> WebReader: + return WebReader() + + @cached_property + def unstructured(self) -> UnstructuredReader: + return UnstructuredReader() + + @cached_property + def adobe(self) -> AdobeReader: + adobe_reader = AdobeReader() + adobe_reader.vlm_endpoint = getattr(flowsettings, "KH_VLM_ENDPOINT", "") + return adobe_reader + + @cached_property + def azuredi(self) -> AzureAIDocumentIntelligenceLoader: + azuredi_reader = AzureAIDocumentIntelligenceLoader( + endpoint=str(config("AZURE_DI_ENDPOINT", default="")), + credential=str(config("AZURE_DI_CREDENTIAL", default="")), + cache_dir=getattr(flowsettings, "KH_MARKDOWN_OUTPUT_DIR", None), + ) + azuredi_reader.vlm_endpoint = getattr(flowsettings, "KH_VLM_ENDPOINT", "") + return azuredi_reader + + @cached_property + def pandas_excel(self) -> PandasExcelReader: + return PandasExcelReader() + + @cached_property + def html(self) -> HtmlReader: + return HtmlReader() + + @cached_property + def mhtml(self) -> MhtmlReader: + return MhtmlReader() + + @cached_property + def gocr(self) -> GOCR2ImageReader: + return GOCR2ImageReader() + + @cached_property + def txt(self) -> TxtReader: + return TxtReader() + + @cached_property + def docling(self) -> DoclingReader: + return DoclingReader() + + @cached_property + def pdf_thumbnail(self) -> PDFThumbnailReader: + return PDFThumbnailReader() class ExtensionManager: """Pool of loaders for extensions""" - def __init__(self): + def __init__(self, factory: ReaderFactory | None = None): + self.factory = factory or ReaderFactory() self._supported, self._default_index = self._init_supported() def get_current_loader(self) -> dict[str, BaseReader]: @@ -43,26 +90,40 @@ def get_current_loader(self) -> dict[str, BaseReader]: } ) - @staticmethod - def _init_supported() -> tuple[dict[str, list[BaseReader]], dict[str, str]]: - gocr = GOCR2ImageReader() - + def _init_supported(self) -> tuple[dict[str, list[BaseReader]], dict[str, str]]: supported: dict[str, list[BaseReader]] = { - ".xlsx": [PandasExcelReader()], - ".docx": [unstructured], - ".pptx": [unstructured], - ".xls": [unstructured], - ".doc": [unstructured], - ".html": [HtmlReader()], - ".mhtml": [MhtmlReader()], - ".png": [unstructured, gocr], - ".jpeg": [unstructured, gocr], - ".jpg": [unstructured, gocr], - ".tiff": [unstructured], - ".tif": [unstructured], - ".pdf": [PDFThumbnailReader(), adobe_reader, azure_reader], - ".txt": [TxtReader()], - ".md": [TxtReader()], + ".xlsx": [self.factory.pandas_excel], + ".docx": [self.factory.unstructured], + ".pptx": [self.factory.unstructured], + ".xls": [self.factory.unstructured], + ".doc": [self.factory.unstructured], + ".html": [self.factory.html], + ".mhtml": [self.factory.mhtml], + ".png": [ + self.factory.unstructured, + self.factory.gocr, + self.factory.docling, + ], + ".jpeg": [ + self.factory.unstructured, + self.factory.gocr, + self.factory.docling, + ], + ".jpg": [ + self.factory.unstructured, + self.factory.gocr, + self.factory.docling, + ], + ".tiff": [self.factory.unstructured, self.factory.docling], + ".tif": [self.factory.unstructured, self.factory.docling], + ".pdf": [ + self.factory.pdf_thumbnail, + self.factory.adobe, + self.factory.azuredi, + self.factory.docling, + ], + ".txt": [self.factory.txt], + ".md": [self.factory.txt], } default_index = { @@ -136,7 +197,3 @@ def generate_gradio_settings(self) -> dict[str, Any]: extension_manager = ExtensionManager() - - -if __name__ == "__main__": - print(extension_manager.get_loaders_by_extension(".xlsx")) diff --git a/libs/ktem/ktem/index/file/pipelines.py b/libs/ktem/ktem/index/file/pipelines.py index e74f1af0e..c6b555930 100644 --- a/libs/ktem/ktem/index/file/pipelines.py +++ b/libs/ktem/ktem/index/file/pipelines.py @@ -15,6 +15,7 @@ import tiktoken from ktem.db.models import engine from ktem.embeddings.manager import embedding_models_manager +from ktem.extensions.extensions import extension_manager from ktem.llms.manager import llms from ktem.rerankings.manager import reranking_models_manager from llama_index.core.readers.base import BaseReader @@ -34,14 +35,10 @@ from kotaemon.base import BaseComponent, Document, Node, Param, RetrievedDocument from kotaemon.embeddings import BaseEmbeddings from kotaemon.indices import VectorIndexing, VectorRetrieval -from kotaemon.indices.ingests.extensions import extension_manager -from kotaemon.indices.ingests.files import ( # KH_DEFAULT_FILE_EXTRACTORS, - adobe_reader, - azure_reader, - docling_reader, - unstructured, - web_reader, -) + +# from kotaemon.indices.ingests.files import ( # KH_DEFAULT_FILE_EXTRACTORS, +# web_reader, +# ) from kotaemon.indices.rankings import BaseReranking, LLMReranking, LLMTrulensScoring from kotaemon.indices.splitters import BaseSplitter, TokenSplitter @@ -674,11 +671,11 @@ def readers(self): readers: dict[str, BaseReader] = extension_manager.get_current_loader() print("reader_mode", self.reader_mode) if self.reader_mode == "adobe": - readers[".pdf"] = adobe_reader + readers[".pdf"] = extension_manager.factory.adobe elif self.reader_mode == "azure-di": - readers[".pdf"] = azure_reader + readers[".pdf"] = extension_manager.factory.azuredi elif self.reader_mode == "docling": - readers[".pdf"] = docling_reader + readers[".pdf"] = extension_manager.factory.docling dev_readers, _, _ = dev_settings() readers.update(dev_readers) @@ -737,11 +734,11 @@ def route(self, file_path: str | Path) -> IndexPipeline: # check if file_path is a URL if self.is_url(file_path): - reader = web_reader + reader = extension_manager.factory.web else: assert isinstance(file_path, Path) ext = file_path.suffix.lower() - reader = self.readers.get(ext, unstructured) + reader = self.readers.get(ext, extension_manager.factory.unstructured) if reader is None: raise NotImplementedError( f"No supported pipeline to index {file_path.name}. Please specify " diff --git a/libs/ktem/ktem/pages/chat/__init__.py b/libs/ktem/ktem/pages/chat/__init__.py index 970706591..b1f04de1f 100644 --- a/libs/ktem/ktem/pages/chat/__init__.py +++ b/libs/ktem/ktem/pages/chat/__init__.py @@ -8,6 +8,7 @@ from ktem.app import BasePage from ktem.components import reasonings from ktem.db.models import Conversation, engine +from ktem.extensions.extensions import extension_manager from ktem.index.file.ui import File from ktem.reasoning.prompt_optimization.suggest_conversation_name import ( SuggestConvNamePipeline, @@ -20,7 +21,6 @@ from theflow.settings import settings as flowsettings from kotaemon.base import Document -from kotaemon.indices.ingests.extensions import extension_manager from ...utils import SUPPORTED_LANGUAGE_MAP, get_file_names_regex from .chat_panel import ChatPanel diff --git a/libs/ktem/ktem/pages/settings.py b/libs/ktem/ktem/pages/settings.py index 001dcaef7..acbff117c 100644 --- a/libs/ktem/ktem/pages/settings.py +++ b/libs/ktem/ktem/pages/settings.py @@ -4,10 +4,9 @@ from ktem.app import BasePage from ktem.components import reasonings from ktem.db.models import Settings, User, engine +from ktem.extensions.extensions import extension_manager from sqlmodel import Session, select -from kotaemon.indices.ingests.extensions import extension_manager - signout_js = """ function(u, c, pw, pwc) { removeFromStorage('username');