Skip to content

Commit

Permalink
feat: update docling reader into extension manager
Browse files Browse the repository at this point in the history
  • Loading branch information
phv2312 committed Dec 15, 2024
1 parent 609a1f0 commit ae56308
Show file tree
Hide file tree
Showing 7 changed files with 105 additions and 53 deletions.
2 changes: 1 addition & 1 deletion libs/kotaemon/kotaemon/indices/ingests/files.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,13 @@
from typing import Type

from decouple import config
from ktem.extensions.extensions import extension_manager
from llama_index.core.readers.base import BaseReader
from llama_index.readers.file import PDFReader
from theflow.settings import settings as flowsettings

from kotaemon.base import BaseComponent, Document, Param
from kotaemon.indices.extractors import BaseDocParser
from kotaemon.indices.ingests.extensions import extension_manager
from kotaemon.indices.splitters import BaseSplitter, TokenSplitter
from kotaemon.loaders import (
AdobeReader,
Expand Down
3 changes: 1 addition & 2 deletions libs/ktem/ktem/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,12 @@
from ktem.assets import PDFJS_PREBUILT_DIR, KotaemonTheme
from ktem.components import reasonings
from ktem.exceptions import HookAlreadyDeclared, HookNotDeclared
from ktem.extensions.extensions import extension_manager
from ktem.index import IndexManager
from ktem.settings import BaseSettingGroup, SettingGroup, SettingReasoningGroup
from theflow.settings import settings
from theflow.utils.modules import import_dotted_string

from kotaemon.indices.ingests.extensions import extension_manager


class BaseApp:
"""The main app of Kotaemon
Expand Down
Empty file.
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from copy import deepcopy
from functools import cached_property
from typing import Any

from decouple import config
Expand All @@ -8,31 +9,77 @@
from kotaemon.loaders import (
AdobeReader,
AzureAIDocumentIntelligenceLoader,
DoclingReader,
GOCR2ImageReader,
HtmlReader,
MhtmlReader,
PandasExcelReader,
PDFThumbnailReader,
TxtReader,
UnstructuredReader,
WebReader,
)

unstructured = UnstructuredReader()
adobe_reader = AdobeReader()
azure_reader = AzureAIDocumentIntelligenceLoader(
endpoint=str(config("AZURE_DI_ENDPOINT", default="")),
credential=str(config("AZURE_DI_CREDENTIAL", default="")),
cache_dir=getattr(flowsettings, "KH_MARKDOWN_OUTPUT_DIR", None),
)
adobe_reader.vlm_endpoint = azure_reader.vlm_endpoint = getattr(
flowsettings, "KH_VLM_ENDPOINT", ""
)

class ReaderFactory:
@cached_property
def web(self) -> WebReader:
return WebReader()

@cached_property
def unstructured(self) -> UnstructuredReader:
return UnstructuredReader()

@cached_property
def adobe(self) -> AdobeReader:
adobe_reader = AdobeReader()
adobe_reader.vlm_endpoint = getattr(flowsettings, "KH_VLM_ENDPOINT", "")
return adobe_reader

@cached_property
def azuredi(self) -> AzureAIDocumentIntelligenceLoader:
azuredi_reader = AzureAIDocumentIntelligenceLoader(
endpoint=str(config("AZURE_DI_ENDPOINT", default="")),
credential=str(config("AZURE_DI_CREDENTIAL", default="")),
cache_dir=getattr(flowsettings, "KH_MARKDOWN_OUTPUT_DIR", None),
)
azuredi_reader.vlm_endpoint = getattr(flowsettings, "KH_VLM_ENDPOINT", "")
return azuredi_reader

@cached_property
def pandas_excel(self) -> PandasExcelReader:
return PandasExcelReader()

@cached_property
def html(self) -> HtmlReader:
return HtmlReader()

@cached_property
def mhtml(self) -> MhtmlReader:
return MhtmlReader()

@cached_property
def gocr(self) -> GOCR2ImageReader:
return GOCR2ImageReader()

@cached_property
def txt(self) -> TxtReader:
return TxtReader()

@cached_property
def docling(self) -> DoclingReader:
return DoclingReader()

@cached_property
def pdf_thumbnail(self) -> PDFThumbnailReader:
return PDFThumbnailReader()


class ExtensionManager:
"""Pool of loaders for extensions"""

def __init__(self):
def __init__(self, factory: ReaderFactory | None = None):
self.factory = factory or ReaderFactory()
self._supported, self._default_index = self._init_supported()

def get_current_loader(self) -> dict[str, BaseReader]:
Expand All @@ -43,26 +90,40 @@ def get_current_loader(self) -> dict[str, BaseReader]:
}
)

@staticmethod
def _init_supported() -> tuple[dict[str, list[BaseReader]], dict[str, str]]:
gocr = GOCR2ImageReader()

def _init_supported(self) -> tuple[dict[str, list[BaseReader]], dict[str, str]]:
supported: dict[str, list[BaseReader]] = {
".xlsx": [PandasExcelReader()],
".docx": [unstructured],
".pptx": [unstructured],
".xls": [unstructured],
".doc": [unstructured],
".html": [HtmlReader()],
".mhtml": [MhtmlReader()],
".png": [unstructured, gocr],
".jpeg": [unstructured, gocr],
".jpg": [unstructured, gocr],
".tiff": [unstructured],
".tif": [unstructured],
".pdf": [PDFThumbnailReader(), adobe_reader, azure_reader],
".txt": [TxtReader()],
".md": [TxtReader()],
".xlsx": [self.factory.pandas_excel],
".docx": [self.factory.unstructured],
".pptx": [self.factory.unstructured],
".xls": [self.factory.unstructured],
".doc": [self.factory.unstructured],
".html": [self.factory.html],
".mhtml": [self.factory.mhtml],
".png": [
self.factory.unstructured,
self.factory.gocr,
self.factory.docling,
],
".jpeg": [
self.factory.unstructured,
self.factory.gocr,
self.factory.docling,
],
".jpg": [
self.factory.unstructured,
self.factory.gocr,
self.factory.docling,
],
".tiff": [self.factory.unstructured, self.factory.docling],
".tif": [self.factory.unstructured, self.factory.docling],
".pdf": [
self.factory.pdf_thumbnail,
self.factory.adobe,
self.factory.azuredi,
self.factory.docling,
],
".txt": [self.factory.txt],
".md": [self.factory.txt],
}

default_index = {
Expand Down Expand Up @@ -136,7 +197,3 @@ def generate_gradio_settings(self) -> dict[str, Any]:


extension_manager = ExtensionManager()


if __name__ == "__main__":
print(extension_manager.get_loaders_by_extension(".xlsx"))
23 changes: 10 additions & 13 deletions libs/ktem/ktem/index/file/pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
import tiktoken
from ktem.db.models import engine
from ktem.embeddings.manager import embedding_models_manager
from ktem.extensions.extensions import extension_manager
from ktem.llms.manager import llms
from ktem.rerankings.manager import reranking_models_manager
from llama_index.core.readers.base import BaseReader
Expand All @@ -34,14 +35,10 @@
from kotaemon.base import BaseComponent, Document, Node, Param, RetrievedDocument
from kotaemon.embeddings import BaseEmbeddings
from kotaemon.indices import VectorIndexing, VectorRetrieval
from kotaemon.indices.ingests.extensions import extension_manager
from kotaemon.indices.ingests.files import ( # KH_DEFAULT_FILE_EXTRACTORS,
adobe_reader,
azure_reader,
docling_reader,
unstructured,
web_reader,
)

# from kotaemon.indices.ingests.files import ( # KH_DEFAULT_FILE_EXTRACTORS,
# web_reader,
# )
from kotaemon.indices.rankings import BaseReranking, LLMReranking, LLMTrulensScoring
from kotaemon.indices.splitters import BaseSplitter, TokenSplitter

Expand Down Expand Up @@ -674,11 +671,11 @@ def readers(self):
readers: dict[str, BaseReader] = extension_manager.get_current_loader()
print("reader_mode", self.reader_mode)
if self.reader_mode == "adobe":
readers[".pdf"] = adobe_reader
readers[".pdf"] = extension_manager.factory.adobe
elif self.reader_mode == "azure-di":
readers[".pdf"] = azure_reader
readers[".pdf"] = extension_manager.factory.azuredi
elif self.reader_mode == "docling":
readers[".pdf"] = docling_reader
readers[".pdf"] = extension_manager.factory.docling

dev_readers, _, _ = dev_settings()
readers.update(dev_readers)
Expand Down Expand Up @@ -737,11 +734,11 @@ def route(self, file_path: str | Path) -> IndexPipeline:

# check if file_path is a URL
if self.is_url(file_path):
reader = web_reader
reader = extension_manager.factory.web
else:
assert isinstance(file_path, Path)
ext = file_path.suffix.lower()
reader = self.readers.get(ext, unstructured)
reader = self.readers.get(ext, extension_manager.factory.unstructured)
if reader is None:
raise NotImplementedError(
f"No supported pipeline to index {file_path.name}. Please specify "
Expand Down
2 changes: 1 addition & 1 deletion libs/ktem/ktem/pages/chat/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from ktem.app import BasePage
from ktem.components import reasonings
from ktem.db.models import Conversation, engine
from ktem.extensions.extensions import extension_manager
from ktem.index.file.ui import File
from ktem.reasoning.prompt_optimization.suggest_conversation_name import (
SuggestConvNamePipeline,
Expand All @@ -20,7 +21,6 @@
from theflow.settings import settings as flowsettings

from kotaemon.base import Document
from kotaemon.indices.ingests.extensions import extension_manager

from ...utils import SUPPORTED_LANGUAGE_MAP, get_file_names_regex
from .chat_panel import ChatPanel
Expand Down
3 changes: 1 addition & 2 deletions libs/ktem/ktem/pages/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,9 @@
from ktem.app import BasePage
from ktem.components import reasonings
from ktem.db.models import Settings, User, engine
from ktem.extensions.extensions import extension_manager
from sqlmodel import Session, select

from kotaemon.indices.ingests.extensions import extension_manager

signout_js = """
function(u, c, pw, pwc) {
removeFromStorage('username');
Expand Down

0 comments on commit ae56308

Please sign in to comment.