diff --git a/libs/ktem/ktem/index/file/base.py b/libs/ktem/ktem/index/file/base.py index 427a3965..d57943ba 100644 --- a/libs/ktem/ktem/index/file/base.py +++ b/libs/ktem/ktem/index/file/base.py @@ -55,6 +55,8 @@ class BaseFileIndexIndexing(BaseComponent): FSPath = Param(help="The file storage path") user_id = Param(help="The user id") private = Param(False, help="Whether this is private index") + chunk_size = Param(help="Chunk size for this index") + chunk_overlap = Param(help="Chunk overlap for this index") def run( self, file_paths: str | Path | list[str | Path], *args, **kwargs diff --git a/libs/ktem/ktem/index/file/index.py b/libs/ktem/ktem/index/file/index.py index f202a6a8..9092d488 100644 --- a/libs/ktem/ktem/index/file/index.py +++ b/libs/ktem/ktem/index/file/index.py @@ -404,6 +404,25 @@ def get_admin_settings(cls): "choices": [("Yes", True), ("No", False)], "info": "If private, files will not be accessible across users.", }, + "chunk_size": { + "name": "Size of chunk (number of tokens)", + "value": 0, + "component": "number", + "info": ( + "Number of tokens of each text segment. " + "Set 0 to use developer setting." + ), + }, + "chunk_overlap": { + "name": "Number of overlapping tokens between chunks", + "value": 0, + "component": "number", + "info": ( + "Number of tokens that consecutive text segments " + "should overlap with each other. " + "Set 0 to use developer setting." + ), + }, } def get_indexing_pipeline(self, settings, user_id) -> BaseFileIndexIndexing: @@ -423,6 +442,8 @@ def get_indexing_pipeline(self, settings, user_id) -> BaseFileIndexIndexing: obj.FSPath = self._fs_path obj.user_id = user_id obj.private = self.config.get("private", False) + obj.chunk_size = self.config.get("chunk_size", 0) + obj.chunk_overlap = self.config.get("chunk_overlap", 0) return obj diff --git a/libs/ktem/ktem/index/file/pipelines.py b/libs/ktem/ktem/index/file/pipelines.py index 6b0033cc..4d53e653 100644 --- a/libs/ktem/ktem/index/file/pipelines.py +++ b/libs/ktem/ktem/index/file/pipelines.py @@ -729,7 +729,11 @@ def route(self, file_path: str | Path) -> IndexPipeline: Can subclass this method for a more elaborate pipeline routing strategy. """ - _, chunk_size, chunk_overlap = dev_settings() + + _, dev_chunk_size, dev_chunk_overlap = dev_settings() + + chunk_size = self.chunk_size or dev_chunk_size + chunk_overlap = self.chunk_overlap or dev_chunk_overlap # check if file_path is a URL if self.is_url(file_path): @@ -744,12 +748,14 @@ def route(self, file_path: str | Path) -> IndexPipeline: "the suitable pipeline for this file type in the settings." ) + print(f"Chunk size: {chunk_size}, chunk overlap: {chunk_overlap}") + print("Using reader", reader) pipeline: IndexPipeline = IndexPipeline( loader=reader, splitter=TokenSplitter( chunk_size=chunk_size or 1024, - chunk_overlap=chunk_overlap if chunk_overlap is not None else 256, + chunk_overlap=chunk_overlap or 256, separator="\n\n", backup_separators=["\n", ".", "\u200B"], ),