-
-
Notifications
You must be signed in to change notification settings - Fork 57
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #315 from ohdearquant/merge_direct
v0.0.315 chunk / load
- Loading branch information
Showing
15 changed files
with
823 additions
and
18 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
# TODO: chain of thoughts |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
# TODO: sentiment analysis |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,175 @@ | ||
from typing import Union, Callable | ||
|
||
from lionagi.libs import func_call | ||
from lionagi.core.schema import DataNode | ||
from ..bridge.langchain_.langchain_bridge import LangchainBridge | ||
from ..bridge.llamaindex_.llama_index_bridge import LlamaIndexBridge | ||
|
||
|
||
from ..loader.load_util import ChunkerType, file_to_chunks, _datanode_parser | ||
|
||
|
||
def datanodes_convert(documents, chunker_type): | ||
|
||
for i in range(len(documents)): | ||
if type(documents[i]) == DataNode: | ||
if chunker_type == ChunkerType.LLAMAINDEX: | ||
documents[i] = documents[i].to_llama_index() | ||
elif chunker_type == ChunkerType.LANGCHAIN: | ||
documents[i] = documents[i].to_langchain() | ||
return documents | ||
|
||
|
||
def text_chunker(documents, args, kwargs): | ||
|
||
def chunk_node(node): | ||
chunks = file_to_chunks(node.to_dict(), *args, **kwargs) | ||
func_call.lcall(chunks, lambda chunk: chunk.pop("node_id")) | ||
return [DataNode.from_obj({**chunk}) for chunk in chunks] | ||
|
||
return [chunk_node(doc) for doc in documents] | ||
|
||
|
||
def chunk( | ||
documents, | ||
chunker, | ||
chunker_type=ChunkerType.PLAIN, | ||
chunker_args=None, | ||
chunker_kwargs=None, | ||
chunking_kwargs=None, | ||
documents_convert_func=None, | ||
to_datanode: bool | Callable = True, | ||
): | ||
|
||
if chunker_args is None: | ||
chunker_args = [] | ||
if chunker_kwargs is None: | ||
chunker_kwargs = {} | ||
if chunking_kwargs is None: | ||
chunking_kwargs = {} | ||
|
||
if chunker_type == ChunkerType.PLAIN: | ||
return chunk_funcs[ChunkerType.PLAIN]( | ||
documents, chunker, chunker_args, chunker_kwargs | ||
) | ||
|
||
elif chunker_type == ChunkerType.LANGCHAIN: | ||
return chunk_funcs[ChunkerType.LANGCHAIN]( | ||
documents, | ||
documents_convert_func, | ||
chunker, | ||
chunker_args, | ||
chunker_kwargs, | ||
to_datanode, | ||
) | ||
|
||
elif chunker_type == ChunkerType.LLAMAINDEX: | ||
return chunk_funcs[ChunkerType.LLAMAINDEX]( | ||
documents, | ||
documents_convert_func, | ||
chunker, | ||
chunker_args, | ||
chunker_kwargs, | ||
to_datanode, | ||
) | ||
|
||
elif chunker_type == ChunkerType.SELFDEFINED: | ||
return chunk_funcs[ChunkerType.SELFDEFINED]( | ||
documents, | ||
chunker, | ||
chunker_args, | ||
chunker_kwargs, | ||
chunking_kwargs, | ||
to_datanode, | ||
) | ||
|
||
else: | ||
raise ValueError( | ||
f"{chunker_type} is not supported. Please choose from {list(ChunkerType)}" | ||
) | ||
|
||
|
||
def _self_defined_chunker( | ||
documents, | ||
chunker, | ||
chunker_args, | ||
chunker_kwargs, | ||
chunking_kwargs, | ||
to_datanode: bool | Callable, | ||
): | ||
try: | ||
splitter = chunker(*chunker_args, **chunker_kwargs) | ||
nodes = splitter.split(documents, **chunking_kwargs) | ||
except Exception as e: | ||
raise ValueError( | ||
f"Self defined chunker {chunker} is not valid. Error: {e}" | ||
) from e | ||
|
||
if isinstance(to_datanode, bool) and to_datanode is True: | ||
raise ValueError("Please define a valid parser to DataNode.") | ||
elif isinstance(to_datanode, Callable): | ||
nodes = _datanode_parser(nodes, to_datanode) | ||
return nodes | ||
|
||
|
||
def _llama_index_chunker( | ||
documents, | ||
documents_convert_func, | ||
chunker, | ||
chunker_args, | ||
chunker_kwargs, | ||
to_datanode: bool | Callable, | ||
): | ||
if documents_convert_func: | ||
documents = documents_convert_func(documents, "llama_index") | ||
nodes = LlamaIndexBridge.llama_index_parse_node( | ||
documents, chunker, chunker_args, chunker_kwargs | ||
) | ||
|
||
if isinstance(to_datanode, bool) and to_datanode is True: | ||
nodes = [DataNode.from_llama_index(i) for i in nodes] | ||
elif isinstance(to_datanode, Callable): | ||
nodes = _datanode_parser(nodes, to_datanode) | ||
return nodes | ||
|
||
|
||
def _langchain_chunker( | ||
documents, | ||
documents_convert_func, | ||
chunker, | ||
chunker_args, | ||
chunker_kwargs, | ||
to_datanode: bool | Callable, | ||
): | ||
if documents_convert_func: | ||
documents = documents_convert_func(documents, "langchain") | ||
nodes = LangchainBridge.langchain_text_splitter( | ||
documents, chunker, chunker_args, chunker_kwargs | ||
) | ||
if isinstance(to_datanode, bool) and to_datanode is True: | ||
if isinstance(documents, str): | ||
nodes = [DataNode(content=i) for i in nodes] | ||
else: | ||
nodes = [DataNode.from_langchain(i) for i in nodes] | ||
elif isinstance(to_datanode, Callable): | ||
nodes = _datanode_parser(nodes, to_datanode) | ||
return nodes | ||
|
||
|
||
def _plain_chunker(documents, chunker, chunker_args, chunker_kwargs): | ||
try: | ||
if chunker == "text_chunker": | ||
chunker = text_chunker | ||
return chunker(documents, chunker_args, chunker_kwargs) | ||
except Exception as e: | ||
raise ValueError( | ||
f"Reader {chunker} is currently not supported. Error: {e}" | ||
) from e | ||
|
||
|
||
chunk_funcs = { | ||
ChunkerType.PLAIN: _plain_chunker, | ||
ChunkerType.LANGCHAIN: _langchain_chunker, | ||
ChunkerType.LLAMAINDEX: _llama_index_chunker, | ||
ChunkerType.SELFDEFINED: _self_defined_chunker, | ||
} |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,152 @@ | ||
from typing import Callable | ||
|
||
from lionagi.core.schema import DataNode | ||
from ..bridge.langchain_.langchain_bridge import LangchainBridge | ||
from ..bridge.llamaindex_.llama_index_bridge import LlamaIndexBridge | ||
|
||
from .load_util import dir_to_nodes, ReaderType, _datanode_parser | ||
|
||
|
||
def text_reader(args, kwargs): | ||
""" | ||
Reads text files from a directory and converts them to DataNode instances. | ||
Args: | ||
args: Positional arguments for the dir_to_nodes function. | ||
kwargs: Keyword arguments for the dir_to_nodes function. | ||
Returns: | ||
A list of DataNode instances. | ||
Example usage: | ||
>>> args = ['path/to/text/files'] | ||
>>> kwargs = {'file_extension': 'txt'} | ||
>>> nodes = text_reader(args, kwargs) | ||
""" | ||
return dir_to_nodes(*args, **kwargs) | ||
|
||
|
||
def load( | ||
reader: str | Callable = "SimpleDirectoryReader", | ||
input_dir=None, | ||
input_files=None, | ||
recursive: bool = False, | ||
required_exts: list[str] = None, | ||
reader_type=ReaderType.LLAMAINDEX, | ||
reader_args=None, | ||
reader_kwargs=None, | ||
load_args=None, | ||
load_kwargs=None, | ||
to_datanode: bool | Callable = True, | ||
): | ||
|
||
if reader_args is None: | ||
reader_args = [] | ||
if reader_kwargs is None: | ||
reader_kwargs = {} | ||
if load_args is None: | ||
load_args = [] | ||
if load_kwargs is None: | ||
load_kwargs = {} | ||
|
||
if reader_type == ReaderType.PLAIN: | ||
return read_funcs[ReaderType.PLAIN](reader, reader_args, reader_kwargs) | ||
|
||
if reader_type == ReaderType.LANGCHAIN: | ||
return read_funcs[ReaderType.LANGCHAIN]( | ||
reader, reader_args, reader_kwargs, to_datanode | ||
) | ||
|
||
elif reader_type == ReaderType.LLAMAINDEX: | ||
if input_dir is not None: | ||
reader_kwargs["input_dir"] = input_dir | ||
if input_files is not None: | ||
reader_kwargs["input_files"] = input_files | ||
if recursive: | ||
reader_kwargs["recursive"] = True | ||
if required_exts is not None: | ||
reader_kwargs["required_exts"] = required_exts | ||
|
||
return read_funcs[ReaderType.LLAMAINDEX]( | ||
reader, reader_args, reader_kwargs, load_args, load_kwargs, to_datanode | ||
) | ||
|
||
elif reader_type == ReaderType.SELFDEFINED: | ||
return read_funcs[ReaderType.SELFDEFINED]( | ||
reader, reader_args, reader_kwargs, load_args, load_kwargs, to_datanode | ||
) | ||
|
||
else: | ||
raise ValueError( | ||
f"{reader_type} is not supported. Please choose from {list(ReaderType)}" | ||
) | ||
|
||
|
||
def _plain_reader(reader, reader_args, reader_kwargs): | ||
try: | ||
if reader == "text_reader": | ||
reader = text_reader | ||
return reader(reader_args, reader_kwargs) | ||
except Exception as e: | ||
raise ValueError( | ||
f"Reader {reader} is currently not supported. Error: {e}" | ||
) from e | ||
|
||
|
||
def _langchain_reader(reader, reader_args, reader_kwargs, to_datanode: bool | Callable): | ||
nodes = LangchainBridge.langchain_loader(reader, reader_args, reader_kwargs) | ||
if isinstance(to_datanode, bool) and to_datanode is True: | ||
nodes = [DataNode.from_langchain(i) for i in nodes] | ||
|
||
elif isinstance(to_datanode, Callable): | ||
nodes = _datanode_parser(nodes, to_datanode) | ||
return nodes | ||
|
||
|
||
def _llama_index_reader( | ||
reader, | ||
reader_args, | ||
reader_kwargs, | ||
load_args, | ||
load_kwargs, | ||
to_datanode: bool | Callable, | ||
): | ||
nodes = LlamaIndexBridge.llama_index_read_data( | ||
reader, reader_args, reader_kwargs, load_args, load_kwargs | ||
) | ||
if isinstance(to_datanode, bool) and to_datanode is True: | ||
nodes = [DataNode.from_llama_index(i) for i in nodes] | ||
elif isinstance(to_datanode, Callable): | ||
nodes = _datanode_parser(nodes, to_datanode) | ||
return nodes | ||
|
||
|
||
def _self_defined_reader( | ||
reader, | ||
reader_args, | ||
reader_kwargs, | ||
load_args, | ||
load_kwargs, | ||
to_datanode: bool | Callable, | ||
): | ||
try: | ||
loader = reader(*reader_args, **reader_kwargs) | ||
nodes = loader.load(*load_args, **load_kwargs) | ||
except Exception as e: | ||
raise ValueError( | ||
f"Self defined reader {reader} is not valid. Error: {e}" | ||
) from e | ||
|
||
if isinstance(to_datanode, bool) and to_datanode is True: | ||
raise ValueError("Please define a valid parser to DataNode.") | ||
elif isinstance(to_datanode, Callable): | ||
nodes = _datanode_parser(nodes, to_datanode) | ||
return nodes | ||
|
||
|
||
read_funcs = { | ||
ReaderType.PLAIN: _plain_reader, | ||
ReaderType.LANGCHAIN: _langchain_reader, | ||
ReaderType.LLAMAINDEX: _llama_index_reader, | ||
ReaderType.SELFDEFINED: _self_defined_reader, | ||
} |
Oops, something went wrong.