From dfe5399c58b2c4710011ff3accd045439e3a1b67 Mon Sep 17 00:00:00 2001 From: Egil Date: Sat, 5 Oct 2024 22:14:27 +0200 Subject: [PATCH] Added llama-index based parsers --- docetl/parsing_tools.py | 19 +++++++++++++++++++ pyproject.toml | 2 ++ 2 files changed, 21 insertions(+) diff --git a/docetl/parsing_tools.py b/docetl/parsing_tools.py index 72cfdbf7..b8a60c96 100644 --- a/docetl/parsing_tools.py +++ b/docetl/parsing_tools.py @@ -5,6 +5,25 @@ from litellm import transcription +def llama_index_simple_directory_reader(filename: str) -> List[str]: + from llama_index.core import SimpleDirectoryReader + + documents = SimpleDirectoryReader(filename).load_data() + # FIXME: What about doc.metadata? Would be good to include that too... + return [doc.text for doc in documents] + +def llama_index_wikipedia_reader(filename: str) -> List[str]: + from llama_index.readers.wikipedia import WikipediaReader + + loader = WikipediaReader() + pages = [filename] + documents = loader.load_data(pages=pages, auto_suggest=False) + # The wikipedia reader does not include the page url in the metadata, which is impractical... + for name, doc in zip(pages, documents): + doc.metadata["source"] = "https://en.wikipedia.org/wiki/" + name + + # FIXME: What about doc.metadata? Would be good to include that too... + return [doc.text for doc in documents] def whisper_speech_to_text(filename: str) -> List[str]: """ diff --git a/pyproject.toml b/pyproject.toml index 8e6210b3..b45a3794 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -88,6 +88,8 @@ resolve = "docetl.operations.resolve:ResolveOperation" gather = "docetl.operations.gather:GatherOperation" [tool.poetry.plugins."docetl.parser"] +llama_index_simple_directory_reader = "docetl.parsing_tools:llama_index_simple_directory_reader" +llama_index_wikipedia_reader = "docetl.parsing_tools:llama_index_wikipedia_reader" whisper_speech_to_text = "docetl.parsing_tools:whisper_speech_to_text" xlsx_to_string = "docetl.parsing_tools:xlsx_to_string" txt_to_string = "docetl.parsing_tools:txt_to_string"