From dfe5399c58b2c4710011ff3accd045439e3a1b67 Mon Sep 17 00:00:00 2001
From: Egil <egil.moller@freecode.no>
Date: Sat, 5 Oct 2024 22:14:27 +0200
Subject: [PATCH] Added llama-index based parsers

---
 docetl/parsing_tools.py | 19 +++++++++++++++++++
 pyproject.toml          |  2 ++
 2 files changed, 21 insertions(+)

diff --git a/docetl/parsing_tools.py b/docetl/parsing_tools.py
index 72cfdbf7..b8a60c96 100644
--- a/docetl/parsing_tools.py
+++ b/docetl/parsing_tools.py
@@ -5,6 +5,25 @@
 
 from litellm import transcription
 
+def llama_index_simple_directory_reader(filename: str) -> List[str]:
+    from llama_index.core import SimpleDirectoryReader
+
+    documents = SimpleDirectoryReader(filename).load_data()
+    # FIXME: What about doc.metadata? Would be good to include that too...
+    return [doc.text for doc in documents]
+
+def llama_index_wikipedia_reader(filename: str) -> List[str]:
+    from llama_index.readers.wikipedia import WikipediaReader
+
+    loader = WikipediaReader()
+    pages = [filename]
+    documents = loader.load_data(pages=pages, auto_suggest=False)
+    # The wikipedia reader does not include the page url in the metadata, which is impractical...
+    for name, doc in zip(pages, documents):
+        doc.metadata["source"] = "https://en.wikipedia.org/wiki/" + name
+    
+    # FIXME: What about doc.metadata? Would be good to include that too...
+    return [doc.text for doc in documents]
 
 def whisper_speech_to_text(filename: str) -> List[str]:
     """
diff --git a/pyproject.toml b/pyproject.toml
index 8e6210b3..b45a3794 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -88,6 +88,8 @@ resolve = "docetl.operations.resolve:ResolveOperation"
 gather = "docetl.operations.gather:GatherOperation"
 
 [tool.poetry.plugins."docetl.parser"]
+llama_index_simple_directory_reader = "docetl.parsing_tools:llama_index_simple_directory_reader"
+llama_index_wikipedia_reader = "docetl.parsing_tools:llama_index_wikipedia_reader"
 whisper_speech_to_text = "docetl.parsing_tools:whisper_speech_to_text"
 xlsx_to_string = "docetl.parsing_tools:xlsx_to_string"
 txt_to_string = "docetl.parsing_tools:txt_to_string"