Skip to content

Commit

Permalink
Merge pull request #71 from redhog/llama-index-parsers
Browse files Browse the repository at this point in the history
Added llama-index based parsers
  • Loading branch information
shreyashankar authored Oct 5, 2024
2 parents 87fa9ae + dfe5399 commit da282aa
Show file tree
Hide file tree
Showing 2 changed files with 21 additions and 0 deletions.
19 changes: 19 additions & 0 deletions docetl/parsing_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,25 @@

from litellm import transcription

def llama_index_simple_directory_reader(filename: str) -> List[str]:
from llama_index.core import SimpleDirectoryReader

documents = SimpleDirectoryReader(filename).load_data()
# FIXME: What about doc.metadata? Would be good to include that too...
return [doc.text for doc in documents]

def llama_index_wikipedia_reader(filename: str) -> List[str]:
from llama_index.readers.wikipedia import WikipediaReader

loader = WikipediaReader()
pages = [filename]
documents = loader.load_data(pages=pages, auto_suggest=False)
# The wikipedia reader does not include the page url in the metadata, which is impractical...
for name, doc in zip(pages, documents):
doc.metadata["source"] = "https://en.wikipedia.org/wiki/" + name

# FIXME: What about doc.metadata? Would be good to include that too...
return [doc.text for doc in documents]

def whisper_speech_to_text(filename: str) -> List[str]:
"""
Expand Down
2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,8 @@ resolve = "docetl.operations.resolve:ResolveOperation"
gather = "docetl.operations.gather:GatherOperation"

[tool.poetry.plugins."docetl.parser"]
llama_index_simple_directory_reader = "docetl.parsing_tools:llama_index_simple_directory_reader"
llama_index_wikipedia_reader = "docetl.parsing_tools:llama_index_wikipedia_reader"
whisper_speech_to_text = "docetl.parsing_tools:whisper_speech_to_text"
xlsx_to_string = "docetl.parsing_tools:xlsx_to_string"
txt_to_string = "docetl.parsing_tools:txt_to_string"
Expand Down

0 comments on commit da282aa

Please sign in to comment.