Skip to content

Commit

Permalink
Merge staging to main (after parsers refactor) (#82)
Browse files Browse the repository at this point in the history
* Parsers can now return any number of fields, and can access the whole item

* nit: change gpt-4o to gpt-4o-mini in tests

* feat: add verbose parameter for gleaning

* feat: add verbose parameter for gleaning

* fix: tokenizers should be wrapped in try catch

* fix: resort to eval if ast eval does not work

* docs: update docs to reflect new custom parsing API

---------

Co-authored-by: Egil <egil.moller@freecode.no>
  • Loading branch information
shreyashankar and Egil authored Oct 8, 2024
1 parent 090981f commit 2e6997d
Show file tree
Hide file tree
Showing 4 changed files with 92 additions and 84 deletions.
37 changes: 13 additions & 24 deletions docetl/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,20 +148,14 @@ def _validate_parsing(
for tool in parsing_tools:
if (
not isinstance(tool, dict)
or "input_key" not in tool
or "function" not in tool
or "output_key" not in tool
):
raise ValueError(
"Each parsing tool must be a dictionary with 'input_key', 'function', and 'output_key' keys"
"Each parsing tool must be a dictionary with a 'function' key and any arguments required by that function"
)
if (
not isinstance(tool["input_key"], str)
or not isinstance(tool["function"], str)
or not isinstance(tool["output_key"], str)
):
if not isinstance(tool["function"], str):
raise ValueError(
"'input_key', 'function', and 'output_key' in parsing tools must be strings"
"'function' in parsing tools must be a string"
)
if "function_kwargs" in tool and not isinstance(
tool["function_kwargs"], dict
Expand Down Expand Up @@ -213,19 +207,12 @@ def load(self) -> List[Dict]:
def _process_item(
self,
item: Dict[str, Any],
input_key: str,
output_key: str,
func: Callable,
**function_kwargs: Dict[str, Any],
):
if input_key not in item:
raise ValueError(f"Input key {input_key} not found in item: {item}")
result = func(item[input_key], **function_kwargs)
if isinstance(result, list):
return [item.copy() | {output_key: res} for res in result]
else:
return [item | {output_key: result}]

result = func(item, **function_kwargs)
return [item.copy() | res for res in result]

def _apply_parsing_tools(self, data: List[Dict]) -> List[Dict]:
"""
Apply parsing tools to the data.
Expand All @@ -240,7 +227,13 @@ def _apply_parsing_tools(self, data: List[Dict]) -> List[Dict]:
ValueError: If a parsing tool is not found or if an input key is missing from an item.
"""
for tool in self.parsing:
input_key = tool["input_key"]
function_kwargs = dict(tool)
function_kwargs.pop("function")
# FIXME: The following is just for backwards compatibility
# with the existing yaml format...
if "function_kwargs" in function_kwargs:
function_kwargs.update(function_kwargs.pop("function_kwargs"))

try:
func = get_parser(tool["function"])
except KeyError:
Expand All @@ -261,17 +254,13 @@ def _apply_parsing_tools(self, data: List[Dict]) -> List[Dict]:
f"Parsing tool {tool['function']} not found. Please define it or use one of our existing parsing tools: {get_parsing_tools()}"
)

output_key = tool["output_key"]
function_kwargs = tool.get("function_kwargs", {})
new_data = []

with ThreadPoolExecutor() as executor:
futures = [
executor.submit(
self._process_item,
item,
input_key,
output_key,
func,
**function_kwargs,
)
Expand Down
47 changes: 36 additions & 11 deletions docetl/parsing_tools.py
Original file line number Diff line number Diff line change
@@ -1,31 +1,49 @@
import importlib
import io
import os
from typing import Dict, List, Optional


def llama_index_simple_directory_reader(filename: str) -> List[str]:
from typing import Dict, List, Optional, Any

def with_input_output_key(fn):
"""Decorator that wraps a parser function that takes a single
string parameter and return list of strings and makes it a full
parser function that takes an item as a dictionary and return a
list of dictionaries."""
def wrapper(item, input_key="text", output_key="text", **kw):
if input_key not in item:
raise ValueError(f"Input key {input_key} not found in item: {item}")
result = fn(item[input_key], **kw)
if not isinstance(result, list):
result = [result]
return [{output_key: res} for res in result]
return wrapper

def llama_index_simple_directory_reader(item: dict[str, Any], input_key: str ="path") -> List[dict[str, Any]]:
from llama_index.core import SimpleDirectoryReader

documents = SimpleDirectoryReader(filename).load_data()
# FIXME: What about doc.metadata? Would be good to include that too...
return [doc.text for doc in documents]
documents = SimpleDirectoryReader(item[input_key]).load_data()
return [{"text": doc.text,
"metadata": doc.metadata}
for doc in documents]


def llama_index_wikipedia_reader(filename: str) -> List[str]:
def llama_index_wikipedia_reader(item: dict[str, Any], input_key: str = "pages") -> List[dict[str, Any]]:
from llama_index.readers.wikipedia import WikipediaReader

loader = WikipediaReader()
pages = [filename]
pages = item[input_key]
if not isinstance(pages, list):
pages = [pages]
documents = loader.load_data(pages=pages, auto_suggest=False)
# The wikipedia reader does not include the page url in the metadata, which is impractical...
for name, doc in zip(pages, documents):
doc.metadata["source"] = "https://en.wikipedia.org/wiki/" + name

# FIXME: What about doc.metadata? Would be good to include that too...
return [doc.text for doc in documents]
return [{"text": doc.text,
"metadata": doc.metadata}
for doc in documents]


@with_input_output_key
def whisper_speech_to_text(filename: str) -> List[str]:
"""
Transcribe speech from an audio file to text using Whisper model via litellm.
Expand Down Expand Up @@ -72,6 +90,7 @@ def whisper_speech_to_text(filename: str) -> List[str]:
return [response.text]


@with_input_output_key
def xlsx_to_string(
filename: str,
orientation: str = "col",
Expand Down Expand Up @@ -128,6 +147,7 @@ def process_sheet(sheet):
return [process_sheet(wb.active)]


@with_input_output_key
def txt_to_string(filename: str) -> List[str]:
"""
Read the content of a text file and return it as a list of strings (only one element).
Expand All @@ -142,6 +162,7 @@ def txt_to_string(filename: str) -> List[str]:
return [file.read()]


@with_input_output_key
def docx_to_string(filename: str) -> List[str]:
"""
Extract text from a Word document.
Expand All @@ -158,6 +179,7 @@ def docx_to_string(filename: str) -> List[str]:
return ["\n".join([paragraph.text for paragraph in doc.paragraphs])]


@with_input_output_key
def pptx_to_string(filename: str, doc_per_slide: bool = False) -> List[str]:
"""
Extract text from a PowerPoint presentation.
Expand Down Expand Up @@ -195,6 +217,7 @@ def pptx_to_string(filename: str, doc_per_slide: bool = False) -> List[str]:
return result


@with_input_output_key
def azure_di_read(
filename: str,
use_url: bool = False,
Expand Down Expand Up @@ -334,6 +357,7 @@ def azure_di_read(
]


@with_input_output_key
def paddleocr_pdf_to_string(
input_path: str,
doc_per_page: bool = False,
Expand Down Expand Up @@ -399,6 +423,7 @@ def paddleocr_pdf_to_string(
return pdf_content


@with_input_output_key
def gptpdf_to_string(
input_path: str,
gpt_model: str,
Expand Down
82 changes: 39 additions & 43 deletions docs/examples/custom-parsing.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,10 +35,11 @@ To use custom parsing, you need to define parsing tools in your DocETL configura
parsing_tools:
- name: top_products_report
function_code: |
def top_products_report(filename: str) -> List[str]:
def top_products_report(document: Dict) -> List[Dict]:
import pandas as pd
# Read the Excel file
filename = document["excel_path"]
df = pd.read_excel(filename)
# Calculate total sales
Expand All @@ -61,17 +62,18 @@ parsing_tools:
mom_growth.to_string()
]
return ["\n".join(report)]
# Return a list of dicts representing the output
# The input document will be merged into each output doc,
# so we can access all original fields from the input doc.
return [{"sales_analysis": "\n".join(report)}]
datasets:
sales_reports:
type: file
source: local
path: "sales_data/sales_paths.json"
parsing:
- input_key: excel_path
function: top_products_report
output_key: sales_analysis
- function: top_products_report

receipts:
type: file
Expand All @@ -81,9 +83,8 @@ datasets:
- input_key: pdf_path
function: paddleocr_pdf_to_string
output_key: receipt_text
function_kwargs:
ocr_enabled: true
lang: "en"
ocr_enabled: true
lang: "en"
```
In this configuration:
Expand Down Expand Up @@ -111,8 +112,6 @@ pipeline:

This pipeline will use the parsed data from both Excel files and PDFs for further processing.



### How Data Gets Parsed and Formatted

When you run your DocETL pipeline, the parsing tools you've specified in your configuration file are applied to the external files referenced in your dataset JSONs. Here's what happens:
Expand Down Expand Up @@ -205,45 +204,45 @@ When you run this command:
DocETL provides several built-in parsing tools to handle common file formats and data processing tasks. These tools can be used directly in your configuration by specifying their names in the `function` field of your parsing tools configuration. Here's an overview of the available built-in parsing tools:

::: docetl.parsing_tools.xlsx_to_string
options:
show_root_heading: true
heading_level: 3
options:
show_root_heading: true
heading_level: 3

::: docetl.parsing_tools.txt_to_string
options:
show_root_heading: true
heading_level: 3
options:
show_root_heading: true
heading_level: 3

::: docetl.parsing_tools.docx_to_string
options:
show_root_heading: true
heading_level: 3
options:
show_root_heading: true
heading_level: 3

::: docetl.parsing_tools.whisper_speech_to_text
options:
show_root_heading: true
heading_level: 3
options:
show_root_heading: true
heading_level: 3

::: docetl.parsing_tools.pptx_to_string
options:
show_root_heading: true
heading_level: 3
options:
show_root_heading: true
heading_level: 3

::: docetl.parsing_tools.azure_di_read
options:
heading_level: 3
show_root_heading: true
options:
heading_level: 3
show_root_heading: true

::: docetl.parsing_tools.paddleocr_pdf_to_string
options:
heading_level: 3
show_root_heading: true
options:
heading_level: 3
show_root_heading: true

### Using Function Arguments with Parsing Tools

When using parsing tools in your DocETL configuration, you can pass additional arguments to the parsing functions using the function_kwargs field. This allows you to customize the behavior of the parsing tools without modifying their implementation.
When using parsing tools in your DocETL configuration, you can pass additional arguments to the parsing functions.

For example, when using the xlsx_to_string parsing tool, you can specify options like the orientation of the data, the order of columns, or whether to process each sheet separately. Here's an example of how to use function_kwargs in your configuration:
For example, when using the xlsx_to_string parsing tool, you can specify options like the orientation of the data, the order of columns, or whether to process each sheet separately. Here's an example of how to use such kwargs in your configuration:

```yaml
datasets:
Expand All @@ -254,10 +253,9 @@ datasets:
parsing_tools:
- name: excel_parser
function: xlsx_to_string
function_kwargs:
orientation: row
col_order: ["Date", "Product", "Quantity", "Price"]
doc_per_sheet: true
orientation: row
col_order: ["Date", "Product", "Quantity", "Price"]
doc_per_sheet: true
```

## Contributing Built-in Parsing Tools
Expand Down Expand Up @@ -285,7 +283,7 @@ While DocETL provides several built-in parsing tools, the community can always b
If the built-in tools don't meet your needs, you can create your own custom parsing tools. Here's how:

1. Define your parsing function in the `parsing_tools` section of your configuration.
2. Ensure your function takes a filename as input and returns a list of strings.
2. Ensure your function takes a document (dict) as input and returns a list of documents (dicts).
3. Use your custom parser in the `parsing` section of your dataset configuration.

For example:
Expand All @@ -294,7 +292,7 @@ For example:
parsing_tools:
- name: my_custom_parser
function_code: |
def my_custom_parser(filename: str) -> List[str]:
def my_custom_parser(document: Dict) -> List[Dict]:
# Your custom parsing logic here
return [processed_data]
Expand All @@ -304,7 +302,5 @@ datasets:
source: local
path: "data/paths.json"
parsing:
- input_key: file_path
function: my_custom_parser
output_key: processed_data
```
- function: my_custom_parser
```
10 changes: 4 additions & 6 deletions tests/basic/test_pipeline_with_parsing.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import List
from typing import Dict, List
import pytest
import json
import os
Expand Down Expand Up @@ -129,9 +129,9 @@ def test_pipeline_with_parsing(config_file):
os.remove(sample_data_file.name)


def custom_exploder(text: str) -> List[str]:

return [t for t in text]
def custom_exploder(doc: Dict) -> List[Dict]:
text = doc["text"]
return [{"text": t} for t in text]


def test_pipeline_with_custom_parsing():
Expand Down Expand Up @@ -160,9 +160,7 @@ def test_pipeline_with_custom_parsing():
path=tmp_input.name,
parsing=[
{
"input_key": "text",
"function": "custom_exploder",
"output_key": "parsed_content",
}
],
)
Expand Down

0 comments on commit 2e6997d

Please sign in to comment.