Skip to content

Commit

Permalink
fix unstructured-text example (#277)
Browse files Browse the repository at this point in the history
  • Loading branch information
mattseddon authored Aug 13, 2024
1 parent 720f38c commit 1fa9465
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 15 deletions.
28 changes: 14 additions & 14 deletions examples/llm_and_nlp/unstructured-text.py
Original file line number Diff line number Diff line change
@@ -1,26 +1,27 @@
#
# pip install unstructured[all-docs]
# libmagic
# pip install unstructured[pdf] nltk==3.8.1 huggingface_hub[hf_transfer]
#
# partition_object supports via unstructured library:
#
# "csv", "doc", "docx", "epub", "image", "md", "msg", "odt", "org",
# "pdf", "ppt", "pptx", "rtf", "rst", "tsv", "xlsx"
import os

os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"

from transformers import pipeline
from unstructured.partition.pdf import PartitionStrategy
from unstructured.partition.pdf import partition_pdf as partition
from unstructured.staging.base import convert_to_dataframe

from datachain import C, DataChain
from datachain import DataChain

device = "cpu"
model = "pszemraj/led-large-book-summary"
source = "gs://datachain-demo/nlp-infobooks/"
source = "gs://datachain-demo/nlp-infobooks/*.pdf"


def partition_object(file):
with file.open() as raw:
elements = partition(file=raw, metadata_filename=file.name)
elements = partition(
file=raw, metadata_filename=file.name, strategy=PartitionStrategy.FAST
)
title = str(elements[0])
text = "\n\n".join([str(el) for el in elements])
df = convert_to_dataframe(elements)
Expand All @@ -39,9 +40,8 @@ def summarize(clean):
return (summary,)


ds = (
dc = (
DataChain.from_storage(source)
.filter(C("file.path").glob("*.pdf"))
.limit(1)
.map(
partition_object,
Expand All @@ -50,9 +50,9 @@ def summarize(clean):
)
)

ds = ds.map(cleanse, output={"clean": str})
ds = ds.map(summarize, output={"summary": str})
results = ds.select("text", "summary").collect()
dc = dc.map(cleanse, output={"clean": str})
dc = dc.map(summarize, output={"summary": str})
results = dc.select("text", "summary").collect()

for story in results:
print("\n *********** the original: ********** ")
Expand Down
4 changes: 3 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -95,11 +95,13 @@ dev = [
]
examples = [
"datachain[tests]",
"numpy>=1,<2",
"defusedxml",
"accelerate",
"unstructured[pdf]",
"pdfplumber==0.11.1",
"huggingface_hub[hf_transfer]"
"huggingface_hub[hf_transfer]",
"nltk==3.8.1"
]

[project.urls]
Expand Down

0 comments on commit 1fa9465

Please sign in to comment.