Skip to content

Commit

Permalink
update unstructured and associated examples (#688)
Browse files Browse the repository at this point in the history
  • Loading branch information
mattseddon authored Dec 11, 2024
1 parent df2e0fa commit 8391d4e
Showing 3 changed files with 9 additions and 17 deletions.
12 changes: 7 additions & 5 deletions examples/llm_and_nlp/unstructured-embeddings-gen.py
Original file line number Diff line number Diff line change
@@ -12,11 +12,11 @@
group_broken_paragraphs,
replace_unicode_quotes,
)
from unstructured.embed.huggingface import (
from unstructured.partition.pdf import partition_pdf
from unstructured_ingest.embed.huggingface import (
HuggingFaceEmbeddingConfig,
HuggingFaceEmbeddingEncoder,
)
from unstructured.partition.pdf import partition_pdf

from datachain import C, DataChain, DataModel, File

@@ -43,6 +43,7 @@ def process_pdf(file: File) -> Iterator[Chunk]:
chunks = partition_pdf(file=f, chunking_strategy="by_title", strategy="fast")

# Clean the chunks and add new columns
text_chunks = []
for chunk in chunks:
chunk.apply(
lambda text: clean(
@@ -51,16 +52,17 @@ def process_pdf(file: File) -> Iterator[Chunk]:
)
chunk.apply(replace_unicode_quotes)
chunk.apply(group_broken_paragraphs)
text_chunks.append({"text": str(chunk)})

# create embeddings
chunks_embedded = embedding_encoder.embed_documents(chunks)
chunks_embedded = embedding_encoder.embed_documents(text_chunks)

# Add new rows to DataChain
for chunk in chunks_embedded:
yield Chunk(
key=file.path,
text=chunk.text,
embeddings=chunk.embeddings,
text=chunk.get("text"),
embeddings=chunk.get("embeddings"),
)


3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -107,7 +107,8 @@ examples = [
"numpy>=1,<2",
"defusedxml",
"accelerate",
"unstructured[pdf,embed-huggingface]<0.16.0",
"unstructured_ingest[embed-huggingface]",
"unstructured[pdf]",
"pdfplumber==0.11.4",
"huggingface_hub[hf_transfer]",
"onnx==1.16.1",
11 changes: 0 additions & 11 deletions tests/examples/test_examples.py
Original file line number Diff line number Diff line change
@@ -74,17 +74,6 @@ def test_llm_and_nlp_examples(example):
pytest.skip("Hugging Face token not set")
if "claude" in name and "ANTHROPIC_API_KEY" not in os.environ:
pytest.skip("ANTHROPIC_API_KEY not set")
if "unstructured-summary-map" in name:
import nltk

# pre-download nltk data manually
# Older version of unstructured uses their own hosted dataset, which is down.
# See: https://github.com/Unstructured-IO/unstructured/issues/3795.
# We cannot update to the latest version of unstructured because of https://github.com/Unstructured-IO/unstructured/issues/3731.

nltk.download("averaged_perceptron_tagger_eng", quiet=True)
nltk.download("punkt_tab", quiet=True)

smoke_test(example)


0 comments on commit 8391d4e

Please sign in to comment.