From 605f900398376377e8142029e18876690bbe3588 Mon Sep 17 00:00:00 2001
From: Matt Seddon <mattseddon@hotmail.com>
Date: Wed, 11 Dec 2024 09:56:37 +1100
Subject: [PATCH] update unstructured and associated examples

---
 examples/llm_and_nlp/unstructured-embeddings-gen.py | 12 +++++++-----
 pyproject.toml                                      |  3 ++-
 tests/examples/test_examples.py                     | 11 -----------
 3 files changed, 9 insertions(+), 17 deletions(-)

diff --git a/examples/llm_and_nlp/unstructured-embeddings-gen.py b/examples/llm_and_nlp/unstructured-embeddings-gen.py
index 0c0dc5f0e..193d5e159 100644
--- a/examples/llm_and_nlp/unstructured-embeddings-gen.py
+++ b/examples/llm_and_nlp/unstructured-embeddings-gen.py
@@ -12,11 +12,11 @@
     group_broken_paragraphs,
     replace_unicode_quotes,
 )
-from unstructured.embed.huggingface import (
+from unstructured.partition.pdf import partition_pdf
+from unstructured_ingest.embed.huggingface import (
     HuggingFaceEmbeddingConfig,
     HuggingFaceEmbeddingEncoder,
 )
-from unstructured.partition.pdf import partition_pdf
 
 from datachain import C, DataChain, DataModel, File
 
@@ -43,6 +43,7 @@ def process_pdf(file: File) -> Iterator[Chunk]:
         chunks = partition_pdf(file=f, chunking_strategy="by_title", strategy="fast")
 
     # Clean the chunks and add new columns
+    text_chunks = []
     for chunk in chunks:
         chunk.apply(
             lambda text: clean(
@@ -51,16 +52,17 @@ def process_pdf(file: File) -> Iterator[Chunk]:
         )
         chunk.apply(replace_unicode_quotes)
         chunk.apply(group_broken_paragraphs)
+        text_chunks.append({"text": str(chunk)})
 
     # create embeddings
-    chunks_embedded = embedding_encoder.embed_documents(chunks)
+    chunks_embedded = embedding_encoder.embed_documents(text_chunks)
 
     # Add new rows to DataChain
     for chunk in chunks_embedded:
         yield Chunk(
             key=file.path,
-            text=chunk.text,
-            embeddings=chunk.embeddings,
+            text=chunk.get("text"),
+            embeddings=chunk.get("embeddings"),
         )
 
 
diff --git a/pyproject.toml b/pyproject.toml
index 37604bbba..3f2af75f2 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -107,7 +107,8 @@ examples = [
   "numpy>=1,<2",
   "defusedxml",
   "accelerate",
-  "unstructured[pdf,embed-huggingface]<0.16.0",
+  "unstructured_ingest[embed-huggingface]",
+  "unstructured[pdf]",
   "pdfplumber==0.11.4",
   "huggingface_hub[hf_transfer]",
   "onnx==1.16.1",
diff --git a/tests/examples/test_examples.py b/tests/examples/test_examples.py
index 1525b1b8f..701d1307f 100644
--- a/tests/examples/test_examples.py
+++ b/tests/examples/test_examples.py
@@ -74,17 +74,6 @@ def test_llm_and_nlp_examples(example):
             pytest.skip("Hugging Face token not set")
     if "claude" in name and "ANTHROPIC_API_KEY" not in os.environ:
         pytest.skip("ANTHROPIC_API_KEY not set")
-    if "unstructured-summary-map" in name:
-        import nltk
-
-        # pre-download nltk data manually
-        # Older version of unstructured uses their own hosted dataset, which is down.
-        # See: https://github.com/Unstructured-IO/unstructured/issues/3795.
-        # We cannot update to the latest version of unstructured because of https://github.com/Unstructured-IO/unstructured/issues/3731.
-
-        nltk.download("averaged_perceptron_tagger_eng", quiet=True)
-        nltk.download("punkt_tab", quiet=True)
-
     smoke_test(example)