From 605f900398376377e8142029e18876690bbe3588 Mon Sep 17 00:00:00 2001 From: Matt Seddon Date: Wed, 11 Dec 2024 09:56:37 +1100 Subject: [PATCH] update unstructured and associated examples --- examples/llm_and_nlp/unstructured-embeddings-gen.py | 12 +++++++----- pyproject.toml | 3 ++- tests/examples/test_examples.py | 11 ----------- 3 files changed, 9 insertions(+), 17 deletions(-) diff --git a/examples/llm_and_nlp/unstructured-embeddings-gen.py b/examples/llm_and_nlp/unstructured-embeddings-gen.py index 0c0dc5f0e..193d5e159 100644 --- a/examples/llm_and_nlp/unstructured-embeddings-gen.py +++ b/examples/llm_and_nlp/unstructured-embeddings-gen.py @@ -12,11 +12,11 @@ group_broken_paragraphs, replace_unicode_quotes, ) -from unstructured.embed.huggingface import ( +from unstructured.partition.pdf import partition_pdf +from unstructured_ingest.embed.huggingface import ( HuggingFaceEmbeddingConfig, HuggingFaceEmbeddingEncoder, ) -from unstructured.partition.pdf import partition_pdf from datachain import C, DataChain, DataModel, File @@ -43,6 +43,7 @@ def process_pdf(file: File) -> Iterator[Chunk]: chunks = partition_pdf(file=f, chunking_strategy="by_title", strategy="fast") # Clean the chunks and add new columns + text_chunks = [] for chunk in chunks: chunk.apply( lambda text: clean( @@ -51,16 +52,17 @@ def process_pdf(file: File) -> Iterator[Chunk]: ) chunk.apply(replace_unicode_quotes) chunk.apply(group_broken_paragraphs) + text_chunks.append({"text": str(chunk)}) # create embeddings - chunks_embedded = embedding_encoder.embed_documents(chunks) + chunks_embedded = embedding_encoder.embed_documents(text_chunks) # Add new rows to DataChain for chunk in chunks_embedded: yield Chunk( key=file.path, - text=chunk.text, - embeddings=chunk.embeddings, + text=chunk.get("text"), + embeddings=chunk.get("embeddings"), ) diff --git a/pyproject.toml b/pyproject.toml index 37604bbba..3f2af75f2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -107,7 +107,8 @@ examples = [ "numpy>=1,<2", "defusedxml", "accelerate", - "unstructured[pdf,embed-huggingface]<0.16.0", + "unstructured_ingest[embed-huggingface]", + "unstructured[pdf]", "pdfplumber==0.11.4", "huggingface_hub[hf_transfer]", "onnx==1.16.1", diff --git a/tests/examples/test_examples.py b/tests/examples/test_examples.py index 1525b1b8f..701d1307f 100644 --- a/tests/examples/test_examples.py +++ b/tests/examples/test_examples.py @@ -74,17 +74,6 @@ def test_llm_and_nlp_examples(example): pytest.skip("Hugging Face token not set") if "claude" in name and "ANTHROPIC_API_KEY" not in os.environ: pytest.skip("ANTHROPIC_API_KEY not set") - if "unstructured-summary-map" in name: - import nltk - - # pre-download nltk data manually - # Older version of unstructured uses their own hosted dataset, which is down. - # See: https://github.com/Unstructured-IO/unstructured/issues/3795. - # We cannot update to the latest version of unstructured because of https://github.com/Unstructured-IO/unstructured/issues/3731. - - nltk.download("averaged_perceptron_tagger_eng", quiet=True) - nltk.download("punkt_tab", quiet=True) - smoke_test(example)