From 11aa7b63870a50547254e62de7841925e4ed257e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Saugat=20Pachhai=20=28=E0=A4=B8=E0=A5=8C=E0=A4=97=E0=A4=BE?= =?UTF-8?q?=E0=A4=A4=29?= Date: Tue, 10 Dec 2024 20:30:29 +0545 Subject: [PATCH] test-examples: pre-download nltk data --- tests/examples/test_examples.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/tests/examples/test_examples.py b/tests/examples/test_examples.py index 701d1307f..1525b1b8f 100644 --- a/tests/examples/test_examples.py +++ b/tests/examples/test_examples.py @@ -74,6 +74,17 @@ def test_llm_and_nlp_examples(example): pytest.skip("Hugging Face token not set") if "claude" in name and "ANTHROPIC_API_KEY" not in os.environ: pytest.skip("ANTHROPIC_API_KEY not set") + if "unstructured-summary-map" in name: + import nltk + + # pre-download nltk data manually + # Older version of unstructured uses their own hosted dataset, which is down. + # See: https://github.com/Unstructured-IO/unstructured/issues/3795. + # We cannot update to the latest version of unstructured because of https://github.com/Unstructured-IO/unstructured/issues/3731. + + nltk.download("averaged_perceptron_tagger_eng", quiet=True) + nltk.download("punkt_tab", quiet=True) + smoke_test(example)