From a014c3bcda13c9f09f90ff6f2a97ab70645f5315 Mon Sep 17 00:00:00 2001 From: davidmezzetti <561939+davidmezzetti@users.noreply.github.com> Date: Fri, 16 Aug 2024 11:58:27 -0400 Subject: [PATCH] Update NLTK model downloads, closes #760 --- .github/workflows/build.yml | 2 +- docker/base/Dockerfile | 2 +- examples/10_Extract_text_from_documents.ipynb | 2 +- examples/40_Text_to_Speech_Generation.ipynb | 6 +++++- examples/52_Build_RAG_pipelines_with_txtai.ipynb | 2 +- 5 files changed, 9 insertions(+), 5 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index cf9def6c9..0542b413c 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -54,7 +54,7 @@ jobs: run: | pip install -U wheel pip install .[all,dev] fasttext==0.9.2 faiss-cpu==1.8.0 "numpy < 2.0.0" - python -c "import nltk; nltk.download('punkt')" + python -c "import nltk; nltk.download(['punkt', 'punkt_tab', 'averaged_perceptron_tagger_eng'])" python --version make data coverage diff --git a/docker/base/Dockerfile b/docker/base/Dockerfile index 42eaa762c..3cad841d0 100644 --- a/docker/base/Dockerfile +++ b/docker/base/Dockerfile @@ -29,7 +29,7 @@ RUN \ python -m pip install --no-cache-dir -U pip wheel setuptools && \ if [ -z ${GPU} ] && { [ -z ${TARGETARCH} ] || [ ${TARGETARCH} = "amd64" ] ;}; then pip install --no-cache-dir torch==2.3.1+cpu torchvision==0.18.1+cpu -f https://download.pytorch.org/whl/torch_stable.html; fi && \ python -m pip install --no-cache-dir txtai${COMPONENTS} && \ - python -c "import sys, importlib.util as util; 1 if util.find_spec('nltk') else sys.exit(); import nltk; nltk.download('punkt')" && \ + python -c "import sys, importlib.util as util; 1 if util.find_spec('nltk') else sys.exit(); import nltk; nltk.download(['punkt', 'punkt_tab', 'averaged_perceptron_tagger_eng'])" && \ \ # Cleanup build packages apt-get -y purge gcc g++ python${PYTHON_VERSION}-dev && apt-get -y autoremove diff --git a/examples/10_Extract_text_from_documents.ipynb b/examples/10_Extract_text_from_documents.ipynb index e6463a049..b44f8a7d7 100644 --- a/examples/10_Extract_text_from_documents.ipynb +++ b/examples/10_Extract_text_from_documents.ipynb @@ -50,7 +50,7 @@ "\n", "# Install NLTK\n", "import nltk\n", - "nltk.download('punkt')" + "nltk.download(['punkt', 'punkt_tab'])" ], "execution_count": 19, "outputs": [] diff --git a/examples/40_Text_to_Speech_Generation.ipynb b/examples/40_Text_to_Speech_Generation.ipynb index 4f884b4f8..215d8cab8 100644 --- a/examples/40_Text_to_Speech_Generation.ipynb +++ b/examples/40_Text_to_Speech_Generation.ipynb @@ -52,7 +52,11 @@ }, "source": [ "%%capture\n", - "!pip install git+https://github.com/neuml/txtai#egg=txtai[pipeline] onnxruntime-gpu librosa" + "!pip install git+https://github.com/neuml/txtai#egg=txtai[pipeline] onnxruntime-gpu librosa\n", + "\n", + "# Install NLTK\n", + "import nltk\n", + "nltk.download('averaged_perceptron_tagger_eng')" ], "execution_count": 1, "outputs": [] diff --git a/examples/52_Build_RAG_pipelines_with_txtai.ipynb b/examples/52_Build_RAG_pipelines_with_txtai.ipynb index 77eb90a07..d1cc9c947 100644 --- a/examples/52_Build_RAG_pipelines_with_txtai.ipynb +++ b/examples/52_Build_RAG_pipelines_with_txtai.ipynb @@ -63,7 +63,7 @@ "\n", "# Install NLTK\n", "import nltk\n", - "nltk.download('punkt')" + "nltk.download(['punkt', 'punkt_tab'])" ] }, {