From b29159b7f75894b658d0a3fe3a782d33da3f6ae9 Mon Sep 17 00:00:00 2001 From: Hugo Abonizio Date: Fri, 16 Oct 2020 15:56:40 -0300 Subject: [PATCH 1/2] Lazy load stopwords module to prevent downloading when it's not needed --- texthero/preprocessing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/texthero/preprocessing.py b/texthero/preprocessing.py index f603705..27d42cc 100644 --- a/texthero/preprocessing.py +++ b/texthero/preprocessing.py @@ -12,7 +12,6 @@ import pandas as pd import unidecode -from texthero import stopwords as _stopwords from texthero._types import TokenSeries, TextSeries, InputSeries from typing import List, Callable, Union @@ -329,6 +328,7 @@ def replace_stopwords( """ if stopwords is None: + from texthero import stopwords as _stopwords stopwords = _stopwords.DEFAULT return s.apply(_replace_stopwords, args=(stopwords, symbol)) From 159eedec311a31546a183f53418d976bd95d302e Mon Sep 17 00:00:00 2001 From: Hugo Abonizio Date: Sat, 17 Oct 2020 18:54:00 -0300 Subject: [PATCH 2/2] Download spaCy English model by default since it's required by many functions --- texthero/nlp.py | 10 +++++++++- texthero/preprocessing.py | 1 + texthero/stopwords.py | 9 --------- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/texthero/nlp.py b/texthero/nlp.py index 748f0cd..9e8cfbe 100644 --- a/texthero/nlp.py +++ b/texthero/nlp.py @@ -4,10 +4,18 @@ import spacy import pandas as pd -import en_core_web_sm from nltk.stem import PorterStemmer, SnowballStemmer from texthero._types import TextSeries, InputSeries +try: + # If not present, download 'en_core_web_sm' + import en_core_web_sm +except ModuleNotFoundError: + from spacy.cli.download import download as spacy_download + + spacy_download("en_core_web_sm") + import en_core_web_sm + @InputSeries(TextSeries) def named_entities(s: TextSeries, package="spacy") -> pd.Series: diff --git a/texthero/preprocessing.py b/texthero/preprocessing.py index 27d42cc..f2c1984 100644 --- a/texthero/preprocessing.py +++ b/texthero/preprocessing.py @@ -329,6 +329,7 @@ def replace_stopwords( if stopwords is None: from texthero import stopwords as _stopwords + stopwords = _stopwords.DEFAULT return s.apply(_replace_stopwords, args=(stopwords, symbol)) diff --git a/texthero/stopwords.py b/texthero/stopwords.py index 379e222..d85a95e 100644 --- a/texthero/stopwords.py +++ b/texthero/stopwords.py @@ -8,15 +8,6 @@ nltk.download("stopwords") from nltk.corpus import stopwords as nltk_en_stopwords - -try: - # If not present, download 'en_core_web_sm' - spacy_model = spacy.load("en_core_web_sm") -except OSError: - from spacy.cli.download import download as spacy_download - - spacy_download("en_core_web_sm") - from spacy.lang.en import stop_words as spacy_en_stopwords DEFAULT = set(nltk_en_stopwords.words("english"))