diff --git a/requirements-dev.txt b/requirements-dev.txt index d727c6d48..7a3152283 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -9,3 +9,4 @@ sphinx_rtd_theme>=0.4.3 sphinx-autodoc-typehints>=1.10.3 sphinx-multiversion>=0.2.3 autodoc_pydantic +nltk \ No newline at end of file diff --git a/requirements-test.txt b/requirements-test.txt index f190e6cd1..ad3bd8a36 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -6,4 +6,5 @@ pytest-spark nbval pyarrow twine>=3.1.1 -kaggle \ No newline at end of file +kaggle +nltk \ No newline at end of file diff --git a/src/ydata_profiling/model/pandas/describe_categorical_pandas.py b/src/ydata_profiling/model/pandas/describe_categorical_pandas.py index 31ae57417..237a77e1a 100644 --- a/src/ydata_profiling/model/pandas/describe_categorical_pandas.py +++ b/src/ydata_profiling/model/pandas/describe_categorical_pandas.py @@ -5,6 +5,8 @@ import numpy as np import pandas as pd +from nltk.corpus import stopwords +import nltk from ydata_profiling.config import Settings from ydata_profiling.model.pandas.imbalance_pandas import column_imbalance_score @@ -18,6 +20,9 @@ ) +nltk.download('stopwords') + + def get_character_counts_vc(vc: pd.Series) -> pd.Series: series = pd.Series(vc.index, index=vc) characters = series[series != ""].apply(list) @@ -151,41 +156,58 @@ def unicode_summary_vc(vc: pd.Series) -> dict: return summary -def word_summary_vc(vc: pd.Series, stop_words: List[str] = []) -> dict: +def word_summary_vc( + vc: pd.Series, + stop_words: List[str] = [], + remove_default_stopwords: bool = True, + keep_stopwords: List[str] = [] +) -> dict: """Count the number of occurrences of each individual word across all lines of the data Series, then sort from the word with the most occurrences to the word with the least occurrences. If a list of - stop words is given, they will be ignored. + stop words is given, they will be ignored, along with default + English stopwords if remove_default_stopwords is True. Args: vc: Series containing all unique categories as index and their frequency as value. Sorted from the most frequent down. stop_words: List of stop words to ignore, empty by default. + remove_default_stopwords: Boolean flag to decide if default + English stopwords should be removed, default is True. + keep_stopwords: List of stop words to keep, even if they are + part of the default or custom stop words. Returns: A dict containing the results as a Series with unique words as - index and the computed frequency as value + index and the computed frequency as value. """ - # TODO: configurable lowercase/punctuation etc. - # TODO: remove punctuation in words + # Convert custom stop words to lowercase + stop_words = {word.lower() for word in stop_words} + + # Merge default stop words if enabled + if remove_default_stopwords: + default_stop_words = set(stopwords.words('english')) + stop_words = stop_words.union(default_stop_words) + # Remove any words specified in keep_stopwords + stop_words -= set(word.lower() for word in keep_stopwords) + + # Prepare series for word count series = pd.Series(vc.index, index=vc) word_lists = series.str.lower().str.split() words = word_lists.explode().str.strip(string.punctuation + string.whitespace) word_counts = pd.Series(words.index, index=words) - # fix for pandas 1.0.5 word_counts = word_counts[word_counts.index.notnull()] word_counts = word_counts.groupby(level=0, sort=False).sum() word_counts = word_counts.sort_values(ascending=False) - # Remove stop words - if len(stop_words) > 0: - stop_words = [x.lower() for x in stop_words] - word_counts = word_counts.loc[~word_counts.index.isin(stop_words)] + # Exclude stop words + word_counts = word_counts.loc[~word_counts.index.isin(stop_words)] return {"word_counts": word_counts} if not word_counts.empty else {} + def length_summary_vc(vc: pd.Series) -> dict: series = pd.Series(vc.index, index=vc) length = series.str.len() diff --git a/tests/unit/test_pandas/test_describe_categorical_pandas.py b/tests/unit/test_pandas/test_describe_categorical_pandas.py index 4cb7b12f6..51a9593dd 100644 --- a/tests/unit/test_pandas/test_describe_categorical_pandas.py +++ b/tests/unit/test_pandas/test_describe_categorical_pandas.py @@ -1,23 +1,48 @@ import pandas as pd import pytest - from ydata_profiling.model.pandas.describe_categorical_pandas import word_summary_vc value_counts_w_words = pd.Series(index=["The dog", "is hungry"], data=[2, 1]) - +# Test the basic word summary function def test_word_summary_vc(): assert ( - word_summary_vc(vc=value_counts_w_words)["word_counts"].to_dict() + word_summary_vc(vc=value_counts_w_words, remove_default_stopwords=False)["word_counts"].to_dict() == pd.Series(index=["the", "dog", "is", "hungry"], data=[2, 2, 1, 1]).to_dict() ) - -@pytest.mark.parametrize("stop_words", [["The"], ["the", "a"]]) +# Test word summary function with custom stop words +@pytest.mark.parametrize("stop_words", [["the"], ["the", "a"]]) def test_word_summary_vc_with_stop_words(stop_words): assert ( - word_summary_vc(vc=value_counts_w_words, stop_words=stop_words)[ + word_summary_vc(vc=value_counts_w_words, stop_words=stop_words, remove_default_stopwords=False)[ "word_counts" ].to_dict() == pd.Series(index=["dog", "is", "hungry"], data=[2, 1, 1]).to_dict() ) + +# Test word summary function with default stopwords removed +def test_word_summary_vc_with_default_stopwords(): + assert ( + word_summary_vc(vc=value_counts_w_words, remove_default_stopwords=True)["word_counts"].to_dict() + == pd.Series(index=["dog", "hungry"], data=[2, 1]).to_dict() + ) + +# Test word summary function with both custom and default stop words +@pytest.mark.parametrize( + "stop_words, expected", + [ + (["dog"], {"hungry": 1}), # Custom stop word "dog", "is" removed as a default stopword + (["the", "is"], {"dog": 2, "hungry": 1}), # Custom stop words "the" and "is" + ], +) +def test_word_summary_vc_with_custom_and_default_stop_words(stop_words, expected): + result = word_summary_vc(vc=value_counts_w_words, stop_words=stop_words, remove_default_stopwords=True)["word_counts"].to_dict() + assert result == expected + +# Test word summary function with keep_stopwords +def test_word_summary_vc_with_keep_stopwords(): + assert ( + word_summary_vc(vc=value_counts_w_words, remove_default_stopwords=True, keep_stopwords=["is"])["word_counts"].to_dict() + == pd.Series(index=["dog", "is", "hungry"], data=[2, 1, 1]).to_dict() + )