Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Add option to remove default stopwords from word summary #1676

Open
wants to merge 1 commit into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions requirements-dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,4 @@ sphinx_rtd_theme>=0.4.3
sphinx-autodoc-typehints>=1.10.3
sphinx-multiversion>=0.2.3
autodoc_pydantic
nltk
3 changes: 2 additions & 1 deletion requirements-test.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,5 @@ pytest-spark
nbval
pyarrow
twine>=3.1.1
kaggle
kaggle
nltk
42 changes: 32 additions & 10 deletions src/ydata_profiling/model/pandas/describe_categorical_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@

import numpy as np
import pandas as pd
from nltk.corpus import stopwords
import nltk

from ydata_profiling.config import Settings
from ydata_profiling.model.pandas.imbalance_pandas import column_imbalance_score
Expand All @@ -18,6 +20,9 @@
)


nltk.download('stopwords')


def get_character_counts_vc(vc: pd.Series) -> pd.Series:
series = pd.Series(vc.index, index=vc)
characters = series[series != ""].apply(list)
Expand Down Expand Up @@ -151,41 +156,58 @@ def unicode_summary_vc(vc: pd.Series) -> dict:
return summary


def word_summary_vc(vc: pd.Series, stop_words: List[str] = []) -> dict:
def word_summary_vc(
vc: pd.Series,
stop_words: List[str] = [],
remove_default_stopwords: bool = True,
keep_stopwords: List[str] = []
) -> dict:
"""Count the number of occurrences of each individual word across
all lines of the data Series, then sort from the word with the most
occurrences to the word with the least occurrences. If a list of
stop words is given, they will be ignored.
stop words is given, they will be ignored, along with default
English stopwords if remove_default_stopwords is True.

Args:
vc: Series containing all unique categories as index and their
frequency as value. Sorted from the most frequent down.
stop_words: List of stop words to ignore, empty by default.
remove_default_stopwords: Boolean flag to decide if default
English stopwords should be removed, default is True.
keep_stopwords: List of stop words to keep, even if they are
part of the default or custom stop words.

Returns:
A dict containing the results as a Series with unique words as
index and the computed frequency as value
index and the computed frequency as value.
"""
# TODO: configurable lowercase/punctuation etc.
# TODO: remove punctuation in words
# Convert custom stop words to lowercase
stop_words = {word.lower() for word in stop_words}

# Merge default stop words if enabled
if remove_default_stopwords:
default_stop_words = set(stopwords.words('english'))
stop_words = stop_words.union(default_stop_words)

# Remove any words specified in keep_stopwords
stop_words -= set(word.lower() for word in keep_stopwords)

# Prepare series for word count
series = pd.Series(vc.index, index=vc)
word_lists = series.str.lower().str.split()
words = word_lists.explode().str.strip(string.punctuation + string.whitespace)
word_counts = pd.Series(words.index, index=words)
# fix for pandas 1.0.5
word_counts = word_counts[word_counts.index.notnull()]
word_counts = word_counts.groupby(level=0, sort=False).sum()
word_counts = word_counts.sort_values(ascending=False)

# Remove stop words
if len(stop_words) > 0:
stop_words = [x.lower() for x in stop_words]
word_counts = word_counts.loc[~word_counts.index.isin(stop_words)]
# Exclude stop words
word_counts = word_counts.loc[~word_counts.index.isin(stop_words)]

return {"word_counts": word_counts} if not word_counts.empty else {}



def length_summary_vc(vc: pd.Series) -> dict:
series = pd.Series(vc.index, index=vc)
length = series.str.len()
Expand Down
37 changes: 31 additions & 6 deletions tests/unit/test_pandas/test_describe_categorical_pandas.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,48 @@
import pandas as pd
import pytest

from ydata_profiling.model.pandas.describe_categorical_pandas import word_summary_vc

value_counts_w_words = pd.Series(index=["The dog", "is hungry"], data=[2, 1])


# Test the basic word summary function
def test_word_summary_vc():
assert (
word_summary_vc(vc=value_counts_w_words)["word_counts"].to_dict()
word_summary_vc(vc=value_counts_w_words, remove_default_stopwords=False)["word_counts"].to_dict()
== pd.Series(index=["the", "dog", "is", "hungry"], data=[2, 2, 1, 1]).to_dict()
)


@pytest.mark.parametrize("stop_words", [["The"], ["the", "a"]])
# Test word summary function with custom stop words
@pytest.mark.parametrize("stop_words", [["the"], ["the", "a"]])
def test_word_summary_vc_with_stop_words(stop_words):
assert (
word_summary_vc(vc=value_counts_w_words, stop_words=stop_words)[
word_summary_vc(vc=value_counts_w_words, stop_words=stop_words, remove_default_stopwords=False)[
"word_counts"
].to_dict()
== pd.Series(index=["dog", "is", "hungry"], data=[2, 1, 1]).to_dict()
)

# Test word summary function with default stopwords removed
def test_word_summary_vc_with_default_stopwords():
assert (
word_summary_vc(vc=value_counts_w_words, remove_default_stopwords=True)["word_counts"].to_dict()
== pd.Series(index=["dog", "hungry"], data=[2, 1]).to_dict()
)

# Test word summary function with both custom and default stop words
@pytest.mark.parametrize(
"stop_words, expected",
[
(["dog"], {"hungry": 1}), # Custom stop word "dog", "is" removed as a default stopword
(["the", "is"], {"dog": 2, "hungry": 1}), # Custom stop words "the" and "is"
],
)
def test_word_summary_vc_with_custom_and_default_stop_words(stop_words, expected):
result = word_summary_vc(vc=value_counts_w_words, stop_words=stop_words, remove_default_stopwords=True)["word_counts"].to_dict()
assert result == expected

# Test word summary function with keep_stopwords
def test_word_summary_vc_with_keep_stopwords():
assert (
word_summary_vc(vc=value_counts_w_words, remove_default_stopwords=True, keep_stopwords=["is"])["word_counts"].to_dict()
== pd.Series(index=["dog", "is", "hungry"], data=[2, 1, 1]).to_dict()
)