Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Ability to internally store counts of each category in a CategoricalColumn #296

Merged
merged 5 commits into from
Jun 28, 2021
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 13 additions & 8 deletions dataprofiler/profilers/categorical_column_profile.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from collections import defaultdict
from . import BaseColumnProfiler
from .profiler_options import CategoricalOptions
from . import utils
Expand Down Expand Up @@ -29,7 +30,7 @@ def __init__(self, name, options=None):
raise ValueError("CategoricalColumn parameter 'options' must be of"
" type CategoricalOptions.")
super(CategoricalColumn, self).__init__(name)
self._categories = list()
self._categories = defaultdict(int)
self.__calculations = {}
self._filter_properties_w_options(self.__calculations, options)

Expand All @@ -49,8 +50,8 @@ def __add__(self, other):
other.__class__.__name__))

merged_profile = CategoricalColumn(None)
merged_profile._categories = self._categories.copy()
merged_profile._update_categories(other._categories)
merged_profile._categories = \
utils.add_nested_dictionaries(self._categories, other._categories)
BaseColumnProfiler._add_helper(merged_profile, self, other)
self._merge_calculations(merged_profile.__calculations,
self.__calculations,
Expand Down Expand Up @@ -82,7 +83,7 @@ def categories(self):
"""
Property for categories.
"""
return self._categories
return list(self._categories.keys())

@property
def unique_ratio(self):
Expand Down Expand Up @@ -126,11 +127,8 @@ def _update_categories(self, df_series, prev_dependent_properties=None,
:type df_series: pandas.DataFrame
:return: None
"""
if hasattr(df_series, 'tolist'):
df_series = df_series.tolist()

self._categories = utils._combine_unique_sets(
self._categories, df_series)
self._categories.update(df_series.value_counts(dropna=False).to_dict())

def _update_helper(self, df_series_clean, profile):
"""
Expand Down Expand Up @@ -167,3 +165,10 @@ def update(self, df_series):
self._update_helper(df_series, profile)

return self

def dict_of_categories(self):
"""
Returns the dict of categories with the number of occurrences
for each category.
"""
return dict(self._categories.items())
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Instead of dict_of_categories, we could do a functional property category_counts and return a copy of the dict.
However, I suggest adding that to the subsequent PR which adds it to the profile.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For now we can just remove this function.

26 changes: 26 additions & 0 deletions dataprofiler/tests/profilers/test_categorical_column_profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -335,3 +335,29 @@ def test_categorical_column_with_wrong_options(self):
"CategoricalColumn parameter 'options' must"
" be of type CategoricalOptions."):
profiler = CategoricalColumn("Categorical", options="wrong_data_type")

def test_categorical_column_is_counted(self):
df_categorical = pd.Series([
"a", "a", "a", "b", "b", "b", "b", "c", "c", "c", "c", "c",
])
profile = CategoricalColumn(df_categorical.name)
# Check that _categories counts in profile is empty after init.
self.assertEqual(0, len(profile.dict_of_categories()))

profile.update(df_categorical)
# Check that _categories counts is correct after update function
self.assertEqual(profile.dict_of_categories()['a'], 3)
self.assertEqual(profile.dict_of_categories()['b'], 4)
self.assertEqual(profile.dict_of_categories()['c'], 5)

df_categorical2 = pd.Series([
"a", "a", "d", "d"
])
profile2 = CategoricalColumn(df_categorical2.name)
profile2.update(df_categorical2)
profile3 = profile + profile2
# Check that _categories dict_of_categories is correct after merge/add function
self.assertEqual(profile3.dict_of_categories()['a'], 5)
self.assertEqual(profile3.dict_of_categories()['b'], 4)
self.assertEqual(profile3.dict_of_categories()['c'], 5)
self.assertTrue(profile3.dict_of_categories()['d'], 2)