Skip to content

Commit

Permalink
gini_impurity function property and test cases for this (capitalone#308)
Browse files Browse the repository at this point in the history
* added gini_impurity function property and test cases for this

* fixed documentation for gini impurity

* fixed syntax and test cases related to gini_impurity

* edited test cases and code related to gini_impurity

* deleted extra code and simplified variable names
  • Loading branch information
az85252 authored Jul 7, 2021
1 parent 9155777 commit fb2bf1d
Show file tree
Hide file tree
Showing 2 changed files with 56 additions and 7 deletions.
29 changes: 24 additions & 5 deletions dataprofiler/profilers/categorical_column_profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,9 +78,8 @@ def profile(self):
times=self.times
)
if self.is_match:
profile["statistics"].update(
dict(categories=self.categories)
)
profile["statistics"]['categories'] = self.categories
profile["statistics"]['gini_impurity'] = self.gini_impurity
profile["statistics"]['categorical_count'] = dict(
sorted(self._categories.items(), key=itemgetter(1),
reverse=True)[:top_k_categories])
Expand Down Expand Up @@ -162,7 +161,7 @@ def update(self, df_series):
"""
if len(df_series) == 0:
return self

profile = dict(
sample_size=len(df_series)
)
Expand All @@ -173,4 +172,24 @@ def update(self, df_series):

self._update_helper(df_series, profile)

return self
return self

@property
def gini_impurity(self):
"""
Property for Gini Impurity. Gini Impurity is a way to calculate
likelihood of an incorrect classification of a new instance of
a random variable.
G = Σ(i=1; J): P(i) * (1 - P(i)), where i is the category classes.
We are traversing through categories and calculating with the column
:return: None or Gini Impurity probability
"""
if self.sample_size == 0:
return None
gini_sum = 0
for i in self._categories:
gini_sum += (self._categories[i]/self.sample_size) * \
(1 - (self._categories[i]/self.sample_size))
return gini_sum
34 changes: 32 additions & 2 deletions dataprofiler/tests/profilers/test_categorical_column_profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,6 @@ def test_categorical_mapping(self):

column_profile = StructuredColProfiler(df1)
cat_profiler = column_profile.profiles['data_stats_profile']._profiles["category"]

num_null_types = 1
num_nan_count = 1
categories = df1.apply(str).unique().tolist()
Expand Down Expand Up @@ -155,14 +154,16 @@ def test_true_categorical_report(self):
profile = CategoricalColumn(df_categorical.name)
profile.update(df_categorical)
report = profile.profile

self.assertIsNotNone(report.pop("times", None))
expected_profile = dict(
categorical=True,
statistics=dict([
('unique_count', 3),
('unique_ratio', .25),
('categories', ["a", "b", "c"]),
('categorical_count', {"a": 3, "b": 4, "c": 5})
('categorical_count', {"a": 3, "b": 4, "c": 5}),
('gini_impurity', (27/144) + (32/144) + (35/144))
]),
)
# We have to pop these values because sometimes the order changes
Expand Down Expand Up @@ -261,13 +262,20 @@ def test_categorical_merge(self):
self.assertIsNotNone(report.pop("times", None))
report_categories = report['statistics'].pop('categories')
report_count = report['statistics'].pop('categorical_count')
report_gini = report['statistics'].pop('gini_impurity')
expected_profile = dict(
categorical=True,
statistics=dict([
('unique_count', 16),
('unique_ratio', 16 / 1000)
]),
)
expected_gini = (1*((5/1000) * (995/1000))) + \
(2*((4/1000) * (996/1000))) + \
(1*((3/1000) * (997/1000))) + \
(5*((2/1000) * (998/1000))) + \
(7*((1/1000) * (999/1000)))
self.assertAlmostEqual(report_gini, expected_gini)
self.assertEqual(report, expected_profile)
self.assertCountEqual(report_categories, ['abcd', 'aa', '2', np.nan,
'4', 'b', '3', 'dfd', 'ee',
Expand All @@ -280,6 +288,28 @@ def test_categorical_merge(self):
self.assertEqual(report_count['abcd'], expected_dict['abcd'])
self.assertEqual(report_count['b'], expected_dict['b'])

def test_gini_impurity(self):
# Normal test
df_categorical = pd.Series(["y", "y", "y", "y", "n", "n", "n"])
profile = CategoricalColumn(df_categorical.name)
profile.update(df_categorical)
expected_val = ((4/7) * (3/7)) + ((4/7) * (3/7))
self.assertAlmostEqual(profile.gini_impurity, expected_val)

# One class only test
df_categorical = pd.Series(["y", "y", "y", "y", "y", "y", "y"])
profile = CategoricalColumn(df_categorical.name)
profile.update(df_categorical)
expected_val = 0
self.assertEqual(profile.gini_impurity, expected_val)

# Empty test
df_categorical = pd.Series([])
profile = CategoricalColumn(df_categorical.name)
profile.update(df_categorical)
self.assertEqual(profile.gini_impurity, None)


class TestCategoricalSentence(unittest.TestCase):

def setUp(self):
Expand Down

0 comments on commit fb2bf1d

Please sign in to comment.