gini_impurity function property and test cases for this (capitalone#308)

* added gini_impurity function property and test cases for this * fixed documentation for gini impurity * fixed syntax and test cases related to gini_impurity * edited test cases and code related to gini_impurity * deleted extra code and simplified variable names
stevensecreti · Jul 7, 2021 · fb2bf1d · fb2bf1d
1 parent 9155777
commit fb2bf1d
Show file tree

Hide file tree

Showing 2 changed files with 56 additions and 7 deletions.
diff --git a/dataprofiler/profilers/categorical_column_profile.py b/dataprofiler/profilers/categorical_column_profile.py
@@ -78,9 +78,8 @@ def profile(self):
             times=self.times
         )
         if self.is_match:
-            profile["statistics"].update(
-                dict(categories=self.categories)
-            )
+            profile["statistics"]['categories'] = self.categories
+            profile["statistics"]['gini_impurity'] = self.gini_impurity
             profile["statistics"]['categorical_count'] = dict(
                 sorted(self._categories.items(), key=itemgetter(1),
                        reverse=True)[:top_k_categories])
@@ -162,7 +161,7 @@ def update(self, df_series):
         """
         if len(df_series) == 0:
             return self
-        
+
         profile = dict(
             sample_size=len(df_series)
         )
@@ -173,4 +172,24 @@ def update(self, df_series):
 
         self._update_helper(df_series, profile)
 
-        return self
+        return self
+
+    @property
+    def gini_impurity(self):
+        """
+        Property for Gini Impurity. Gini Impurity is a way to calculate
+        likelihood of an incorrect classification of a new instance of
+        a random variable.
+
+        G = Σ(i=1; J): P(i) * (1 - P(i)), where i is the category classes.
+        We are traversing through categories and calculating with the column
+
+        :return: None or Gini Impurity probability
+        """
+        if self.sample_size == 0:
+            return None
+        gini_sum = 0
+        for i in self._categories:
+            gini_sum += (self._categories[i]/self.sample_size) * \
+                         (1 - (self._categories[i]/self.sample_size))
+        return gini_sum
diff --git a/dataprofiler/tests/profilers/test_categorical_column_profile.py b/dataprofiler/tests/profilers/test_categorical_column_profile.py
@@ -106,7 +106,6 @@ def test_categorical_mapping(self):
 
         column_profile = StructuredColProfiler(df1)
         cat_profiler = column_profile.profiles['data_stats_profile']._profiles["category"]
-
         num_null_types = 1
         num_nan_count = 1
         categories = df1.apply(str).unique().tolist()
@@ -155,14 +154,16 @@ def test_true_categorical_report(self):
         profile = CategoricalColumn(df_categorical.name)
         profile.update(df_categorical)
         report = profile.profile
+
         self.assertIsNotNone(report.pop("times", None))
         expected_profile = dict(
             categorical=True,
             statistics=dict([
                 ('unique_count', 3),
                 ('unique_ratio', .25),
                 ('categories', ["a", "b", "c"]),
-                ('categorical_count', {"a": 3, "b": 4, "c": 5})
+                ('categorical_count', {"a": 3, "b": 4, "c": 5}),
+                ('gini_impurity', (27/144) + (32/144) + (35/144))
             ]),
         )
         # We have to pop these values because sometimes the order changes
@@ -261,13 +262,20 @@ def test_categorical_merge(self):
         self.assertIsNotNone(report.pop("times", None))
         report_categories = report['statistics'].pop('categories')
         report_count = report['statistics'].pop('categorical_count')
+        report_gini = report['statistics'].pop('gini_impurity')
         expected_profile = dict(
             categorical=True,
             statistics=dict([
                 ('unique_count', 16),
                 ('unique_ratio', 16 / 1000)
             ]),
         )
+        expected_gini = (1*((5/1000) * (995/1000))) + \
+                        (2*((4/1000) * (996/1000))) + \
+                        (1*((3/1000) * (997/1000))) + \
+                        (5*((2/1000) * (998/1000))) + \
+                        (7*((1/1000) * (999/1000)))
+        self.assertAlmostEqual(report_gini, expected_gini)
         self.assertEqual(report, expected_profile)
         self.assertCountEqual(report_categories, ['abcd', 'aa', '2', np.nan,
                                                   '4', 'b', '3', 'dfd', 'ee',
@@ -280,6 +288,28 @@ def test_categorical_merge(self):
         self.assertEqual(report_count['abcd'], expected_dict['abcd'])
         self.assertEqual(report_count['b'], expected_dict['b'])
 
+    def test_gini_impurity(self):
+        # Normal test
+        df_categorical = pd.Series(["y", "y", "y", "y", "n", "n", "n"])
+        profile = CategoricalColumn(df_categorical.name)
+        profile.update(df_categorical)
+        expected_val = ((4/7) * (3/7)) + ((4/7) * (3/7))
+        self.assertAlmostEqual(profile.gini_impurity, expected_val)
+
+        # One class only test
+        df_categorical = pd.Series(["y", "y", "y", "y", "y", "y", "y"])
+        profile = CategoricalColumn(df_categorical.name)
+        profile.update(df_categorical)
+        expected_val = 0
+        self.assertEqual(profile.gini_impurity, expected_val)
+
+        # Empty test
+        df_categorical = pd.Series([])
+        profile = CategoricalColumn(df_categorical.name)
+        profile.update(df_categorical)
+        self.assertEqual(profile.gini_impurity, None)
+
+
 class TestCategoricalSentence(unittest.TestCase):
 
     def setUp(self):