Move profile to numeric stats mixin (#337)

* move profile to nnumeric stats * clean code
capitalone · Jul 19, 2021 · 00933e6 · 00933e6
1 parent 2524613
commit 00933e6
Show file tree

Hide file tree

Showing 5 changed files with 101 additions and 58 deletions.
diff --git a/dataprofiler/profilers/float_column_profile.py b/dataprofiler/profilers/float_column_profile.py
@@ -131,33 +131,21 @@ def profile(self):
         Property for profile. Returns the profile of the column.
         :return:
         """
-
-        profile = dict(
-            min=self.np_type_to_type(self.min),
-            max=self.np_type_to_type(self.max),
-            sum=self.np_type_to_type(self.sum),
-            mean=self.np_type_to_type(self.mean),
-            variance=self.np_type_to_type(self.variance),
-            stddev=self.np_type_to_type(self.stddev),
-            skewness=self.np_type_to_type(self.skewness),
-            kurtosis=self.np_type_to_type(self.kurtosis),
-            num_zeros=self.np_type_to_type(self.num_zeros),
-            num_negatives=self.np_type_to_type(self.num_negatives),
-            histogram=self._get_best_histogram_for_profile(),
-            quantiles=self.quantiles,
-            times=self.times,
-            precision=dict(
-                min=self.np_type_to_type(self.precision['min']),
-                max=self.np_type_to_type(self.precision['max']),
-                mean=self.np_type_to_type(self.precision['mean']),
-                var=self.np_type_to_type(self.precision['var']),
-                std=self.np_type_to_type(self.precision['std']),
-                sample_size=self.np_type_to_type(self.precision['sample_size']),
-                margin_of_error=self.np_type_to_type(self.precision['margin_of_error']),
-                confidence_level=self.np_type_to_type(self.precision['confidence_level'])
+        profile = NumericStatsMixin.profile(self)
+        profile.update(
+            dict(
+                precision=dict(
+                    min=self.np_type_to_type(self.precision['min']),
+                    max=self.np_type_to_type(self.precision['max']),
+                    mean=self.np_type_to_type(self.precision['mean']),
+                    var=self.np_type_to_type(self.precision['var']),
+                    std=self.np_type_to_type(self.precision['std']),
+                    sample_size=self.np_type_to_type(self.precision['sample_size']),
+                    margin_of_error=self.np_type_to_type(self.precision['margin_of_error']),
+                    confidence_level=self.np_type_to_type(self.precision['confidence_level'])
+                )
             )
         )
-
         return profile
 
     @property

diff --git a/dataprofiler/profilers/int_column_profile.py b/dataprofiler/profilers/int_column_profile.py
@@ -60,24 +60,7 @@ def profile(self):
         
         :return:
         """
-        profile = dict(
-            min=self.np_type_to_type(self.min),
-            max=self.np_type_to_type(self.max),
-            sum=self.np_type_to_type(self.sum),
-            mean=self.np_type_to_type(self.mean),
-            variance=self.np_type_to_type(self.variance),
-            stddev=self.np_type_to_type(self.stddev),
-            skewness=self.np_type_to_type(self.skewness),
-            kurtosis=self.np_type_to_type(self.kurtosis),
-            histogram=self._get_best_histogram_for_profile(),
-            num_zeros=self.np_type_to_type(self.num_zeros),
-            num_negatives=self.np_type_to_type(self.num_negatives),
-            quantiles=self.quantiles,
-            times=self.times
-        )
-
-        return profile
-
+        return NumericStatsMixin.profile(self)
 
     @property
     def data_type_ratio(self):

diff --git a/dataprofiler/profilers/numerical_column_stats.py b/dataprofiler/profilers/numerical_column_stats.py
@@ -247,6 +247,29 @@ def _add_helper(self, other1, other2):
         if "num_negatives" in self.__calculations.keys():
             self.num_negatives = other1.num_negatives + other2.num_negatives
 
+    def profile(self):
+        """
+        Property for profile. Returns the profile of the column.
+        :return:
+        """
+        profile = dict(
+            min=self.np_type_to_type(self.min),
+            max=self.np_type_to_type(self.max),
+            sum=self.np_type_to_type(self.sum),
+            mean=self.np_type_to_type(self.mean),
+            variance=self.np_type_to_type(self.variance),
+            stddev=self.np_type_to_type(self.stddev),
+            skewness=self.np_type_to_type(self.skewness),
+            kurtosis=self.np_type_to_type(self.kurtosis),
+            histogram=self._get_best_histogram_for_profile(),
+            quantiles=self.quantiles,
+            num_zeros=self.np_type_to_type(self.num_zeros),
+            num_negatives=self.np_type_to_type(self.num_negatives),
+            times=self.times,
+        )
+
+        return profile
+
     def diff(self, other_profile, options=None):
         """
         Finds the differences for several numerical stats.

diff --git a/dataprofiler/profilers/text_column_profile.py b/dataprofiler/profilers/text_column_profile.py
@@ -64,21 +64,12 @@ def profile(self):
         
         :return:
         """
-
-        profile = dict(
-            min=self.min,
-            max=self.max,
-            sum=self.sum,
-            mean=self.mean,
-            variance=self.variance,
-            stddev=self.stddev,
-            skewness=self.skewness,
-            kurtosis=self.kurtosis,
-            histogram=self._get_best_histogram_for_profile(),
-            quantiles=self.quantiles,
-            vocab=self.vocab,
-            times=self.times
-        )
+        profile = NumericStatsMixin.profile(self)
+        # remove num_zeros and num_negative updated from numeric profile
+        profile.pop('num_zeros')
+        profile.pop('num_negatives')
+        # and add the vocab update for text profile
+        profile.update(dict(vocab=self.vocab))
         return profile
 
     def diff(self, other_profile, options=None):

diff --git a/dataprofiler/tests/profilers/test_numeric_stats_mixin_profile.py b/dataprofiler/tests/profilers/test_numeric_stats_mixin_profile.py
@@ -503,6 +503,64 @@ def test_merge_num_zeros_and_negatives(self):
         self.assertEqual(num_profiler.num_zeros, 0)
         self.assertEqual(num_profiler.num_negatives, 0)
 
+    def test_profile(self):
+        num_profiler = TestColumn()
+
+        mock_profile = dict(
+            min=1.0,
+            max=1.0,
+            sum=1.0,
+            mean=0, # default
+            variance=np.nan, # default
+            skewness=np.nan, # default
+            kurtosis=np.nan, # default
+            stddev=np.nan, # default
+            histogram={
+                'bin_counts': np.array([1, 1, 1]),
+                'bin_edges': np.array([1.0, 2.0, 3.0, 4.0])
+            },
+            quantiles={
+                0: 2.0,
+                1: 3.0,
+                2: 4.0,
+            },
+            num_zeros=0, # default
+            num_negatives=0, # default
+            times=defaultdict(float), # default
+        )
+
+        num_profiler.match_count = 0
+        num_profiler.min = mock_profile['min']
+        num_profiler.max = mock_profile['max']
+        num_profiler.sum = mock_profile['sum']
+        num_profiler.histogram_selection = 'auto'
+        num_profiler.histogram_methods['auto']['histogram'] = \
+            mock_profile['histogram']
+        num_profiler.quantiles = mock_profile['quantiles']
+        num_profiler.times = mock_profile['times']
+
+        time_array = [float(i) for i in range(100, 0, -1)]
+        with mock.patch('time.time', side_effect=lambda: time_array.pop()):
+            # Validate that the times dictionary is empty
+            self.assertEqual(defaultdict(float), num_profiler.times)
+
+            profile = num_profiler.profile()
+            # pop out the histogram and quartiles to test separately from the
+            # rest of the dict as we need comparison with some precision
+            histogram = profile.pop('histogram')
+            expected_histogram = mock_profile.pop('histogram')
+            quartiles = profile.pop('quantiles')
+            expected_quartiles = mock_profile.pop('quantiles')
+
+            self.assertDictEqual(mock_profile, profile)
+            self.assertEqual(expected_histogram['bin_counts'].tolist(),
+                             histogram['bin_counts'].tolist())
+            self.assertCountEqual(np.round(expected_histogram['bin_edges'], 12),
+                                  np.round(histogram['bin_edges'], 12))
+            self.assertAlmostEqual(expected_quartiles[0], quartiles[0])
+            self.assertAlmostEqual(expected_quartiles[1], quartiles[1])
+            self.assertAlmostEqual(expected_quartiles[2], quartiles[2])
+
     def test_diff(self):
         """
         Checks _diff_helper() works appropriately.