Skip to content

Commit

Permalink
Move profile to numeric stats mixin (#337)
Browse files Browse the repository at this point in the history
* move profile to nnumeric stats

* clean code
  • Loading branch information
AnhTruong authored Jul 19, 2021
1 parent 2524613 commit 00933e6
Show file tree
Hide file tree
Showing 5 changed files with 101 additions and 58 deletions.
38 changes: 13 additions & 25 deletions dataprofiler/profilers/float_column_profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,33 +131,21 @@ def profile(self):
Property for profile. Returns the profile of the column.
:return:
"""

profile = dict(
min=self.np_type_to_type(self.min),
max=self.np_type_to_type(self.max),
sum=self.np_type_to_type(self.sum),
mean=self.np_type_to_type(self.mean),
variance=self.np_type_to_type(self.variance),
stddev=self.np_type_to_type(self.stddev),
skewness=self.np_type_to_type(self.skewness),
kurtosis=self.np_type_to_type(self.kurtosis),
num_zeros=self.np_type_to_type(self.num_zeros),
num_negatives=self.np_type_to_type(self.num_negatives),
histogram=self._get_best_histogram_for_profile(),
quantiles=self.quantiles,
times=self.times,
precision=dict(
min=self.np_type_to_type(self.precision['min']),
max=self.np_type_to_type(self.precision['max']),
mean=self.np_type_to_type(self.precision['mean']),
var=self.np_type_to_type(self.precision['var']),
std=self.np_type_to_type(self.precision['std']),
sample_size=self.np_type_to_type(self.precision['sample_size']),
margin_of_error=self.np_type_to_type(self.precision['margin_of_error']),
confidence_level=self.np_type_to_type(self.precision['confidence_level'])
profile = NumericStatsMixin.profile(self)
profile.update(
dict(
precision=dict(
min=self.np_type_to_type(self.precision['min']),
max=self.np_type_to_type(self.precision['max']),
mean=self.np_type_to_type(self.precision['mean']),
var=self.np_type_to_type(self.precision['var']),
std=self.np_type_to_type(self.precision['std']),
sample_size=self.np_type_to_type(self.precision['sample_size']),
margin_of_error=self.np_type_to_type(self.precision['margin_of_error']),
confidence_level=self.np_type_to_type(self.precision['confidence_level'])
)
)
)

return profile

@property
Expand Down
19 changes: 1 addition & 18 deletions dataprofiler/profilers/int_column_profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,24 +60,7 @@ def profile(self):
:return:
"""
profile = dict(
min=self.np_type_to_type(self.min),
max=self.np_type_to_type(self.max),
sum=self.np_type_to_type(self.sum),
mean=self.np_type_to_type(self.mean),
variance=self.np_type_to_type(self.variance),
stddev=self.np_type_to_type(self.stddev),
skewness=self.np_type_to_type(self.skewness),
kurtosis=self.np_type_to_type(self.kurtosis),
histogram=self._get_best_histogram_for_profile(),
num_zeros=self.np_type_to_type(self.num_zeros),
num_negatives=self.np_type_to_type(self.num_negatives),
quantiles=self.quantiles,
times=self.times
)

return profile

return NumericStatsMixin.profile(self)

@property
def data_type_ratio(self):
Expand Down
23 changes: 23 additions & 0 deletions dataprofiler/profilers/numerical_column_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -247,6 +247,29 @@ def _add_helper(self, other1, other2):
if "num_negatives" in self.__calculations.keys():
self.num_negatives = other1.num_negatives + other2.num_negatives

def profile(self):
"""
Property for profile. Returns the profile of the column.
:return:
"""
profile = dict(
min=self.np_type_to_type(self.min),
max=self.np_type_to_type(self.max),
sum=self.np_type_to_type(self.sum),
mean=self.np_type_to_type(self.mean),
variance=self.np_type_to_type(self.variance),
stddev=self.np_type_to_type(self.stddev),
skewness=self.np_type_to_type(self.skewness),
kurtosis=self.np_type_to_type(self.kurtosis),
histogram=self._get_best_histogram_for_profile(),
quantiles=self.quantiles,
num_zeros=self.np_type_to_type(self.num_zeros),
num_negatives=self.np_type_to_type(self.num_negatives),
times=self.times,
)

return profile

def diff(self, other_profile, options=None):
"""
Finds the differences for several numerical stats.
Expand Down
21 changes: 6 additions & 15 deletions dataprofiler/profilers/text_column_profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,21 +64,12 @@ def profile(self):
:return:
"""

profile = dict(
min=self.min,
max=self.max,
sum=self.sum,
mean=self.mean,
variance=self.variance,
stddev=self.stddev,
skewness=self.skewness,
kurtosis=self.kurtosis,
histogram=self._get_best_histogram_for_profile(),
quantiles=self.quantiles,
vocab=self.vocab,
times=self.times
)
profile = NumericStatsMixin.profile(self)
# remove num_zeros and num_negative updated from numeric profile
profile.pop('num_zeros')
profile.pop('num_negatives')
# and add the vocab update for text profile
profile.update(dict(vocab=self.vocab))
return profile

def diff(self, other_profile, options=None):
Expand Down
58 changes: 58 additions & 0 deletions dataprofiler/tests/profilers/test_numeric_stats_mixin_profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -503,6 +503,64 @@ def test_merge_num_zeros_and_negatives(self):
self.assertEqual(num_profiler.num_zeros, 0)
self.assertEqual(num_profiler.num_negatives, 0)

def test_profile(self):
num_profiler = TestColumn()

mock_profile = dict(
min=1.0,
max=1.0,
sum=1.0,
mean=0, # default
variance=np.nan, # default
skewness=np.nan, # default
kurtosis=np.nan, # default
stddev=np.nan, # default
histogram={
'bin_counts': np.array([1, 1, 1]),
'bin_edges': np.array([1.0, 2.0, 3.0, 4.0])
},
quantiles={
0: 2.0,
1: 3.0,
2: 4.0,
},
num_zeros=0, # default
num_negatives=0, # default
times=defaultdict(float), # default
)

num_profiler.match_count = 0
num_profiler.min = mock_profile['min']
num_profiler.max = mock_profile['max']
num_profiler.sum = mock_profile['sum']
num_profiler.histogram_selection = 'auto'
num_profiler.histogram_methods['auto']['histogram'] = \
mock_profile['histogram']
num_profiler.quantiles = mock_profile['quantiles']
num_profiler.times = mock_profile['times']

time_array = [float(i) for i in range(100, 0, -1)]
with mock.patch('time.time', side_effect=lambda: time_array.pop()):
# Validate that the times dictionary is empty
self.assertEqual(defaultdict(float), num_profiler.times)

profile = num_profiler.profile()
# pop out the histogram and quartiles to test separately from the
# rest of the dict as we need comparison with some precision
histogram = profile.pop('histogram')
expected_histogram = mock_profile.pop('histogram')
quartiles = profile.pop('quantiles')
expected_quartiles = mock_profile.pop('quantiles')

self.assertDictEqual(mock_profile, profile)
self.assertEqual(expected_histogram['bin_counts'].tolist(),
histogram['bin_counts'].tolist())
self.assertCountEqual(np.round(expected_histogram['bin_edges'], 12),
np.round(histogram['bin_edges'], 12))
self.assertAlmostEqual(expected_quartiles[0], quartiles[0])
self.assertAlmostEqual(expected_quartiles[1], quartiles[1])
self.assertAlmostEqual(expected_quartiles[2], quartiles[2])

def test_diff(self):
"""
Checks _diff_helper() works appropriately.
Expand Down

0 comments on commit 00933e6

Please sign in to comment.