Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Move profile to numeric stats mixin #337

Merged
merged 9 commits into from
Jul 19, 2021
38 changes: 13 additions & 25 deletions dataprofiler/profilers/float_column_profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,33 +131,21 @@ def profile(self):
Property for profile. Returns the profile of the column.
:return:
"""

profile = dict(
min=self.np_type_to_type(self.min),
max=self.np_type_to_type(self.max),
sum=self.np_type_to_type(self.sum),
mean=self.np_type_to_type(self.mean),
variance=self.np_type_to_type(self.variance),
stddev=self.np_type_to_type(self.stddev),
skewness=self.np_type_to_type(self.skewness),
kurtosis=self.np_type_to_type(self.kurtosis),
num_zeros=self.np_type_to_type(self.num_zeros),
num_negatives=self.np_type_to_type(self.num_negatives),
histogram=self._get_best_histogram_for_profile(),
quantiles=self.quantiles,
times=self.times,
precision=dict(
min=self.np_type_to_type(self.precision['min']),
max=self.np_type_to_type(self.precision['max']),
mean=self.np_type_to_type(self.precision['mean']),
var=self.np_type_to_type(self.precision['var']),
std=self.np_type_to_type(self.precision['std']),
sample_size=self.np_type_to_type(self.precision['sample_size']),
margin_of_error=self.np_type_to_type(self.precision['margin_of_error']),
confidence_level=self.np_type_to_type(self.precision['confidence_level'])
profile = NumericStatsMixin.profile(self)
profile.update(
dict(
precision=dict(
min=self.np_type_to_type(self.precision['min']),
max=self.np_type_to_type(self.precision['max']),
mean=self.np_type_to_type(self.precision['mean']),
var=self.np_type_to_type(self.precision['var']),
std=self.np_type_to_type(self.precision['std']),
sample_size=self.np_type_to_type(self.precision['sample_size']),
margin_of_error=self.np_type_to_type(self.precision['margin_of_error']),
confidence_level=self.np_type_to_type(self.precision['confidence_level'])
)
)
)

return profile

@property
Expand Down
19 changes: 1 addition & 18 deletions dataprofiler/profilers/int_column_profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,24 +60,7 @@ def profile(self):

:return:
"""
profile = dict(
min=self.np_type_to_type(self.min),
max=self.np_type_to_type(self.max),
sum=self.np_type_to_type(self.sum),
mean=self.np_type_to_type(self.mean),
variance=self.np_type_to_type(self.variance),
stddev=self.np_type_to_type(self.stddev),
skewness=self.np_type_to_type(self.skewness),
kurtosis=self.np_type_to_type(self.kurtosis),
histogram=self._get_best_histogram_for_profile(),
num_zeros=self.np_type_to_type(self.num_zeros),
num_negatives=self.np_type_to_type(self.num_negatives),
quantiles=self.quantiles,
times=self.times
)

return profile

return NumericStatsMixin.profile(self)

@property
def data_type_ratio(self):
Expand Down
23 changes: 23 additions & 0 deletions dataprofiler/profilers/numerical_column_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -247,6 +247,29 @@ def _add_helper(self, other1, other2):
if "num_negatives" in self.__calculations.keys():
self.num_negatives = other1.num_negatives + other2.num_negatives

def profile(self):
"""
Property for profile. Returns the profile of the column.
:return:
"""
profile = dict(
min=self.np_type_to_type(self.min),
max=self.np_type_to_type(self.max),
sum=self.np_type_to_type(self.sum),
mean=self.np_type_to_type(self.mean),
variance=self.np_type_to_type(self.variance),
stddev=self.np_type_to_type(self.stddev),
skewness=self.np_type_to_type(self.skewness),
kurtosis=self.np_type_to_type(self.kurtosis),
histogram=self._get_best_histogram_for_profile(),
quantiles=self.quantiles,
num_zeros=self.np_type_to_type(self.num_zeros),
num_negatives=self.np_type_to_type(self.num_negatives),
times=self.times,
)

return profile

def diff(self, other_profile, options=None):
"""
Finds the differences for several numerical stats.
Expand Down
19 changes: 4 additions & 15 deletions dataprofiler/profilers/text_column_profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,21 +64,10 @@ def profile(self):

:return:
"""

profile = dict(
min=self.min,
max=self.max,
sum=self.sum,
mean=self.mean,
variance=self.variance,
stddev=self.stddev,
skewness=self.skewness,
kurtosis=self.kurtosis,
histogram=self._get_best_histogram_for_profile(),
quantiles=self.quantiles,
vocab=self.vocab,
times=self.times
)
profile = NumericStatsMixin.profile(self)
profile.pop('num_zeros')
profile.pop('num_negatives')
Comment on lines +69 to +70
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should add a comment here

profile.update(dict(vocab=self.vocab))
return profile

def diff(self, other_profile, options=None):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -503,6 +503,64 @@ def test_merge_num_zeros_and_negatives(self):
self.assertEqual(num_profiler.num_zeros, 0)
self.assertEqual(num_profiler.num_negatives, 0)

def test_profile(self):
num_profiler = TestColumn()

mock_profile = dict(
min=1.0,
max=1.0,
sum=1.0,
mean=0, # default
variance=np.nan, # default
skewness=np.nan, # default
kurtosis=np.nan, # default
stddev=np.nan, # default
histogram={
'bin_counts': np.array([1, 1, 1]),
'bin_edges': np.array([1.0, 2.0, 3.0, 4.0])
},
quantiles={
0: 2.0,
1: 3.0,
2: 4.0,
},
num_zeros=0, # default
num_negatives=0, # default
times=defaultdict(float), # default
)

num_profiler.match_count = 0
num_profiler.min = mock_profile['min']
num_profiler.max = mock_profile['max']
num_profiler.sum = mock_profile['sum']
num_profiler.histogram_selection = 'auto'
num_profiler.histogram_methods['auto']['histogram'] = \
mock_profile['histogram']
num_profiler.quantiles = mock_profile['quantiles']
num_profiler.times = mock_profile['times']

time_array = [float(i) for i in range(100, 0, -1)]
with mock.patch('time.time', side_effect=lambda: time_array.pop()):
# Validate that the times dictionary is empty
self.assertEqual(defaultdict(float), num_profiler.times)

profile = num_profiler.profile()
# pop out the histogram and quartiles to test separately from the
# rest of the dict as we need comparison with some precision
histogram = profile.pop('histogram')
expected_histogram = mock_profile.pop('histogram')
quartiles = profile.pop('quantiles')
expected_quartiles = mock_profile.pop('quantiles')

self.assertDictEqual(mock_profile, profile)
self.assertEqual(expected_histogram['bin_counts'].tolist(),
histogram['bin_counts'].tolist())
self.assertCountEqual(np.round(expected_histogram['bin_edges'], 12),
np.round(histogram['bin_edges'], 12))
self.assertAlmostEqual(expected_quartiles[0], quartiles[0])
self.assertAlmostEqual(expected_quartiles[1], quartiles[1])
self.assertAlmostEqual(expected_quartiles[2], quartiles[2])

def test_diff(self):
"""
Checks _diff_helper() works appropriately.
Expand Down