Skip to content

Commit

Permalink
Add diff for FloatColumn (#300)
Browse files Browse the repository at this point in the history
* Add tests

* Add precision to diff

* Add specificity to diff parent method call

* Simplified test
  • Loading branch information
Andrew Yin authored Jun 30, 2021
1 parent 470b706 commit 79e0ae7
Show file tree
Hide file tree
Showing 2 changed files with 62 additions and 0 deletions.
20 changes: 20 additions & 0 deletions dataprofiler/profilers/float_column_profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import math
import numpy as np

from . import utils
from .numerical_column_stats import NumericStatsMixin
from .base_column_profilers import BaseColumnProfiler, \
BaseColumnPrimitiveTypeProfiler
Expand Down Expand Up @@ -105,6 +106,25 @@ def __add__(self, other):

return merged_profile

def diff(self, other_profile, options=None):
"""
Finds the differences for FloatColumnss.
:param other_profile: profile to find the difference with
:type other_profile: FloatColumn
:return: the FloatColumn differences
:rtype: dict
"""
differences = NumericStatsMixin.diff(self, other_profile, options=None)
other_precision = other_profile.profile['precision']
precision_diff = dict()
for key in self.profile['precision'].keys():
precision_diff[key] = utils.find_diff_of_numbers(
self.profile['precision'][key], other_precision[key])
precision_diff.pop("confidence_level")
differences["precision"] = precision_diff
return differences

@property
def profile(self):
"""
Expand Down
42 changes: 42 additions & 0 deletions dataprofiler/tests/profilers/test_float_column_profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -1299,3 +1299,45 @@ def test_insufficient_counts(self):
# to make sure NO warnings were thrown since we have
# a sufficient match count.
self.assertEqual(0, len(w))

def test_diff(self):
data = [2.5, 12.5, 'not a float', 5, 'not a float']
df = pd.Series(data).apply(str)
profiler1 = FloatColumn(df.name)
profiler1.update(df)
profile1 = profiler1.profile

data = [1, 15, 0.5, 0]
df = pd.Series(data).apply(str)
profiler2 = FloatColumn(df.name)
profiler2.update(df)
profile2 = profiler2.profile

# Assert the difference report is correct
diff = profiler1.diff(profiler2)
expected_diff = {
'max': -2.5,
'mean': profile1['mean'] - profile2['mean'],
'min': 2.5,
'stddev': profile1['stddev'] - profile2['stddev'],
'sum': 3.5,
'variance': profile1['variance'] - profile2['variance'],
'precision': {
'min': 1,
'max': 1,
'mean': 1,
'var': profile1['precision']['var'] - profile2['precision']['var'],
'std': profile1['precision']['std'] - profile2['precision']['std'],
'sample_size': -1,
'margin_of_error':
profile1['precision']['margin_of_error'] - profiler2['precision']['margin_of_error']
}
}
self.assertDictEqual(expected_diff, diff)

# Assert type error is properly called
with self.assertRaises(TypeError) as exc:
profiler1.diff("Inproper input")
self.assertEqual(str(exc.exception),
"Unsupported operand type(s) for diff: 'FloatColumn' and"
" 'str'")

0 comments on commit 79e0ae7

Please sign in to comment.