Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add difference for median, mode and mad #400

Merged
merged 5 commits into from
Aug 24, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 11 additions & 4 deletions dataprofiler/profilers/numerical_column_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -308,19 +308,26 @@ def diff(self, other_profile, options=None):
raise TypeError("Unsupported operand type(s) for diff: '{}' "
"and '{}'".format(cls.__name__,
other_profile.__class__.__name__))

differences = {
"min": utils.find_diff_of_numbers(self.min, other_profile.min),
"max": utils.find_diff_of_numbers(self.max, other_profile.max),
"sum": utils.find_diff_of_numbers(self.sum, other_profile.sum),
"mean": utils.find_diff_of_numbers(self.mean, other_profile.mean),
"median": utils.find_diff_of_numbers(
self.median, other_profile.median),
"mode": utils.find_diff_of_lists_and_sets(
self.mode, other_profile.mode),
"median_absolute_deviation": utils.find_diff_of_numbers(
self.median_abs_deviation, other_profile.median_abs_deviation),
"variance": utils.find_diff_of_numbers(self.variance,
other_profile.variance),
"stddev": utils.find_diff_of_numbers(self.stddev,
other_profile.stddev),
"t-test": self._perform_t_test(self.mean, self.variance, self.match_count,
other_profile.mean, other_profile.variance,
other_profile.match_count)
"t-test": self._perform_t_test(
self.mean, self.variance, self.match_count,
other_profile.mean, other_profile.variance,
other_profile.match_count)
}
return differences

Expand Down
40 changes: 36 additions & 4 deletions dataprofiler/tests/profilers/test_column_profile_compilers.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import six
import unittest
import numpy as np
from unittest import mock
import pandas as pd

Expand Down Expand Up @@ -129,6 +130,9 @@ def test_diff_primitive_compilers(self):
'max': 10.0,
'sum': 12.0,
'mean': 2.0,
'median': -0.5,
'mode': [[-2, 15, 1, 2], [], [5, -1]],
'median_absolute_deviation': -1,
'variance': 38.666666666666664,
'stddev': 3.285085839971525,
't-test': {
Expand All @@ -144,8 +148,20 @@ def test_diff_primitive_compilers(self):
}
}
}
self.maxDiff = None
self.assertDictEqual(expected_diff, compiler1.diff(compiler2))
profile_diff = compiler1.diff(compiler2)
self.assertAlmostEqual(expected_diff['statistics'].pop('median'),
profile_diff['statistics'].pop('median'),
places=2)
expected_diff_mode = expected_diff['statistics'].pop('mode')
diff_mode = profile_diff['statistics'].pop('mode')
for i in range(len(expected_diff_mode)):
np.testing.assert_almost_equal(sorted(expected_diff_mode[i]),
sorted(diff_mode[i]), 2)
self.assertAlmostEqual(
expected_diff['statistics'].pop('median_absolute_deviation'),
profile_diff['statistics'].pop('median_absolute_deviation'),
places=2)
self.assertDictEqual(expected_diff, profile_diff)

# Test different compilers
data1 = pd.Series(['-2', '-1', '1', '2'])
Expand Down Expand Up @@ -196,7 +212,10 @@ def test_disabling_columns_during_primitive_diff(self):
'min': -7.0,
'max': -13.0,
'sum': -20.0,
'mean': -10.0,
'mean': -10.0,
'median': -10,
'mode': [[-2, -1, 1, 2], [], [5, 15]],
'median_absolute_deviation': -3.5,
'variance': -46.666666666666664,
'stddev': data1.astype(int).std() - data2.astype(int).std(),
'precision': {
Expand All @@ -221,7 +240,20 @@ def test_disabling_columns_during_primitive_diff(self):
}
}
}
self.assertDictEqual(expected_diff, compiler1.diff(compiler2))
profile_diff = compiler1.diff(compiler2)
self.assertAlmostEqual(expected_diff['statistics'].pop('median'),
profile_diff['statistics'].pop('median'),
places=2)
expected_diff_mode = expected_diff['statistics'].pop('mode')
diff_mode = profile_diff['statistics'].pop('mode')
for i in range(len(expected_diff_mode)):
np.testing.assert_almost_equal(sorted(expected_diff_mode[i]),
sorted(diff_mode[i]), 2)
self.assertAlmostEqual(
expected_diff['statistics'].pop('median_absolute_deviation'),
profile_diff['statistics'].pop('median_absolute_deviation'),
places=2)
self.assertDictEqual(expected_diff, profile_diff)

# Test disabling all columns in one compiler
options.float.is_enabled = False
Expand Down
16 changes: 15 additions & 1 deletion dataprofiler/tests/profilers/test_float_column_profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -1543,6 +1543,9 @@ def test_diff(self):
'stddev': profile1['stddev'] - profile2['stddev'],
'sum': 3.5,
'variance': profile1['variance'] - profile2['variance'],
'median': 4.25,
'mode': [[2.5, 12.5, 5], [], [1, 15, 0.5, 0]],
'median_absolute_deviation': 2,
'precision': {
'min': 1,
'max': 1,
Expand All @@ -1565,7 +1568,18 @@ def test_diff(self):
}
}
}
self.assertDictEqual(expected_diff, diff)
profile_diff = profiler1.diff(profiler2)
self.assertAlmostEqual(
expected_diff.pop('median'), profile_diff.pop('median'), places=2)
expected_diff_mode = expected_diff.pop('mode')
diff_mode = profile_diff.pop('mode')
for i in range(len(expected_diff_mode)):
np.testing.assert_almost_equal(sorted(expected_diff_mode[i]),
sorted(diff_mode[i]), 2)
self.assertAlmostEqual(expected_diff.pop('median_absolute_deviation'),
profile_diff.pop('median_absolute_deviation'),
places=2)
self.assertDictEqual(expected_diff, profile_diff)

# Assert type error is properly called
with self.assertRaises(TypeError) as exc:
Expand Down
16 changes: 15 additions & 1 deletion dataprofiler/tests/profilers/test_int_column_profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -990,6 +990,9 @@ def test_diff(self):
'stddev': -7.899494936611665,
'sum': -4.0,
'variance': -94.0,
'median': -4,
'mode': [[2, 6, 4], [], [1, 15]],
'median_absolute_deviation': -5,
't-test': {
't-statistic': -0.5638091828819275,
'conservative': {
Expand All @@ -1002,7 +1005,18 @@ def test_diff(self):
}
}
}
self.assertDictEqual(expected_diff, profiler1.diff(profiler2))
profile_diff = profiler1.diff(profiler2)
self.assertAlmostEqual(
expected_diff.pop('median'), profile_diff.pop('median'), places=2)
expected_diff_mode = expected_diff.pop('mode')
diff_mode = profile_diff.pop('mode')
for i in range(len(expected_diff_mode)):
np.testing.assert_almost_equal(sorted(expected_diff_mode[i]),
sorted(diff_mode[i]), 2)
self.assertAlmostEqual(expected_diff.pop('median_absolute_deviation'),
profile_diff.pop('median_absolute_deviation'),
places=2)
self.assertDictEqual(expected_diff, profile_diff)

# Assert type error is properly called
with self.assertRaises(TypeError) as exc:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,19 @@ def _filter_properties_w_options(self, calculations, options):
pass


class TestColumnWProps(TestColumn):
# overrides the property func
median = None
mode = None
median_abs_deviation = None

def __init__(self):
super().__init__()
self.median = None
self.mode = None
self.median_abs_deviation = None


class TestNumericStatsMixin(unittest.TestCase):

@mock.patch.multiple(NumericStatsMixin, __abstractmethods__=set(),
Expand Down Expand Up @@ -640,18 +653,24 @@ def test_diff(self):
Checks _diff_helper() works appropriately.
"""

other1, other2 = TestColumn(), TestColumn()
other1, other2 = TestColumnWProps(), TestColumnWProps()
other1.min = 3
other1.max = 4
other1._biased_variance = 1
other1.sum = 6
other1.match_count = 10
other1.median = 5
other1.mode = [3]
other1.median_abs_deviation = 4

other2.min = 3
other2.max = None
other2._biased_variance = 9
other2.sum = 6
other2.match_count = 20
other2.median = 6
other2.mode = [2]
other2.median_abs_deviation = 3

# T-stat and Welch's df calculation can be found here:
# https://en.wikipedia.org/wiki/Welch%27s_t-test#Calculations
Expand All @@ -662,6 +681,9 @@ def test_diff(self):
'max': [4, None],
'sum': 'unchanged',
'mean': 0.3,
'median': -1,
'mode': [[3], [], [2]],
'median_absolute_deviation': 1,
'variance': 10 / 9 - (9 * 20 / 19),
'stddev': np.sqrt(10 / 9) - np.sqrt(9 * 20 / 19),
't-test': {
Expand All @@ -676,28 +698,39 @@ def test_diff(self):
}
}
}

difference = other1.diff(other2)
self.maxDiff = None
self.assertDictEqual(expected_diff, difference)

# Invalid statistics
other1, other2 = TestColumn(), TestColumn()
other1, other2 = TestColumnWProps(), TestColumnWProps()
other1.min = 3
other1.max = 4
other1._biased_variance = np.nan # NaN variance
other1.sum = 6
other1.match_count = 10
other1.median = 5
other1.mode = [3]
other1.median_abs_deviation = 4

other2.min = 3
other2.max = None
other2._biased_variance = 9
other2.sum = 6
other2.match_count = 20
other2.median = 6
other2.mode = [2]
other2.median_abs_deviation = 3

expected_diff = {
'min': 'unchanged',
'max': [4, None],
'sum': 'unchanged',
'mean': 0.3,
'median': -1,
'mode': [[3], [], [2]],
'median_absolute_deviation': 1,
'variance': np.nan,
'stddev': np.nan,
't-test': {
Expand All @@ -724,24 +757,33 @@ def test_diff(self):
self.assertTrue(np.isnan([expected_var, var, expected_stddev, stddev]).all())

# Insufficient match count
other1, other2 = TestColumn(), TestColumn()
other1, other2 = TestColumnWProps(), TestColumnWProps()
other1.min = 3
other1.max = 4
other1._biased_variance = 1
other1.sum = 6
other1.match_count = 10
other1.median = 5
other1.mode = [3]
other1.median_abs_deviation = 4

other2.min = 3
other2.max = None
other2._biased_variance = 9
other2.sum = 6
other2.match_count = 1 # Insufficient count
other2.median = 6
other2.mode = [2]
other2.median_abs_deviation = 3

expected_diff = {
'min': 'unchanged',
'max': [4, None],
'sum': 'unchanged',
'mean': -5.4,
'median': -1,
'mode': [[3], [], [2]],
'median_absolute_deviation': 1,
'variance': np.nan,
'stddev': np.nan,
't-test': {
Expand All @@ -768,24 +810,33 @@ def test_diff(self):
self.assertTrue(np.isnan([expected_var, var, expected_stddev, stddev]).all())

# Small p-value
other1, other2 = TestColumn(), TestColumn()
other1, other2 = TestColumnWProps(), TestColumnWProps()
other1.min = 3
other1.max = 4
other1._biased_variance = 1
other1.sum = 6
other1.match_count = 10
other1.median = 5
other1.mode = [3]
other1.median_abs_deviation = 4

other2.min = 3
other2.max = None
other2._biased_variance = 9
other2.sum = 60
other2.match_count = 20
other2.median = 6
other2.mode = [2]
other2.median_abs_deviation = 3

expected_diff = {
'min': 'unchanged',
'max': [4, None],
'sum': -54,
'mean': -2.4,
'median': -1,
'mode': [[3], [], [2]],
'median_absolute_deviation': 1,
'variance': 10 / 9 - (9 * 20 / 19),
'stddev': np.sqrt(10 / 9) - np.sqrt(9 * 20 / 19),
't-test': {
Expand All @@ -807,5 +858,5 @@ def test_diff(self):
with self.assertRaises(TypeError) as exc:
other1.diff("Inproper input")
self.assertEqual(str(exc.exception),
"Unsupported operand type(s) for diff: 'TestColumn' and"
"Unsupported operand type(s) for diff: 'TestColumnWProps' and"
" 'str'")
23 changes: 16 additions & 7 deletions dataprofiler/tests/profilers/test_text_column_profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -456,11 +456,6 @@ def test_diff(self):
["hello", "my", "name", "is", "Grant", "I", "have", "67", "dogs"]
).apply(str)

expected_vocab = [
'a', 'b', 'c', 'd', '4', '3', '2', 'f', 'h', 'e', 'l', 'o', 'm',
'y', 'n', 'i', 's', 'G', 'r', 't', 'I', 'v', '6', '7', 'g'
]

profiler1 = TextColumn(df.name)
profiler1.update(df)
profile1 = profiler1.profile
Expand All @@ -473,6 +468,9 @@ def test_diff(self):
'max': -1.0,
'sum': -9.0,
'mean': profile1['mean'] - profile2['mean'],
'median': -2.5,
'mode': [[1], [], [2, 4]],
'median_absolute_deviation': -0.5,
'variance': profile1['variance'] - profile2['variance'],
'stddev': profile1['stddev'] - profiler2['stddev'],
'vocab': utils.find_diff_of_lists_and_sets(
Expand All @@ -489,5 +487,16 @@ def test_diff(self):
}
}
}
diff = profiler1.diff(profiler2)
self.assertDictEqual(expected_diff, diff)

profile_diff = profiler1.diff(profiler2)
self.assertAlmostEqual(
expected_diff.pop('median'), profile_diff.pop('median'), places=2)
expected_diff_mode = expected_diff.pop('mode')
diff_mode = profile_diff.pop('mode')
for i in range(len(expected_diff_mode)):
np.testing.assert_almost_equal(sorted(expected_diff_mode[i]),
sorted(diff_mode[i]), 2)
self.assertAlmostEqual(expected_diff.pop('median_absolute_deviation'),
profile_diff.pop('median_absolute_deviation'),
places=2)
self.assertDictEqual(expected_diff, profile_diff)