Skip to content

Commit

Permalink
Added diff for the stats compiler (capitalone#324)
Browse files Browse the repository at this point in the history
* dataprofiler/profilers/column_profile_compilers.py

* Here we go

* Small updates

* Updated things
  • Loading branch information
grant-eden authored Jul 14, 2021
1 parent 6ba12ce commit a6aa450
Show file tree
Hide file tree
Showing 2 changed files with 66 additions and 0 deletions.
21 changes: 21 additions & 0 deletions dataprofiler/profilers/column_profile_compilers.py
Original file line number Diff line number Diff line change
Expand Up @@ -287,6 +287,27 @@ def profile(self):
profile.update(profiler.profile)
return profile

def diff(self, other, options=None):
"""
Finds the difference between 2 compilers and returns the report
:param other: profile compiler finding the difference with this one.
:type other: ColumnStatsProfileCompiler
:return: difference of the profiles
:rtype: dict
"""
# Call super for compiler instance check
diff_profile = super().diff(other, options)

# Iterate through profiles
all_profiles = set(self._profiles.keys()) | set(other._profiles.keys())
for key in all_profiles:
if key in self._profiles and key in other._profiles:
diff = self._profiles[key].diff(other._profiles[key],
options)
diff_profile.update(diff)

return diff_profile

class ColumnDataLabelerCompiler(BaseCompiler):

Expand Down
45 changes: 45 additions & 0 deletions dataprofiler/tests/profilers/test_column_profile_compilers.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,6 +223,51 @@ def test_disabling_columns_during_primitive_diff(self):
expected_diff = {}
self.assertDictEqual(expected_diff, compiler1.diff(compiler2))

def test_compiler_stats_diff(self):
data1 = pd.Series(['1', '9', '9'])
data2 = pd.Series(['10', '9', '9', '9'])
options = StructuredOptions()

# Test normal diff
compiler1 = col_pro_compilers.ColumnStatsProfileCompiler(data1)
compiler2 = col_pro_compilers.ColumnStatsProfileCompiler(data2)
expected_diff = {
'order': ['ascending', 'descending'],
'categorical': 'unchanged',
'statistics': {
'unique_count': 'unchanged',
'unique_ratio': 0.16666666666666663,
'categories': [['1'], ['9'], ['10']],
'gini_impurity': 0.06944444444444448,
'unalikeability': 0.16666666666666663,
'categorical_count': {
'9': -1,
'1': [1, None],
'10': [None, 1]
}
}
}
self.assertDictEqual(expected_diff, compiler1.diff(compiler2))

# Test disabled categorical column in one compiler
options.category.is_enabled = False
compiler1 = col_pro_compilers.ColumnStatsProfileCompiler(data1, options)
compiler2 = col_pro_compilers.ColumnStatsProfileCompiler(data2)
expected_diff = {'order': ['ascending', 'descending']}
self.assertDictEqual(expected_diff, compiler1.diff(compiler2))

# Test disabling categorical profile in both compilers
compiler1 = col_pro_compilers.ColumnStatsProfileCompiler(data1, options)
compiler2 = col_pro_compilers.ColumnStatsProfileCompiler(data2, options)
expected_diff = {'order': ['ascending', 'descending']}
self.assertDictEqual(expected_diff, compiler1.diff(compiler2))

# Test disabling everything
options.order.is_enabled = False
compiler1 = col_pro_compilers.ColumnStatsProfileCompiler(data1, options)
compiler2 = col_pro_compilers.ColumnStatsProfileCompiler(data2, options)
expected_diff = {}
self.assertDictEqual(expected_diff, compiler1.diff(compiler2))

@mock.patch.multiple(
col_pro_compilers.BaseCompiler, __abstractmethods__=set())
Expand Down

0 comments on commit a6aa450

Please sign in to comment.