Skip to content

Commit

Permalink
Report functionality added to BaseColumnProfiler (#497)
Browse files Browse the repository at this point in the history
* Fix: updated test_(int/float)_column_profile.py test_diff to cover json.dumps() functionality. Updated numerical_column_stats.py _perform_t_test to cast conservative and welch values to floats before assignment.

* Fix: addressed PR change requests.

* Fix: Reformatting

* Bumped version

* added report function to BaseColumnProfiler, added report to profile classes, added test coverage

* fixing formatting and naming
  • Loading branch information
micdavis authored Jun 24, 2022
1 parent 847d215 commit 54de73e
Show file tree
Hide file tree
Showing 15 changed files with 195 additions and 4 deletions.
11 changes: 11 additions & 0 deletions dataprofiler/profilers/base_column_profilers.py
Original file line number Diff line number Diff line change
Expand Up @@ -230,6 +230,17 @@ def profile(self):
"""
raise NotImplementedError()

@abc.abstractmethod
def report(self, remove_disabled_flag=False):
"""
Private abstract method for returning report.
:param remove_disabled_flag: flag to determine if disabled options should be excluded in the report.
:type remove_disabled_flag: boolean
"""

raise NotImplementedError()


class BaseColumnPrimitiveTypeProfiler(with_metaclass(abc.ABCMeta,
BaseColumnProfiler)):
Expand Down
9 changes: 9 additions & 0 deletions dataprofiler/profilers/categorical_column_profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,15 @@ def diff(self, other_profile, options=None):

return differences

def report(self, remove_disabled_flag=False):
"""
Private abstract method for returning report.
:param remove_disabled_flag: flag to determine if disabled options should be excluded in the report.
:type remove_disabled_flag: boolean
"""
return self.profile

@property
def profile(self):
"""
Expand Down
36 changes: 36 additions & 0 deletions dataprofiler/profilers/column_profile_compilers.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,15 @@ class ColumnPrimitiveTypeProfileCompiler(BaseCompiler):
]
_option_class = StructuredOptions

def report(self, remove_disabled_flag=False):
"""
Private abstract method for returning report.
:param remove_disabled_flag: flag to determine if disabled options should be excluded in the report.
:type remove_disabled_flag: boolean
"""
return self.profile

@property
def profile(self):
profile = {
Expand Down Expand Up @@ -287,6 +296,15 @@ class ColumnStatsProfileCompiler(BaseCompiler):
]
_option_class = StructuredOptions

def report(self, remove_disabled_flag=False):
"""
Private abstract method for returning report.
:param remove_disabled_flag: flag to determine if disabled options should be excluded in the report.
:type remove_disabled_flag: boolean
"""
return self.profile

@property
def profile(self):
profile = dict()
Expand Down Expand Up @@ -325,6 +343,15 @@ class ColumnDataLabelerCompiler(BaseCompiler):
]
_option_class = StructuredOptions

def report(self, remove_disabled_flag=False):
"""
Private abstract method for returning report.
:param remove_disabled_flag: flag to determine if disabled options should be excluded in the report.
:type remove_disabled_flag: boolean
"""
return self.profile

@property
def profile(self):
profile = {
Expand Down Expand Up @@ -376,6 +403,15 @@ class UnstructuredCompiler(BaseCompiler):

_option_class = UnstructuredOptions

def report(self, remove_disabled_flag=False):
"""
Private abstract method for returning report.
:param remove_disabled_flag: flag to determine if disabled options should be excluded in the report.
:type remove_disabled_flag: boolean
"""
return self.profile

@property
def profile(self):
profile = {
Expand Down
9 changes: 9 additions & 0 deletions dataprofiler/profilers/data_labeler_column_profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -239,6 +239,15 @@ def profile(self):
}
return profile

def report(self, remove_disabled_flag=False):
"""
Private abstract method for returning report.
:param remove_disabled_flag: flag to determine if disabled options should be excluded in the report.
:type remove_disabled_flag: boolean
"""
return self.profile

def diff(self, other_profile, options=None):
"""
Generates the differences between the orders of two DataLabeler columns
Expand Down
9 changes: 9 additions & 0 deletions dataprofiler/profilers/datetime_column_profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,15 @@ def __add__(self, other):
self.date_formats, other.date_formats)
return merged_profile

def report(self, remove_disabled_flag=False):
"""
Private abstract method for returning report.
:param remove_disabled_flag: flag to determine if disabled options should be excluded in the report.
:type remove_disabled_flag: boolean
"""
return self.profile

@property
def profile(self):
"""
Expand Down
9 changes: 9 additions & 0 deletions dataprofiler/profilers/int_column_profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,15 @@ def __add__(self, other):
other.__calculations)
return merged_profile

def report(self, remove_disabled_flag=False):
"""
Private abstract method for returning report.
:param remove_disabled_flag: flag to determine if disabled options should be excluded in the report.
:type remove_disabled_flag: boolean
"""
return self.profile

@property
def profile(self):
"""
Expand Down
9 changes: 9 additions & 0 deletions dataprofiler/profilers/order_column_profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -228,6 +228,15 @@ def __add__(self, other):
other.__calculations)
return merged_profile

def report(self, remove_disabled_flag=False):
"""
Private abstract method for returning report.
:param remove_disabled_flag: flag to determine if disabled options should be excluded in the report.
:type remove_disabled_flag: boolean
"""
return self.profile

@property
def profile(self):
"""
Expand Down
9 changes: 9 additions & 0 deletions dataprofiler/profilers/unstructured_labeler_profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,15 @@ def __add__(self, other):

return merged_profile

def report(self, remove_disabled_flag=False):
"""
Private abstract method for returning report.
:param remove_disabled_flag: flag to determine if disabled options should be excluded in the report.
:type remove_disabled_flag: boolean
"""
return self.profile

def diff(self, other_profile, options=None):
"""
Finds the differences for two unstructured labeler profiles
Expand Down
14 changes: 12 additions & 2 deletions dataprofiler/tests/profilers/test_base_column_profilers.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,7 @@ def test_cannot_instantiate(self):
BaseColumnPrimitiveTypeProfiler()
self.assertEqual(
"Can't instantiate abstract class BaseColumnPrimitiveTypeProfiler "
"with abstract methods _update_helper, profile, update",
"with abstract methods _update_helper, profile, report, update",
str(e.exception)
)

Expand Down Expand Up @@ -181,7 +181,7 @@ def test_update_column_base_properties(self):
def test_update_match_are_abstract(self):
six.assertCountEqual(
self,
{'_update_helper', 'update', 'profile'},
{'_update_helper', 'update', 'report', 'profile'},
BaseColumnPrimitiveTypeProfiler.__abstractmethods__
)

Expand Down Expand Up @@ -284,6 +284,16 @@ def test_updated_profile(self):
self.assertEqual(1, profiler_mock.call_count)
self._delete_profiler_mocks()

def test_report(self):
self._create_profiler_mocks()
profile = self.column_profiler(self.aws_dataset["datetime"])

report1 = profile.profile
report2 = profile.report(remove_disabled_flag=False)
report3 = profile.report(remove_disabled_flag=True)
self.assertDictEqual(report1, report2)
self.assertDictEqual(report1, report3)

def test_profile(self):
self._create_profiler_mocks()
profile = self.column_profiler(self.aws_dataset["datetime"])
Expand Down
12 changes: 12 additions & 0 deletions dataprofiler/tests/profilers/test_categorical_column_profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,7 @@ def test_true_categorical_report(self):
('unalikeability', 2*(12 + 15 + 20)/132)
]),
)

# We have to pop these values because sometimes the order changes
self.assertCountEqual(expected_profile['statistics'].pop('categories'),
report["statistics"].pop('categories'))
Expand All @@ -190,6 +191,17 @@ def test_false_categorical_report(self):
)
self.assertEqual(report, expected_profile)

def test_report(self):
df_non_categorical = pd.Series(list(map(str, range(0, 20))))
profile = CategoricalColumn(df_non_categorical.name)
profile.update(df_non_categorical)

report1 = profile.profile
report2 = profile.report(remove_disabled_flag=False)
report3 = profile.report(remove_disabled_flag=True)
self.assertDictEqual(report1, report2)
self.assertDictEqual(report1, report3)

def test_categorical_merge(self):
df1 = pd.Series(["abcd", "aa", "abcd", "aa", "b", "4", "3", "2",
"dfd", "2", np.nan])
Expand Down
12 changes: 12 additions & 0 deletions dataprofiler/tests/profilers/test_data_labeler_column_profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,18 @@ def test_profile(self, mock_instance):
expected = defaultdict(float, {'data_labeler_predict': 2.0})
self.assertEqual(expected, profiler.profile['times'])

def test_report(self, mock_instance):
self._setup_data_labeler_mock(mock_instance)

data = pd.Series(['1', '2', '3'])
profile = DataLabelerColumn(data.name)

report1 = profile.profile
report2 = profile.report(remove_disabled_flag=False)
report3 = profile.report(remove_disabled_flag=True)
self.assertDictEqual(report1, report2)
self.assertDictEqual(report1, report3)

def test_label_match(self, mock_instance):
"""
Test label match between avg_prediction and data_label_representation
Expand Down
16 changes: 15 additions & 1 deletion dataprofiler/tests/profilers/test_datetime_column_profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -261,14 +261,28 @@ def test_profile(self):
self.assertEqual(expected, profiler.profile['times'])
profile = profiler.profile
self.assertCountEqual(expected_profile, profile)

# Validate time in datetime class has expected time after second
# update
profiler.update(df)
expected = defaultdict(float, {'datetime': 2.0})
self.assertEqual(expected, profiler.profile['times'])
self.assertEqual(expected_profile.pop('max'), profiler.profile['max'])

def test_report(self):
data = [
2.5, 12.5, '2013-03-10 15:43:30', 5, '03/10/13 15:43',
'Mar 11, 2013'
]
df = pd.Series(data).apply(str)
profile = DateTimeColumn(df.name)

report1 = profile.profile
report2 = profile.report(remove_disabled_flag=False)
report3 = profile.report(remove_disabled_flag=True)
self.assertDictEqual(report1, report2)
self.assertDictEqual(report1, report3)

def test_warning_for_bad_dates(self):

df = pd.Series(['03/10/2013 15:43'])
Expand Down
12 changes: 12 additions & 0 deletions dataprofiler/tests/profilers/test_order_column_profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,18 @@ def test_profile(self):
# key and value populated correctly
self.assertDictEqual(expected_profile, profile)

def test_report(self):
data = [1]
df = pd.Series(data).apply(str)

profile = OrderColumn(df.name)

report1 = profile.profile
report2 = profile.report(remove_disabled_flag=False)
report3 = profile.report(remove_disabled_flag=True)
self.assertDictEqual(report1, report2)
self.assertDictEqual(report1, report3)

def test_profile_merge(self):
data = [1, 2, 3, 4, 5, 6]
df = pd.Series(data).apply(str)
Expand Down
2 changes: 1 addition & 1 deletion dataprofiler/tests/profilers/test_profile_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -1836,7 +1836,7 @@ def test_column_names(self):
def test_update_match_are_abstract(self):
six.assertCountEqual(
self,
{'profile', '_update_helper', 'update'},
{'profile', '_update_helper', 'report', 'update'},
dp.profilers.BaseColumnProfiler.__abstractmethods__
)

Expand Down
30 changes: 30 additions & 0 deletions dataprofiler/tests/profilers/test_unstructured_labeler_profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,36 @@ def test_profile(self, processor_class_mock, model_class_mock):
# key and value populated correctly
self.assertDictEqual(expected_profile, profile)

@mock.patch('dataprofiler.profilers.'
'unstructured_labeler_profile.DataLabeler')
@mock.patch('dataprofiler.profilers.'
'unstructured_labeler_profile.'
'CharPostprocessor')
def test_report(self, processor_class_mock, model_class_mock):
# setup mocks
model_mock = mock.Mock()
model_mock.reverse_label_mapping = {1: 'UNKNOWN'}
model_mock.predict.return_value = dict(pred=[[1]])
model_class_mock.return_value = model_mock
processor_mock = mock.Mock()
processor_mock.process.return_value = dict(pred=[[]])
processor_class_mock.return_value = processor_mock

# initialize labeler profile
profile = UnstructuredLabelerProfile()

sample = pd.Series(["a"])

time_array = [float(i) for i in range(4, 0, -1)]
with mock.patch('time.time', side_effect=lambda: time_array.pop()):
profile.update(sample)

report1 = profile.profile
report2 = profile.report(remove_disabled_flag=False)
report3 = profile.report(remove_disabled_flag=True)
self.assertDictEqual(report1, report2)
self.assertDictEqual(report1, report3)

@mock.patch('dataprofiler.profilers.'
'unstructured_labeler_profile.DataLabeler')
@mock.patch('dataprofiler.profilers.'
Expand Down

0 comments on commit 54de73e

Please sign in to comment.