Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Report functionality added to BaseColumnProfiler #497

Merged
merged 16 commits into from
Jun 24, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions dataprofiler/profilers/base_column_profilers.py
Original file line number Diff line number Diff line change
Expand Up @@ -230,6 +230,17 @@ def profile(self):
"""
raise NotImplementedError()

@abc.abstractmethod
def report(self, remove_disabled_flag=False):
"""
Private abstract method for returning report.
:param remove_disabled_flag: flag to determine if disabled options should be excluded in the report.
:type remove_disabled_flag: boolean
"""

raise NotImplementedError()


class BaseColumnPrimitiveTypeProfiler(with_metaclass(abc.ABCMeta,
BaseColumnProfiler)):
Expand Down
9 changes: 9 additions & 0 deletions dataprofiler/profilers/categorical_column_profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,15 @@ def diff(self, other_profile, options=None):

return differences

def report(self, remove_disabled_flag=False):
"""
Private abstract method for returning report.
:param remove_disabled_flag: flag to determine if disabled options should be excluded in the report.
:type remove_disabled_flag: boolean
"""
return self.profile

@property
def profile(self):
"""
Expand Down
36 changes: 36 additions & 0 deletions dataprofiler/profilers/column_profile_compilers.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,15 @@ class ColumnPrimitiveTypeProfileCompiler(BaseCompiler):
]
_option_class = StructuredOptions

def report(self, remove_disabled_flag=False):
"""
Private abstract method for returning report.

:param remove_disabled_flag: flag to determine if disabled options should be excluded in the report.
:type remove_disabled_flag: boolean
"""
return self.profile

@property
def profile(self):
profile = {
Expand Down Expand Up @@ -287,6 +296,15 @@ class ColumnStatsProfileCompiler(BaseCompiler):
]
_option_class = StructuredOptions

def report(self, remove_disabled_flag=False):
"""
Private abstract method for returning report.

:param remove_disabled_flag: flag to determine if disabled options should be excluded in the report.
:type remove_disabled_flag: boolean
"""
return self.profile

@property
def profile(self):
profile = dict()
Expand Down Expand Up @@ -325,6 +343,15 @@ class ColumnDataLabelerCompiler(BaseCompiler):
]
_option_class = StructuredOptions

def report(self, remove_disabled_flag=False):
"""
Private abstract method for returning report.

:param remove_disabled_flag: flag to determine if disabled options should be excluded in the report.
:type remove_disabled_flag: boolean
"""
return self.profile

@property
def profile(self):
profile = {
Expand Down Expand Up @@ -376,6 +403,15 @@ class UnstructuredCompiler(BaseCompiler):

_option_class = UnstructuredOptions

def report(self, remove_disabled_flag=False):
"""
Private abstract method for returning report.

:param remove_disabled_flag: flag to determine if disabled options should be excluded in the report.
:type remove_disabled_flag: boolean
"""
return self.profile

@property
def profile(self):
profile = {
Expand Down
9 changes: 9 additions & 0 deletions dataprofiler/profilers/data_labeler_column_profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -239,6 +239,15 @@ def profile(self):
}
return profile

def report(self, remove_disabled_flag=False):
"""
Private abstract method for returning report.
:param remove_disabled_flag: flag to determine if disabled options should be excluded in the report.
:type remove_disabled_flag: boolean
"""
return self.profile

def diff(self, other_profile, options=None):
"""
Generates the differences between the orders of two DataLabeler columns
Expand Down
9 changes: 9 additions & 0 deletions dataprofiler/profilers/datetime_column_profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,15 @@ def __add__(self, other):
self.date_formats, other.date_formats)
return merged_profile

def report(self, remove_disabled_flag=False):
"""
Private abstract method for returning report.
:param remove_disabled_flag: flag to determine if disabled options should be excluded in the report.
:type remove_disabled_flag: boolean
"""
return self.profile

@property
def profile(self):
"""
Expand Down
9 changes: 9 additions & 0 deletions dataprofiler/profilers/int_column_profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,15 @@ def __add__(self, other):
other.__calculations)
return merged_profile

def report(self, remove_disabled_flag=False):
"""
Private abstract method for returning report.
:param remove_disabled_flag: flag to determine if disabled options should be excluded in the report.
:type remove_disabled_flag: boolean
"""
return self.profile

@property
def profile(self):
"""
Expand Down
9 changes: 9 additions & 0 deletions dataprofiler/profilers/order_column_profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -228,6 +228,15 @@ def __add__(self, other):
other.__calculations)
return merged_profile

def report(self, remove_disabled_flag=False):
"""
Private abstract method for returning report.
:param remove_disabled_flag: flag to determine if disabled options should be excluded in the report.
:type remove_disabled_flag: boolean
"""
return self.profile

@property
def profile(self):
"""
Expand Down
9 changes: 9 additions & 0 deletions dataprofiler/profilers/unstructured_labeler_profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,15 @@ def __add__(self, other):

return merged_profile

def report(self, remove_disabled_flag=False):
"""
Private abstract method for returning report.
:param remove_disabled_flag: flag to determine if disabled options should be excluded in the report.
:type remove_disabled_flag: boolean
"""
return self.profile

def diff(self, other_profile, options=None):
"""
Finds the differences for two unstructured labeler profiles
Expand Down
14 changes: 12 additions & 2 deletions dataprofiler/tests/profilers/test_base_column_profilers.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,7 @@ def test_cannot_instantiate(self):
BaseColumnPrimitiveTypeProfiler()
self.assertEqual(
"Can't instantiate abstract class BaseColumnPrimitiveTypeProfiler "
"with abstract methods _update_helper, profile, update",
"with abstract methods _update_helper, profile, report, update",
str(e.exception)
)

Expand Down Expand Up @@ -181,7 +181,7 @@ def test_update_column_base_properties(self):
def test_update_match_are_abstract(self):
six.assertCountEqual(
self,
{'_update_helper', 'update', 'profile'},
{'_update_helper', 'update', 'report', 'profile'},
BaseColumnPrimitiveTypeProfiler.__abstractmethods__
)

Expand Down Expand Up @@ -284,6 +284,16 @@ def test_updated_profile(self):
self.assertEqual(1, profiler_mock.call_count)
self._delete_profiler_mocks()

def test_report(self):
self._create_profiler_mocks()
profile = self.column_profiler(self.aws_dataset["datetime"])

report1 = profile.profile
report2 = profile.report(remove_disabled_flag=False)
report3 = profile.report(remove_disabled_flag=True)
self.assertDictEqual(report1, report2)
self.assertDictEqual(report1, report3)

def test_profile(self):
self._create_profiler_mocks()
profile = self.column_profiler(self.aws_dataset["datetime"])
Expand Down
12 changes: 12 additions & 0 deletions dataprofiler/tests/profilers/test_categorical_column_profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,7 @@ def test_true_categorical_report(self):
('unalikeability', 2*(12 + 15 + 20)/132)
]),
)

# We have to pop these values because sometimes the order changes
self.assertCountEqual(expected_profile['statistics'].pop('categories'),
report["statistics"].pop('categories'))
Expand All @@ -190,6 +191,17 @@ def test_false_categorical_report(self):
)
self.assertEqual(report, expected_profile)

def test_report(self):
df_non_categorical = pd.Series(list(map(str, range(0, 20))))
profile = CategoricalColumn(df_non_categorical.name)
profile.update(df_non_categorical)

report1 = profile.profile
report2 = profile.report(remove_disabled_flag=False)
report3 = profile.report(remove_disabled_flag=True)
self.assertDictEqual(report1, report2)
self.assertDictEqual(report1, report3)

def test_categorical_merge(self):
df1 = pd.Series(["abcd", "aa", "abcd", "aa", "b", "4", "3", "2",
"dfd", "2", np.nan])
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,18 @@ def test_profile(self, mock_instance):
expected = defaultdict(float, {'data_labeler_predict': 2.0})
self.assertEqual(expected, profiler.profile['times'])

def test_report(self, mock_instance):
self._setup_data_labeler_mock(mock_instance)

data = pd.Series(['1', '2', '3'])
profile = DataLabelerColumn(data.name)

report1 = profile.profile
report2 = profile.report(remove_disabled_flag=False)
report3 = profile.report(remove_disabled_flag=True)
self.assertDictEqual(report1, report2)
self.assertDictEqual(report1, report3)

def test_label_match(self, mock_instance):
"""
Test label match between avg_prediction and data_label_representation
Expand Down
16 changes: 15 additions & 1 deletion dataprofiler/tests/profilers/test_datetime_column_profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -261,14 +261,28 @@ def test_profile(self):
self.assertEqual(expected, profiler.profile['times'])
profile = profiler.profile
self.assertCountEqual(expected_profile, profile)

# Validate time in datetime class has expected time after second
# update
profiler.update(df)
expected = defaultdict(float, {'datetime': 2.0})
self.assertEqual(expected, profiler.profile['times'])
self.assertEqual(expected_profile.pop('max'), profiler.profile['max'])

def test_report(self):
data = [
2.5, 12.5, '2013-03-10 15:43:30', 5, '03/10/13 15:43',
'Mar 11, 2013'
]
df = pd.Series(data).apply(str)
profile = DateTimeColumn(df.name)

report1 = profile.profile
report2 = profile.report(remove_disabled_flag=False)
report3 = profile.report(remove_disabled_flag=True)
self.assertDictEqual(report1, report2)
self.assertDictEqual(report1, report3)

def test_warning_for_bad_dates(self):

df = pd.Series(['03/10/2013 15:43'])
Expand Down
12 changes: 12 additions & 0 deletions dataprofiler/tests/profilers/test_order_column_profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,18 @@ def test_profile(self):
# key and value populated correctly
self.assertDictEqual(expected_profile, profile)

def test_report(self):
data = [1]
df = pd.Series(data).apply(str)

profile = OrderColumn(df.name)

report1 = profile.profile
report2 = profile.report(remove_disabled_flag=False)
report3 = profile.report(remove_disabled_flag=True)
self.assertDictEqual(report1, report2)
self.assertDictEqual(report1, report3)

def test_profile_merge(self):
data = [1, 2, 3, 4, 5, 6]
df = pd.Series(data).apply(str)
Expand Down
2 changes: 1 addition & 1 deletion dataprofiler/tests/profilers/test_profile_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -1836,7 +1836,7 @@ def test_column_names(self):
def test_update_match_are_abstract(self):
six.assertCountEqual(
self,
{'profile', '_update_helper', 'update'},
{'profile', '_update_helper', 'report', 'update'},
dp.profilers.BaseColumnProfiler.__abstractmethods__
)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,36 @@ def test_profile(self, processor_class_mock, model_class_mock):
# key and value populated correctly
self.assertDictEqual(expected_profile, profile)

@mock.patch('dataprofiler.profilers.'
'unstructured_labeler_profile.DataLabeler')
@mock.patch('dataprofiler.profilers.'
'unstructured_labeler_profile.'
'CharPostprocessor')
def test_report(self, processor_class_mock, model_class_mock):
# setup mocks
model_mock = mock.Mock()
model_mock.reverse_label_mapping = {1: 'UNKNOWN'}
model_mock.predict.return_value = dict(pred=[[1]])
model_class_mock.return_value = model_mock
processor_mock = mock.Mock()
processor_mock.process.return_value = dict(pred=[[]])
processor_class_mock.return_value = processor_mock

# initialize labeler profile
profile = UnstructuredLabelerProfile()

sample = pd.Series(["a"])

time_array = [float(i) for i in range(4, 0, -1)]
with mock.patch('time.time', side_effect=lambda: time_array.pop()):
profile.update(sample)

report1 = profile.profile
report2 = profile.report(remove_disabled_flag=False)
report3 = profile.report(remove_disabled_flag=True)
self.assertDictEqual(report1, report2)
self.assertDictEqual(report1, report3)

@mock.patch('dataprofiler.profilers.'
'unstructured_labeler_profile.DataLabeler')
@mock.patch('dataprofiler.profilers.'
Expand Down