Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Profile Builder: add report for remove_disable_flag at the top level #496

Merged
merged 15 commits into from
Jun 27, 2022
3 changes: 2 additions & 1 deletion dataprofiler/profilers/base_column_profilers.py
Original file line number Diff line number Diff line change
Expand Up @@ -235,7 +235,8 @@ def report(self, remove_disabled_flag=False):
"""
Private abstract method for returning report.

:param remove_disabled_flag: flag to determine if disabled options should be excluded in the report.
:param remove_disabled_flag: flag to determine if disabled
options should be excluded in the report.
:type remove_disabled_flag: boolean
"""

Expand Down
3 changes: 2 additions & 1 deletion dataprofiler/profilers/categorical_column_profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,8 @@ def report(self, remove_disabled_flag=False):
"""
Private abstract method for returning report.

:param remove_disabled_flag: flag to determine if disabled options should be excluded in the report.
:param remove_disabled_flag: flag to determine if disabled
options should be excluded in the report.
:type remove_disabled_flag: boolean
"""
return self.profile
Expand Down
3 changes: 2 additions & 1 deletion dataprofiler/profilers/data_labeler_column_profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -243,7 +243,8 @@ def report(self, remove_disabled_flag=False):
"""
Private abstract method for returning report.

:param remove_disabled_flag: flag to determine if disabled options should be excluded in the report.
:param remove_disabled_flag: flag to determine if disabled
options should be excluded in the report.
:type remove_disabled_flag: boolean
"""
return self.profile
Expand Down
3 changes: 2 additions & 1 deletion dataprofiler/profilers/datetime_column_profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,8 @@ def report(self, remove_disabled_flag=False):
"""
Private abstract method for returning report.

:param remove_disabled_flag: flag to determine if disabled options should be excluded in the report.
:param remove_disabled_flag: flag to determine if disabled
options should be excluded in the report.
:type remove_disabled_flag: boolean
"""
return self.profile
Expand Down
3 changes: 2 additions & 1 deletion dataprofiler/profilers/int_column_profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,8 @@ def report(self, remove_disabled_flag=False):
"""
Private abstract method for returning report.

:param remove_disabled_flag: flag to determine if disabled options should be excluded in the report.
:param remove_disabled_flag: flag to determine if disabled
options should be excluded in the report.
:type remove_disabled_flag: boolean
"""
return self.profile
Expand Down
3 changes: 2 additions & 1 deletion dataprofiler/profilers/order_column_profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -232,7 +232,8 @@ def report(self, remove_disabled_flag=False):
"""
Private abstract method for returning report.

:param remove_disabled_flag: flag to determine if disabled options should be excluded in the report.
:param remove_disabled_flag: flag to determine if disabled
options should be excluded in the report.
:type remove_disabled_flag: boolean
"""
return self.profile
Expand Down
19 changes: 13 additions & 6 deletions dataprofiler/profilers/profile_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -264,12 +264,11 @@ def diff(self, other_profile, options=None):
profile[key] = None

return profile

@property
def profile(self):

def report(self, remove_disabled_flag=False):
unordered_profile = dict()
for profile in self.profiles.values():
utils.dict_merge(unordered_profile, profile.profile)
utils.dict_merge(unordered_profile, profile.report(remove_disabled_flag))

name = self.name
if isinstance(self.name, np.integer):
Expand Down Expand Up @@ -313,6 +312,10 @@ def profile(self):

return profile

@property
def profile(self):
return self.report(remove_disabled_flag=False)

def _update_base_stats(self, base_stats):
self.sample_size += base_stats["sample_size"]
self._last_batch_size = base_stats["sample_size"]
Expand Down Expand Up @@ -1099,10 +1102,12 @@ def report(self, report_options=None):
report_options = {
"output_format": None,
"omit_keys": None,
"remove_disabled_flag": False,
}

output_format = report_options.get("output_format", None)
omit_keys = report_options.get("omit_keys", None)
remove_disabled_flag = report_options.get("remove_disabled_flag", False)

report = OrderedDict([
("global_stats", {
Expand All @@ -1115,7 +1120,7 @@ def report(self, report_options=None):
}),
("data_stats", OrderedDict()),
])
report["data_stats"] = self._profile.profile
report["data_stats"] = self._profile.report(remove_disabled_flag)
return _prepare_report(report, output_format, omit_keys)

@utils.method_timeit(name="clean_and_base_stats")
Expand Down Expand Up @@ -1550,11 +1555,13 @@ def report(self, report_options=None):
report_options = {
"output_format": None,
"num_quantile_groups": 4,
"remove_disabled_flag": False,
}

output_format = report_options.get("output_format", None)
omit_keys = report_options.get("omit_keys", [])
num_quantile_groups = report_options.get("num_quantile_groups", 4)
remove_disabled_flag = report_options.get("remove_disabled_flag", False)

report = OrderedDict([
("global_stats", {
Expand All @@ -1578,7 +1585,7 @@ def report(self, report_options=None):
for i in range(len(self._profile)):
col_name = self._profile[i].name
report["global_stats"]["profile_schema"][col_name].append(i)
report["data_stats"].append(self._profile[i].profile)
report["data_stats"].append(self._profile[i].report(remove_disabled_flag))
quantiles = report["data_stats"][i]["statistics"].get('quantiles')
if quantiles:
quantiles = calculate_quantiles(num_quantile_groups, quantiles)
Expand Down
70 changes: 70 additions & 0 deletions dataprofiler/tests/profilers/test_profile_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -972,6 +972,63 @@ def test_omit_cols_preserves_schema(self):
for col_report in report["data_stats"]:
self.assertIsNone(col_report)

def test_report_remove_disabled_flag(self):
data = pd.DataFrame([[1.01, 2.02, 'if you'],
[10.01, 20.02,'read this you'],
[100.01,200.02, 'are cool']],
columns=["a", "b", "wordy_text_words"])

# with options to disable FloatColumn `precision`
# and with remove_disabled_flag == True
profiler_options = ProfilerOptions()
profiler_options.set({"precision.is_enabled": False})
profiler = dp.StructuredProfiler(data=data, options=profiler_options)
report = profiler.report(report_options={"remove_disabled_flag": True})

for iter_value in range(0, len(data.columns)-1):
self.assertNotIn("precision", report["data_stats"][iter_value]["statistics"])

# with options to disable NumericalMixIn cal `min`
# and with remove_disabled_flag == True
profiler_options = ProfilerOptions()
profiler_options.set({"min.is_enabled": False})
profiler = dp.StructuredProfiler(data=data, options=profiler_options)
report = profiler.report(report_options={"remove_disabled_flag": True})

for iter_value in range(0,len(data.columns)-1):
self.assertNotIn("min", report["data_stats"][iter_value]["statistics"])

# with options to disable TextColumn cal `vocab`
# and with remove_disabled_flag == True
profiler_options = ProfilerOptions()
profiler_options.set({"vocab.is_enabled": False})
profiler = dp.StructuredProfiler(data=data, options=profiler_options)
report = profiler.report(report_options={"remove_disabled_flag": True})

for iter_value in range(0,len(data.columns)):
self.assertNotIn("vocab", report["data_stats"][iter_value])

# with profiler options and default remove_disabled_flag
profiler_options = ProfilerOptions()
profiler_options.set({"min.is_enabled": False})
profiler = dp.StructuredProfiler(data=data, options=profiler_options)
report = profiler.report()

for iter_value in range(0,len(data.columns)):
self.assertIn("min", report["data_stats"][iter_value]["statistics"])

# w/o profiler options and default remove_disabled_flag
profiler = dp.StructuredProfiler(data=data)
report = profiler.report()

for iter_value in range(0, len(data.columns)-1):
self.assertIn("precision", report["data_stats"][iter_value]["statistics"])
self.assertIn("min", report["data_stats"][iter_value]["statistics"])

self.assertNotIn("precision", report["data_stats"][2]["statistics"])
self.assertIn("min", report["data_stats"][2]["statistics"])
self.assertIn("vocab", report["data_stats"][2]["statistics"])

def test_report_quantiles(self):
report_none = self.trained_schema.report(
report_options={"num_quantile_groups": None})
Expand Down Expand Up @@ -2607,6 +2664,19 @@ def test_update_profile(self):
expected_word_count,
report['data_stats']['statistics']['word_count'])

def test_report_remove_disabled_flag(self):
profiler_options = ProfilerOptions()
profiler_options.set({"vocab.is_enabled": False})
profiler = dp.UnstructuredProfiler(data=self.dataset, options=profiler_options)
report = profiler.report(report_options={"remove_disabled_flag": True})
self.assertNotIn('vocab', report['data_stats']['statistics'])
self.assertIn('words', report['data_stats']['statistics'])

profiler = dp.UnstructuredProfiler(data=self.dataset)
report = profiler.report(report_options={"remove_disabled_flag": True})
self.assertIn('vocab', report['data_stats']['statistics'])
self.assertIn('words', report['data_stats']['statistics'])

def test_save_and_load(self):
data_folder = "dataprofiler/tests/data/"
test_files = ["txt/code.txt", "txt/sentence-10x.txt"]
Expand Down