diff --git a/dataprofiler/profilers/base_column_profilers.py b/dataprofiler/profilers/base_column_profilers.py index 9ec5b8697..cadb61fa9 100644 --- a/dataprofiler/profilers/base_column_profilers.py +++ b/dataprofiler/profilers/base_column_profilers.py @@ -235,7 +235,8 @@ def report(self, remove_disabled_flag=False): """ Private abstract method for returning report. - :param remove_disabled_flag: flag to determine if disabled options should be excluded in the report. + :param remove_disabled_flag: flag to determine if disabled + options should be excluded in the report. :type remove_disabled_flag: boolean """ diff --git a/dataprofiler/profilers/categorical_column_profile.py b/dataprofiler/profilers/categorical_column_profile.py index dc45740ce..27049aa4d 100644 --- a/dataprofiler/profilers/categorical_column_profile.py +++ b/dataprofiler/profilers/categorical_column_profile.py @@ -121,7 +121,8 @@ def report(self, remove_disabled_flag=False): """ Private abstract method for returning report. - :param remove_disabled_flag: flag to determine if disabled options should be excluded in the report. + :param remove_disabled_flag: flag to determine if disabled + options should be excluded in the report. :type remove_disabled_flag: boolean """ return self.profile diff --git a/dataprofiler/profilers/data_labeler_column_profile.py b/dataprofiler/profilers/data_labeler_column_profile.py index 12133c249..a7eece60a 100644 --- a/dataprofiler/profilers/data_labeler_column_profile.py +++ b/dataprofiler/profilers/data_labeler_column_profile.py @@ -243,7 +243,8 @@ def report(self, remove_disabled_flag=False): """ Private abstract method for returning report. - :param remove_disabled_flag: flag to determine if disabled options should be excluded in the report. + :param remove_disabled_flag: flag to determine if disabled + options should be excluded in the report. :type remove_disabled_flag: boolean """ return self.profile diff --git a/dataprofiler/profilers/datetime_column_profile.py b/dataprofiler/profilers/datetime_column_profile.py index 0027ef34c..00c91cb37 100644 --- a/dataprofiler/profilers/datetime_column_profile.py +++ b/dataprofiler/profilers/datetime_column_profile.py @@ -115,7 +115,8 @@ def report(self, remove_disabled_flag=False): """ Private abstract method for returning report. - :param remove_disabled_flag: flag to determine if disabled options should be excluded in the report. + :param remove_disabled_flag: flag to determine if disabled + options should be excluded in the report. :type remove_disabled_flag: boolean """ return self.profile diff --git a/dataprofiler/profilers/int_column_profile.py b/dataprofiler/profilers/int_column_profile.py index 32010d717..fc49dd8f1 100644 --- a/dataprofiler/profilers/int_column_profile.py +++ b/dataprofiler/profilers/int_column_profile.py @@ -57,7 +57,8 @@ def report(self, remove_disabled_flag=False): """ Private abstract method for returning report. - :param remove_disabled_flag: flag to determine if disabled options should be excluded in the report. + :param remove_disabled_flag: flag to determine if disabled + options should be excluded in the report. :type remove_disabled_flag: boolean """ return self.profile diff --git a/dataprofiler/profilers/order_column_profile.py b/dataprofiler/profilers/order_column_profile.py index 2bcdc2cb4..ec3614a9e 100644 --- a/dataprofiler/profilers/order_column_profile.py +++ b/dataprofiler/profilers/order_column_profile.py @@ -232,7 +232,8 @@ def report(self, remove_disabled_flag=False): """ Private abstract method for returning report. - :param remove_disabled_flag: flag to determine if disabled options should be excluded in the report. + :param remove_disabled_flag: flag to determine if disabled + options should be excluded in the report. :type remove_disabled_flag: boolean """ return self.profile diff --git a/dataprofiler/profilers/profile_builder.py b/dataprofiler/profilers/profile_builder.py index a75b0a873..55f3c8747 100644 --- a/dataprofiler/profilers/profile_builder.py +++ b/dataprofiler/profilers/profile_builder.py @@ -264,12 +264,11 @@ def diff(self, other_profile, options=None): profile[key] = None return profile - - @property - def profile(self): + + def report(self, remove_disabled_flag=False): unordered_profile = dict() for profile in self.profiles.values(): - utils.dict_merge(unordered_profile, profile.profile) + utils.dict_merge(unordered_profile, profile.report(remove_disabled_flag)) name = self.name if isinstance(self.name, np.integer): @@ -313,6 +312,10 @@ def profile(self): return profile + @property + def profile(self): + return self.report(remove_disabled_flag=False) + def _update_base_stats(self, base_stats): self.sample_size += base_stats["sample_size"] self._last_batch_size = base_stats["sample_size"] @@ -1099,10 +1102,12 @@ def report(self, report_options=None): report_options = { "output_format": None, "omit_keys": None, + "remove_disabled_flag": False, } output_format = report_options.get("output_format", None) omit_keys = report_options.get("omit_keys", None) + remove_disabled_flag = report_options.get("remove_disabled_flag", False) report = OrderedDict([ ("global_stats", { @@ -1115,7 +1120,7 @@ def report(self, report_options=None): }), ("data_stats", OrderedDict()), ]) - report["data_stats"] = self._profile.profile + report["data_stats"] = self._profile.report(remove_disabled_flag) return _prepare_report(report, output_format, omit_keys) @utils.method_timeit(name="clean_and_base_stats") @@ -1550,11 +1555,13 @@ def report(self, report_options=None): report_options = { "output_format": None, "num_quantile_groups": 4, + "remove_disabled_flag": False, } output_format = report_options.get("output_format", None) omit_keys = report_options.get("omit_keys", []) num_quantile_groups = report_options.get("num_quantile_groups", 4) + remove_disabled_flag = report_options.get("remove_disabled_flag", False) report = OrderedDict([ ("global_stats", { @@ -1578,7 +1585,7 @@ def report(self, report_options=None): for i in range(len(self._profile)): col_name = self._profile[i].name report["global_stats"]["profile_schema"][col_name].append(i) - report["data_stats"].append(self._profile[i].profile) + report["data_stats"].append(self._profile[i].report(remove_disabled_flag)) quantiles = report["data_stats"][i]["statistics"].get('quantiles') if quantiles: quantiles = calculate_quantiles(num_quantile_groups, quantiles) diff --git a/dataprofiler/tests/profilers/test_profile_builder.py b/dataprofiler/tests/profilers/test_profile_builder.py index 56d29d072..32b646965 100644 --- a/dataprofiler/tests/profilers/test_profile_builder.py +++ b/dataprofiler/tests/profilers/test_profile_builder.py @@ -972,6 +972,63 @@ def test_omit_cols_preserves_schema(self): for col_report in report["data_stats"]: self.assertIsNone(col_report) + def test_report_remove_disabled_flag(self): + data = pd.DataFrame([[1.01, 2.02, 'if you'], + [10.01, 20.02,'read this you'], + [100.01,200.02, 'are cool']], + columns=["a", "b", "wordy_text_words"]) + + # with options to disable FloatColumn `precision` + # and with remove_disabled_flag == True + profiler_options = ProfilerOptions() + profiler_options.set({"precision.is_enabled": False}) + profiler = dp.StructuredProfiler(data=data, options=profiler_options) + report = profiler.report(report_options={"remove_disabled_flag": True}) + + for iter_value in range(0, len(data.columns)-1): + self.assertNotIn("precision", report["data_stats"][iter_value]["statistics"]) + + # with options to disable NumericalMixIn cal `min` + # and with remove_disabled_flag == True + profiler_options = ProfilerOptions() + profiler_options.set({"min.is_enabled": False}) + profiler = dp.StructuredProfiler(data=data, options=profiler_options) + report = profiler.report(report_options={"remove_disabled_flag": True}) + + for iter_value in range(0,len(data.columns)-1): + self.assertNotIn("min", report["data_stats"][iter_value]["statistics"]) + + # with options to disable TextColumn cal `vocab` + # and with remove_disabled_flag == True + profiler_options = ProfilerOptions() + profiler_options.set({"vocab.is_enabled": False}) + profiler = dp.StructuredProfiler(data=data, options=profiler_options) + report = profiler.report(report_options={"remove_disabled_flag": True}) + + for iter_value in range(0,len(data.columns)): + self.assertNotIn("vocab", report["data_stats"][iter_value]) + + # with profiler options and default remove_disabled_flag + profiler_options = ProfilerOptions() + profiler_options.set({"min.is_enabled": False}) + profiler = dp.StructuredProfiler(data=data, options=profiler_options) + report = profiler.report() + + for iter_value in range(0,len(data.columns)): + self.assertIn("min", report["data_stats"][iter_value]["statistics"]) + + # w/o profiler options and default remove_disabled_flag + profiler = dp.StructuredProfiler(data=data) + report = profiler.report() + + for iter_value in range(0, len(data.columns)-1): + self.assertIn("precision", report["data_stats"][iter_value]["statistics"]) + self.assertIn("min", report["data_stats"][iter_value]["statistics"]) + + self.assertNotIn("precision", report["data_stats"][2]["statistics"]) + self.assertIn("min", report["data_stats"][2]["statistics"]) + self.assertIn("vocab", report["data_stats"][2]["statistics"]) + def test_report_quantiles(self): report_none = self.trained_schema.report( report_options={"num_quantile_groups": None}) @@ -2607,6 +2664,19 @@ def test_update_profile(self): expected_word_count, report['data_stats']['statistics']['word_count']) + def test_report_remove_disabled_flag(self): + profiler_options = ProfilerOptions() + profiler_options.set({"vocab.is_enabled": False}) + profiler = dp.UnstructuredProfiler(data=self.dataset, options=profiler_options) + report = profiler.report(report_options={"remove_disabled_flag": True}) + self.assertNotIn('vocab', report['data_stats']['statistics']) + self.assertIn('words', report['data_stats']['statistics']) + + profiler = dp.UnstructuredProfiler(data=self.dataset) + report = profiler.report(report_options={"remove_disabled_flag": True}) + self.assertIn('vocab', report['data_stats']['statistics']) + self.assertIn('words', report['data_stats']['statistics']) + def test_save_and_load(self): data_folder = "dataprofiler/tests/data/" test_files = ["txt/code.txt", "txt/sentence-10x.txt"]