From ca3100140e8d76c6bd9d993a7702f59f11ab6235 Mon Sep 17 00:00:00 2001 From: micdavis Date: Tue, 29 Mar 2022 14:22:32 -0400 Subject: [PATCH 1/5] Fix: updated test_(int/float)_column_profile.py test_diff to cover json.dumps() functionality. Updated numerical_column_stats.py _perform_t_test to cast conservative and welch values to floats before assignment. --- dataprofiler/profilers/numerical_column_stats.py | 8 ++++---- dataprofiler/tests/profilers/test_float_column_profile.py | 6 +++++- dataprofiler/tests/profilers/test_int_column_profile.py | 6 +++++- 3 files changed, 14 insertions(+), 6 deletions(-) diff --git a/dataprofiler/profilers/numerical_column_stats.py b/dataprofiler/profilers/numerical_column_stats.py index f8b2d5f86..5cfff6a48 100644 --- a/dataprofiler/profilers/numerical_column_stats.py +++ b/dataprofiler/profilers/numerical_column_stats.py @@ -422,16 +422,16 @@ def _perform_t_test(mean1, var1, n1, welch_df = s_delta ** 2 / ((var1 / n1) ** 2 / (n1 - 1) + (var2 / n2) ** 2 / (n2 - 1)) results['t-statistic'] = t - results['conservative']['df'] = conservative_df - results['welch']['df'] = welch_df + results['conservative']['df'] = float(conservative_df) + results['welch']['df'] = float(welch_df) conservative_t = scipy.stats.t(conservative_df) conservative_p_val = (1 - conservative_t.cdf(abs(t))) * 2 welch_t = scipy.stats.t(welch_df) welch_p_val = (1 - welch_t.cdf(abs(t))) * 2 - results['conservative']['p-value'] = conservative_p_val - results['welch']['p-value'] = welch_p_val + results['conservative']['p-value'] = float(conservative_p_val) + results['welch']['p-value'] = float(welch_p_val) return results def _update_variance(self, batch_mean, batch_var, batch_count): diff --git a/dataprofiler/tests/profilers/test_float_column_profile.py b/dataprofiler/tests/profilers/test_float_column_profile.py index 6372b85aa..e8e2746c6 100644 --- a/dataprofiler/tests/profilers/test_float_column_profile.py +++ b/dataprofiler/tests/profilers/test_float_column_profile.py @@ -6,7 +6,7 @@ import pandas as pd import numpy as np - +import json from dataprofiler.profilers import FloatColumn from dataprofiler.profilers.profiler_options import FloatOptions @@ -1569,6 +1569,10 @@ def test_diff(self): } } profile_diff = profiler1.diff(profiler2) + try: + json.dumps(profile_diff) + except TypeError: + self.fail('Object of type int64 is not JSON serializable in the diff') self.assertAlmostEqual( expected_diff.pop('median'), profile_diff.pop('median'), places=2) expected_diff_mode = expected_diff.pop('mode') diff --git a/dataprofiler/tests/profilers/test_int_column_profile.py b/dataprofiler/tests/profilers/test_int_column_profile.py index 1e418c8dd..44343fa44 100644 --- a/dataprofiler/tests/profilers/test_int_column_profile.py +++ b/dataprofiler/tests/profilers/test_int_column_profile.py @@ -6,7 +6,7 @@ import pandas as pd import numpy as np import warnings - +import json from dataprofiler.profilers import IntColumn from dataprofiler.profilers.profiler_options import IntOptions @@ -1006,6 +1006,10 @@ def test_diff(self): } } profile_diff = profiler1.diff(profiler2) + try: + json.dumps(profile_diff) + except TypeError: + self.fail('Object of type int64 is not JSON serializable in the diff') self.assertAlmostEqual( expected_diff.pop('median'), profile_diff.pop('median'), places=2) expected_diff_mode = expected_diff.pop('mode') From 76ee8670cc6f44a4be17ceef84c724db31b22131 Mon Sep 17 00:00:00 2001 From: micdavis Date: Tue, 29 Mar 2022 16:27:43 -0400 Subject: [PATCH 2/5] Fix: addressed PR change requests. --- .../tests/profilers/test_float_column_profile.py | 8 ++++---- .../tests/profilers/test_int_column_profile.py | 14 +++++++------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/dataprofiler/tests/profilers/test_float_column_profile.py b/dataprofiler/tests/profilers/test_float_column_profile.py index e8e2746c6..7819b3bdd 100644 --- a/dataprofiler/tests/profilers/test_float_column_profile.py +++ b/dataprofiler/tests/profilers/test_float_column_profile.py @@ -3,10 +3,10 @@ from collections import defaultdict from unittest import mock import warnings +import json import pandas as pd import numpy as np -import json from dataprofiler.profilers import FloatColumn from dataprofiler.profilers.profiler_options import FloatOptions @@ -294,7 +294,7 @@ def test_profiled_mode(self): profiler.update(df) np.testing.assert_array_almost_equal([1.9, 2.01], profiler.mode, decimal=2) - + # all unique values df = pd.Series([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]).apply(str) profiler = FloatColumn(df.name) @@ -1571,8 +1571,8 @@ def test_diff(self): profile_diff = profiler1.diff(profiler2) try: json.dumps(profile_diff) - except TypeError: - self.fail('Object of type int64 is not JSON serializable in the diff') + except TypeError as e: + self.fail('There was an issue serializing the profile diff to JSON. Exception raised: {}'.format(str(e))) self.assertAlmostEqual( expected_diff.pop('median'), profile_diff.pop('median'), places=2) expected_diff_mode = expected_diff.pop('mode') diff --git a/dataprofiler/tests/profilers/test_int_column_profile.py b/dataprofiler/tests/profilers/test_int_column_profile.py index 44343fa44..9ab78d5b2 100644 --- a/dataprofiler/tests/profilers/test_int_column_profile.py +++ b/dataprofiler/tests/profilers/test_int_column_profile.py @@ -2,11 +2,11 @@ import unittest from unittest import mock from collections import defaultdict +import json import pandas as pd import numpy as np import warnings -import json from dataprofiler.profilers import IntColumn from dataprofiler.profilers.profiler_options import IntOptions @@ -540,7 +540,7 @@ def test_profile(self): 'sum': 1.0, 'variance': 1.0, 'skewness': 1.0, 'kurtosis': 1.0, 'num_negatives': 1.0, 'num_zeros': 1.0}) - + ) time_array = [float(i) for i in range(100, 0, -1)] with mock.patch('time.time', side_effect=lambda: time_array.pop()): @@ -824,7 +824,7 @@ def test_profile_merge_no_bin_overlap(self): 'Profiles have no overlapping bin methods ' 'and therefore cannot be added together.'): profiler1 + profiler2 - + def test_profile_merge_with_different_options(self): # Creating first profiler with default options options = IntOptions() @@ -850,13 +850,13 @@ def test_profile_merge_with_different_options(self): "max is disabled because it is not enabled in" " both profiles."): profiler3 = profiler1 + profiler2 - + # Assert that these features are still merged profile = profiler3.profile self.assertIsNotNone(profiler3.histogram_selection) self.assertIsNotNone(profile['variance']) self.assertIsNotNone(profiler3.sum) - + # Assert that these features are not calculated self.assertIsNone(profiler3.max) self.assertIsNone(profiler3.min) @@ -1008,8 +1008,8 @@ def test_diff(self): profile_diff = profiler1.diff(profiler2) try: json.dumps(profile_diff) - except TypeError: - self.fail('Object of type int64 is not JSON serializable in the diff') + except TypeError as e: + self.fail('There was an issue serializing the profile diff to JSON. Exception raised: {}'.format(str(e))) self.assertAlmostEqual( expected_diff.pop('median'), profile_diff.pop('median'), places=2) expected_diff_mode = expected_diff.pop('mode') From 9201b56cecefad2647c229437758cd507d57612b Mon Sep 17 00:00:00 2001 From: micdavis Date: Wed, 30 Mar 2022 09:55:03 -0400 Subject: [PATCH 3/5] Fix: Reformatting --- dataprofiler/tests/profilers/test_float_column_profile.py | 5 ++++- dataprofiler/tests/profilers/test_int_column_profile.py | 5 ++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/dataprofiler/tests/profilers/test_float_column_profile.py b/dataprofiler/tests/profilers/test_float_column_profile.py index 7819b3bdd..d5253d5a5 100644 --- a/dataprofiler/tests/profilers/test_float_column_profile.py +++ b/dataprofiler/tests/profilers/test_float_column_profile.py @@ -7,6 +7,7 @@ import pandas as pd import numpy as np + from dataprofiler.profilers import FloatColumn from dataprofiler.profilers.profiler_options import FloatOptions @@ -1572,7 +1573,9 @@ def test_diff(self): try: json.dumps(profile_diff) except TypeError as e: - self.fail('There was an issue serializing the profile diff to JSON. Exception raised: {}'.format(str(e))) + self.fail( + 'JSON Serializing issue with the profile diff. ' + 'Exception raised: {}'.format(str(e))) self.assertAlmostEqual( expected_diff.pop('median'), profile_diff.pop('median'), places=2) expected_diff_mode = expected_diff.pop('mode') diff --git a/dataprofiler/tests/profilers/test_int_column_profile.py b/dataprofiler/tests/profilers/test_int_column_profile.py index 9ab78d5b2..81cc16017 100644 --- a/dataprofiler/tests/profilers/test_int_column_profile.py +++ b/dataprofiler/tests/profilers/test_int_column_profile.py @@ -7,6 +7,7 @@ import pandas as pd import numpy as np import warnings + from dataprofiler.profilers import IntColumn from dataprofiler.profilers.profiler_options import IntOptions @@ -1009,7 +1010,9 @@ def test_diff(self): try: json.dumps(profile_diff) except TypeError as e: - self.fail('There was an issue serializing the profile diff to JSON. Exception raised: {}'.format(str(e))) + self.fail( + 'JSON Serializing issue with the profile diff. ' + 'Exception raised: {}'.format(str(e))) self.assertAlmostEqual( expected_diff.pop('median'), profile_diff.pop('median'), places=2) expected_diff_mode = expected_diff.pop('mode') From cdf2635b97eebad7863d0bdb905a031e66fa00c0 Mon Sep 17 00:00:00 2001 From: micdavis Date: Tue, 5 Apr 2022 12:19:49 -0400 Subject: [PATCH 4/5] Bumped version --- dataprofiler/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dataprofiler/version.py b/dataprofiler/version.py index 1be848a81..8f2b8bd9b 100644 --- a/dataprofiler/version.py +++ b/dataprofiler/version.py @@ -4,7 +4,7 @@ MAJOR = 0 MINOR = 7 -MICRO = 6 +MICRO = 7 VERSION = '%d.%d.%d' % (MAJOR, MINOR, MICRO) From 26fda7fbbf0d4bded6605da3cd93dc8c7e91da55 Mon Sep 17 00:00:00 2001 From: micdavis Date: Fri, 24 Jun 2022 12:35:30 -0400 Subject: [PATCH 5/5] Added remove_disabled_flag for TextProfiler --- .../profilers/unstructured_text_profile.py | 20 ++++++++ .../test_unstructured_text_profile.py | 46 +++++++++++++++++++ 2 files changed, 66 insertions(+) diff --git a/dataprofiler/profilers/unstructured_text_profile.py b/dataprofiler/profilers/unstructured_text_profile.py index 216d1d50f..365f19a2a 100644 --- a/dataprofiler/profilers/unstructured_text_profile.py +++ b/dataprofiler/profilers/unstructured_text_profile.py @@ -266,6 +266,26 @@ def diff(self, other_profile, options=None): return diff + def report(self, remove_disabled_flag=False): + """Report on profile attribute of the class and pop value + from self.profile if key not in self.__calculations + """ + calcs_dict_keys = self._TextProfiler__calculations.keys() + profile = self.profile + + if remove_disabled_flag: + profile_keys = list(profile.keys()) + for profile_key in profile_keys: + if profile_key == 'vocab': + if 'vocab' in calcs_dict_keys: + continue + if profile_key == 'words': + if 'words' in calcs_dict_keys: + continue + profile.pop(profile_key) + + return profile + @property def profile(self): """ diff --git a/dataprofiler/tests/profilers/test_unstructured_text_profile.py b/dataprofiler/tests/profilers/test_unstructured_text_profile.py index bf982ddd7..dc850e736 100644 --- a/dataprofiler/tests/profilers/test_unstructured_text_profile.py +++ b/dataprofiler/tests/profilers/test_unstructured_text_profile.py @@ -366,6 +366,52 @@ def test_options_default(self): self.assertDictEqual(expected_word_count, text_profile.word_count) self.assertDictEqual(expected_vocab, text_profile.vocab_count) + def test_report(self): + """Test report method in TextProfiler class under four (4) scenarios. + First, test under scenario of disabling vocab and word. Second, test with no options and + `remove_disabled_flag`=True. Third, test no options and default + `remove_disabled_flag`. Lastly, test under scenario of disabling vocab but not word. + """ + options = TextProfilerOptions() # With TextProfilerOptions as False and remove_disabled_flag == True + options.vocab.is_enabled = False + options.words.is_enabled = False + + profiler = TextProfiler("Name", options) + sample = pd.Series(["This is test, a Test sentence.!!!"]) + profiler.update(sample) + + report = profiler.report(remove_disabled_flag=True) + report_keys = list(report.keys()) + self.assertNotIn('vocab', report_keys) + self.assertNotIn('words', report_keys) + + profiler = TextProfiler("Name") # w/o TextProfilerOptions and remove_disabled_flag == True + report = profiler.report(remove_disabled_flag=True) + report_keys = list(report.keys()) + self.assertIn('vocab', report_keys) + self.assertIn('words', report_keys) + + profiler = TextProfiler("Name") # w/o TextProfilerOptions and remove_disabled_flag default + report = profiler.report() + report_keys = list(report.keys()) + self.assertIn('vocab', report_keys) + self.assertIn('words', report_keys) + + options = TextProfilerOptions() # With TextProfilerOptions True/False and remove_disabled_flag == True + options.vocab.is_enabled = True + options.words.is_enabled = False + + profiler = TextProfiler("Name", options) + sample = pd.Series(["This is test, a Test sentence.!!!"]) + profiler.update(sample) + + report = profiler.report(remove_disabled_flag=True) + report_keys = list(report.keys()) + + self.assertIn('vocab', report_keys) + self.assertNotIn('words', report_keys) + + def test_options_case_sensitive(self): # change is_case_sensitive, other options remain the same as default values options = TextProfilerOptions()