From bee16d3becc74187ba01bbf8529a05c2faee179b Mon Sep 17 00:00:00 2001 From: Tony Wu Date: Fri, 4 Nov 2022 11:35:26 -0400 Subject: [PATCH 1/4] Fix bug when row is all null --- dataprofiler/profilers/profile_builder.py | 15 ++++++++++++++- .../tests/profilers/test_profile_builder.py | 13 +++++++++++++ 2 files changed, 27 insertions(+), 1 deletion(-) diff --git a/dataprofiler/profilers/profile_builder.py b/dataprofiler/profilers/profile_builder.py index 150135f40..f2c4643b6 100644 --- a/dataprofiler/profilers/profile_builder.py +++ b/dataprofiler/profilers/profile_builder.py @@ -2240,7 +2240,20 @@ def _update_null_replication_metrics(self, clean_samples: Dict) -> None: :param clean_samples: input cleaned dataset :type clean_samples: dict """ - data = pd.DataFrame(clean_samples).apply(pd.to_numeric, errors="coerce") + data: pd.DataFrame = pd.DataFrame(clean_samples) + + # If the last row is all null, then add rows to the data DataFrame + max_null_index = max( + [max(i) for i in getattr(self._profile[0], "null_types_index").values()], + default=0, + ) + if max_null_index > data.index.max(): + data.loc[max_null_index] = {} + + # Fill in missing rows with NaN and convert types to numeric + data = data.reindex(range(data.index.max() + 1), fill_value=np.nan).apply( + pd.to_numeric, errors="coerce" + ) get_data_type = lambda profile: profile.profiles[ # NOQA: E731 "data_type_profile" diff --git a/dataprofiler/tests/profilers/test_profile_builder.py b/dataprofiler/tests/profilers/test_profile_builder.py index c90ef3eb2..1cbca2192 100644 --- a/dataprofiler/tests/profilers/test_profile_builder.py +++ b/dataprofiler/tests/profilers/test_profile_builder.py @@ -2081,6 +2081,19 @@ def test_null_replication_metrics_calculation(self): np.testing.assert_array_almost_equal([[np.nan], [18]], column["class_sum"]) np.testing.assert_array_almost_equal([[np.nan], [9]], column["class_mean"]) + # Test with all null in a row + data_4 = pd.DataFrame([[10, 10], [9999999, 9999999]]) + + profiler = dp.StructuredProfiler(data_4, options=profile_options) + report = profiler.report() + + self.assertTrue("null_replication_metrics" in report["data_stats"][0]) + column = report["data_stats"][0]["null_replication_metrics"] + + np.testing.assert_array_almost_equal([0.5, 0.5], column["class_prior"]) + np.testing.assert_array_almost_equal([[10], [0]], column["class_sum"]) + np.testing.assert_array_almost_equal([[10], [0]], column["class_mean"]) + def test_column_level_invalid_values(self): data = pd.DataFrame([[1, 1], [9999999, 2], [3, 3]]) From 7644b792eacda7db2ce13f6450f6f566d7ffd17f Mon Sep 17 00:00:00 2001 From: Tony Wu Date: Fri, 4 Nov 2022 11:40:48 -0400 Subject: [PATCH 2/4] Improve test --- dataprofiler/tests/profilers/test_profile_builder.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/dataprofiler/tests/profilers/test_profile_builder.py b/dataprofiler/tests/profilers/test_profile_builder.py index 1cbca2192..a08fcad96 100644 --- a/dataprofiler/tests/profilers/test_profile_builder.py +++ b/dataprofiler/tests/profilers/test_profile_builder.py @@ -2082,7 +2082,9 @@ def test_null_replication_metrics_calculation(self): np.testing.assert_array_almost_equal([[np.nan], [9]], column["class_mean"]) # Test with all null in a row - data_4 = pd.DataFrame([[10, 10], [9999999, 9999999]]) + data_4 = pd.DataFrame( + [[10, 20], [9999999, 9999999], [30, 9999999], [9999999, 9999999]] + ) profiler = dp.StructuredProfiler(data_4, options=profile_options) report = profiler.report() @@ -2091,7 +2093,7 @@ def test_null_replication_metrics_calculation(self): column = report["data_stats"][0]["null_replication_metrics"] np.testing.assert_array_almost_equal([0.5, 0.5], column["class_prior"]) - np.testing.assert_array_almost_equal([[10], [0]], column["class_sum"]) + np.testing.assert_array_almost_equal([[20], [0]], column["class_sum"]) np.testing.assert_array_almost_equal([[10], [0]], column["class_mean"]) def test_column_level_invalid_values(self): From a3b0a168e53757e457924796291559aa54e62732 Mon Sep 17 00:00:00 2001 From: Tony Wu Date: Fri, 4 Nov 2022 15:18:02 -0400 Subject: [PATCH 3/4] Remove unnecessary type --- dataprofiler/profilers/profile_builder.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dataprofiler/profilers/profile_builder.py b/dataprofiler/profilers/profile_builder.py index f2c4643b6..901755de0 100644 --- a/dataprofiler/profilers/profile_builder.py +++ b/dataprofiler/profilers/profile_builder.py @@ -2240,7 +2240,7 @@ def _update_null_replication_metrics(self, clean_samples: Dict) -> None: :param clean_samples: input cleaned dataset :type clean_samples: dict """ - data: pd.DataFrame = pd.DataFrame(clean_samples) + data = pd.DataFrame(clean_samples) # If the last row is all null, then add rows to the data DataFrame max_null_index = max( From 00ee95ab93e32f89445dae96fd56f6b5cff10332 Mon Sep 17 00:00:00 2001 From: Tony Wu Date: Tue, 8 Nov 2022 16:53:05 -0500 Subject: [PATCH 4/4] Improve efficiency --- dataprofiler/profilers/profile_builder.py | 21 ++++++--------------- 1 file changed, 6 insertions(+), 15 deletions(-) diff --git a/dataprofiler/profilers/profile_builder.py b/dataprofiler/profilers/profile_builder.py index 901755de0..54a2b44de 100644 --- a/dataprofiler/profilers/profile_builder.py +++ b/dataprofiler/profilers/profile_builder.py @@ -2240,20 +2240,7 @@ def _update_null_replication_metrics(self, clean_samples: Dict) -> None: :param clean_samples: input cleaned dataset :type clean_samples: dict """ - data = pd.DataFrame(clean_samples) - - # If the last row is all null, then add rows to the data DataFrame - max_null_index = max( - [max(i) for i in getattr(self._profile[0], "null_types_index").values()], - default=0, - ) - if max_null_index > data.index.max(): - data.loc[max_null_index] = {} - - # Fill in missing rows with NaN and convert types to numeric - data = data.reindex(range(data.index.max() + 1), fill_value=np.nan).apply( - pd.to_numeric, errors="coerce" - ) + data = pd.DataFrame(clean_samples).apply(pd.to_numeric, errors="coerce") get_data_type = lambda profile: profile.profiles[ # NOQA: E731 "data_type_profile" @@ -2300,7 +2287,11 @@ def _update_null_replication_metrics(self, clean_samples: Dict) -> None: # Partition data based on whether target column value is null or not # Calculate sum, mean of each partition without including current column # in calculation - sum_null = data.iloc[null_indices, data.columns != col_id].sum().to_numpy() + sum_null = ( + data.loc[data.index.intersection(null_indices), data.columns != col_id] + .sum() + .to_numpy() + ) # Add old sum_null if exists if col_id in self._null_replication_metrics: