From bee16d3becc74187ba01bbf8529a05c2faee179b Mon Sep 17 00:00:00 2001
From: Tony Wu <tonywu315@gmail.com>
Date: Fri, 4 Nov 2022 11:35:26 -0400
Subject: [PATCH 1/4] Fix bug when row is all null

---
 dataprofiler/profilers/profile_builder.py         | 15 ++++++++++++++-
 .../tests/profilers/test_profile_builder.py       | 13 +++++++++++++
 2 files changed, 27 insertions(+), 1 deletion(-)

diff --git a/dataprofiler/profilers/profile_builder.py b/dataprofiler/profilers/profile_builder.py
index 150135f40..f2c4643b6 100644
--- a/dataprofiler/profilers/profile_builder.py
+++ b/dataprofiler/profilers/profile_builder.py
@@ -2240,7 +2240,20 @@ def _update_null_replication_metrics(self, clean_samples: Dict) -> None:
         :param clean_samples: input cleaned dataset
         :type clean_samples: dict
         """
-        data = pd.DataFrame(clean_samples).apply(pd.to_numeric, errors="coerce")
+        data: pd.DataFrame = pd.DataFrame(clean_samples)
+
+        # If the last row is all null, then add rows to the data DataFrame
+        max_null_index = max(
+            [max(i) for i in getattr(self._profile[0], "null_types_index").values()],
+            default=0,
+        )
+        if max_null_index > data.index.max():
+            data.loc[max_null_index] = {}
+
+        # Fill in missing rows with NaN and convert types to numeric
+        data = data.reindex(range(data.index.max() + 1), fill_value=np.nan).apply(
+            pd.to_numeric, errors="coerce"
+        )
 
         get_data_type = lambda profile: profile.profiles[  # NOQA: E731
             "data_type_profile"
diff --git a/dataprofiler/tests/profilers/test_profile_builder.py b/dataprofiler/tests/profilers/test_profile_builder.py
index c90ef3eb2..1cbca2192 100644
--- a/dataprofiler/tests/profilers/test_profile_builder.py
+++ b/dataprofiler/tests/profilers/test_profile_builder.py
@@ -2081,6 +2081,19 @@ def test_null_replication_metrics_calculation(self):
         np.testing.assert_array_almost_equal([[np.nan], [18]], column["class_sum"])
         np.testing.assert_array_almost_equal([[np.nan], [9]], column["class_mean"])
 
+        # Test with all null in a row
+        data_4 = pd.DataFrame([[10, 10], [9999999, 9999999]])
+
+        profiler = dp.StructuredProfiler(data_4, options=profile_options)
+        report = profiler.report()
+
+        self.assertTrue("null_replication_metrics" in report["data_stats"][0])
+        column = report["data_stats"][0]["null_replication_metrics"]
+
+        np.testing.assert_array_almost_equal([0.5, 0.5], column["class_prior"])
+        np.testing.assert_array_almost_equal([[10], [0]], column["class_sum"])
+        np.testing.assert_array_almost_equal([[10], [0]], column["class_mean"])
+
     def test_column_level_invalid_values(self):
         data = pd.DataFrame([[1, 1], [9999999, 2], [3, 3]])
 

From 7644b792eacda7db2ce13f6450f6f566d7ffd17f Mon Sep 17 00:00:00 2001
From: Tony Wu <tonywu315@gmail.com>
Date: Fri, 4 Nov 2022 11:40:48 -0400
Subject: [PATCH 2/4] Improve test

---
 dataprofiler/tests/profilers/test_profile_builder.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/dataprofiler/tests/profilers/test_profile_builder.py b/dataprofiler/tests/profilers/test_profile_builder.py
index 1cbca2192..a08fcad96 100644
--- a/dataprofiler/tests/profilers/test_profile_builder.py
+++ b/dataprofiler/tests/profilers/test_profile_builder.py
@@ -2082,7 +2082,9 @@ def test_null_replication_metrics_calculation(self):
         np.testing.assert_array_almost_equal([[np.nan], [9]], column["class_mean"])
 
         # Test with all null in a row
-        data_4 = pd.DataFrame([[10, 10], [9999999, 9999999]])
+        data_4 = pd.DataFrame(
+            [[10, 20], [9999999, 9999999], [30, 9999999], [9999999, 9999999]]
+        )
 
         profiler = dp.StructuredProfiler(data_4, options=profile_options)
         report = profiler.report()
@@ -2091,7 +2093,7 @@ def test_null_replication_metrics_calculation(self):
         column = report["data_stats"][0]["null_replication_metrics"]
 
         np.testing.assert_array_almost_equal([0.5, 0.5], column["class_prior"])
-        np.testing.assert_array_almost_equal([[10], [0]], column["class_sum"])
+        np.testing.assert_array_almost_equal([[20], [0]], column["class_sum"])
         np.testing.assert_array_almost_equal([[10], [0]], column["class_mean"])
 
     def test_column_level_invalid_values(self):

From a3b0a168e53757e457924796291559aa54e62732 Mon Sep 17 00:00:00 2001
From: Tony Wu <tonywu315@gmail.com>
Date: Fri, 4 Nov 2022 15:18:02 -0400
Subject: [PATCH 3/4] Remove unnecessary type

---
 dataprofiler/profilers/profile_builder.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dataprofiler/profilers/profile_builder.py b/dataprofiler/profilers/profile_builder.py
index f2c4643b6..901755de0 100644
--- a/dataprofiler/profilers/profile_builder.py
+++ b/dataprofiler/profilers/profile_builder.py
@@ -2240,7 +2240,7 @@ def _update_null_replication_metrics(self, clean_samples: Dict) -> None:
         :param clean_samples: input cleaned dataset
         :type clean_samples: dict
         """
-        data: pd.DataFrame = pd.DataFrame(clean_samples)
+        data = pd.DataFrame(clean_samples)
 
         # If the last row is all null, then add rows to the data DataFrame
         max_null_index = max(

From 00ee95ab93e32f89445dae96fd56f6b5cff10332 Mon Sep 17 00:00:00 2001
From: Tony Wu <tonywu315@gmail.com>
Date: Tue, 8 Nov 2022 16:53:05 -0500
Subject: [PATCH 4/4] Improve efficiency

---
 dataprofiler/profilers/profile_builder.py | 21 ++++++---------------
 1 file changed, 6 insertions(+), 15 deletions(-)

diff --git a/dataprofiler/profilers/profile_builder.py b/dataprofiler/profilers/profile_builder.py
index 901755de0..54a2b44de 100644
--- a/dataprofiler/profilers/profile_builder.py
+++ b/dataprofiler/profilers/profile_builder.py
@@ -2240,20 +2240,7 @@ def _update_null_replication_metrics(self, clean_samples: Dict) -> None:
         :param clean_samples: input cleaned dataset
         :type clean_samples: dict
         """
-        data = pd.DataFrame(clean_samples)
-
-        # If the last row is all null, then add rows to the data DataFrame
-        max_null_index = max(
-            [max(i) for i in getattr(self._profile[0], "null_types_index").values()],
-            default=0,
-        )
-        if max_null_index > data.index.max():
-            data.loc[max_null_index] = {}
-
-        # Fill in missing rows with NaN and convert types to numeric
-        data = data.reindex(range(data.index.max() + 1), fill_value=np.nan).apply(
-            pd.to_numeric, errors="coerce"
-        )
+        data = pd.DataFrame(clean_samples).apply(pd.to_numeric, errors="coerce")
 
         get_data_type = lambda profile: profile.profiles[  # NOQA: E731
             "data_type_profile"
@@ -2300,7 +2287,11 @@ def _update_null_replication_metrics(self, clean_samples: Dict) -> None:
             # Partition data based on whether target column value is null or not
             # Calculate sum, mean of each partition without including current column
             # in calculation
-            sum_null = data.iloc[null_indices, data.columns != col_id].sum().to_numpy()
+            sum_null = (
+                data.loc[data.index.intersection(null_indices), data.columns != col_id]
+                .sum()
+                .to_numpy()
+            )
 
             # Add old sum_null if exists
             if col_id in self._null_replication_metrics: