Bugfix: Update drop invalid logic to handle multi-index dfs (#1320)

* Update drop invalid logic to handle multi-index cases Signed-off-by: Baden Ashford <baden.ashford@gmail.com> * Remove un-needed code Signed-off-by: Baden Ashford <baden.ashford@gmail.com> * Add note in docs about uniqueness Signed-off-by: Baden Ashford <baden.ashford@gmail.com> * Fix typo Signed-off-by: Baden Ashford <baden.ashford@gmail.com> --------- Signed-off-by: Baden Ashford <baden.ashford@gmail.com>
unionai-oss · Aug 18, 2023 · d43a11b · d43a11b
1 parent eda9a2e
commit d43a11b
Show file tree

Hide file tree

Showing 3 changed files with 149 additions and 3 deletions.
diff --git a/docs/source/drop_invalid_rows.rst b/docs/source/drop_invalid_rows.rst
@@ -18,6 +18,9 @@ remove the rows which causes the failure.
 This functionality is available on ``DataFrameSchema``, ``SeriesSchema``, ``Column``,
 as well as ``DataFrameModel`` schemas.
 
+**Note** that this functionality works by identifying the index or multi-index of the failing rows.
+If the index is not unique on the dataframe, this could result in incorrect rows being dropped.
+
 Dropping invalid rows with :class:`~pandera.api.pandas.container.DataFrameSchema`:
 
 .. testcode:: drop_invalid_rows_data_frame_schema

diff --git a/pandera/backends/pandas/base.py b/pandera/backends/pandas/base.py
@@ -155,7 +155,15 @@ def drop_invalid_rows(self, check_obj, error_handler: SchemaErrorHandler):
         """Remove invalid elements in a check obj according to failures in caught by the error handler."""
         errors = error_handler.collected_errors
         for err in errors:
-            check_obj = check_obj.loc[
-                ~check_obj.index.isin(err.failure_cases["index"])
-            ]
+            index_values = err.failure_cases["index"]
+            if isinstance(check_obj.index, pd.MultiIndex):
+                # MultiIndex values are saved on the error as strings so need to be cast back
+                # to their original types
+                index_tuples = err.failure_cases["index"].apply(eval)
+                index_values = pd.MultiIndex.from_tuples(index_tuples)
+
+            mask = ~check_obj.index.isin(index_values)
+
+            check_obj = check_obj.loc[mask]
+
         return check_obj
diff --git a/tests/core/test_schemas.py b/tests/core/test_schemas.py
@@ -2330,6 +2330,141 @@ class Config:
         MySchema.validate(actual_obj, lazy=False)
 
 
+@pytest.mark.parametrize(
+    "schema, obj, expected_obj",
+    [
+        (
+            DataFrameSchema(
+                columns={
+                    "name": Column(str),
+                    "occupation": Column(str, nullable=False),
+                },
+                index=MultiIndex(
+                    [
+                        Index(str, name="state"),
+                        Index(str, name="city"),
+                    ]
+                ),
+                drop_invalid_rows=True,
+            ),
+            pd.DataFrame(
+                {
+                    "name": ["Frodo", "Boromir"],
+                    "occupation": ["Ring bearer", None],
+                },
+                index=pd.MultiIndex.from_tuples(
+                    (("MiddleEarth", "TheShire"), ("MiddleEarth", "Gondor")),
+                    names=["state", "city"],
+                ),
+            ),
+            pd.DataFrame(
+                {"name": ["Frodo"], "occupation": ["Ring bearer"]},
+                index=pd.MultiIndex.from_tuples(
+                    (("MiddleEarth", "TheShire"),), names=["state", "city"]
+                ),
+            ),
+        ),
+        (
+            DataFrameSchema(
+                columns={
+                    "path_description": Column(str, nullable=False),
+                    "days_to_travel": Column(float, nullable=False),
+                },
+                index=MultiIndex(
+                    [
+                        Index(str, name="character_name"),
+                        Index(int, name="path_id"),
+                    ]
+                ),
+                drop_invalid_rows=True,
+            ),
+            pd.DataFrame(
+                {
+                    "path_description": [
+                        "To Rivendell",
+                        "To Mordor",
+                        "To Gondor",
+                        None,
+                    ],
+                    "days_to_travel": [30.0, 60.5, None, 15.9],
+                },
+                index=pd.MultiIndex.from_tuples(
+                    (("Frodo", 1), ("Sam", 2), ("Boromir", 3), ("Legolas", 4)),
+                    names=["character_name", "path_id"],
+                ),
+            ),
+            pd.DataFrame(
+                {
+                    "path_description": [
+                        "To Rivendell",
+                        "To Mordor",
+                    ],
+                    "days_to_travel": [30.0, 60.5],
+                },
+                index=pd.MultiIndex.from_tuples(
+                    (("Frodo", 1), ("Sam", 2)),
+                    names=["character_name", "path_id"],
+                ),
+            ),
+        ),
+        (
+            DataFrameSchema(
+                columns={
+                    "battle_name": Column(str, nullable=False),
+                    "victor": Column(str, nullable=False),
+                },
+                index=MultiIndex(
+                    [
+                        Index(int, name="year"),
+                        Index(float, name="coordinates"),
+                    ]
+                ),
+                drop_invalid_rows=True,
+            ),
+            pd.DataFrame(
+                {
+                    "battle_name": [
+                        "Battle of Helm's Deep",
+                        "Battle of the Black Gate",
+                        "Siege of Gondor",
+                        "Skirmish at Weathertop",
+                    ],
+                    "victor": [
+                        "Rohan & Allies",
+                        "Free Peoples",
+                        None,
+                        "Nazgûl",
+                    ],
+                },
+                index=pd.MultiIndex.from_tuples(
+                    ((3019, 42.5), (3019, 42.6), (3019, 42.7), (3018, 42.8)),
+                    names=["year", "coordinates"],
+                ),
+            ),
+            pd.DataFrame(
+                {
+                    "battle_name": [
+                        "Battle of Helm's Deep",
+                        "Battle of the Black Gate",
+                        "Skirmish at Weathertop",
+                    ],
+                    "victor": ["Rohan & Allies", "Free Peoples", "Nazgûl"],
+                },
+                index=pd.MultiIndex.from_tuples(
+                    ((3019, 42.5), (3019, 42.6), (3018, 42.8)),
+                    names=["year", "coordinates"],
+                ),
+            ),
+        ),
+    ],
+)
+def test_drop_invalid_for_multi_index(schema, obj, expected_obj):
+    """Test drop_invalid_rows works as expected on multi-index dataframes"""
+    actual_obj = schema.validate(obj, lazy=True)
+
+    pd.testing.assert_frame_equal(actual_obj, expected_obj)
+
+
 def test_get_schema_metadata():
     """Test fetching schema metadata."""