Skip to content

Commit

Permalink
Bugfix: Update drop invalid logic to handle multi-index dfs (#1320)
Browse files Browse the repository at this point in the history
* Update drop invalid logic to handle multi-index cases

Signed-off-by: Baden Ashford <baden.ashford@gmail.com>

* Remove un-needed code

Signed-off-by: Baden Ashford <baden.ashford@gmail.com>

* Add note in docs about uniqueness

Signed-off-by: Baden Ashford <baden.ashford@gmail.com>

* Fix typo

Signed-off-by: Baden Ashford <baden.ashford@gmail.com>

---------

Signed-off-by: Baden Ashford <baden.ashford@gmail.com>
  • Loading branch information
kykyi authored Aug 18, 2023
1 parent eda9a2e commit d43a11b
Show file tree
Hide file tree
Showing 3 changed files with 149 additions and 3 deletions.
3 changes: 3 additions & 0 deletions docs/source/drop_invalid_rows.rst
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,9 @@ remove the rows which causes the failure.
This functionality is available on ``DataFrameSchema``, ``SeriesSchema``, ``Column``,
as well as ``DataFrameModel`` schemas.

**Note** that this functionality works by identifying the index or multi-index of the failing rows.
If the index is not unique on the dataframe, this could result in incorrect rows being dropped.

Dropping invalid rows with :class:`~pandera.api.pandas.container.DataFrameSchema`:

.. testcode:: drop_invalid_rows_data_frame_schema
Expand Down
14 changes: 11 additions & 3 deletions pandera/backends/pandas/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,15 @@ def drop_invalid_rows(self, check_obj, error_handler: SchemaErrorHandler):
"""Remove invalid elements in a check obj according to failures in caught by the error handler."""
errors = error_handler.collected_errors
for err in errors:
check_obj = check_obj.loc[
~check_obj.index.isin(err.failure_cases["index"])
]
index_values = err.failure_cases["index"]
if isinstance(check_obj.index, pd.MultiIndex):
# MultiIndex values are saved on the error as strings so need to be cast back
# to their original types
index_tuples = err.failure_cases["index"].apply(eval)
index_values = pd.MultiIndex.from_tuples(index_tuples)

mask = ~check_obj.index.isin(index_values)

check_obj = check_obj.loc[mask]

return check_obj
135 changes: 135 additions & 0 deletions tests/core/test_schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -2330,6 +2330,141 @@ class Config:
MySchema.validate(actual_obj, lazy=False)


@pytest.mark.parametrize(
"schema, obj, expected_obj",
[
(
DataFrameSchema(
columns={
"name": Column(str),
"occupation": Column(str, nullable=False),
},
index=MultiIndex(
[
Index(str, name="state"),
Index(str, name="city"),
]
),
drop_invalid_rows=True,
),
pd.DataFrame(
{
"name": ["Frodo", "Boromir"],
"occupation": ["Ring bearer", None],
},
index=pd.MultiIndex.from_tuples(
(("MiddleEarth", "TheShire"), ("MiddleEarth", "Gondor")),
names=["state", "city"],
),
),
pd.DataFrame(
{"name": ["Frodo"], "occupation": ["Ring bearer"]},
index=pd.MultiIndex.from_tuples(
(("MiddleEarth", "TheShire"),), names=["state", "city"]
),
),
),
(
DataFrameSchema(
columns={
"path_description": Column(str, nullable=False),
"days_to_travel": Column(float, nullable=False),
},
index=MultiIndex(
[
Index(str, name="character_name"),
Index(int, name="path_id"),
]
),
drop_invalid_rows=True,
),
pd.DataFrame(
{
"path_description": [
"To Rivendell",
"To Mordor",
"To Gondor",
None,
],
"days_to_travel": [30.0, 60.5, None, 15.9],
},
index=pd.MultiIndex.from_tuples(
(("Frodo", 1), ("Sam", 2), ("Boromir", 3), ("Legolas", 4)),
names=["character_name", "path_id"],
),
),
pd.DataFrame(
{
"path_description": [
"To Rivendell",
"To Mordor",
],
"days_to_travel": [30.0, 60.5],
},
index=pd.MultiIndex.from_tuples(
(("Frodo", 1), ("Sam", 2)),
names=["character_name", "path_id"],
),
),
),
(
DataFrameSchema(
columns={
"battle_name": Column(str, nullable=False),
"victor": Column(str, nullable=False),
},
index=MultiIndex(
[
Index(int, name="year"),
Index(float, name="coordinates"),
]
),
drop_invalid_rows=True,
),
pd.DataFrame(
{
"battle_name": [
"Battle of Helm's Deep",
"Battle of the Black Gate",
"Siege of Gondor",
"Skirmish at Weathertop",
],
"victor": [
"Rohan & Allies",
"Free Peoples",
None,
"Nazgûl",
],
},
index=pd.MultiIndex.from_tuples(
((3019, 42.5), (3019, 42.6), (3019, 42.7), (3018, 42.8)),
names=["year", "coordinates"],
),
),
pd.DataFrame(
{
"battle_name": [
"Battle of Helm's Deep",
"Battle of the Black Gate",
"Skirmish at Weathertop",
],
"victor": ["Rohan & Allies", "Free Peoples", "Nazgûl"],
},
index=pd.MultiIndex.from_tuples(
((3019, 42.5), (3019, 42.6), (3018, 42.8)),
names=["year", "coordinates"],
),
),
),
],
)
def test_drop_invalid_for_multi_index(schema, obj, expected_obj):
"""Test drop_invalid_rows works as expected on multi-index dataframes"""
actual_obj = schema.validate(obj, lazy=True)

pd.testing.assert_frame_equal(actual_obj, expected_obj)


def test_get_schema_metadata():
"""Test fetching schema metadata."""

Expand Down

0 comments on commit d43a11b

Please sign in to comment.