Skip to content

Commit e36becd

Browse files
authored
Merge pull request #27 from getyourguide/improve-result-handling
feat: add suite_result and tagging
2 parents 509dcc3 + 6a08bd9 commit e36becd

File tree

62 files changed

+2882
-573
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

62 files changed

+2882
-573
lines changed

.github/workflows/main.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ jobs:
3333
uv run python scripts/sanity_checks.py
3434
- name: Run tests
3535
run: |
36-
uv run pytest tests/ --cov=dataframe_expectations
36+
uv run pytest tests/ -n auto --tb=line --cov=dataframe_expectations
3737
3838
lint:
3939
runs-on: ubuntu-latest

README.md

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -185,6 +185,44 @@ Some examples of violations:
185185

186186
```
187187

188+
**Tag-based filtering for selective execution:**
189+
```python
190+
from dataframe_expectations import DataFrameExpectationsSuite, TagMatchMode
191+
192+
# Tag expectations with priorities and environments
193+
suite = (
194+
DataFrameExpectationsSuite()
195+
.expect_value_greater_than(column_name="age", value=18, tags=["priority:high", "env:prod"])
196+
.expect_value_not_null(column_name="name", tags=["priority:high"])
197+
.expect_min_rows(min_rows=1, tags=["priority:low", "env:test"])
198+
)
199+
200+
# Run only high-priority checks (OR logic - matches ANY tag)
201+
runner = suite.build(tags=["priority:high"], tag_match_mode=TagMatchMode.ANY)
202+
runner.run(df)
203+
204+
# Run production-critical checks (AND logic - matches ALL tags)
205+
runner = suite.build(tags=["priority:high", "env:prod"], tag_match_mode=TagMatchMode.ALL)
206+
runner.run(df)
207+
```
208+
209+
**Programmatic result inspection:**
210+
```python
211+
# Get detailed results without raising exceptions
212+
result = runner.run(df, raise_on_failure=False)
213+
214+
# Inspect validation outcomes
215+
print(f"Total: {result.total_expectations}, Passed: {result.total_passed}, Failed: {result.total_failed}")
216+
print(f"Pass rate: {result.pass_rate:.2%}")
217+
print(f"Duration: {result.total_duration_seconds:.2f}s")
218+
print(f"Applied filters: {result.applied_filters}")
219+
220+
# Access individual results
221+
for exp_result in result.results:
222+
if exp_result.status == "failed":
223+
print(f"Failed: {exp_result.description} - {exp_result.violation_count} violations")
224+
```
225+
188226
### How to contribute?
189227
Contributions are welcome! You can enhance the library by adding new expectations, refining existing ones, or improving the testing framework.
190228

dataframe_expectations/__init__.py

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,4 +9,24 @@
99
# Catch all exceptions to handle various edge cases in different environments
1010
__version__ = "0.0.0.dev0"
1111

12-
__all__ = []
12+
from dataframe_expectations.core.suite_result import (
13+
ExpectationResult,
14+
SuiteExecutionResult,
15+
serialize_violations,
16+
)
17+
from dataframe_expectations.core.types import TagMatchMode
18+
from dataframe_expectations.suite import (
19+
DataFrameExpectationsSuite,
20+
DataFrameExpectationsSuiteRunner,
21+
DataFrameExpectationsSuiteFailure,
22+
)
23+
24+
__all__ = [
25+
"ExpectationResult",
26+
"SuiteExecutionResult",
27+
"serialize_violations",
28+
"DataFrameExpectationsSuite",
29+
"DataFrameExpectationsSuiteRunner",
30+
"DataFrameExpectationsSuiteFailure",
31+
"TagMatchMode",
32+
]
Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,15 @@
11
"""Core base classes and interfaces for DataFrame expectations."""
22

3-
__all__ = []
3+
from dataframe_expectations.core.suite_result import (
4+
ExpectationResult,
5+
ExpectationStatus,
6+
SuiteExecutionResult,
7+
serialize_violations,
8+
)
9+
10+
__all__ = [
11+
"ExpectationResult",
12+
"ExpectationStatus",
13+
"SuiteExecutionResult",
14+
"serialize_violations",
15+
]

dataframe_expectations/core/aggregation_expectation.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
from abc import abstractmethod
2-
from typing import List, Union
2+
from typing import List, Optional, Union
33

44
from dataframe_expectations.core.types import DataFrameLike, DataFrameType
55
from dataframe_expectations.core.expectation import DataFrameExpectation
@@ -20,6 +20,7 @@ def __init__(
2020
expectation_name: str,
2121
column_names: List[str],
2222
description: str,
23+
tags: Optional[List[str]] = None,
2324
):
2425
"""
2526
Template for implementing DataFrame aggregation expectations, where data is first aggregated
@@ -28,7 +29,10 @@ def __init__(
2829
:param expectation_name: The name of the expectation. This will be used during logging.
2930
:param column_names: The list of column names to aggregate on.
3031
:param description: A description of the expectation used in logging.
32+
:param tags: Optional tags as list of strings in "key:value" format.
33+
Example: ["priority:high", "env:test"]
3134
"""
35+
super().__init__(tags=tags)
3236
self.expectation_name = expectation_name
3337
self.column_names = column_names
3438
self.description = description

dataframe_expectations/core/column_expectation.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from typing import Callable
1+
from typing import Callable, List, Optional
22

33
from dataframe_expectations.core.types import DataFrameLike, DataFrameType
44
from dataframe_expectations.core.expectation import DataFrameExpectation
@@ -23,6 +23,7 @@ def __init__(
2323
fn_violations_pyspark: Callable,
2424
description: str,
2525
error_message: str,
26+
tags: Optional[List[str]] = None,
2627
):
2728
"""
2829
Template for implementing DataFrame column expectations, where a column value is tested against a
@@ -34,7 +35,10 @@ def __init__(
3435
:param fn_violations_pyspark: Function to find violations in a PySpark DataFrame.
3536
:param description: A description of the expectation used in logging.
3637
:param error_message: The error message to return if the expectation fails.
38+
:param tags: Optional tags as list of strings in "key:value" format.
39+
Example: ["priority:high", "env:test"]
3740
"""
41+
super().__init__(tags=tags)
3842
self.column_name = column_name
3943
self.expectation_name = expectation_name
4044
self.fn_violations_pandas = fn_violations_pandas

dataframe_expectations/core/expectation.py

Lines changed: 34 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
from abc import ABC, abstractmethod
2-
from typing import cast
2+
from typing import List, Optional, cast
33

44
from pandas import DataFrame as PandasDataFrame
55
from pyspark.sql import DataFrame as PySparkDataFrame
@@ -12,6 +12,7 @@
1212
PySparkConnectDataFrame = None # type: ignore[misc,assignment]
1313

1414
from dataframe_expectations.core.types import DataFrameLike, DataFrameType
15+
from dataframe_expectations.core.tagging import TagSet
1516
from dataframe_expectations.result_message import (
1617
DataFrameExpectationResultMessage,
1718
)
@@ -22,6 +23,20 @@ class DataFrameExpectation(ABC):
2223
Base class for DataFrame expectations.
2324
"""
2425

26+
def __init__(self, tags: Optional[List[str]] = None):
27+
"""
28+
Initialize the base expectation with optional tags.
29+
:param tags: Optional tags as list of strings in "key:value" format.
30+
Example: ["priority:high", "env:test"]
31+
"""
32+
self.__tags = TagSet(tags)
33+
34+
def get_tags(self) -> TagSet:
35+
"""
36+
Returns the tags for this expectation.
37+
"""
38+
return self.__tags
39+
2540
def get_expectation_name(self) -> str:
2641
"""
2742
Returns the class name as the expectation name.
@@ -48,29 +63,31 @@ def infer_data_frame_type(cls, data_frame: DataFrameLike) -> DataFrameType:
4863
"""
4964
Infer the DataFrame type based on the provided DataFrame.
5065
"""
51-
if isinstance(data_frame, PandasDataFrame):
52-
return DataFrameType.PANDAS
53-
elif isinstance(data_frame, PySparkDataFrame):
54-
return DataFrameType.PYSPARK
55-
elif PySparkConnectDataFrame is not None and isinstance(
56-
data_frame, PySparkConnectDataFrame
57-
):
58-
return DataFrameType.PYSPARK
59-
else:
60-
raise ValueError(f"Unsupported DataFrame type: {type(data_frame)}")
66+
match data_frame:
67+
case PandasDataFrame():
68+
return DataFrameType.PANDAS
69+
case PySparkDataFrame():
70+
return DataFrameType.PYSPARK
71+
case _ if PySparkConnectDataFrame is not None and isinstance(
72+
data_frame, PySparkConnectDataFrame
73+
):
74+
return DataFrameType.PYSPARK
75+
case _:
76+
raise ValueError(f"Unsupported DataFrame type: {type(data_frame)}")
6177

6278
def validate(self, data_frame: DataFrameLike, **kwargs):
6379
"""
6480
Validate the DataFrame against the expectation.
6581
"""
6682
data_frame_type = self.infer_data_frame_type(data_frame)
6783

68-
if data_frame_type == DataFrameType.PANDAS:
69-
return self.validate_pandas(data_frame=data_frame, **kwargs)
70-
elif data_frame_type == DataFrameType.PYSPARK:
71-
return self.validate_pyspark(data_frame=data_frame, **kwargs)
72-
else:
73-
raise ValueError(f"Unsupported DataFrame type: {data_frame_type}")
84+
match data_frame_type:
85+
case DataFrameType.PANDAS:
86+
return self.validate_pandas(data_frame=data_frame, **kwargs)
87+
case DataFrameType.PYSPARK:
88+
return self.validate_pyspark(data_frame=data_frame, **kwargs)
89+
case _:
90+
raise ValueError(f"Unsupported DataFrame type: {data_frame_type}")
7491

7592
@abstractmethod
7693
def validate_pandas(

0 commit comments

Comments
 (0)