Skip to content

Commit 3fe9307

Browse files
implement bq are_unique (#115)
1 parent 398cb62 commit 3fe9307

File tree

2 files changed

+64
-0
lines changed

2 files changed

+64
-0
lines changed

cuallee/bigquery_validation.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,18 @@ def is_unique(self, rule: Rule):
6060
)
6161
return self.compute_instruction
6262

63+
def are_unique(self, rule: Rule):
64+
"""Validation for unique values in a group of columns"""
65+
predicate = None
66+
self.compute_instruction = ComputeInstruction(
67+
predicate,
68+
"COUNT(DISTINCT CONCAT("
69+
+ ", '_', ".join([f"{c}" for c in rule.column])
70+
+ "))",
71+
ComputeMethod.SQL,
72+
)
73+
return self.compute_instruction
74+
6375

6476
def _get_expressions(compute_set: Dict[str, ComputeInstruction]) -> str:
6577
"""Get the expression for all the rules in check in one string"""

test/unit/bigquery/test_are_unique.py

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
import pytest
2+
3+
from google.cloud import bigquery
4+
5+
from cuallee import Check, CheckLevel
6+
7+
8+
def test_positive():
9+
df = bigquery.dataset.Table("bigquery-public-data.chicago_taxi_trips.taxi_trips")
10+
check = Check(CheckLevel.WARNING, "pytest")
11+
check.are_unique(("unique_key", "taxi_id"))
12+
rs = check.validate(df)
13+
assert rs.status.str.match("PASS")[1]
14+
assert rs.violations[1] == 0
15+
assert rs.pass_rate[1] == 1.0
16+
17+
18+
def test_negative():
19+
df = bigquery.dataset.Table("bigquery-public-data.chicago_taxi_trips.taxi_trips")
20+
check = Check(CheckLevel.WARNING, "pytest")
21+
check.are_unique(("taxi_id", "pickup_community_area"))
22+
rs = check.validate(df)
23+
assert rs.status.str.match("FAIL")[1]
24+
assert rs.violations[1] == 208532125
25+
assert rs.pass_threshold[1] == 1.0
26+
assert rs.pass_rate[1] == 411496 / 208943621
27+
28+
29+
@pytest.mark.parametrize(
30+
"rule_column",
31+
[tuple(["unique_key", "taxi_id"]), list(["unique_key", "taxi_id"])],
32+
ids=("tuple", "list"),
33+
)
34+
def test_parameters(rule_column):
35+
df = bigquery.dataset.Table("bigquery-public-data.chicago_taxi_trips.taxi_trips")
36+
check = Check(CheckLevel.WARNING, "pytest")
37+
check.are_unique(rule_column)
38+
rs = check.validate(df)
39+
assert rs.status.str.match("PASS")[1]
40+
assert rs.violations[1] == 0
41+
assert rs.pass_rate[1] == 1.0
42+
43+
44+
def test_coverage():
45+
df = bigquery.dataset.Table("bigquery-public-data.chicago_taxi_trips.taxi_trips")
46+
check = Check(CheckLevel.WARNING, "pytest")
47+
check.are_unique(("taxi_id", "pickup_community_area"), 0.001)
48+
rs = check.validate(df)
49+
assert rs.status.str.match("PASS")[1]
50+
assert rs.violations[1] == 208532125
51+
assert rs.pass_threshold[1] == 0.001
52+
assert rs.pass_rate[1] == 411496 / 208943621

0 commit comments

Comments
 (0)