Skip to content

Commit

Permalink
Fix group by logic in grouped recency and stdev tests (#205)
Browse files Browse the repository at this point in the history
* Fix group by logic in grouped recency tests

* Remove table aliases

* Add a negative test

* Add a negative test

* Add divide by zero check
  • Loading branch information
clausherther authored Oct 12, 2022
1 parent 51e1a01 commit 3759b63
Show file tree
Hide file tree
Showing 8 changed files with 64 additions and 35 deletions.
31 changes: 24 additions & 7 deletions integration_tests/models/schema_tests/schema.yml
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,9 @@ models:
tests:
- dbt_expectations.expect_column_values_to_be_within_n_stdevs:
sigma_threshold: 6
- dbt_expectations.expect_column_values_to_be_within_n_stdevs:
group_by: ["date_day"]
sigma_threshold: 6
- dbt_expectations.expect_column_values_to_be_within_n_moving_stdevs:
date_column_name: date_day
sigma_threshold: 6
Expand Down Expand Up @@ -395,25 +398,27 @@ models:
strictly: True
- dbt_expectations.expect_grouped_row_values_to_have_recent_data:
group_by: [group_id]
timestamp_column: date_day
timestamp_column: date_timestamp
datepart: day
interval: 1
- dbt_expectations.expect_grouped_row_values_to_have_recent_data:
group_by: [group_id, row_value]
timestamp_column: date_day
group_by: [group_id]
timestamp_column: date_timestamp
datepart: day
interval: 1
row_condition: group_id = 4
- dbt_expectations.expect_grouped_row_values_to_have_recent_data:
group_by: [group_id]
timestamp_column: date_day
timestamp_column: date_timestamp
datepart: day
interval: 1
row_condition: group_id = 4
- dbt_expectations.expect_grouped_row_values_to_have_recent_data:
group_by: [group_id]
group_by: [date_day]
timestamp_column: date_timestamp
datepart: day
interval: 1
config:
# this should fail, so we flip the fail condition
fail_calc: 'cast((count(*)=0) as int)'

columns:
- name: row_value
Expand All @@ -424,6 +429,18 @@ models:
sigma_threshold: 6
take_logs: true
severity: warn
- dbt_expectations.expect_column_values_to_be_within_n_stdevs:
group_by: [group_id]
sigma_threshold: 6
- dbt_expectations.expect_column_values_to_be_within_n_stdevs:
sigma_threshold: 6
- dbt_expectations.expect_column_values_to_be_within_n_stdevs:
group_by: [group_id]
sigma_threshold: 1
config:
# this should fail, so we flip the fail condition
fail_calc: 'cast((count(*)=0) as int)'


- name: window_function_test
columns:
Expand Down
3 changes: 2 additions & 1 deletion integration_tests/models/schema_tests/timeseries_data.sql
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@ add_row_values as (
date_day,
cast(date_day as {{ dbt_expectations.type_datetime() }}) as date_datetime,
cast(date_day as {{ type_timestamp() }}) as date_timestamp,
cast(abs({{ dbt_expectations.rand() }}) as {{ type_float() }}) as row_value
cast(100 * abs({{ dbt_expectations.rand() }}) as {{ type_float() }}) as row_value

from
dates

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@ add_row_values as (

select
cast(dates.date_day as {{ dbt_expectations.type_datetime() }}) as date_day,
cast(abs({{ dbt_expectations.rand() }}) as {{ type_float() }}) as row_value
cast(100 * abs({{ dbt_expectations.rand() }}) as {{ type_float() }}) as row_value

from
dates
cross join row_values
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,14 @@ row_values as (
add_row_values as (

select
cast(d.date_day as {{ dbt_expectations.type_datetime() }}) as date_day,
cast(d.date_day as {{ dbt_expectations.type_timestamp() }}) as date_timestamp,
cast(g.generated_number as {{ type_int() }}) as group_id,
cast(floor(100 * r.generated_number) as {{ type_int() }}) as row_value
cast(dates.date_day as {{ dbt_expectations.type_datetime() }}) as date_day,
cast(dates.date_day as {{ dbt_expectations.type_timestamp() }}) as date_timestamp,
cast(groupings.generated_number as {{ type_int() }}) as group_id,
cast(100 * abs({{ dbt_expectations.rand() }}) as {{ type_float() }}) as row_value
from
dates d
cross join groupings g
cross join row_values r
dates
cross join groupings
cross join row_values

),
add_logs as (
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@ add_row_values as (

select
cast(dates.date_hour as {{ dbt_expectations.type_datetime() }}) as date_hour,
cast(abs({{ dbt_expectations.rand() }}) as {{ type_float() }}) as row_value
cast(100 * abs({{ dbt_expectations.rand() }}) as {{ type_float() }}) as row_value

from
dates
cross join row_values
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,8 @@ metric_sigma as (
select
*,
(metric_test_value - metric_test_rolling_average) as metric_test_delta,
(metric_test_value - metric_test_rolling_average)/nullif(metric_test_rolling_stddev, 0) as metric_test_sigma
(metric_test_value - metric_test_rolling_average)/
nullif(metric_test_rolling_stddev, 0) as metric_test_sigma
from
metric_moving_calcs

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,29 +3,28 @@
group_by=None,
sigma_threshold=3
) -%}
{{ adapter.dispatch('test_expect_column_values_to_be_within_n_stdevs', 'dbt_expectations') (model, column_name, group_by, sigma_threshold) }}
{{
adapter.dispatch('test_expect_column_values_to_be_within_n_stdevs', 'dbt_expectations') (
model, column_name, group_by, sigma_threshold
)
}}
{%- endtest %}

{% macro default__test_expect_column_values_to_be_within_n_stdevs(model,
column_name,
group_by,
sigma_threshold
) %}

with metric_values as (

{% if group_by -%}
select
{{ group_by }} as metric_date,
{{ group_by | join(",") ~ "," if group_by }}
sum({{ column_name }}) as {{ column_name }}
from
{{ model }}
group by
1
{%- else -%}
select
{{ column_name }} as {{ column_name }}
from
{{ model }}
{% if group_by -%}
{{ dbt_utils.group_by(group_by | length) }}
{%- endif %}

),
Expand All @@ -43,7 +42,8 @@ metric_values_z_scores as (

select
*,
({{ column_name }} - {{ column_name }}_average)/{{ column_name }}_stddev as {{ column_name }}_sigma
({{ column_name }} - {{ column_name }}_average)/
nullif({{ column_name }}_stddev, 0) as {{ column_name }}_sigma
from
metric_values_with_statistics

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,7 @@
with latest_grouped_timestamps as (

select
{%- for g in group_by %}
{{ g }},
{%- endfor %}
{{ group_by | join(",") ~ "," if group_by }}
max(1) as join_key,
max(cast({{ timestamp_column }} as {{ type_timestamp() }})) as latest_timestamp_column
from
Expand All @@ -37,16 +35,22 @@ with latest_grouped_timestamps as (
and {{ row_condition }}
{% endif %}

{{ dbt_utils.group_by(group_by | length )}}

{% if group_by -%}
{{ dbt_utils.group_by(group_by | length) }}
{%- endif %}
),
total_row_counts as (

select
{{ group_by | join(",") ~ "," if group_by }}
max(1) as join_key,
count(*) as row_count
from
latest_grouped_timestamps
{% if group_by -%}
{{ dbt_utils.group_by(group_by | length) }}
{%- endif %}


),
outdated_grouped_timestamps as (
Expand All @@ -72,7 +76,11 @@ validation_errors as (
total_row_counts r
left join
outdated_grouped_timestamps t
on r.join_key = t.join_key
on
{% for g in group_by %}
r.{{ g }} = t.{{ g }} and
{% endfor %}
r.join_key = t.join_key
where
-- fail if either no rows were returned due to row_condition,
-- or the recency test returned failed rows
Expand Down

0 comments on commit 3759b63

Please sign in to comment.