Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix group by logic in grouped recency and stdev tests #205

Merged
merged 6 commits into from
Oct 12, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 24 additions & 7 deletions integration_tests/models/schema_tests/schema.yml
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,9 @@ models:
tests:
- dbt_expectations.expect_column_values_to_be_within_n_stdevs:
sigma_threshold: 6
- dbt_expectations.expect_column_values_to_be_within_n_stdevs:
group_by: ["date_day"]
sigma_threshold: 6
- dbt_expectations.expect_column_values_to_be_within_n_moving_stdevs:
date_column_name: date_day
sigma_threshold: 6
Expand Down Expand Up @@ -395,25 +398,27 @@ models:
strictly: True
- dbt_expectations.expect_grouped_row_values_to_have_recent_data:
group_by: [group_id]
timestamp_column: date_day
timestamp_column: date_timestamp
datepart: day
interval: 1
- dbt_expectations.expect_grouped_row_values_to_have_recent_data:
group_by: [group_id, row_value]
timestamp_column: date_day
group_by: [group_id]
timestamp_column: date_timestamp
datepart: day
interval: 1
row_condition: group_id = 4
- dbt_expectations.expect_grouped_row_values_to_have_recent_data:
group_by: [group_id]
timestamp_column: date_day
timestamp_column: date_timestamp
datepart: day
interval: 1
row_condition: group_id = 4
- dbt_expectations.expect_grouped_row_values_to_have_recent_data:
group_by: [group_id]
group_by: [date_day]
timestamp_column: date_timestamp
datepart: day
interval: 1
config:
# this should fail, so we flip the fail condition
fail_calc: 'cast((count(*)=0) as int)'

columns:
- name: row_value
Expand All @@ -424,6 +429,18 @@ models:
sigma_threshold: 6
take_logs: true
severity: warn
- dbt_expectations.expect_column_values_to_be_within_n_stdevs:
group_by: [group_id]
sigma_threshold: 6
- dbt_expectations.expect_column_values_to_be_within_n_stdevs:
sigma_threshold: 6
- dbt_expectations.expect_column_values_to_be_within_n_stdevs:
group_by: [group_id]
sigma_threshold: 1
config:
# this should fail, so we flip the fail condition
fail_calc: 'cast((count(*)=0) as int)'


- name: window_function_test
columns:
Expand Down
3 changes: 2 additions & 1 deletion integration_tests/models/schema_tests/timeseries_data.sql
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@ add_row_values as (
date_day,
cast(date_day as {{ dbt_expectations.type_datetime() }}) as date_datetime,
cast(date_day as {{ type_timestamp() }}) as date_timestamp,
cast(abs({{ dbt_expectations.rand() }}) as {{ type_float() }}) as row_value
cast(100 * abs({{ dbt_expectations.rand() }}) as {{ type_float() }}) as row_value

from
dates

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@ add_row_values as (

select
cast(dates.date_day as {{ dbt_expectations.type_datetime() }}) as date_day,
cast(abs({{ dbt_expectations.rand() }}) as {{ type_float() }}) as row_value
cast(100 * abs({{ dbt_expectations.rand() }}) as {{ type_float() }}) as row_value

from
dates
cross join row_values
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,14 @@ row_values as (
add_row_values as (

select
cast(d.date_day as {{ dbt_expectations.type_datetime() }}) as date_day,
cast(d.date_day as {{ dbt_expectations.type_timestamp() }}) as date_timestamp,
cast(g.generated_number as {{ type_int() }}) as group_id,
cast(floor(100 * r.generated_number) as {{ type_int() }}) as row_value
cast(dates.date_day as {{ dbt_expectations.type_datetime() }}) as date_day,
cast(dates.date_day as {{ dbt_expectations.type_timestamp() }}) as date_timestamp,
cast(groupings.generated_number as {{ type_int() }}) as group_id,
cast(100 * abs({{ dbt_expectations.rand() }}) as {{ type_float() }}) as row_value
from
dates d
cross join groupings g
cross join row_values r
dates
cross join groupings
cross join row_values

),
add_logs as (
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@ add_row_values as (

select
cast(dates.date_hour as {{ dbt_expectations.type_datetime() }}) as date_hour,
cast(abs({{ dbt_expectations.rand() }}) as {{ type_float() }}) as row_value
cast(100 * abs({{ dbt_expectations.rand() }}) as {{ type_float() }}) as row_value

from
dates
cross join row_values
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,8 @@ metric_sigma as (
select
*,
(metric_test_value - metric_test_rolling_average) as metric_test_delta,
(metric_test_value - metric_test_rolling_average)/nullif(metric_test_rolling_stddev, 0) as metric_test_sigma
(metric_test_value - metric_test_rolling_average)/
nullif(metric_test_rolling_stddev, 0) as metric_test_sigma
from
metric_moving_calcs

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,29 +3,28 @@
group_by=None,
sigma_threshold=3
) -%}
{{ adapter.dispatch('test_expect_column_values_to_be_within_n_stdevs', 'dbt_expectations') (model, column_name, group_by, sigma_threshold) }}
{{
adapter.dispatch('test_expect_column_values_to_be_within_n_stdevs', 'dbt_expectations') (
model, column_name, group_by, sigma_threshold
)
}}
{%- endtest %}

{% macro default__test_expect_column_values_to_be_within_n_stdevs(model,
column_name,
group_by,
sigma_threshold
) %}

with metric_values as (

{% if group_by -%}
select
{{ group_by }} as metric_date,
{{ group_by | join(",") ~ "," if group_by }}
sum({{ column_name }}) as {{ column_name }}
from
{{ model }}
group by
1
{%- else -%}
select
{{ column_name }} as {{ column_name }}
from
{{ model }}
{% if group_by -%}
{{ dbt_utils.group_by(group_by | length) }}
{%- endif %}

),
Expand All @@ -43,7 +42,8 @@ metric_values_z_scores as (

select
*,
({{ column_name }} - {{ column_name }}_average)/{{ column_name }}_stddev as {{ column_name }}_sigma
({{ column_name }} - {{ column_name }}_average)/
nullif({{ column_name }}_stddev, 0) as {{ column_name }}_sigma
from
metric_values_with_statistics

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,7 @@
with latest_grouped_timestamps as (

select
{%- for g in group_by %}
{{ g }},
{%- endfor %}
{{ group_by | join(",") ~ "," if group_by }}
max(1) as join_key,
max(cast({{ timestamp_column }} as {{ type_timestamp() }})) as latest_timestamp_column
from
Expand All @@ -37,16 +35,22 @@ with latest_grouped_timestamps as (
and {{ row_condition }}
{% endif %}

{{ dbt_utils.group_by(group_by | length )}}

{% if group_by -%}
{{ dbt_utils.group_by(group_by | length) }}
{%- endif %}
),
total_row_counts as (

select
{{ group_by | join(",") ~ "," if group_by }}
max(1) as join_key,
count(*) as row_count
from
latest_grouped_timestamps
{% if group_by -%}
{{ dbt_utils.group_by(group_by | length) }}
{%- endif %}


),
outdated_grouped_timestamps as (
Expand All @@ -72,7 +76,11 @@ validation_errors as (
total_row_counts r
left join
outdated_grouped_timestamps t
on r.join_key = t.join_key
on
{% for g in group_by %}
r.{{ g }} = t.{{ g }} and
{% endfor %}
r.join_key = t.join_key
where
-- fail if either no rows were returned due to row_condition,
-- or the recency test returned failed rows
Expand Down