Skip to content

Commit

Permalink
Migrated DIM checks over to ETL checks for telemetry_derived.active_u…
Browse files Browse the repository at this point in the history
…sers_aggregates_v1
  • Loading branch information
kik-kik committed Dec 1, 2023
1 parent 9409d2b commit 27d6a6c
Show file tree
Hide file tree
Showing 2 changed files with 68 additions and 0 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
#fail
{{ previous_row_count_avg_within_expected_delta(days=7, expected_percentage_delta=5) }}
66 changes: 66 additions & 0 deletions tests/checks/previous_row_count_avg_within_expected_delta.jinja
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
{% macro previous_row_count_avg_within_expected_delta(days, expected_percentage_delta, partition_field) %}
{% set partition_field = partition_field|default('submission_date') %}

WITH CTE AS (
SELECT
ARRAY_AGG(
STRUCT(
PARSE_DATE('%Y%m%d', partition_id) AS table_partition,
total_rows
)
) AS included_partition_info
FROM
`{{ project_id }}.{{ dataset_id }}.INFORMATION_SCHEMA.PARTITIONS`
WHERE
table_name = "{{ table_name }}"
AND partition_id <> "__NULL__"
AND PARSE_DATE('%Y%m%d', partition_id)
BETWEEN DATE_SUB(@{{ partition_field }}, INTERVAL {{ days }} + 1 DAY) AND DATE(@{{ partition_field }})
)

, DATE_RANGE_ROW_COUNT_AVG AS (
SELECT
IF(
date_range_avg_row_count IS NULL,
ERROR("looks like no data was found for past partitions. Make sure you're using a table and not a view."),
CAST(date_range_avg_row_count AS INTEGER)
) AS date_range_avg_row_count
FROM (
SELECT
AVG(partition_info.total_rows) AS date_range_avg_row_count,
FROM CTE,
UNNEST (included_partition_info) AS partition_info
WHERE partition_info.table_partition <> DATE(@{{ partition_field }})
)
)

, ROW_COUNT_BOUNDARIES AS (
SELECT
CAST(date_range_avg_row_count - (date_range_avg_row_count / 100 * {{ expected_percentage_delta }}) AS INTEGER) AS min_expected,
CAST(date_range_avg_row_count + (date_range_avg_row_count / 100 * {{ expected_percentage_delta }}) AS INTEGER) AS max_expected,
FROM DATE_RANGE_ROW_COUNT_AVG
)

, CURRENT_PARTITION_ROW_COUNT AS (
SELECT
CAST(partition_info.total_rows AS INTEGER) AS total_rows
FROM CTE,
UNNEST (included_partition_info) AS partition_info
WHERE partition_info.table_partition = DATE(@{{ partition_field }})
)

SELECT
IF(
(SELECT total_rows FROM CURRENT_PARTITION_ROW_COUNT) NOT BETWEEN (SELECT min_expected FROM ROW_COUNT_BOUNDARIES) AND (SELECT max_expected FROM ROW_COUNT_BOUNDARIES),
ERROR(
CONCAT(
"The row count for partition ", @{{ partition_field }}, " is outside of the expected boundaries. ",
"Row count for the current partition: ",
(SELECT total_rows FROM CURRENT_PARTITION_ROW_COUNT),
". Expected boundary: ",
(SELECT min_expected FROM ROW_COUNT_BOUNDARIES), " - ", (SELECT max_expected FROM ROW_COUNT_BOUNDARIES)
)
),
null
);
{% endmacro %}

1 comment on commit 27d6a6c

@dataops-ci-bot
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Integration report for "Migrated DIM checks over to ETL checks for telemetry_derived.active_users_aggregates_v1"

sql.diff

Click to expand!
diff -bur --no-dereference --new-file /tmp/workspace/main-generated-sql/dags/bqetl_analytics_aggregations.py /tmp/workspace/generated-sql/dags/bqetl_analytics_aggregations.py
--- /tmp/workspace/main-generated-sql/dags/bqetl_analytics_aggregations.py	2023-12-01 16:34:49.000000000 +0000
+++ /tmp/workspace/generated-sql/dags/bqetl_analytics_aggregations.py	2023-12-01 16:33:11.000000000 +0000
@@ -106,6 +106,23 @@
         depends_on_past=False,
     )
 
+    checks__fail_telemetry_derived__active_users_aggregates__v1 = bigquery_dq_check(
+        task_id="checks__fail_telemetry_derived__active_users_aggregates__v1",
+        source_table="active_users_aggregates_v1",
+        dataset_id="telemetry_derived",
+        project_id="moz-fx-data-shared-prod",
+        is_dq_check_fail=True,
+        owner="lvargas@mozilla.com",
+        email=[
+            "gkaberere@mozilla.com",
+            "lvargas@mozilla.com",
+            "telemetry-alerts@mozilla.com",
+        ],
+        depends_on_past=False,
+        parameters=["submission_date:DATE:{{ds}}"],
+        retries=0,
+    )
+
     fenix_active_users_aggregates = bigquery_etl_query(
         task_id="fenix_active_users_aggregates",
         destination_table="active_users_aggregates_v2",
@@ -269,6 +286,10 @@
         wait_for_telemetry_derived__unified_metrics__v1
     )
 
+    checks__fail_telemetry_derived__active_users_aggregates__v1.set_upstream(
+        active_users_aggregates_v1
+    )
+
     fenix_active_users_aggregates.set_upstream(
         wait_for_checks__fail_fenix_derived__firefox_android_clients__v1
     )
Only in /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/telemetry_derived/active_users_aggregates_v1: checks.sql
diff -bur --no-dereference --new-file /tmp/workspace/main-generated-sql/sql/moz-fx-data-shared-prod/fenix/startup_timeline/metadata.yaml /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/fenix/startup_timeline/metadata.yaml
--- /tmp/workspace/main-generated-sql/sql/moz-fx-data-shared-prod/fenix/startup_timeline/metadata.yaml	2023-12-01 16:33:54.000000000 +0000
+++ /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/fenix/startup_timeline/metadata.yaml	2023-12-01 16:30:56.000000000 +0000
@@ -2,7 +2,7 @@
 description: |-
   This a view that UNIONs the stable ping tables
   across all channels of the Glean application "Firefox for Android"
-  (org_mozilla_firefox.startup_timeline, org_mozilla_fenix.startup_timeline, org_mozilla_fenix_nightly.startup_timeline, org_mozilla_fennec_aurora.startup_timeline).
+  (org_mozilla_firefox.startup_timeline, org_mozilla_firefox_beta.startup_timeline, org_mozilla_fenix.startup_timeline, org_mozilla_fenix_nightly.startup_timeline, org_mozilla_fennec_aurora.startup_timeline).
 
   It is used by Looker.
 owners: []
@@ -18,4 +18,5 @@
   - moz-fx-data-shared-prod.org_mozilla_fenix_nightly.startup_timeline
   - moz-fx-data-shared-prod.org_mozilla_fennec_aurora.startup_timeline
   - moz-fx-data-shared-prod.org_mozilla_firefox.startup_timeline
+  - moz-fx-data-shared-prod.org_mozilla_firefox_beta.startup_timeline
 deprecated: false
diff -bur --no-dereference --new-file /tmp/workspace/main-generated-sql/sql/moz-fx-data-shared-prod/fenix/startup_timeline/view.sql /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/fenix/startup_timeline/view.sql
--- /tmp/workspace/main-generated-sql/sql/moz-fx-data-shared-prod/fenix/startup_timeline/view.sql	2023-12-01 16:33:54.000000000 +0000
+++ /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/fenix/startup_timeline/view.sql	2023-12-01 16:29:04.000000000 +0000
@@ -25,6 +25,28 @@
   `moz-fx-data-shared-prod.org_mozilla_firefox.startup_timeline`
 UNION ALL
 SELECT
+  "org_mozilla_firefox_beta" AS normalized_app_id,
+  mozfun.norm.fenix_app_info(
+    "org_mozilla_firefox_beta",
+    client_info.app_build
+  ).channel AS normalized_channel,
+  additional_properties,
+  client_info,
+  document_id,
+  events,
+  metadata,
+  metrics,
+  normalized_app_name,
+  normalized_country_code,
+  normalized_os,
+  normalized_os_version,
+  ping_info,
+  sample_id,
+  submission_timestamp
+FROM
+  `moz-fx-data-shared-prod.org_mozilla_firefox_beta.startup_timeline`
+UNION ALL
+SELECT
   "org_mozilla_fenix" AS normalized_app_id,
   mozfun.norm.fenix_app_info(
     "org_mozilla_fenix",
diff -bur --no-dereference --new-file /tmp/workspace/main-generated-sql/sql/moz-fx-data-shared-prod/telemetry_derived/active_users_aggregates_v1/checks.sql /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/telemetry_derived/active_users_aggregates_v1/checks.sql
--- /tmp/workspace/main-generated-sql/sql/moz-fx-data-shared-prod/telemetry_derived/active_users_aggregates_v1/checks.sql	1970-01-01 00:00:00.000000000 +0000
+++ /tmp/workspace/generated-sql/sql/moz-fx-data-shared-prod/telemetry_derived/active_users_aggregates_v1/checks.sql	2023-12-01 16:29:23.000000000 +0000
@@ -0,0 +1,76 @@
+
+#fail
+WITH CTE AS (
+  SELECT
+    ARRAY_AGG(
+      STRUCT(PARSE_DATE('%Y%m%d', partition_id) AS table_partition, total_rows)
+    ) AS included_partition_info
+  FROM
+    `moz-fx-data-shared-prod.telemetry_derived.INFORMATION_SCHEMA.PARTITIONS`
+  WHERE
+    table_name = "active_users_aggregates_v1"
+    AND partition_id <> "__NULL__"
+    AND PARSE_DATE('%Y%m%d', partition_id)
+    BETWEEN DATE_SUB(@submission_date, INTERVAL 7 + 1 DAY)
+    AND DATE(@submission_date)
+),
+DATE_RANGE_ROW_COUNT_AVG AS (
+  SELECT
+    IF(
+      date_range_avg_row_count IS NULL,
+      ERROR(
+        "looks like no data was found for past partitions. Make sure you're using a table and not a view."
+      ),
+      CAST(date_range_avg_row_count AS INTEGER)
+    ) AS date_range_avg_row_count
+  FROM
+    (
+      SELECT
+        AVG(partition_info.total_rows) AS date_range_avg_row_count,
+      FROM
+        CTE,
+        UNNEST(included_partition_info) AS partition_info
+      WHERE
+        partition_info.table_partition <> DATE(@submission_date)
+    )
+),
+ROW_COUNT_BOUNDARIES AS (
+  SELECT
+    CAST(
+      date_range_avg_row_count - (date_range_avg_row_count / 100 * 5) AS INTEGER
+    ) AS min_expected,
+    CAST(
+      date_range_avg_row_count + (date_range_avg_row_count / 100 * 5) AS INTEGER
+    ) AS max_expected,
+  FROM
+    DATE_RANGE_ROW_COUNT_AVG
+),
+CURRENT_PARTITION_ROW_COUNT AS (
+  SELECT
+    CAST(partition_info.total_rows AS INTEGER) AS total_rows
+  FROM
+    CTE,
+    UNNEST(included_partition_info) AS partition_info
+  WHERE
+    partition_info.table_partition = DATE(@submission_date)
+)
+SELECT
+  IF(
+    (SELECT total_rows FROM CURRENT_PARTITION_ROW_COUNT) NOT
+    BETWEEN (SELECT min_expected FROM ROW_COUNT_BOUNDARIES)
+    AND (SELECT max_expected FROM ROW_COUNT_BOUNDARIES),
+    ERROR(
+      CONCAT(
+        "The row count for partition ",
+        @submission_date,
+        " is outside of the expected boundaries. ",
+        "Row count for the current partition: ",
+        (SELECT total_rows FROM CURRENT_PARTITION_ROW_COUNT),
+        ". Expected boundary: ",
+        (SELECT min_expected FROM ROW_COUNT_BOUNDARIES),
+        " - ",
+        (SELECT max_expected FROM ROW_COUNT_BOUNDARIES)
+      )
+    ),
+    NULL
+  );

Link to full diff

Please sign in to comment.