diff --git a/.circleci/config.yml b/.circleci/config.yml index 0a77a15..191ae8e 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -5,6 +5,7 @@ jobs: build: docker: - image: circleci/python:3.6.2-stretch + - image: circleci/postgres:9.6.5-alpine-ram steps: - checkout @@ -28,19 +29,20 @@ jobs: cp integration_tests/ci/sample.profiles.yml ~/.dbt/profiles.yml - run: - name: "Run Tests - BigQuery" + name: "Run Tests - Postgres" environment: - GCLOUD_SERVICE_KEY_PATH: "/home/circleci/gcloud-service-key.json" - + CI_DBT_USER: root + CI_DBT_PASS: '' + CI_DBT_PORT: 5432 + CI_DBT_DBNAME: circle_test command: | . venv/bin/activate - echo `pwd` cd integration_tests - dbt deps --target bigquery - dbt seed --target bigquery --full-refresh - dbt run --target bigquery --full-refresh --vars 'update: false' - dbt run --target bigquery --vars 'update: true' - dbt test --target bigquery + dbt deps --target postgres + dbt seed --target postgres --full-refresh + dbt run --target postgres --full-refresh --vars 'update: false' + dbt run --target postgres --vars 'update: true' + dbt test --target postgres - run: name: "Run Tests - Redshift" @@ -66,6 +68,20 @@ jobs: dbt run --target snowflake --vars 'update: true' dbt test --target snowflake + - run: + name: "Run Tests - BigQuery" + environment: + GCLOUD_SERVICE_KEY_PATH: "/home/circleci/gcloud-service-key.json" + + command: | + . venv/bin/activate + echo `pwd` + cd integration_tests + dbt deps --target bigquery + dbt seed --target bigquery --full-refresh + dbt run --target bigquery --full-refresh --vars 'update: false' + dbt run --target bigquery --vars 'update: true' + dbt test --target bigquery - save_cache: key: deps1-{{ .Branch }} diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS new file mode 100644 index 0000000..0082d7c --- /dev/null +++ b/.github/CODEOWNERS @@ -0,0 +1 @@ +* @jtcohen6 \ No newline at end of file diff --git a/.github/bug_report.md b/.github/bug_report.md new file mode 100644 index 0000000..a42456c --- /dev/null +++ b/.github/bug_report.md @@ -0,0 +1,56 @@ +--- +name: Bug report +about: Report a bug or an issue you've found with this package +title: '' +labels: bug, triage +assignees: '' + +--- + +### Describe the bug + + +### Steps To Reproduce + + +### Expected results + + +### Actual results + + +### Screenshots and log output + + +### System information +**Which database are you using dbt with?** +- [ ] postgres +- [ ] redshift +- [ ] bigquery +- [ ] snowflake +- [ ] other (specify: ____________) + + +**The output of `dbt --version`:** +``` + +``` + +**The operating system you're using:** + +**The output of `python --version`:** + +### Additional context + diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md new file mode 100644 index 0000000..4b02d46 --- /dev/null +++ b/.github/pull_request_template.md @@ -0,0 +1,4 @@ +## Description & motivation + diff --git a/README.md b/README.md index ee6449c..000ba4a 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,11 @@ # Snowplow sessionization -dbt data models for sessionizing Snowplow data. Adapted from Snowplow's [web model](https://github.com/snowplow/web-data-model). +This dbt package: +* Rolls up `page_view` and `page_ping` events into page views and sessions +* Performs "user stitching" to tie all historical events associated with an +anonymous cookie (`domain_userid`) to the same `user_id` + +Adapted from Snowplow's [web model](https://github.com/snowplow/web-data-model). ### Models ### @@ -14,6 +19,29 @@ several intermediate models used to create these two models. ![snowplow graph](/etc/snowplow_graph.png) + +## Prerequisites + +This package takes the Snowplow JavaScript tracker as its foundation. It assumes +that all Snowplow events are sent with a +[`web_page` context](https://github.com/snowplow/snowplow/wiki/1-General-parameters-for-the-Javascript-tracker#webPage). + +### Mobile + +It _is_ possible to sessionize mobile (app) events by including two predefined contexts with all events: +* [`client_session`](https://github.com/snowplow/iglu-central/blob/master/schemas/com.snowplowanalytics.snowplow/client_session/jsonschema/1-0-1) ([iOS](https://docs.snowplowanalytics.com/docs/collecting-data/collecting-from-own-applications/objective-c-tracker/objective-c-1-2-0/#session-tracking), [Android](https://github.com/snowplow/snowplow/wiki/Android-Tracker#12-client-sessions)) +* [`screen`](https://github.com/snowplow/iglu-central/blob/master/schemas/com.snowplowanalytics.mobile/screen/jsonschema/1-0-0) + +As long as all events are associated with an anonymous user, a session, and a +screen/page view, they can be made to fit the same canonical data model as web +events fired from the JavaScript tracker. Whether this is the desired outcome +will vary significantly; mobile-first analytics often makes different +assumptions about user identity, engagement, referral, and inactivity cutoffs. + +For specific implementation details: +* [iOS](https://docs.snowplowanalytics.com/docs/collecting-data/collecting-from-own-applications/objective-c-tracker/) +* [Android trackers](https://docs.snowplowanalytics.com/docs/collecting-data/collecting-from-own-applications/android-tracker/) + ## Installation Instructions Check [dbt Hub](https://hub.getdbt.com/fishtown-analytics/snowplow/latest/) for the latest installation instructions, or [read the docs](https://docs.getdbt.com/docs/package-management) @@ -65,12 +93,14 @@ models: * Redshift * Snowflake * BigQuery -* Postgres, with the creation of [these UDFs](pg_udfs.sql) +* Postgres ### Contributions ### Additional contributions to this package are very welcome! Please create issues -or open PRs against `master`. +or open PRs against `master`. Check out +[this post](https://discourse.getdbt.com/t/contributing-to-a-dbt-package/657) +on the best workflow for contributing to a package.. Much of tracking can be the Wild West. Snowplow's canonical event model is a major asset in our ability to perform consistent analysis atop predictably structured @@ -78,19 +108,20 @@ data, but any detailed implementation is bound to diverge. To that end, we aim to keep this package rooted in a garden-variety Snowplow web deployment. All PRs should seek to add or improve functionality that is contained -within a plurality of snowplow deployments. +within a plurality of Snowplow deployments. If you need to change implementation-specific details, you have two avenues: * Override models from this package with versions that feature your custom logic. -Create a model with the same name locally (e.g. `snowplow_id_map`) and disable the `snowplow` -package's version in `dbt_project.yml`: +Create a model with the same name locally (e.g. `snowplow_id_map`) and disable +the `snowplow` package's version in `dbt_project.yml`: ```yml snowplow: ... identification: - snowplow_id_map: - enabled: false + default: + snowplow_id_map: + enabled: false ``` * Fork this repository :) diff --git a/data/snowplow_seeds.yml b/data/snowplow_seeds.yml new file mode 100644 index 0000000..b869bfb --- /dev/null +++ b/data/snowplow_seeds.yml @@ -0,0 +1,13 @@ +version: 2 + +seeds: + - name: country_codes + description: > + English names for countries based on their two-letter ISO code, which is + stored in the `geo_country` column of `snowplow_page_views` and + `snowplow_sessions`. Not directly used in any of the snowplow package's + sessionization logic. + columns: + - name: name + - name: two_letter_iso_code + - name: three_letter_iso_code diff --git a/dbt_project.yml b/dbt_project.yml index 963c066..4375b40 100644 --- a/dbt_project.yml +++ b/dbt_project.yml @@ -11,6 +11,8 @@ analysis-paths: ["analysis"] data-paths: ["data"] macro-paths: ["macros"] +require-dbt-version: ">=0.16.0" + models: snowplow: base: diff --git a/etc/snowplow_graph.png b/etc/snowplow_graph.png index 7a778c7..ccb0efd 100644 Binary files a/etc/snowplow_graph.png and b/etc/snowplow_graph.png differ diff --git a/integration_tests/Makefile b/integration_tests/Makefile index 0b5483d..325ed1d 100644 --- a/integration_tests/Makefile +++ b/integration_tests/Makefile @@ -22,5 +22,5 @@ test-bigquery: dbt run --target bigquery --vars 'update: true' dbt test --target bigquery -test-all: test-redshift test-snowflake test-bigquery +test-all: test-postgres test-redshift test-snowflake test-bigquery echo "Completed successfully" diff --git a/integration_tests/ci/sample.profiles.yml b/integration_tests/ci/sample.profiles.yml index 5749b41..f51f76e 100644 --- a/integration_tests/ci/sample.profiles.yml +++ b/integration_tests/ci/sample.profiles.yml @@ -7,8 +7,19 @@ config: use_colors: True integration_tests: - target: redshift + target: postgres outputs: + + postgres: + type: postgres + host: localhost + user: "{{ env_var('CI_DBT_USER') }}" + pass: "{{ env_var('CI_DBT_PASS') }}" + port: "{{ env_var('CI_DBT_PORT') }}" + dbname: "{{ env_var('CI_DBT_DBNAME') }}" + schema: snowplow_integration_tests_redshift + threads: 1 + redshift: type: redshift host: "{{ env_var('CI_REDSHIFT_DBT_HOST') }}" diff --git a/integration_tests/dbt_project.yml b/integration_tests/dbt_project.yml index 7e4bdf3..0f79d4e 100644 --- a/integration_tests/dbt_project.yml +++ b/integration_tests/dbt_project.yml @@ -30,4 +30,16 @@ models: 'snowplow:context:performance_timing': FALSE 'snowplow:context:useragent': FALSE 'snowplow:pass_through_columns': ['test_add_col'] - + +seeds: + quote_columns: false + snowplow_integration_tests: + snowplow: + sp_event_update: + column_types: + collector_tstamp: timestamp + derived_tstamp: timestamp + sp_event: + column_types: + collector_tstamp: timestamp + derived_tstamp: timestamp diff --git a/macros/adapters/convert_timezone.sql b/macros/adapters/convert_timezone.sql new file mode 100644 index 0000000..051eb5d --- /dev/null +++ b/macros/adapters/convert_timezone.sql @@ -0,0 +1,11 @@ +{%- macro convert_timezone(in_tz, out_tz, in_timestamp) -%} + {{ adapter_macro('convert_timezone', in_tz, out_tz, in_timestamp) }} +{%- endmacro -%} + +{% macro default__convert_timezone(in_tz, out_tz, in_timestamp) %} + convert_timezone({{in_tz}}, {{out_tz}}, {{in_timestamp}}) +{% endmacro %} + +{% macro postgres__convert_timezone(in_tz, out_tz, in_timestamp) %} + ({{in_timestamp}} at time zone {{in_tz}} at time zone {{out_tz}})::timestamptz +{% endmacro %} diff --git a/macros/adapters/get_start_ts.sql b/macros/adapters/get_start_ts.sql new file mode 100644 index 0000000..b510d7d --- /dev/null +++ b/macros/adapters/get_start_ts.sql @@ -0,0 +1,68 @@ + +{%- macro get_max_sql(relation, field = 'collector_tstamp') -%} + + select + + coalesce( + max({{field}}), + '0001-01-01' -- a long, long time ago + ) as start_ts + + from {{ relation }} + +{%- endmacro -%} + + +{%- macro get_most_recent_record(relation, field = 'collector_tstamp') -%} + + {%- set result = run_query(get_max_sql(relation, field)) -%} + + {% if execute %} + {% set start_ts = result.columns['start_ts'].values()[0] %} + {% else %} + {% set start_ts = '' %} + {% endif %} + + {{ return(start_ts) }} + +{%- endmacro -%} + + +{%- macro get_start_ts(relation, field = 'collector_tstamp') -%} + {{ adapter_macro('get_start_ts', relation, field) }} +{%- endmacro -%} + + +{%- macro default__get_start_ts(relation, field = 'collector_tstamp') -%} + ({{get_max_sql(relation, field)}}) +{%- endmacro -%} + + +{%- macro bigquery__get_start_ts(relation, field = 'collector_tstamp') -%} + + {%- set partition_by = config.get('partition_by', none) -%} + {%- set partitions = config.get('partitions', none) -%} + + {%- set start_ts -%} + {%- if config.incremental_strategy == 'insert_overwrite' -%} + + {%- if partitions -%} least({{partitions|join(',')}}) + {%- elif partition_by.data_type == 'date' -%} _dbt_max_partition + {%- else -%} date(_dbt_max_partition) + {%- endif -%} + + {%- else -%} + + {%- set rendered -%} + {%- if partition_by.data_type == 'date' -%} {{partition_by.field}} + {%- else -%} date({{partition_by.field}}) {%- endif -%} + {%- endset -%} + {%- set record = get_most_recent_record(relation, rendered) -%} + '{{record}}' + + {%- endif -%} + {%- endset -%} + + {%- do return(start_ts) -%} + +{%- endmacro -%} \ No newline at end of file diff --git a/macros/adapters/is_adapter.sql b/macros/adapters/is_adapter.sql new file mode 100644 index 0000000..35ae903 --- /dev/null +++ b/macros/adapters/is_adapter.sql @@ -0,0 +1,35 @@ +{% macro set_default_adapters() %} + + {% set default_adapters = ['postgres', 'redshift', 'snowflake'] %} + + {% do return(default_adapters) %} + +{% endmacro %} + +{% macro is_adapter(adapter='default') %} + +{#- + This logic means that if you add your own macro named `set_default_adapters` + to your project, that will be used, giving you the flexibility of overriding + which target types use the default implementation of Snowplow models. +-#} + + {% if context.get(ref.config.project_name, {}).get('set_default_adapters') %} + {% set default_adapters=context[ref.config.project_name].set_default_adapters() %} + {% else %} + {% set default_adapters=snowplow.set_default_adapters() %} + {% endif %} + + {% if adapter == 'default' %} + {% set adapters = default_adapters %} + {% elif adapter is string %} + {% set adapters = [adapter] %} + {% else %} + {% set adapters = adapter %} + {% endif %} + + {% set result = (target.type in adapters) %} + + {{return(result)}} + +{% endmacro %} diff --git a/macros/timestamp_ntz.sql b/macros/adapters/timestamp_ntz.sql similarity index 100% rename from macros/timestamp_ntz.sql rename to macros/adapters/timestamp_ntz.sql diff --git a/macros/is_adapter.sql b/macros/is_adapter.sql deleted file mode 100644 index c33e420..0000000 --- a/macros/is_adapter.sql +++ /dev/null @@ -1,15 +0,0 @@ -{% macro is_adapter(adapter='default') %} - - {% if adapter == 'default' %} - {% set adapters = ['redshift', 'snowflake', 'postgres'] %} - {% elif adapter is string %} - {% set adapters = [adapter] %} - {% else %} - {% set adapters = adapter %} - {% endif %} - - {% set result = (target.type in adapters) %} - - {{return(result)}} - -{% endmacro %} diff --git a/macros/most_recent_record.sql b/macros/most_recent_record.sql deleted file mode 100644 index 9209a6e..0000000 --- a/macros/most_recent_record.sql +++ /dev/null @@ -1,26 +0,0 @@ - - -{% macro get_most_recent_record(rel, field, default) %} - - {#-- do not run the query in parsing mode #} - {% if not execute %} - {{ return(default) }} - {% endif %} - - {% if not is_incremental() %} - {{ return(default) }} - {% endif %} - - {# fix for tmp suffix #} - {%- set rel = api.Relation.create(identifier=rel.name, schema=rel.schema) -%} - - {% call statement('_', fetch_result=True) %} - - select cast(coalesce(max({{field}}), '{{ default }}') as date) as ts from {{ rel }} - - {% endcall %} - - {% set data = load_result('_')['table'].rows %} - {{ return(data[0]['ts']) }} - -{% endmacro %} diff --git a/macros/similar_to.sql b/macros/similar_to.sql deleted file mode 100644 index a9ac004..0000000 --- a/macros/similar_to.sql +++ /dev/null @@ -1,11 +0,0 @@ -{% macro similar_to(values) %} - {{ adapter_macro('snowplow.similar_to', values) }} -{% endmacro %} - -{% macro default__similar_to(values) %} - similar to '%({{ values }})%' -{%- endmacro %} - -{% macro snowflake__similar_to(values) %} - rlike '.*({{ values }}).*' -{% endmacro %} diff --git a/macros/snowplow_macros.yml b/macros/snowplow_macros.yml new file mode 100644 index 0000000..b94c3f0 --- /dev/null +++ b/macros/snowplow_macros.yml @@ -0,0 +1,103 @@ +version: 2 + +macros: + + - name: bot_any + description: > + List of text values to check if user agents are `like`. If matched, + the page view is probably by a bot visitor. + arguments: [] + + - name: convert_timezone + description: > + Adapter macro that supports default and Postgres syntax for converting + timestamps from one timezone into another. + arguments: + - name: in_tz + type: string + description: time zone the timestamp is currently in + - name: out_tz + type: string + description: time zone of the output + - name: in_timestamp + type: timestamp + description: timestamp to convert + + - name: get_max_sql + description: > + Basically, `select max(`field`) from {{relation}}`. + Used in `get_start_ts`. + arguments: + - name: relation + type: relation + - name: field + type: string + + - name: get_most_recent_record + description: > + Runs `get_max_sql` using `run_query` and parses the result. + Used in `get_start_ts`. + arguments: + - name: relation + type: relation + - name: field + type: string + + - name: get_start_ts + description: > + {% raw %} + Determine the start timestamp/date for this incremental run based on the + max timestamp/date in the current version of this table. + + + In the default implementation (Postgres/Redshift/Snowflake), this + resolves to a subquery (`select max(timestamp) from {{ this }}`). + + + On BigQuery, the method for executing this macro depends on the + incremental strategy: + - **Merge:** Uses `run_query` to get the result of `select max(partition_field) from {{ this }}`, + and feeds the result into the model SQL. + - **Insert overwrite**: If user supplies `partitions` config, resolves to + the `least()` of all static partition values. Otherwise, resolves to + `_dbt_max_partition` or `date(_dbt_max_partition)`, which dbt will generate + as part of the materialization script. + {% endraw %} + + arguments: + - name: relation + type: relation + description: always pass `this` + - name: field + type: string + description: > + Differs depending on the adapter: + - name of the timestamp/date column to get max value, with default value of `collector_tstamp` + - not needed on Bigquery, which always uses the model's configured partition field + + - name: set_default_adapters + description: > + Which adapters should use the `default` implementation of Snowplow package + models? By default, includes Postgres, Redshift, and Snowflake. + Override by creating a macro named `set_default_adapters` in your + own project. + + - name: is_adapter + description: > + Determine whether a model should be enabled depending on the `target.type` + of the current run. Returns `true` or `false`. All `default` models run on Postgres, Redshift, + and Snowflake. All `bigquery` models run on BigQuery. + arguments: + - name: adapter + type: string + description: "*default* or *bigquery*" + + - name: timestamp_ntz + description: > + Adapter macro that supports Snowflake's good-yet-eccentric behavior + around timestamps and timezones. The default implementation returns the + input unchanged. + arguments: + - name: field + type: timestamp + description: timestamp to return as `timestampntz` diff --git a/macros/url_query.sql b/macros/url_query.sql deleted file mode 100644 index 88fe4cc..0000000 --- a/macros/url_query.sql +++ /dev/null @@ -1,10 +0,0 @@ - -{% macro get_utm_parameter(query_field, utm_param) -%} - -nullif( - split_part( - split_part(({{ query_field }})::text, '{{ utm_param }}='::text, 2), - '&'::text, 1), - '') - -{%- endmacro %} diff --git a/models/identification/bigquery/snowplow_id_map.sql b/models/identification/bigquery/snowplow_id_map.sql index d7b3529..7fc09a5 100644 --- a/models/identification/bigquery/snowplow_id_map.sql +++ b/models/identification/bigquery/snowplow_id_map.sql @@ -5,19 +5,23 @@ {{ config( materialized='incremental', - partition_by='DATE(max_tstamp)', - unique_key="domain_userid", + partition_by={ + 'field': 'max_tstamp', + 'data_type': 'timestamp' + }, + unique_key='domain_userid', + cluster_by='domain_userid', enabled=is_adapter('bigquery') ) }} -{% set start_date = get_most_recent_record(this, "max_tstamp", "2001-01-01") %} - with all_events as ( - select * - from {{ ref('snowplow_base_events') }} - where DATE(collector_tstamp) >= date_sub('{{ start_date }}', interval 1 day) + select * from {{ ref('snowplow_base_events') }} + + {% if is_incremental() %} + where date(collector_tstamp) >= {{get_start_ts(this)}} + {% endif %} ), @@ -27,7 +31,6 @@ new_sessions as ( domain_sessionid from all_events - where DATE(collector_tstamp) >= '{{ start_date }}' ), @@ -59,7 +62,7 @@ prep as ( rows between unbounded preceding and unbounded following ) as user_id, - max(timestamp(collector_tstamp)) over ( + max(collector_tstamp) over ( partition by domain_userid ) as max_tstamp diff --git a/models/identification/default/snowplow_id_map.sql b/models/identification/default/snowplow_id_map.sql index 6232ea6..e5a0183 100644 --- a/models/identification/default/snowplow_id_map.sql +++ b/models/identification/default/snowplow_id_map.sql @@ -20,13 +20,10 @@ with all_events as ( new_events as ( - select * - from all_events + select * from all_events {% if is_incremental() %} - where collector_tstamp > ( - select coalesce(max(max_tstamp), '0001-01-01') from {{ this }} - ) + where collector_tstamp > {{get_start_ts(this, 'max_tstamp')}} {% endif %} ), diff --git a/models/page_views/bigquery/snowplow_page_views.sql b/models/page_views/bigquery/snowplow_page_views.sql index b7b99ca..2bc30f1 100644 --- a/models/page_views/bigquery/snowplow_page_views.sql +++ b/models/page_views/bigquery/snowplow_page_views.sql @@ -1,14 +1,17 @@ {{ config( materialized='incremental', - partition_by='DATE(page_view_start)', - unique_key="page_view_id", + partition_by={ + 'field': 'page_view_start', + 'data_type': 'timestamp' + }, + unique_key='page_view_id', + cluster_by='page_view_id', enabled=is_adapter('bigquery') ) }} {% set timezone = var('snowplow:timezone', 'UTC') %} -{% set start_date = get_most_recent_record(this, "page_view_start", "2001-01-01") %} /* General approach: find sessions that happened since the last time @@ -25,9 +28,15 @@ with all_events as ( select * from {{ ref('snowplow_base_events') }} - -- load up events from the start date, and the day before it, to ensure - -- that we capture pageviews that span midnight - where DATE(collector_tstamp) >= date_sub('{{ start_date }}', interval 1 day) + {% if is_incremental() %} + + where date(collector_tstamp) >= + date_sub( + {{get_start_ts(this)}}, + interval {{var('snowplow:page_view_lookback_days')}} day + ) + + {% endif %} ), @@ -38,9 +47,6 @@ new_sessions as ( from all_events - -- only consider events for sessions that occurred on or after the start_date - where DATE(collector_tstamp) >= '{{ start_date }}' - ), relevant_events as ( diff --git a/models/page_views/default/snowplow_page_views.sql b/models/page_views/default/snowplow_page_views.sql index ae0d1b3..c36cbd9 100644 --- a/models/page_views/default/snowplow_page_views.sql +++ b/models/page_views/default/snowplow_page_views.sql @@ -20,20 +20,26 @@ with all_events as ( select * from {{ ref('snowplow_web_events') }} + {% if is_incremental() %} - where collector_tstamp > ( - DATEADD('day', -1 * {{var('snowplow:page_view_lookback_days')}}, (select coalesce(max(max_tstamp), '0001-01-01') from {{ this }})) - ) + + where collector_tstamp > + {{dbt_utils.dateadd( + 'day', + -1 * var('snowplow:page_view_lookback_days'), + get_start_ts(this, 'max_tstamp') + )}} + {% endif %} + ), filtered_events as ( select * from all_events + {% if is_incremental() %} - where collector_tstamp > ( - select coalesce(max(max_tstamp), '0001-01-01') from {{ this }} - ) + where collector_tstamp > {{get_start_ts(this, 'max_tstamp')}} {% endif %} ), @@ -101,12 +107,15 @@ prep as ( count(*) over (partition by domain_sessionid) as max_session_page_view_index, -- page view: time - CONVERT_TIMEZONE('UTC', '{{ timezone }}', b.min_tstamp) as page_view_start, - CONVERT_TIMEZONE('UTC', '{{ timezone }}', b.max_tstamp) as page_view_end, + {{convert_timezone("'UTC'", "'" ~ timezone ~ "'", 'b.min_tstamp')}} as page_view_start, + {{convert_timezone("'UTC'", "'" ~ timezone ~ "'", 'b.max_tstamp')}} as page_view_end, -- page view: time in the user's local timezone - convert_timezone('UTC', coalesce(a.os_timezone, '{{ timezone }}'), b.min_tstamp) as page_view_start_local, - convert_timezone('UTC', coalesce(a.os_timezone, '{{ timezone }}'), b.max_tstamp) as page_view_end_local, + + {%- set local_timezone -%} coalesce(a.os_timezone, '{{timezone}}') {%- endset -%} + + {{convert_timezone("'UTC'", local_timezone, 'b.min_tstamp')}} as page_view_start_local, + {{convert_timezone("'UTC'", local_timezone, 'b.max_tstamp')}} as page_view_end_local, -- engagement b.time_engaged_in_s, diff --git a/models/page_views/default/snowplow_web_events.sql b/models/page_views/default/snowplow_web_events.sql index a58e9be..b24f946 100644 --- a/models/page_views/default/snowplow_web_events.sql +++ b/models/page_views/default/snowplow_web_events.sql @@ -22,10 +22,9 @@ with all_events as ( events as ( select * from all_events + {% if is_incremental() %} - where collector_tstamp > ( - select coalesce(max(collector_tstamp), '0001-01-01') from {{ this }} - ) + where collector_tstamp > {{get_start_ts(this, 'collector_tstamp')}} {% endif %} ), diff --git a/models/page_views/default/snowplow_web_events_scroll_depth.sql b/models/page_views/default/snowplow_web_events_scroll_depth.sql index 767fdc9..75dd825 100644 --- a/models/page_views/default/snowplow_web_events_scroll_depth.sql +++ b/models/page_views/default/snowplow_web_events_scroll_depth.sql @@ -17,10 +17,9 @@ with all_events as ( events as ( select * from all_events + {% if is_incremental() %} - where collector_tstamp > ( - select coalesce(max(max_tstamp), '0001-01-01') from {{ this }} - ) + where collector_tstamp > {{get_start_ts(this, 'max_tstamp')}} {% endif %} ), diff --git a/models/page_views/default/snowplow_web_events_time.sql b/models/page_views/default/snowplow_web_events_time.sql index ca4bf2e..1177606 100644 --- a/models/page_views/default/snowplow_web_events_time.sql +++ b/models/page_views/default/snowplow_web_events_time.sql @@ -18,10 +18,9 @@ with all_events as ( events as ( select * from all_events + {% if is_incremental() %} - where collector_tstamp > ( - select coalesce(max(max_tstamp), '0001-01-01') from {{ this }} - ) + where collector_tstamp > {{get_start_ts(this, 'max_tstamp')}} {% endif %} ), diff --git a/models/page_views/optional/snowplow_web_timing_context.sql b/models/page_views/optional/snowplow_web_timing_context.sql index 2cb25b0..d3e35be 100644 --- a/models/page_views/optional/snowplow_web_timing_context.sql +++ b/models/page_views/optional/snowplow_web_timing_context.sql @@ -1,5 +1,11 @@ -{{ config(materialized='table', sort='page_view_id', dist='page_view_id') }} +{{ + config( + materialized='table', + sort='page_view_id', + dist='page_view_id' + ) +}} with performance_timing_context as ( diff --git a/models/sessions/bigquery/snowplow_sessions.sql b/models/sessions/bigquery/snowplow_sessions.sql index c52e2ee..86eb919 100644 --- a/models/sessions/bigquery/snowplow_sessions.sql +++ b/models/sessions/bigquery/snowplow_sessions.sql @@ -1,7 +1,10 @@ {{ config( materialized='table', - partition_by='DATE(session_start)', + partition_by={ + 'field': 'session_start', + 'data_type': 'timestamp' + }, enabled=is_adapter('bigquery') ) }} diff --git a/models/sessions/bigquery/snowplow_sessions_tmp.sql b/models/sessions/bigquery/snowplow_sessions_tmp.sql index cfab990..f0e5ca4 100644 --- a/models/sessions/bigquery/snowplow_sessions_tmp.sql +++ b/models/sessions/bigquery/snowplow_sessions_tmp.sql @@ -1,18 +1,23 @@ {{ config( materialized='incremental', - partition_by='DATE(session_start)', - unique_key="session_id", + partition_by={ + 'field': 'session_start', + 'data_type': 'timestamp' + }, + unique_key='session_id', + cluster_by='session_id', enabled=is_adapter('bigquery') ) }} -{% set start_date = get_most_recent_record(this, "session_start", "2001-01-01") %} - with all_page_views as ( select * from {{ ref('snowplow_page_views') }} - where DATE(page_view_start) >= date_sub('{{ start_date }}', interval 1 day) + + {% if is_incremental() %} + where date(page_view_start) >= {{get_start_ts(this)}} + {% endif %} ), @@ -22,7 +27,6 @@ new_page_views as ( session_id from all_page_views - where DATE(page_view_start) >= '{{ start_date }}' ), diff --git a/models/sessions/default/snowplow_sessions_tmp.sql b/models/sessions/default/snowplow_sessions_tmp.sql index 44c632a..00d6749 100644 --- a/models/sessions/default/snowplow_sessions_tmp.sql +++ b/models/sessions/default/snowplow_sessions_tmp.sql @@ -20,7 +20,7 @@ relevant_sessions as ( from all_web_page_views {% if is_incremental() %} - where page_view_start > (select max(session_start) from {{ this }}) + where page_view_start > {{get_start_ts(this, 'session_start')}} {% endif %} ), diff --git a/packages.yml b/packages.yml new file mode 100644 index 0000000..4e57658 --- /dev/null +++ b/packages.yml @@ -0,0 +1,3 @@ +packages: + - package: fishtown-analytics/dbt_utils + version: ">=0.2.5" diff --git a/pg_udfs.sql b/pg_udfs.sql deleted file mode 100644 index 4a6eb75..0000000 --- a/pg_udfs.sql +++ /dev/null @@ -1,67 +0,0 @@ --- convert_timezone -create function convert_timezone( - in_tzname text, - out_tzname text, - in_t timestamptz - ) returns timestamptz -as $$ -declare -begin - return in_t at time zone out_tzname at time zone in_tzname; -end; -$$ language plpgsql; - --- datediff -create or replace function datediff( - units varchar(30), - start_t timestamp, - end_t timestamp) returns int -as $$ -declare - diff_interval interval; - diff int = 0; - years_diff int = 0; -begin - if units in ('yy', 'yyyy', 'year', 'mm', 'm', 'month') then - years_diff = date_part('year', end_t) - date_part('year', start_t); - - if units in ('yy', 'yyyy', 'year') then - -- sql server does not count full years passed (only difference between year parts) - return years_diff; - else - -- if end month is less than start month it will subtracted - return years_diff * 12 + (date_part('month', end_t) - date_part('month', start_t)); - end if; - end if; - - -- Minus operator returns interval 'DDD days HH:MI:SS' - diff_interval = end_t - start_t; - - diff = diff + date_part('day', diff_interval); - - if units in ('wk', 'ww', 'week') then - diff = diff/7; - return diff; - end if; - - if units in ('dd', 'd', 'day') then - return diff; - end if; - - diff = diff * 24 + date_part('hour', diff_interval); - - if units in ('hh', 'hour') then - return diff; - end if; - - diff = diff * 60 + date_part('minute', diff_interval); - - if units in ('mi', 'n', 'minute') then - return diff; - end if; - - diff = diff * 60 + date_part('second', diff_interval); - - return diff; -end; -$$ language plpgsql; \ No newline at end of file