From 8964814c2d7ff4163c7a87c33b901ddf4053bf8d Mon Sep 17 00:00:00 2001 From: markzegarelli Date: Fri, 1 Nov 2024 10:18:17 -0700 Subject: [PATCH] DOC-324 add examples --- .../source-catalog/en/snowflake.md | 242 +++++++++++++++++- 1 file changed, 233 insertions(+), 9 deletions(-) diff --git a/content/collections/source-catalog/en/snowflake.md b/content/collections/source-catalog/en/snowflake.md index 25e765155..d799c5335 100644 --- a/content/collections/source-catalog/en/snowflake.md +++ b/content/collections/source-catalog/en/snowflake.md @@ -229,7 +229,7 @@ To use a group property: ## SQL query examples -To make the data selection step a bit easier, here are few example SQL snippets to get you started. +To make the data selection step easier, here are few example SQL snippets to get you started. ### Event data example @@ -263,18 +263,242 @@ FROM DATABASE_NAME.SCHEMA_NAME.TABLE_OR_VIEW_NAME ### Common snippets -Creating a JSON Object: +Create a JSON Object: -`OBJECT_CONSTRUCT('city', CITY, 'state', STATE) as "user_properties"` +```sql +OBJECT_CONSTRUCT('city', CITY, 'state', STATE) as "user_properties" +``` + +Convert a timestamp column to milliseconds: + +```sql +DATE_PART('EPOCH_MILLISECOND', TIMESTAMP_COLUMN) as "time" +``` + +Convert milliseconds to the `TIMESTAMP_NTZ` format needed for time-based import. This example uses the `scale` argument set to `3` to convert to milliseconds. See the [Snowflake documentation](https://docs.snowflake.com/en/sql-reference/functions/to_timestamp.html) for more details. + +```sql +TO_TIMESTAMP_NTZ(TIME_COLUMN_IN_MILLIS, 3) as "update_time_column" +``` + +Convert a timestamp column with a timezone to `TIMESTAMP_NTZ` format needed for time-based import. + +```sql +TO_TIMESTAMP_NTZ(CONVERT_TIMEZONE('UTC', TIMESTAMP_TZ_COLUMN)) as "update_time_column" +``` + +## SQL troubleshooting + +The following sections provide example SQL queries you can use to configure your import connectors. + +### Required event properties + +The Snowflake SQL queries you write for Amplitude's data warehouse import connectors must return specific columns that match Amplitude's Event API schema. Use the following examples to help structure your query. + +{{partial:tabs tabs="Basic template, Complete template"}} +{{partial:tab name="Basic template"}} +```sql +SELECT + event_type, -- String: Name of the event + user_id, -- String: Unique identifier for the user + EXTRACT(EPOCH_MILLISECOND FROM event_timestamp) as time -- Timestamp in milliseconds +FROM your_events_table +``` +{{/partial:tab}} +{{partial:tab name="Complete template"}} +```sql +SELECT + event_name as event_type, + user_identifier as user_id, + EXTRACT(EPOCH_MILLISECOND FROM event_timestamp) as time, + device_id, + session_id, + + -- Event Properties as JSON object using OBJECT_CONSTRUCT + OBJECT_CONSTRUCT( + 'property1', property1_value, + 'property2', property2_value, + 'category', category, + 'value', amount + ) as event_properties, + -- [tl! collapse:start ] + -- User Properties as JSON object + OBJECT_CONSTRUCT( + 'user_type', user_type, + 'subscription_status', subscription_status, + 'city', data:address:city::string, + 'last_updated', TO_VARCHAR(last_updated) + ) as user_properties, + + app_version, + platform, + os_name, + os_version, + device_brand, + device_manufacturer, + device_model, + carrier, + country, + region, + city, + dma, + language, + price::FLOAT as price, + quantity::INTEGER as quantity, + revenue::FLOAT as revenue, + product_id as productId, + revenue_type as revenueType, + location_lat::FLOAT as location_lat, + location_lng::FLOAT as location_lng, + ip + -- [tl! collapse:end ] +FROM your_events_table +WHERE event_timestamp >= DATEADD(day, -7, CURRENT_DATE()) +``` +{{/partial:tab}} +{{/partial:tabs}} + +### Basic event query with properties + +```sql +SELECT + event_name as event_type, + user_id, + EXTRACT(EPOCH_MILLISECOND FROM event_timestamp) as time, + device_id, + -- Construct event properties from multiple columns + OBJECT_CONSTRUCT( + 'page_name', page_name, + 'button_id', button_id, + 'interaction_type', interaction_type, + 'duration_ms', duration_ms + ) as event_properties, + -- Construct user properties + OBJECT_CONSTRUCT( + 'account_type', account_type, + 'subscription_tier', subscription_tier, + 'last_login', TO_VARCHAR(last_login_date) + ) as user_properties, + platform, + app_version +FROM app_events +WHERE event_timestamp >= DATEADD(day, -7, CURRENT_DATE()) +``` + +### Snowflake-specific features and best practices -Converting timestamp column to milliseconds: +The following are examples of Snowflake-specific features and best practices. -`DATE_PART('EPOCH_MILLISECOND', TIMESTAMP_COLUMN) as "time"` +#### Working with JSON -Converting milliseconds to TIMESTAMP_NTZ format needed for time-based import. This example uses the `scale` argument set to `3` to convert to milliseconds. See the [Snowflake documentation](https://docs.snowflake.com/en/sql-reference/functions/to_timestamp.html) for more details. +```sql +-- Combining multiple JSON objects +SELECT + event_type, + user_id, + EXTRACT(EPOCH_MILLISECOND FROM event_timestamp) as time, + OBJECT_CONSTRUCT( + 'base_properties', base_properties, -- existing JSON column + 'additional_data', OBJECT_CONSTRUCT( + 'new_field1', value1, + 'new_field2', value2 + ) + ) as event_properties +FROM events + +-- Parsing JSON fields +SELECT + event_type, + user_id, + time, + PARSE_JSON(raw_properties):field_name::string as extracted_value +FROM events +``` + +#### Handling timestamps + +```sql +-- Converting different timestamp formats +SELECT + event_type, + user_id, + CASE + WHEN TRY_TO_TIMESTAMP(timestamp_string) IS NOT NULL + THEN EXTRACT(EPOCH_MILLISECOND FROM TRY_TO_TIMESTAMP(timestamp_string)) + WHEN TRY_TO_TIMESTAMP_NTZ(timestamp_string) IS NOT NULL + THEN EXTRACT(EPOCH_MILLISECOND FROM TRY_TO_TIMESTAMP_NTZ(timestamp_string)) + ELSE NULL + END as time +FROM events +``` + +#### Data validation queries + +```sql +-- Validate required fields +SELECT COUNT(*) +FROM ( + YOUR_QUERY_HERE +) t +WHERE event_type IS NULL + OR user_id IS NULL + OR time IS NULL; + +-- Validate JSON structure +SELECT COUNT(*) +FROM ( + YOUR_QUERY_HERE +) t +WHERE NOT ( + TRY_CAST(event_properties AS OBJECT) IS NOT NULL + AND TRY_CAST(user_properties AS OBJECT) IS NOT NULL +); + +-- Validate timestamp range +SELECT + MIN(time) as min_time, + MAX(time) as max_time, + TIMEADD(millisecond, MIN(time), '1970-01-01'::timestamp) as min_readable_time, + TIMEADD(millisecond, MAX(time), '1970-01-01'::timestamp) as max_readable_time +FROM ( + YOUR_QUERY_HERE +) t; +``` + +## Performance optimization tips + +Use the following examples to help optimize the performance of your integration. -`TO_TIMESTAMP_NTZ(TIME_COLUMN_IN_MILLIS, 3) as "update_time_column"` +### Use clustering keys -Converting a timestamp column with a timezone to TIMESTAMP_NTZ format needed for time-based import. +Use the appropriate clusting keys on your source tables. -`TO_TIMESTAMP_NTZ(CONVERT_TIMEZONE('UTC', TIMESTAMP_TZ_COLUMN)) as "update_time_column"` \ No newline at end of file +```sql +ALTER TABLE your_events_table CLUSTER BY (event_timestamp, user_id); +``` + +### Use materialized views + +Use materialized views for complex transformations. + +```sql +CREATE MATERIALIZED VIEW amplitude_ready_events AS +SELECT + -- Your transformed columns here +FROM source_events; +``` + +### Date partitioning in WHERE clauses + +```sql +WHERE event_timestamp >= DATEADD(day, -7, CURRENT_DATE()) + AND event_timestamp < CURRENT_DATE() +``` + +### Micro-partitions + +```sql +SELECT ... +FROM your_table +WHERE TO_DATE(event_timestamp) BETWEEN '2024-01-01' AND '2024-01-31' +``` \ No newline at end of file