From 7aae3b96c9acef1ad297afed3d4362ecf2f52e2c Mon Sep 17 00:00:00 2001 From: Kacper Muda Date: Tue, 20 May 2025 15:14:40 +0200 Subject: [PATCH] fix: Duplicate region in Snowflake URI no longer breaks OpenLineage --- .../providers/snowflake/utils/openlineage.py | 19 +++++++++++++++---- .../unit/snowflake/utils/test_openlineage.py | 3 +++ 2 files changed, 18 insertions(+), 4 deletions(-) diff --git a/providers/snowflake/src/airflow/providers/snowflake/utils/openlineage.py b/providers/snowflake/src/airflow/providers/snowflake/utils/openlineage.py index bae4fa5bb0ab1..0e3acf5d2542a 100644 --- a/providers/snowflake/src/airflow/providers/snowflake/utils/openlineage.py +++ b/providers/snowflake/src/airflow/providers/snowflake/utils/openlineage.py @@ -52,7 +52,15 @@ def fix_account_name(name: str) -> str: account, region = spl cloud = "aws" else: - account, region, cloud = spl + # region can easily get duplicated without crashing snowflake, so we need to handle that as well + # eg. account_locator.europe-west3.gcp.europe-west3.gcp will be ok for snowflake + account, region, cloud, *rest = spl + rest = [x for x in rest if x not in (region, cloud)] + if rest: # Not sure what could be left here, but leaving this just in case + log.warning( + "Unexpected parts found in Snowflake uri hostname and will be ignored by OpenLineage: %s", + rest, + ) return f"{account}.{region}.{cloud}" # Check for existing accounts with cloud names @@ -72,13 +80,16 @@ def fix_snowflake_sqlalchemy_uri(uri: str) -> str: """ Fix snowflake sqlalchemy connection URI to OpenLineage structure. - Snowflake sqlalchemy connection URI has following structure: + Snowflake sqlalchemy connection URI has the following structure: 'snowflake://:@//?warehouse=&role=' We want account identifier normalized. It can have two forms: - - newer, in form of -. In this case we want to do nothing. - - older, composed of -- where region and cloud can be + - newer, in form of -. In this case we want to do nothing. + - older, composed of .. where region and cloud can be optional in some cases. If is omitted, it's AWS. If region and cloud are omitted, it's AWS us-west-1 + + Current doc on Snowflake account identifiers: + https://docs.snowflake.com/en/user-guide/admin-account-identifier """ try: parts = urlparse(uri) diff --git a/providers/snowflake/tests/unit/snowflake/utils/test_openlineage.py b/providers/snowflake/tests/unit/snowflake/utils/test_openlineage.py index 2127948b3b969..bba8d317032c2 100644 --- a/providers/snowflake/tests/unit/snowflake/utils/test_openlineage.py +++ b/providers/snowflake/tests/unit/snowflake/utils/test_openlineage.py @@ -93,6 +93,9 @@ def test_snowflake_sqlite_account_urls(source, target): ("xy12345", "xy12345.us-west-1.aws"), # No '-' or '_' in name ("xy12345.us-west-1.aws", "xy12345.us-west-1.aws"), # Already complete locator ("xy12345.us-west-2.gcp", "xy12345.us-west-2.gcp"), # Already complete locator for GCP + ("xy12345.us-west-2.gcp.us-west-2.gcp", "xy12345.us-west-2.gcp"), # Duplicated region + ("xy12345.us-west-2.gcp.us-west-2.gcp.us-west-2.gcp", "xy12345.us-west-2.gcp"), # Triple region + ("xy12345.us-west-2.gcp.some_random_part", "xy12345.us-west-2.gcp"), # Suffix to locator, ignored ("xy12345aws", "xy12345aws.us-west-1.aws"), # AWS without '-' or '_' ("xy12345-aws", "xy12345-aws"), # AWS with '-' ("xy12345_gcp-europe-west1", "xy12345.europe-west1.gcp"), # GCP with '_'