From bac87f5a2c6fdc14c84557fcf8624248ed9d5467 Mon Sep 17 00:00:00 2001 From: Michelle Ark Date: Fri, 30 Jun 2023 21:28:15 -0400 Subject: [PATCH 1/7] Contracts: Handle struct column specified both at root and nested levels + arrays of structs --- dbt/adapters/bigquery/column.py | 35 +++++++++++++++++++++++++++++++-- 1 file changed, 33 insertions(+), 2 deletions(-) diff --git a/dbt/adapters/bigquery/column.py b/dbt/adapters/bigquery/column.py index cfa298b82..02a3e8182 100644 --- a/dbt/adapters/bigquery/column.py +++ b/dbt/adapters/bigquery/column.py @@ -5,6 +5,8 @@ from google.cloud.bigquery import SchemaField +_PARENT_DATA_TYPE_KEY = "__parent_data_type" + Self = TypeVar("Self", bound="BigQueryColumn") @@ -215,15 +217,29 @@ def _update_nested_column_data_types( if len(column_name_parts) == 1: # Base case: column is not nested - store its data_type concatenated with constraint if provided. - nested_column_data_types[root_column_name] = ( + column_data_type_and_constraints = ( column_data_type if column_rendered_constraint is None else f"{column_data_type} {column_rendered_constraint}" ) + if root_column_name not in nested_column_data_types: + nested_column_data_types[root_column_name] = column_data_type_and_constraints + else: + # entry could already exist if this is a parent column -- preserve the parent data type under "_PARENT_DATA_TYPE_KEY" + existing_nested_column_data_type = nested_column_data_types[root_column_name] + assert isinstance(existing_nested_column_data_type, dict) # keeping mypy happy + existing_nested_column_data_type[ + _PARENT_DATA_TYPE_KEY + ] = column_data_type_and_constraints else: # Initialize nested dictionary if root_column_name not in nested_column_data_types: nested_column_data_types[root_column_name] = {} + elif not isinstance(nested_column_data_types[root_column_name], dict): + # a parent specified its base type -- preserve its data_type and potential rendered constraints + # this is used to specify a top-level 'struct' or 'array' field with its own description, constraints, etc + parent_data_type = nested_column_data_types[root_column_name] + nested_column_data_types[root_column_name] = {_PARENT_DATA_TYPE_KEY: parent_data_type} # Recursively process rest of remaining column name remaining_column_name = ".".join(column_name_parts[1:]) @@ -252,8 +268,23 @@ def _format_nested_data_type(unformatted_nested_data_type: Union[str, Dict[str, if isinstance(unformatted_nested_data_type, str): return unformatted_nested_data_type else: + parent_data_type = unformatted_nested_data_type.pop(_PARENT_DATA_TYPE_KEY, None) + parent_constraints = None + if parent_data_type: + parent_data_type_flat = parent_data_type.split() + if len(parent_data_type_flat) > 1: + parent_data_type = parent_data_type_flat[0] + parent_constraints = " ".join(parent_data_type_flat[1:]) + formatted_nested_types = [ f"{column_name} {_format_nested_data_type(column_type)}" for column_name, column_type in unformatted_nested_data_type.items() ] - return f"""struct<{", ".join(formatted_nested_types)}>""" + + formatted_nested_type = f"""struct<{", ".join(formatted_nested_types)}>""" + if parent_data_type and parent_data_type.lower() == "array": + formatted_nested_type = f"""array<{formatted_nested_type}>""" + if parent_constraints: + formatted_nested_type = f"""{formatted_nested_type} {parent_constraints}""" + + return formatted_nested_type From 41725f6c1d0bcda997b39f333a357c0ce8b89c71 Mon Sep 17 00:00:00 2001 From: Michelle Ark Date: Fri, 30 Jun 2023 21:31:21 -0400 Subject: [PATCH 2/7] changelog entry --- .changes/unreleased/Fixes-20230630-213112.yaml | 7 +++++++ 1 file changed, 7 insertions(+) create mode 100644 .changes/unreleased/Fixes-20230630-213112.yaml diff --git a/.changes/unreleased/Fixes-20230630-213112.yaml b/.changes/unreleased/Fixes-20230630-213112.yaml new file mode 100644 index 000000000..7238c0cb1 --- /dev/null +++ b/.changes/unreleased/Fixes-20230630-213112.yaml @@ -0,0 +1,7 @@ +kind: Fixes +body: 'Contracts: Handle struct column specified both at root and nested levels + + arrays of structs' +time: 2023-06-30T21:31:12.63257-04:00 +custom: + Author: michelleark + Issue: 781 782 From 51654f4e7d7f0586d233bb2d1f1f05b17b1f409c Mon Sep 17 00:00:00 2001 From: Michelle Ark Date: Fri, 30 Jun 2023 21:47:15 -0400 Subject: [PATCH 3/7] add parent test cases to test_get_nested_column_data_types --- tests/unit/test_column.py | 71 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 71 insertions(+) diff --git a/tests/unit/test_column.py b/tests/unit/test_column.py index 8c3c645bb..821a711df 100644 --- a/tests/unit/test_column.py +++ b/tests/unit/test_column.py @@ -44,6 +44,51 @@ {"b.nested": "not null"}, {"b": {"name": "b", "data_type": "struct"}}, ), + # Single nested column, 1 level - with corresponding parent column + ( + { + "b": {"name": "b", "data_type": "struct"}, + "b.nested": {"name": "b.nested", "data_type": "string"}, + }, + None, + {"b": {"name": "b", "data_type": "struct"}}, + ), + # Single nested column, 1 level - with corresponding parent column specified last + ( + { + "b.nested": {"name": "b.nested", "data_type": "string"}, + "b": {"name": "b", "data_type": "struct"}, + }, + None, + {"b": {"name": "b", "data_type": "struct"}}, + ), + # Single nested column, 1 level - with corresponding parent column + parent constraint + ( + { + "b": {"name": "b", "data_type": "struct"}, + "b.nested": {"name": "b.nested", "data_type": "string"}, + }, + {"b": "not null"}, + {"b": {"name": "b", "data_type": "struct not null"}}, + ), + # Single nested column, 1 level - with corresponding parent column as array + ( + { + "b": {"name": "b", "data_type": "array"}, + "b.nested": {"name": "b.nested", "data_type": "string"}, + }, + None, + {"b": {"name": "b", "data_type": "array>"}}, + ), + # Single nested column, 1 level - with corresponding parent column as array + constraint + ( + { + "b": {"name": "b", "data_type": "array"}, + "b.nested": {"name": "b.nested", "data_type": "string"}, + }, + {"b": "not null"}, + {"b": {"name": "b", "data_type": "array> not null"}}, + ), # Multiple nested columns, 1 level ( { @@ -128,6 +173,32 @@ }, }, ), + # Nested columns, multiple levels - with parent arrays and constraints! + ( + { + "b.user.names": { + "name": "b.user.names", + "data_type": "array", + }, + "b.user.names.first": { + "name": "b.user.names.first", + "data_type": "string", + }, + "b.user.names.last": { + "name": "b.user.names.last", + "data_type": "string", + }, + "b.user.id": {"name": "b.user.id", "data_type": "int64"}, + "b.user.country": {"name": "b.user.country", "data_type": "string"}, + }, + {"b.user.names.first": "not null", "b.user.id": "unique"}, + { + "b": { + "name": "b", + "data_type": "struct>, id int64 unique, country string>>", + }, + }, + ), ], ) def test_get_nested_column_data_types(columns, constraints, expected_nested_columns): From b76108943c7925b3ccd967dfb8872068542a41ed Mon Sep 17 00:00:00 2001 From: Michelle Ark Date: Wed, 5 Jul 2023 17:27:11 -0400 Subject: [PATCH 4/7] refactoring feedback --- dbt/adapters/bigquery/column.py | 46 ++++++++++++++++++--------------- 1 file changed, 25 insertions(+), 21 deletions(-) diff --git a/dbt/adapters/bigquery/column.py b/dbt/adapters/bigquery/column.py index 02a3e8182..2b8f9a1e9 100644 --- a/dbt/adapters/bigquery/column.py +++ b/dbt/adapters/bigquery/column.py @@ -222,24 +222,29 @@ def _update_nested_column_data_types( if column_rendered_constraint is None else f"{column_data_type} {column_rendered_constraint}" ) - if root_column_name not in nested_column_data_types: - nested_column_data_types[root_column_name] = column_data_type_and_constraints - else: - # entry could already exist if this is a parent column -- preserve the parent data type under "_PARENT_DATA_TYPE_KEY" - existing_nested_column_data_type = nested_column_data_types[root_column_name] + + if existing_nested_column_data_type := nested_column_data_types.get(root_column_name): assert isinstance(existing_nested_column_data_type, dict) # keeping mypy happy - existing_nested_column_data_type[ - _PARENT_DATA_TYPE_KEY - ] = column_data_type_and_constraints + # entry could already exist if this is a parent column -- preserve the parent data type under "_PARENT_DATA_TYPE_KEY" + existing_nested_column_data_type.update( + {_PARENT_DATA_TYPE_KEY: column_data_type_and_constraints} + ) + else: + nested_column_data_types.update({root_column_name: column_data_type_and_constraints}) else: - # Initialize nested dictionary - if root_column_name not in nested_column_data_types: - nested_column_data_types[root_column_name] = {} - elif not isinstance(nested_column_data_types[root_column_name], dict): + parent_data_type = nested_column_data_types.get(root_column_name) + if isinstance(parent_data_type, dict): + # nested dictionary already initialized + pass + elif parent_data_type is None: + # initialize nested dictionary + nested_column_data_types.update({root_column_name: {}}) + else: # a parent specified its base type -- preserve its data_type and potential rendered constraints # this is used to specify a top-level 'struct' or 'array' field with its own description, constraints, etc - parent_data_type = nested_column_data_types[root_column_name] - nested_column_data_types[root_column_name] = {_PARENT_DATA_TYPE_KEY: parent_data_type} + nested_column_data_types.update( + {root_column_name: {_PARENT_DATA_TYPE_KEY: parent_data_type}} + ) # Recursively process rest of remaining column name remaining_column_name = ".".join(column_name_parts[1:]) @@ -268,13 +273,9 @@ def _format_nested_data_type(unformatted_nested_data_type: Union[str, Dict[str, if isinstance(unformatted_nested_data_type, str): return unformatted_nested_data_type else: - parent_data_type = unformatted_nested_data_type.pop(_PARENT_DATA_TYPE_KEY, None) - parent_constraints = None - if parent_data_type: - parent_data_type_flat = parent_data_type.split() - if len(parent_data_type_flat) > 1: - parent_data_type = parent_data_type_flat[0] - parent_constraints = " ".join(parent_data_type_flat[1:]) + parent_data_type, *parent_constraints = unformatted_nested_data_type.pop( + _PARENT_DATA_TYPE_KEY, "" + ).split() or [None] formatted_nested_types = [ f"{column_name} {_format_nested_data_type(column_type)}" @@ -282,9 +283,12 @@ def _format_nested_data_type(unformatted_nested_data_type: Union[str, Dict[str, ] formatted_nested_type = f"""struct<{", ".join(formatted_nested_types)}>""" + if parent_data_type and parent_data_type.lower() == "array": formatted_nested_type = f"""array<{formatted_nested_type}>""" + if parent_constraints: + parent_constraints = " ".join(parent_constraints) formatted_nested_type = f"""{formatted_nested_type} {parent_constraints}""" return formatted_nested_type From d45b973306556bb5da63de700b9eb4ce70ca7412 Mon Sep 17 00:00:00 2001 From: Michelle Ark Date: Mon, 10 Jul 2023 08:43:07 -0700 Subject: [PATCH 5/7] handle missing data_type --- dbt/adapters/bigquery/column.py | 32 ++++++++++++++++---------- dbt/adapters/bigquery/impl.py | 2 +- tests/unit/test_column.py | 40 +++++++++++++++++++++++++++++++++ 3 files changed, 61 insertions(+), 13 deletions(-) diff --git a/dbt/adapters/bigquery/column.py b/dbt/adapters/bigquery/column.py index 2b8f9a1e9..a5a60cfc0 100644 --- a/dbt/adapters/bigquery/column.py +++ b/dbt/adapters/bigquery/column.py @@ -133,7 +133,7 @@ def column_to_bq_schema(self) -> SchemaField: def get_nested_column_data_types( columns: Dict[str, Dict[str, Any]], constraints: Optional[Dict[str, str]] = None, -) -> Dict[str, Dict[str, str]]: +) -> Dict[str, Dict[str, Optional[str]]]: """ columns: * Dictionary where keys are of flat columns names and values are dictionary of column attributes @@ -161,16 +161,16 @@ def get_nested_column_data_types( """ constraints = constraints or {} - nested_column_data_types: Dict[str, Union[str, Dict]] = {} + nested_column_data_types: Dict[str, Optional[Union[str, Dict]]] = {} for column in columns.values(): _update_nested_column_data_types( column["name"], - column["data_type"], + column.get("data_type"), constraints.get(column["name"]), nested_column_data_types, ) - formatted_nested_column_data_types: Dict[str, Dict[str, str]] = {} + formatted_nested_column_data_types: Dict[str, Dict[str, Optional[str]]] = {} for column_name, unformatted_column_type in nested_column_data_types.items(): formatted_nested_column_data_types[column_name] = { "name": column_name, @@ -193,9 +193,9 @@ def get_nested_column_data_types( def _update_nested_column_data_types( column_name: str, - column_data_type: str, + column_data_type: Optional[str], column_rendered_constraint: Optional[str], - nested_column_data_types: Dict[str, Union[str, Dict]], + nested_column_data_types: Dict[str, Optional[Union[str, Dict]]], ) -> None: """ Recursively update nested_column_data_types given a column_name, column_data_type, and optional column_rendered_constraint. @@ -218,9 +218,13 @@ def _update_nested_column_data_types( if len(column_name_parts) == 1: # Base case: column is not nested - store its data_type concatenated with constraint if provided. column_data_type_and_constraints = ( - column_data_type - if column_rendered_constraint is None - else f"{column_data_type} {column_rendered_constraint}" + ( + column_data_type + if column_rendered_constraint is None + else f"{column_data_type} {column_rendered_constraint}" + ) + if column_data_type + else None ) if existing_nested_column_data_type := nested_column_data_types.get(root_column_name): @@ -258,7 +262,9 @@ def _update_nested_column_data_types( ) -def _format_nested_data_type(unformatted_nested_data_type: Union[str, Dict[str, Any]]) -> str: +def _format_nested_data_type( + unformatted_nested_data_type: Optional[Union[str, Dict[str, Any]]] +) -> Optional[str]: """ Recursively format a (STRUCT) data type given an arbitrarily nested data type structure. @@ -270,7 +276,9 @@ def _format_nested_data_type(unformatted_nested_data_type: Union[str, Dict[str, >>> BigQueryAdapter._format_nested_data_type({'c': 'string not_null', 'd': {'e': 'string'}}) 'struct>' """ - if isinstance(unformatted_nested_data_type, str): + if unformatted_nested_data_type is None: + return None + elif isinstance(unformatted_nested_data_type, str): return unformatted_nested_data_type else: parent_data_type, *parent_constraints = unformatted_nested_data_type.pop( @@ -278,7 +286,7 @@ def _format_nested_data_type(unformatted_nested_data_type: Union[str, Dict[str, ).split() or [None] formatted_nested_types = [ - f"{column_name} {_format_nested_data_type(column_type)}" + f"{column_name} {_format_nested_data_type(column_type) or ''}".strip() for column_name, column_type in unformatted_nested_data_type.items() ] diff --git a/dbt/adapters/bigquery/impl.py b/dbt/adapters/bigquery/impl.py index 5c965ca7c..353be08d8 100644 --- a/dbt/adapters/bigquery/impl.py +++ b/dbt/adapters/bigquery/impl.py @@ -300,7 +300,7 @@ def nest_column_data_types( cls, columns: Dict[str, Dict[str, Any]], constraints: Optional[Dict[str, str]] = None, - ) -> Dict[str, Dict[str, str]]: + ) -> Dict[str, Dict[str, Optional[str]]]: return get_nested_column_data_types(columns, constraints) def get_columns_in_relation(self, relation: BigQueryRelation) -> List[BigQueryColumn]: diff --git a/tests/unit/test_column.py b/tests/unit/test_column.py index 821a711df..10f30594e 100644 --- a/tests/unit/test_column.py +++ b/tests/unit/test_column.py @@ -14,6 +14,12 @@ None, {"a": {"name": "a", "data_type": "string"}}, ), + # Flat column - missing data_type + ( + {"a": {"name": "a"}}, + None, + {"a": {"name": "a", "data_type": None}}, + ), # Flat column - with constraints ( {"a": {"name": "a", "data_type": "string"}}, @@ -32,12 +38,24 @@ None, {"b": {"name": "b", "data_type": "struct"}}, ), + # Single nested column, 1 level - missing data_type + ( + {"b.nested": {"name": "b.nested"}}, + None, + {"b": {"name": "b", "data_type": "struct"}}, + ), # Single nested column, 1 level - with constraints ( {"b.nested": {"name": "b.nested", "data_type": "string"}}, {"b.nested": "not null"}, {"b": {"name": "b", "data_type": "struct"}}, ), + # Single nested column, 1 level - with constraints, missing data_type (constraints not valid without data_type) + ( + {"b.nested": {"name": "b.nested"}}, + {"b.nested": "not null"}, + {"b": {"name": "b", "data_type": "struct"}}, + ), # Single nested column, 1 level - with constraints + other keys ( {"b.nested": {"name": "b.nested", "data_type": "string", "other": "unpreserved"}}, @@ -151,6 +169,28 @@ }, }, ), + # Nested columns, multiple levels - missing data_type + ( + { + "b.user.name.first": { + "name": "b.user.name.first", + "data_type": "string", + }, + "b.user.name.last": { + "name": "b.user.name.last", + "data_type": "string", + }, + "b.user.id": {"name": "b.user.id", "data_type": "int64"}, + "b.user.country": {"name": "b.user.country"}, # missing data_type + }, + None, + { + "b": { + "name": "b", + "data_type": "struct, id int64, country>>", + }, + }, + ), # Nested columns, multiple levels - with constraints! ( { From 2ab4fef282d53d9fec04d05cbe8554cc7fe8ea51 Mon Sep 17 00:00:00 2001 From: Michelle Ark Date: Mon, 10 Jul 2023 10:18:44 -0700 Subject: [PATCH 6/7] exceptions.column_type_missing when data_type missing in bigquery__get_empty_schema_sql on flattened columns representation --- .../bigquery/macros/utils/get_columns_spec_ddl.sql | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/dbt/include/bigquery/macros/utils/get_columns_spec_ddl.sql b/dbt/include/bigquery/macros/utils/get_columns_spec_ddl.sql index a1247dcd6..1a4193c71 100644 --- a/dbt/include/bigquery/macros/utils/get_columns_spec_ddl.sql +++ b/dbt/include/bigquery/macros/utils/get_columns_spec_ddl.sql @@ -5,6 +5,16 @@ {%- endmacro -%} {% macro bigquery__get_empty_schema_sql(columns) %} + {%- set col_err = [] -%} + {% for col in columns.values() %} + {%- if col['data_type'] is not defined -%} + {{ col_err.append(col['name']) }} + {%- endif -%} + {%- endfor -%} + {%- if (col_err | length) > 0 -%} + {{ exceptions.column_type_missing(column_names=col_err) }} + {%- endif -%} + {%- set columns = adapter.nest_column_data_types(columns) -%} {{ return(dbt.default__get_empty_schema_sql(columns)) }} {% endmacro %} From f05c78421444b39554f4ff88e44ad65b32c15636 Mon Sep 17 00:00:00 2001 From: Michelle Ark Date: Mon, 10 Jul 2023 12:41:57 -0700 Subject: [PATCH 7/7] linting --- .github/pull_request_template.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md index f3fe5ac83..a3c340cc3 100644 --- a/.github/pull_request_template.md +++ b/.github/pull_request_template.md @@ -1,5 +1,5 @@ -resolves # -[docs](https://github.com/dbt-labs/docs.getdbt.com/issues/new/choose) dbt-labs/docs.getdbt.com/# +resolves # +[docs](https://github.com/dbt-labs/docs.getdbt.com/issues/new/choose) dbt-labs/docs.getdbt.com/#