From 51ce3e10933ed2f2a7fddb1b8274faeb1e867854 Mon Sep 17 00:00:00 2001 From: Bob De Schutter Date: Fri, 19 Jun 2020 16:24:01 +0200 Subject: [PATCH 1/4] Fix persist docs functionality for bigquery to include descriptions for nested columns --- .../bigquery/dbt/adapters/bigquery/impl.py | 44 +++++++++-- .../dbt/include/bigquery/macros/catalog.sql | 7 +- .../models-bigquery-nested/schema.yml | 19 +++++ .../table_model_nested.sql | 8 ++ .../view_model_nested.sql | 8 ++ .../test_persist_docs.py | 79 ++++++++++++++++++- 6 files changed, 153 insertions(+), 12 deletions(-) create mode 100644 test/integration/060_persist_docs_tests/models-bigquery-nested/schema.yml create mode 100644 test/integration/060_persist_docs_tests/models-bigquery-nested/table_model_nested.sql create mode 100644 test/integration/060_persist_docs_tests/models-bigquery-nested/view_model_nested.sql diff --git a/plugins/bigquery/dbt/adapters/bigquery/impl.py b/plugins/bigquery/dbt/adapters/bigquery/impl.py index caafcb0fbfb..0b07eb728bf 100644 --- a/plugins/bigquery/dbt/adapters/bigquery/impl.py +++ b/plugins/bigquery/dbt/adapters/bigquery/impl.py @@ -564,6 +564,36 @@ def get_table_ref_from_relation(self, conn, relation): relation.identifier, conn) + def _update_column_dict(self, bq_column_dict, dbt_columns, parent=''): + """ + Helper function to recursively traverse the schema of a table in the + update_column_descriptions function below. + + bq_column_dict should be a dict as obtained by the to_api_repr() + function of a SchemaField object. + """ + if parent: + dotted_column_name = '{}.{}'.format(parent, bq_column_dict['name']) + else: + dotted_column_name = bq_column_dict['name'] + + if dotted_column_name in dbt_columns: + column_config = dbt_columns[dotted_column_name] + bq_column_dict['description'] = column_config.get('description') + + new_fields = [] + for child_col_dict in bq_column_dict.get('fields', list()): + new_child_column_dict = self._update_column_dict( + child_col_dict, + dbt_columns, + parent=dotted_column_name + ) + new_fields.append(new_child_column_dict) + + bq_column_dict['fields'] = new_fields + + return bq_column_dict + @available.parse_none def update_column_descriptions(self, relation, columns): if len(columns) == 0: @@ -574,13 +604,13 @@ def update_column_descriptions(self, relation, columns): table = conn.handle.get_table(table_ref) new_schema = [] - for column in table.schema: - if column.name in columns: - column_config = columns[column.name] - column_dict = column.to_api_repr() - column_dict['description'] = column_config.get('description') - column = SchemaField.from_api_repr(column_dict) - new_schema.append(column) + for bq_column in table.schema: + bq_column_dict = bq_column.to_api_repr() + new_bq_column_dict = self._update_column_dict( + bq_column_dict, + columns + ) + new_schema.append(SchemaField.from_api_repr(new_bq_column_dict)) new_table = google.cloud.bigquery.Table(table_ref, schema=new_schema) conn.handle.update_table(new_table, ['schema']) diff --git a/plugins/bigquery/dbt/include/bigquery/macros/catalog.sql b/plugins/bigquery/dbt/include/bigquery/macros/catalog.sql index ed64af88173..343a1ee04e3 100644 --- a/plugins/bigquery/dbt/include/bigquery/macros/catalog.sql +++ b/plugins/bigquery/dbt/include/bigquery/macros/catalog.sql @@ -83,7 +83,7 @@ -- use the "real" column name from the paths query below column_name as base_column_name, ordinal_position as column_index, - cast(null as string) as column_comment, + -- cast(null as string) as column_comment, is_partitioning_column, clustering_ordinal_position @@ -99,10 +99,11 @@ concat(table_catalog, '.', table_schema, '.', table_name) as relation_id, field_path as column_name, data_type as column_type, - column_name as base_column_name + column_name as base_column_name, + description as column_comment from {{ information_schema.replace(information_schema_view='COLUMN_FIELD_PATHS') }} - where data_type not like 'STRUCT%' + -- where data_type not like 'STRUCT%' ), diff --git a/test/integration/060_persist_docs_tests/models-bigquery-nested/schema.yml b/test/integration/060_persist_docs_tests/models-bigquery-nested/schema.yml new file mode 100644 index 00000000000..0311dcb1449 --- /dev/null +++ b/test/integration/060_persist_docs_tests/models-bigquery-nested/schema.yml @@ -0,0 +1,19 @@ +version: 2 + +models: + - name: table_model_nested + columns: + - name: level_1 + description: level_1 column description + - name: level_1.level_2 + description: level_2 column description + - name: level_1.level_2.level_3_a + description: level_3 column description + - name: view_model_nested + columns: + - name: level_1 + description: level_1 column description + - name: level_1.level_2 + description: level_2 column description + - name: level_1.level_2.level_3_a + description: level_3 column description \ No newline at end of file diff --git a/test/integration/060_persist_docs_tests/models-bigquery-nested/table_model_nested.sql b/test/integration/060_persist_docs_tests/models-bigquery-nested/table_model_nested.sql new file mode 100644 index 00000000000..c2936d4f186 --- /dev/null +++ b/test/integration/060_persist_docs_tests/models-bigquery-nested/table_model_nested.sql @@ -0,0 +1,8 @@ +{{ config(materialized='table') }} +SELECT + STRUCT( + STRUCT( + 1 AS level_3_a, + 2 AS level_3_b + ) AS level_2 + ) AS level_1 \ No newline at end of file diff --git a/test/integration/060_persist_docs_tests/models-bigquery-nested/view_model_nested.sql b/test/integration/060_persist_docs_tests/models-bigquery-nested/view_model_nested.sql new file mode 100644 index 00000000000..e3323ddf4e6 --- /dev/null +++ b/test/integration/060_persist_docs_tests/models-bigquery-nested/view_model_nested.sql @@ -0,0 +1,8 @@ +{{ config(materialized='view') }} +SELECT + STRUCT( + STRUCT( + 1 AS level_3_a, + 2 AS level_3_b + ) AS level_2 + ) AS level_1 \ No newline at end of file diff --git a/test/integration/060_persist_docs_tests/test_persist_docs.py b/test/integration/060_persist_docs_tests/test_persist_docs.py index c5a6cc1689e..44e651701b6 100644 --- a/test/integration/060_persist_docs_tests/test_persist_docs.py +++ b/test/integration/060_persist_docs_tests/test_persist_docs.py @@ -33,13 +33,15 @@ def _assert_has_table_comments(self, table_node): assert table_id_comment.startswith('id Column description') table_name_comment = table_node['columns']['name']['comment'] - assert table_name_comment.startswith('Some stuff here and then a call to') + assert table_name_comment.startswith( + 'Some stuff here and then a call to') self._assert_common_comments( table_comment, table_id_comment, table_name_comment ) - def _assert_has_view_comments(self, view_node, has_node_comments=True, has_column_comments=True): + def _assert_has_view_comments(self, view_node, has_node_comments=True, + has_column_comments=True): view_comment = view_node['metadata']['comment'] if has_node_comments: assert view_comment.startswith('View model description') @@ -156,3 +158,76 @@ def test_snowflake_persist_docs(self): @use_profile('bigquery') def test_bigquery_persist_docs(self): self.run_dbt() + + +class TestPersistDocsNested(BasePersistDocsTest): + @property + def project_config(self): + return { + 'config-version': 2, + 'models': { + 'test': { + '+persist_docs': { + "relation": True, + "columns": True, + }, + } + } + } + + @property + def models(self): + return 'models-bigquery-nested' + + @use_profile('bigquery') + def test_bigquery_persist_docs(self): + """ + run dbt and use the bigquery client from the adapter to check if the + colunmn descriptions are persisted on the test model table and view. + + Next, generate the catalog and check if the comments are also included. + """ + self.run_dbt() + + self.run_dbt(['docs', 'generate']) + with open('target/catalog.json') as fp: + catalog_data = json.load(fp) + assert 'nodes' in catalog_data + assert len(catalog_data['nodes']) == 2 # table and view model + + for node_id in ['table_model_nested', 'view_model_nested']: + # check the descriptions using the api + with self.adapter.connection_named('_test'): + client = self.adapter.connections \ + .get_thread_connection().handle + + table_id = "{}.{}.{}".format( + self.default_database, + self.unique_schema(), + node_id + ) + bq_schema = client.get_table(table_id).schema + + level_1_field = bq_schema[0] + assert level_1_field.description == \ + "level_1 column description" + + level_2_field = level_1_field.fields[0] + assert level_2_field.description == \ + "level_2 column description" + + level_3_field = level_2_field.fields[0] + assert level_3_field.description == \ + "level_3 column description" + + # check the descriptions in the catalog + node = catalog_data['nodes']['model.test.{}'.format(node_id)] + + level_1_column = node['columns']['level_1'] + assert level_1_column['comment'] == "level_1 column description" + + level_2_column = node['columns']['level_1.level_2'] + assert level_2_column['comment'] == "level_2 column description" + + level_3_column = node['columns']['level_1.level_2.level_3_a'] + assert level_3_column['comment'] == "level_3 column description" From 3a5db3c16d4bbcc39f4981902f89f49a082afef2 Mon Sep 17 00:00:00 2001 From: Bob De Schutter Date: Fri, 19 Jun 2020 16:37:53 +0200 Subject: [PATCH 2/4] Updated changelog --- CHANGELOG.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index b6e7ddcaec9..b0b08af69e0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,7 +5,11 @@ - `dbt deps` now respects the `--project-dir` flag, so using `dbt deps --project-dir=/some/path` and then `dbt run --project-dir=/some/path` will properly find dependencies ([#2519](https://github.com/fishtown-analytics/dbt/issues/2519), [#2534](https://github.com/fishtown-analytics/dbt/pull/2534)) - `packages.yml` revision/version fields can be float-like again (`revision: '1.0'` is valid). ([#2518](https://github.com/fishtown-analytics/dbt/issues/2518), [#2535](https://github.com/fishtown-analytics/dbt/pull/2535)) - Parallel RPC requests no longer step on each others' arguments ([[#2484](https://github.com/fishtown-analytics/dbt/issues/2484), [#2554](https://github.com/fishtown-analytics/dbt/pull/2554)]) +- `persist_docs` now takes into account descriptions for nested columns in bigquery ([#2549](https://github.com/fishtown-analytics/dbt/issues/2549), [#2550](https://github.com/fishtown-analytics/dbt/pull/2550)) +Contributors: + - [@bodschut](https://github.com/bodschut) ([#2550](https://github.com/fishtown-analytics/dbt/pull/2550)) + ## dbt 0.17.0 (June 08, 2020) ### Fixes From e99f0d12b815ca8ace8bcac9a08386207256b162 Mon Sep 17 00:00:00 2001 From: Bob De Schutter Date: Fri, 19 Jun 2020 16:56:17 +0200 Subject: [PATCH 3/4] Remove commented out code Co-authored-by: Jacob Beck --- plugins/bigquery/dbt/include/bigquery/macros/catalog.sql | 2 -- 1 file changed, 2 deletions(-) diff --git a/plugins/bigquery/dbt/include/bigquery/macros/catalog.sql b/plugins/bigquery/dbt/include/bigquery/macros/catalog.sql index 343a1ee04e3..6822d88a6a8 100644 --- a/plugins/bigquery/dbt/include/bigquery/macros/catalog.sql +++ b/plugins/bigquery/dbt/include/bigquery/macros/catalog.sql @@ -83,7 +83,6 @@ -- use the "real" column name from the paths query below column_name as base_column_name, ordinal_position as column_index, - -- cast(null as string) as column_comment, is_partitioning_column, clustering_ordinal_position @@ -103,7 +102,6 @@ description as column_comment from {{ information_schema.replace(information_schema_view='COLUMN_FIELD_PATHS') }} - -- where data_type not like 'STRUCT%' ), From 0d2593e0e3a26dcb6c4eda9d88ab165bfdbbb324 Mon Sep 17 00:00:00 2001 From: Bob De Schutter Date: Fri, 19 Jun 2020 18:19:50 +0200 Subject: [PATCH 4/4] Fix docs_generate test --- .../bq_models/nested_table.sql | 4 ++-- .../029_docs_generate_tests/test_docs_generate.py | 12 +++++++++--- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/test/integration/029_docs_generate_tests/bq_models/nested_table.sql b/test/integration/029_docs_generate_tests/bq_models/nested_table.sql index 61309296267..22f9048d629 100644 --- a/test/integration/029_docs_generate_tests/bq_models/nested_table.sql +++ b/test/integration/029_docs_generate_tests/bq_models/nested_table.sql @@ -10,6 +10,6 @@ select 3 as field_3, struct( - 4 as field_4, - 5 as field_5 + 5 as field_5, + 6 as field_6 ) as nested_field diff --git a/test/integration/029_docs_generate_tests/test_docs_generate.py b/test/integration/029_docs_generate_tests/test_docs_generate.py index 18066105620..cb899806eac 100644 --- a/test/integration/029_docs_generate_tests/test_docs_generate.py +++ b/test/integration/029_docs_generate_tests/test_docs_generate.py @@ -619,10 +619,10 @@ def expected_bigquery_complex_catalog(self): 'type': 'INT64', 'comment': None }, - 'nested_field.field_4': { - 'name': 'nested_field.field_4', + 'nested_field': { + 'name': 'nested_field', 'index': 4, - 'type': 'INT64', + 'type': 'STRUCT', 'comment': None }, 'nested_field.field_5': { @@ -630,6 +630,12 @@ def expected_bigquery_complex_catalog(self): 'index': 5, 'type': 'INT64', 'comment': None + }, + 'nested_field.field_6': { + 'name': 'nested_field.field_6', + 'index': 6, + 'type': 'INT64', + 'comment': None } }