From 318bd2fa67687c7d2d69c8852ba73c00dbca8333 Mon Sep 17 00:00:00 2001 From: ali-tny Date: Fri, 23 Jul 2021 10:27:10 +0100 Subject: [PATCH 1/3] Add test for failing get_columns_in_relation Specifically, it fails when it's called on a model that was created in the same run. At the start of the run, the cache is populated, setting Relation.information to be the string output of a DESCRIBE EXTENDED query, which allows columns and metadata to be parsed. However, when models are created, a Relation with information=None is saved in the cache (since columns and metadata aren't returned from a CREATE TABLE / VIEW statement). This means that an `expected string or bytes-like object` error is raised when attempting to regex-parse None. --- .../get_columns_in_relation/models/child.sql | 1 + .../models/get_columns_from_child.sql | 6 +++++ .../test_get_columns_in_relation.py | 27 +++++++++++++++++++ 3 files changed, 34 insertions(+) create mode 100644 test/custom/get_columns_in_relation/models/child.sql create mode 100644 test/custom/get_columns_in_relation/models/get_columns_from_child.sql create mode 100644 test/custom/get_columns_in_relation/test_get_columns_in_relation.py diff --git a/test/custom/get_columns_in_relation/models/child.sql b/test/custom/get_columns_in_relation/models/child.sql new file mode 100644 index 000000000..2e3761f7a --- /dev/null +++ b/test/custom/get_columns_in_relation/models/child.sql @@ -0,0 +1 @@ +SELECT 1 diff --git a/test/custom/get_columns_in_relation/models/get_columns_from_child.sql b/test/custom/get_columns_in_relation/models/get_columns_from_child.sql new file mode 100644 index 000000000..5118ae034 --- /dev/null +++ b/test/custom/get_columns_in_relation/models/get_columns_from_child.sql @@ -0,0 +1,6 @@ +SELECT + {% set cols = adapter.get_columns_in_relation(ref('child')) %} + {% for col in cols %} + {{ adapter.quote(col.column) }}{%- if not loop.last %},{{ '\n ' }}{% endif %} + {% endfor %} +FROM {{ ref('child') }} diff --git a/test/custom/get_columns_in_relation/test_get_columns_in_relation.py b/test/custom/get_columns_in_relation/test_get_columns_in_relation.py new file mode 100644 index 000000000..e2c1d7a48 --- /dev/null +++ b/test/custom/get_columns_in_relation/test_get_columns_in_relation.py @@ -0,0 +1,27 @@ +from test.custom.base import DBTSparkIntegrationTest, use_profile + + +class TestGetColumnInRelationInSameRun(DBTSparkIntegrationTest): + @property + def schema(self): + return "get_columns_in_relation" + + @property + def models(self): + return "models" + + def run_and_test(self): + self.run_dbt(["run"]) + self.assertTablesEqual("child", "get_columns_from_child") + + @use_profile("apache_spark") + def test_get_columns_in_relation_in_same_run_apache_spark(self): + self.run_and_test() + + @use_profile("databricks_cluster") + def test_get_columns_in_relation_in_same_run_databricks_cluster(self): + self.run_and_test() + + @use_profile("databricks_sql_endpoint") + def test_get_columns_in_relation_in_same_run_databricks_sql_endpoint(self): + self.run_and_test() From 4f12570219f8159a5b1e5751418a90c7662c6f42 Mon Sep 17 00:00:00 2001 From: ali-tny Date: Fri, 23 Jul 2021 11:19:20 +0100 Subject: [PATCH 2/3] Only parse cols from cache if there's information If the `information` attribute is not yet set, we fall back on the non-cached version to find column information. We could _also_ cache the output of that query, but given that it wasn't cached originally, I leave it as it is. --- dbt/adapters/spark/impl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dbt/adapters/spark/impl.py b/dbt/adapters/spark/impl.py index 9f4ae514c..f8e72449a 100644 --- a/dbt/adapters/spark/impl.py +++ b/dbt/adapters/spark/impl.py @@ -212,7 +212,7 @@ def get_columns_in_relation(self, relation: Relation) -> List[SparkColumn]: for cached_relation in cached_relations if str(cached_relation) == str(relation)), None) - if cached_relation is None: + if cached_relation is None or cached_relation.information is None: rows: List[agate.Row] = super().get_columns_in_relation(relation) columns = self.parse_describe_extended(relation, rows) else: From 183557e78e099d7d88c0a5c09984ccf1201d8982 Mon Sep 17 00:00:00 2001 From: ali-tny Date: Fri, 23 Jul 2021 11:54:03 +0100 Subject: [PATCH 3/3] Add get_columns_in_relation fix to CHANGELOG --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index d0b8344f8..0f8705ad5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,7 @@ ### Fixes - Fix column-level `persist_docs` on Delta tables, add tests ([#180](https://github.com/fishtown-analytics/dbt-spark/pull/180)) +- Fix `get_columns_in_relation` when called on models created in the same run ([#197](https://github.com/dbt-labs/dbt-spark/pull/197)) ## dbt-spark 0.20.0rc1 (June 8, 2021)