Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix issue parsing structs #204

Merged
merged 4 commits into from
Aug 23, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 10 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,20 @@

### Fixes
- Enhanced get_columns_in_relation method to handle a bug in open source deltalake which doesnt return schema details in `show table extended in databasename like '*'` query output. This impacts dbt snapshots if file format is open source deltalake ([#207](https://github.com/dbt-labs/dbt-spark/pull/207))
- Parse properly columns when there are struct fields to avoid considering inner fields: Issue ([#202](https://github.com/dbt-labs/dbt-spark/issues/202))

### Contributors
- [@harryharanb](https://github.com/harryharanb) ([#207](https://github.com/dbt-labs/dbt-spark/pull/207))
- [@SCouto](https://github.com/Scouto) ([#204](https://github.com/dbt-labs/dbt-spark/pull/204))

## dbt-spark 0.21.0b2 (August 20, 2021)

### Fixes
- Add pyodbc import error message to dbt.exceptions.RuntimeException to get more detailed information when running `dbt debug` ([#192](https://github.com/dbt-labs/dbt-spark/pull/192))
- Add support for ODBC Server Side Parameters, allowing options that need to be set with the `SET` statement to be used ([#201](https://github.com/dbt-labs/dbt-spark/pull/201))
- Add `retry_all` configuration setting to retry all connection issues, not just when the `_is_retryable_error` function determines ([#194](https://github.com/dbt-labs/dbt-spark/pull/194))

### Contributors
- [@harryharanb](https://github.com/harryharanb) ([#207](https://github.com/dbt-labs/dbt-spark/pull/207))
- [@JCZuurmond](https://github.com/JCZuurmond) ([#192](https://github.com/fishtown-analytics/dbt-spark/pull/192))
- [@jethron](https://github.com/jethron) ([#201](https://github.com/fishtown-analytics/dbt-spark/pull/201))
- [@gregingenii](https://github.com/gregingenii) ([#194](https://github.com/dbt-labs/dbt-spark/pull/194))
Expand All @@ -24,6 +32,7 @@
### Contributors
- [@ali-tny](https://github.com/ali-tny) ([#197](https://github.com/fishtown-analytics/dbt-spark/pull/197))


## dbt-spark 0.20.0 (July 12, 2021)

## dbt-spark 0.20.0rc2 (July 7, 2021)
Expand Down
2 changes: 1 addition & 1 deletion dbt/adapters/spark/impl.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ class SparkAdapter(SQLAdapter):
'stats:rows:include',
)
INFORMATION_COLUMNS_REGEX = re.compile(
r"\|-- (.*): (.*) \(nullable = (.*)\b", re.MULTILINE)
r"^ \|-- (.*): (.*) \(nullable = (.*)\b", re.MULTILINE)
INFORMATION_OWNER_REGEX = re.compile(r"^Owner: (.*)$", re.MULTILINE)
INFORMATION_STATISTICS_REGEX = re.compile(
r"^Statistics: (.*)$", re.MULTILINE)
Expand Down
86 changes: 82 additions & 4 deletions test/unit/test_adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -275,6 +275,7 @@ def test_parse_relation(self):
('col1', 'decimal(22,0)'),
('col2', 'string',),
('dt', 'date'),
('struct_col', 'struct<struct_inner_col:string>'),
('# Partition Information', 'data_type'),
('# col_name', 'data_type'),
('dt', 'date'),
Expand All @@ -299,7 +300,7 @@ def test_parse_relation(self):
config = self._get_target_http(self.project_cfg)
rows = SparkAdapter(config).parse_describe_extended(
relation, input_cols)
self.assertEqual(len(rows), 3)
self.assertEqual(len(rows), 4)
self.assertEqual(rows[0].to_column_dict(omit_none=False), {
'table_database': None,
'table_schema': relation.schema,
Expand Down Expand Up @@ -342,6 +343,20 @@ def test_parse_relation(self):
'char_size': None
})

self.assertEqual(rows[3].to_column_dict(omit_none=False), {
'table_database': None,
'table_schema': relation.schema,
'table_name': relation.name,
'table_type': rel_type,
'table_owner': 'root',
'column': 'struct_col',
'column_index': 3,
'dtype': 'struct<struct_inner_col:string>',
'numeric_scale': None,
'numeric_precision': None,
'char_size': None
})

def test_parse_relation_with_integer_owner(self):
self.maxDiff = None
rel_type = SparkRelation.get_relation_type.Table
Expand Down Expand Up @@ -507,6 +522,8 @@ def test_parse_columns_from_information_with_table_type_and_delta_provider(self)
" |-- col1: decimal(22,0) (nullable = true)\n"
" |-- col2: string (nullable = true)\n"
" |-- dt: date (nullable = true)\n"
" |-- struct_col: struct (nullable = true)\n"
" | |-- struct_inner_col: string (nullable = true)\n"
)
relation = SparkRelation.create(
schema='default_schema',
Expand All @@ -518,7 +535,7 @@ def test_parse_columns_from_information_with_table_type_and_delta_provider(self)
config = self._get_target_http(self.project_cfg)
columns = SparkAdapter(config).parse_columns_from_information(
relation)
self.assertEqual(len(columns), 3)
self.assertEqual(len(columns), 4)
self.assertEqual(columns[0].to_column_dict(omit_none=False), {
'table_database': None,
'table_schema': relation.schema,
Expand All @@ -538,6 +555,25 @@ def test_parse_columns_from_information_with_table_type_and_delta_provider(self)
'stats:bytes:value': 123456789,
})

self.assertEqual(columns[3].to_column_dict(omit_none=False), {
'table_database': None,
'table_schema': relation.schema,
'table_name': relation.name,
'table_type': rel_type,
'table_owner': 'root',
'column': 'struct_col',
'column_index': 3,
'dtype': 'struct',
'numeric_scale': None,
'numeric_precision': None,
'char_size': None,

'stats:bytes:description': '',
'stats:bytes:include': True,
'stats:bytes:label': 'bytes',
'stats:bytes:value': 123456789,
})

def test_parse_columns_from_information_with_view_type(self):
self.maxDiff = None
rel_type = SparkRelation.get_relation_type.View
Expand Down Expand Up @@ -571,6 +607,8 @@ def test_parse_columns_from_information_with_view_type(self):
" |-- col1: decimal(22,0) (nullable = true)\n"
" |-- col2: string (nullable = true)\n"
" |-- dt: date (nullable = true)\n"
" |-- struct_col: struct (nullable = true)\n"
" | |-- struct_inner_col: string (nullable = true)\n"
)
relation = SparkRelation.create(
schema='default_schema',
Expand All @@ -582,7 +620,7 @@ def test_parse_columns_from_information_with_view_type(self):
config = self._get_target_http(self.project_cfg)
columns = SparkAdapter(config).parse_columns_from_information(
relation)
self.assertEqual(len(columns), 3)
self.assertEqual(len(columns), 4)
self.assertEqual(columns[1].to_column_dict(omit_none=False), {
'table_database': None,
'table_schema': relation.schema,
Expand All @@ -597,6 +635,20 @@ def test_parse_columns_from_information_with_view_type(self):
'char_size': None
})

self.assertEqual(columns[3].to_column_dict(omit_none=False), {
'table_database': None,
'table_schema': relation.schema,
'table_name': relation.name,
'table_type': rel_type,
'table_owner': 'root',
'column': 'struct_col',
'column_index': 3,
'dtype': 'struct',
'numeric_scale': None,
'numeric_precision': None,
'char_size': None
})

def test_parse_columns_from_information_with_table_type_and_parquet_provider(self):
self.maxDiff = None
rel_type = SparkRelation.get_relation_type.Table
Expand All @@ -619,6 +671,8 @@ def test_parse_columns_from_information_with_table_type_and_parquet_provider(sel
" |-- col1: decimal(22,0) (nullable = true)\n"
" |-- col2: string (nullable = true)\n"
" |-- dt: date (nullable = true)\n"
" |-- struct_col: struct (nullable = true)\n"
" | |-- struct_inner_col: string (nullable = true)\n"
)
relation = SparkRelation.create(
schema='default_schema',
Expand All @@ -630,7 +684,7 @@ def test_parse_columns_from_information_with_table_type_and_parquet_provider(sel
config = self._get_target_http(self.project_cfg)
columns = SparkAdapter(config).parse_columns_from_information(
relation)
self.assertEqual(len(columns), 3)
self.assertEqual(len(columns), 4)
self.assertEqual(columns[2].to_column_dict(omit_none=False), {
'table_database': None,
'table_schema': relation.schema,
Expand All @@ -655,3 +709,27 @@ def test_parse_columns_from_information_with_table_type_and_parquet_provider(sel
'stats:rows:value': 12345678
})

self.assertEqual(columns[3].to_column_dict(omit_none=False), {
'table_database': None,
'table_schema': relation.schema,
'table_name': relation.name,
'table_type': rel_type,
'table_owner': 'root',
'column': 'struct_col',
'column_index': 3,
'dtype': 'struct',
'numeric_scale': None,
'numeric_precision': None,
'char_size': None,

'stats:bytes:description': '',
'stats:bytes:include': True,
'stats:bytes:label': 'bytes',
'stats:bytes:value': 1234567890,

'stats:rows:description': '',
'stats:rows:include': True,
'stats:rows:label': 'rows',
'stats:rows:value': 12345678
})