From e19c9274f51b6d43a3ee91a5fc70d0128acd62e1 Mon Sep 17 00:00:00 2001 From: Beto Dealmeida Date: Mon, 27 Jul 2020 20:45:25 -0700 Subject: [PATCH 1/5] Handle ROW data stored as string --- superset/db_engine_specs/presto.py | 9 ++-- tests/db_engine_specs/presto_tests.py | 59 ++++++++++++++++++++++++++- 2 files changed, 64 insertions(+), 4 deletions(-) diff --git a/superset/db_engine_specs/presto.py b/superset/db_engine_specs/presto.py index ebf8ec4833260..1b739bdc2e2ce 100644 --- a/superset/db_engine_specs/presto.py +++ b/superset/db_engine_specs/presto.py @@ -22,7 +22,7 @@ from contextlib import closing from datetime import datetime from distutils.version import StrictVersion -from typing import Any, cast, Dict, List, Optional, Tuple, TYPE_CHECKING +from typing import Any, cast, Dict, List, Optional, Tuple, TYPE_CHECKING, Union from urllib import parse import pandas as pd @@ -653,12 +653,15 @@ def expand_data( # pylint: disable=too-many-locals # expand columns; we append them to the left so they are added # immediately after the parent expanded = get_children(column) - to_process.extendleft((column, level) for column in expanded) + to_process.extendleft((column, level) for column in expanded[::-1]) expanded_columns.extend(expanded) # expand row objects into new columns for row in data: - for value, col in zip(row.get(name) or [], expanded): + row_data: Union[str, List] = row.get(name) or [] + if isinstance(row_data, str): + row[name] = row_data = json.loads(row_data) + for value, col in zip(row_data, expanded): row[col["name"]] = value data = [ diff --git a/tests/db_engine_specs/presto_tests.py b/tests/db_engine_specs/presto_tests.py index b57ab01b18e96..9d1d384615275 100644 --- a/tests/db_engine_specs/presto_tests.py +++ b/tests/db_engine_specs/presto_tests.py @@ -214,9 +214,9 @@ def test_presto_expand_data_with_complex_row_columns(self): "name": "row_column", "type": "ROW(NESTED_OBJ1 VARCHAR, NESTED_ROW ROW(NESTED_OBJ2 VARCHAR))", }, + {"name": "row_column.nested_obj1", "type": "VARCHAR"}, {"name": "row_column.nested_row", "type": "ROW(NESTED_OBJ2 VARCHAR)"}, {"name": "row_column.nested_row.nested_obj2", "type": "VARCHAR"}, - {"name": "row_column.nested_obj1", "type": "VARCHAR"}, ] expected_data = [ { @@ -433,3 +433,60 @@ def test_query_cost_formatter(self): } ] self.assertEqual(formatted_cost, expected) + + @mock.patch.dict( + "superset.extensions.feature_flag_manager._feature_flags", + {"PRESTO_EXPAND_DATA": True}, + clear=True, + ) + def test_presto_expand_data_array(self): + cols = [ + {"name": "event_id", "type": "VARCHAR", "is_date": False}, + {"name": "timestamp", "type": "BIGINT", "is_date": False}, + { + "name": "user", + "type": "ROW(ID BIGINT, FIRST_NAME VARCHAR, LAST_NAME VARCHAR)", + "is_date": False, + }, + ] + data = [ + { + "event_id": "abcdef01-2345-6789-abcd-ef0123456789", + "timestamp": "1595895506219", + "user": '[1, "JOHN", "DOE"]', + } + ] + actual_cols, actual_data, actual_expanded_cols = PrestoEngineSpec.expand_data( + cols, data + ) + expected_cols = [ + {"name": "event_id", "type": "VARCHAR", "is_date": False}, + {"name": "timestamp", "type": "BIGINT", "is_date": False}, + { + "name": "user", + "type": "ROW(ID BIGINT, FIRST_NAME VARCHAR, LAST_NAME VARCHAR)", + "is_date": False, + }, + {"name": "user.id", "type": "BIGINT"}, + {"name": "user.first_name", "type": "VARCHAR"}, + {"name": "user.last_name", "type": "VARCHAR"}, + ] + expected_data = [ + { + "event_id": "abcdef01-2345-6789-abcd-ef0123456789", + "timestamp": "1595895506219", + "user": [1, "JOHN", "DOE"], + "user.id": 1, + "user.first_name": "JOHN", + "user.last_name": "DOE", + } + ] + expected_expanded_cols = [ + {"name": "user.id", "type": "BIGINT"}, + {"name": "user.first_name", "type": "VARCHAR"}, + {"name": "user.last_name", "type": "VARCHAR"}, + ] + + self.assertEqual(actual_cols, expected_cols) + self.assertEqual(actual_data, expected_data) + self.assertEqual(actual_expanded_cols, expected_expanded_cols) From b8432e40524be690d038c4ac1e186d73982018db Mon Sep 17 00:00:00 2001 From: Beto Dealmeida Date: Tue, 28 Jul 2020 12:48:56 -0700 Subject: [PATCH 2/5] Use destringify --- superset/db_engine_specs/presto.py | 13 ++++++++----- superset/result_set.py | 4 ++++ 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/superset/db_engine_specs/presto.py b/superset/db_engine_specs/presto.py index 1b739bdc2e2ce..42f2d2dc79c40 100644 --- a/superset/db_engine_specs/presto.py +++ b/superset/db_engine_specs/presto.py @@ -40,6 +40,7 @@ from superset.exceptions import SupersetTemplateException from superset.models.sql_lab import Query from superset.models.sql_types.presto_sql_types import type_map as presto_type_map +from superset.result_set import destringify from superset.sql_parse import ParsedQuery from superset.utils import core as utils @@ -626,7 +627,9 @@ def expand_data( # pylint: disable=too-many-locals i = 0 while i < len(data): row = data[i] - values = row.get(name) + values: Union[str, List] = row.get(name) + if isinstance(values, str): + row[name] = values = destringify(values) if values: # how many extra rows we need to unnest the data? extra_rows = len(values) - 1 @@ -658,10 +661,10 @@ def expand_data( # pylint: disable=too-many-locals # expand row objects into new columns for row in data: - row_data: Union[str, List] = row.get(name) or [] - if isinstance(row_data, str): - row[name] = row_data = json.loads(row_data) - for value, col in zip(row_data, expanded): + values: Union[str, List] = row.get(name) or [] + if isinstance(values, str): + row[name] = values = destringify(values) + for value, col in zip(values, expanded): row[col["name"]] = value data = [ diff --git a/superset/result_set.py b/superset/result_set.py index 38ca74d4d2ceb..d0a325b148ed8 100644 --- a/superset/result_set.py +++ b/superset/result_set.py @@ -67,6 +67,10 @@ def stringify_values(array: np.ndarray) -> np.ndarray: return vstringify(array) +def destringify(obj: str) -> Any: + return json.loads(obj) + + class SupersetResultSet: def __init__( # pylint: disable=too-many-locals,too-many-branches self, From ee0556fa786cf5a62bc7724d749b843c739ddbfa Mon Sep 17 00:00:00 2001 From: Beto Dealmeida Date: Tue, 28 Jul 2020 12:55:47 -0700 Subject: [PATCH 3/5] Fix mypy --- superset/db_engine_specs/presto.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/superset/db_engine_specs/presto.py b/superset/db_engine_specs/presto.py index 42f2d2dc79c40..b44144903026f 100644 --- a/superset/db_engine_specs/presto.py +++ b/superset/db_engine_specs/presto.py @@ -617,6 +617,7 @@ def expand_data( # pylint: disable=too-many-locals current_array_level = level name = column["name"] + values: Optional[Union[str, List[Any]]] if column["type"].startswith("ARRAY("): # keep processing array children; we append to the right so that @@ -627,7 +628,7 @@ def expand_data( # pylint: disable=too-many-locals i = 0 while i < len(data): row = data[i] - values: Union[str, List] = row.get(name) + values = row.get(name) if isinstance(values, str): row[name] = values = destringify(values) if values: @@ -661,7 +662,7 @@ def expand_data( # pylint: disable=too-many-locals # expand row objects into new columns for row in data: - values: Union[str, List] = row.get(name) or [] + values = row.get(name) or [] if isinstance(values, str): row[name] = values = destringify(values) for value, col in zip(values, expanded): From 77bd207ecd76d066c32a65f64c6cd02d266e53a5 Mon Sep 17 00:00:00 2001 From: Beto Dealmeida Date: Tue, 28 Jul 2020 13:35:45 -0700 Subject: [PATCH 4/5] Fix mypy with cast --- superset/db_engine_specs/presto.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/superset/db_engine_specs/presto.py b/superset/db_engine_specs/presto.py index b44144903026f..78f80d7be72ba 100644 --- a/superset/db_engine_specs/presto.py +++ b/superset/db_engine_specs/presto.py @@ -664,7 +664,7 @@ def expand_data( # pylint: disable=too-many-locals for row in data: values = row.get(name) or [] if isinstance(values, str): - row[name] = values = destringify(values) + row[name] = values = cast(List[Any], destringify(values)) for value, col in zip(values, expanded): row[col["name"]] = value From 63bf7377cc5e749767b51b48c725f1cb4d4245a4 Mon Sep 17 00:00:00 2001 From: Beto Dealmeida Date: Tue, 28 Jul 2020 15:44:15 -0700 Subject: [PATCH 5/5] Bypass pylint --- superset/db_engine_specs/presto.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/superset/db_engine_specs/presto.py b/superset/db_engine_specs/presto.py index 78f80d7be72ba..fc74eb2cccc17 100644 --- a/superset/db_engine_specs/presto.py +++ b/superset/db_engine_specs/presto.py @@ -569,7 +569,7 @@ def get_all_datasource_names( return datasource_names @classmethod - def expand_data( # pylint: disable=too-many-locals + def expand_data( # pylint: disable=too-many-locals,too-many-branches cls, columns: List[Dict[Any, Any]], data: List[Dict[Any, Any]] ) -> Tuple[List[Dict[Any, Any]], List[Dict[Any, Any]], List[Dict[Any, Any]]]: """