From 570e7c07454befacf1a399d6249aa6b2e19c6552 Mon Sep 17 00:00:00 2001 From: Beto Dealmeida Date: Thu, 17 Jun 2021 15:14:32 -0700 Subject: [PATCH 1/3] feat: Jinja2 macro for querying datasets --- superset/connectors/base/models.py | 2 +- superset/jinja_context.py | 34 +++++++++ tests/unit_tests/jinja_context_test.py | 101 ++++++++++++++++++++++++- 3 files changed, 135 insertions(+), 2 deletions(-) diff --git a/superset/connectors/base/models.py b/superset/connectors/base/models.py index 73b841ac1687a..7809dab4ae9c5 100644 --- a/superset/connectors/base/models.py +++ b/superset/connectors/base/models.py @@ -449,7 +449,7 @@ def external_metadata(self) -> List[Dict[str, str]]: def get_query_str(self, query_obj: QueryObjectDict) -> str: """Returns a query as a string - This is used to be displayed to the user so that she/he can + This is used to be displayed to the user so that they can understand what is taking place behind the scene""" raise NotImplementedError() diff --git a/superset/jinja_context.py b/superset/jinja_context.py index e365b9a708ddb..f2ad5378f00fb 100644 --- a/superset/jinja_context.py +++ b/superset/jinja_context.py @@ -38,6 +38,7 @@ from sqlalchemy.types import String from typing_extensions import TypedDict +from superset.datasets.commands.exceptions import DatasetNotFoundError from superset.exceptions import SupersetTemplateException from superset.extensions import feature_flag_manager from superset.utils.core import convert_legacy_filters_into_adhoc, merge_extra_filters @@ -490,6 +491,7 @@ def set_context(self, **kwargs: Any) -> None: "cache_key_wrapper": partial(safe_proxy, extra_cache.cache_key_wrapper), "filter_values": partial(safe_proxy, extra_cache.filter_values), "get_filters": partial(safe_proxy, extra_cache.get_filters), + "dataset": partial(safe_proxy, dataset_macro), } ) @@ -602,3 +604,35 @@ def get_template_processor( else: template_processor = NoOpTemplateProcessor return template_processor(database=database, table=table, query=query, **kwargs) + + +def dataset_macro( + dataset_id: int, + include_metrics: bool = False, + groupby: Optional[List[str]] = None, +) -> str: + """ + Given a dataset ID, return the SQL that represents it. + + The generated SQL includes all columns (including computed) by default. Optionally + the user can also request metrics to be included, and columns to group by. + """ + # pylint: disable=import-outside-toplevel + from superset.datasets.dao import DatasetDAO + + dataset = DatasetDAO.find_by_id(dataset_id) + if not dataset: + raise DatasetNotFoundError(f"Dataset {dataset_id} not found!") + + columns = [column.column_name for column in dataset.columns] + metrics = [metric.metric_name for metric in dataset.metrics] + query_obj = { + "is_timeseries": False, + "filter": [], + "metrics": metrics if include_metrics else None, + "columns": columns, + "groupby": groupby, + } + sqla_query = dataset.get_query_str_extended(query_obj) + sql = sqla_query.sql + return f"({sql}) AS dataset_{dataset_id}" diff --git a/tests/unit_tests/jinja_context_test.py b/tests/unit_tests/jinja_context_test.py index 1f88f4f1a99c8..58791d0d6ccfd 100644 --- a/tests/unit_tests/jinja_context_test.py +++ b/tests/unit_tests/jinja_context_test.py @@ -14,8 +14,15 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +# pylint: disable=invalid-name, unused-argument -from superset.jinja_context import where_in +import json + +import pytest +from pytest_mock import MockFixture + +from superset.datasets.commands.exceptions import DatasetNotFoundError +from superset.jinja_context import dataset_macro, where_in def test_where_in() -> None: @@ -25,3 +32,95 @@ def test_where_in() -> None: assert where_in([1, "b", 3]) == "(1, 'b', 3)" assert where_in([1, "b", 3], '"') == '(1, "b", 3)' assert where_in(["O'Malley's"]) == "('O''Malley''s')" + + +def test_dataset_macro(mocker: MockFixture, app_context: None) -> None: + """ + Test the ``dataset_macro`` macro. + """ + # pylint: disable=import-outside-toplevel + from superset.connectors.sqla.models import SqlaTable, SqlMetric, TableColumn + from superset.models.core import Database + + columns = [ + TableColumn(column_name="ds", is_dttm=1, type="TIMESTAMP"), + TableColumn(column_name="num_boys", type="INTEGER"), + TableColumn(column_name="revenue", type="INTEGER"), + TableColumn(column_name="expenses", type="INTEGER"), + TableColumn( + column_name="profit", type="INTEGER", expression="revenue-expenses" + ), + ] + metrics = [ + SqlMetric(metric_name="cnt", expression="COUNT(*)"), + ] + + dataset = SqlaTable( + table_name="old_dataset", + columns=columns, + metrics=metrics, + main_dttm_col="ds", + default_endpoint="https://www.youtube.com/watch?v=dQw4w9WgXcQ", # not used + database=Database(database_name="my_database", sqlalchemy_uri="sqlite://"), + offset=-8, + description="This is the description", + is_featured=1, + cache_timeout=3600, + schema="my_schema", + sql=None, + params=json.dumps( + { + "remote_id": 64, + "database_name": "examples", + "import_time": 1606677834, + } + ), + perm=None, + filter_select_enabled=1, + fetch_values_predicate="foo IN (1, 2)", + is_sqllab_view=0, # no longer used? + template_params=json.dumps({"answer": "42"}), + schema_perm=None, + extra=json.dumps({"warning_markdown": "*WARNING*"}), + ) + DatasetDAO = mocker.patch("superset.datasets.dao.DatasetDAO") + DatasetDAO.find_by_id.return_value = dataset + + assert ( + dataset_macro(1) + == """(SELECT ds AS ds, + num_boys AS num_boys, + revenue AS revenue, + expenses AS expenses, + revenue-expenses AS profit +FROM my_schema.old_dataset) AS dataset_1""" + ) + + assert ( + dataset_macro(1, include_metrics=True) + == """(SELECT ds AS ds, + num_boys AS num_boys, + revenue AS revenue, + expenses AS expenses, + revenue-expenses AS profit, + COUNT(*) AS cnt +FROM my_schema.old_dataset +GROUP BY ds, + num_boys, + revenue, + expenses, + revenue-expenses) AS dataset_1""" + ) + + assert ( + dataset_macro(1, include_metrics=True, groupby=["ds"]) + == """(SELECT ds AS ds, + COUNT(*) AS cnt +FROM my_schema.old_dataset +GROUP BY ds) AS dataset_1""" + ) + + DatasetDAO.find_by_id.return_value = None + with pytest.raises(DatasetNotFoundError) as excinfo: + dataset_macro(1) + assert str(excinfo.value) == "Dataset 1 not found!" From 2fff8e8107fdad30c282acebfc7f6efc3a9744e2 Mon Sep 17 00:00:00 2001 From: Beto Dealmeida Date: Tue, 17 May 2022 16:52:03 -0700 Subject: [PATCH 2/3] Add docs --- docs/docs/installation/sql-templating.mdx | 24 +++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/docs/docs/installation/sql-templating.mdx b/docs/docs/installation/sql-templating.mdx index 8908d39f0280e..4135403f80770 100644 --- a/docs/docs/installation/sql-templating.mdx +++ b/docs/docs/installation/sql-templating.mdx @@ -273,3 +273,27 @@ Here's a concrete example: superiors order by lineage, level ``` + +**Datasets** + +It's possible to query physical and virtual datasets using the `dataset` macro. This is useful if you've defined computed columns and metrics on your datasets, and want to reuse the definition in adhoc SQL Lab queries. + +To use the macro, first you need to find the ID of the dataset. This can be done by going to the view showing all the datasets, hovering over the dataset you're interested in, and looking at its URL. For example, if the URL for a dataset is https://superset.example.org/superset/explore/table/42/ its ID is 42. + +Once you have the ID you can query it as if it were a table: + +``` +SELECT * FROM {{ dataset(42) }} LIMIT 10 +``` + +IF you want to select the metric definitions as well, in addition to the columns, you need to pass an additional keyword argument: + +``` +SELECT * FROM {{ dataset(42, include_metrics=True) }} LIMIT 10 +``` + +Since metrics are aggregations, the resulting SQL expression will be grouped by all non-metric columns. You can specify a subset of columns to group by instead: + +``` +SELECT * FROM {{ dataset(42, include_metrics=True, groupby=["ds", "category"]) }} LIMIT 10 +``` From ed43b79932ba8db605c29a41f5e67d664a668466 Mon Sep 17 00:00:00 2001 From: Beto Dealmeida Date: Wed, 1 Jun 2022 13:37:35 -0700 Subject: [PATCH 3/3] Address comments --- docs/docs/installation/sql-templating.mdx | 4 ++-- superset/jinja_context.py | 5 ++--- tests/unit_tests/jinja_context_test.py | 2 +- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/docs/docs/installation/sql-templating.mdx b/docs/docs/installation/sql-templating.mdx index 4135403f80770..905f99cb9f5de 100644 --- a/docs/docs/installation/sql-templating.mdx +++ b/docs/docs/installation/sql-templating.mdx @@ -286,7 +286,7 @@ Once you have the ID you can query it as if it were a table: SELECT * FROM {{ dataset(42) }} LIMIT 10 ``` -IF you want to select the metric definitions as well, in addition to the columns, you need to pass an additional keyword argument: +If you want to select the metric definitions as well, in addition to the columns, you need to pass an additional keyword argument: ``` SELECT * FROM {{ dataset(42, include_metrics=True) }} LIMIT 10 @@ -295,5 +295,5 @@ SELECT * FROM {{ dataset(42, include_metrics=True) }} LIMIT 10 Since metrics are aggregations, the resulting SQL expression will be grouped by all non-metric columns. You can specify a subset of columns to group by instead: ``` -SELECT * FROM {{ dataset(42, include_metrics=True, groupby=["ds", "category"]) }} LIMIT 10 +SELECT * FROM {{ dataset(42, include_metrics=True, columns=["ds", "category"]) }} LIMIT 10 ``` diff --git a/superset/jinja_context.py b/superset/jinja_context.py index f2ad5378f00fb..42f6809c7402f 100644 --- a/superset/jinja_context.py +++ b/superset/jinja_context.py @@ -609,7 +609,7 @@ def get_template_processor( def dataset_macro( dataset_id: int, include_metrics: bool = False, - groupby: Optional[List[str]] = None, + columns: Optional[List[str]] = None, ) -> str: """ Given a dataset ID, return the SQL that represents it. @@ -624,14 +624,13 @@ def dataset_macro( if not dataset: raise DatasetNotFoundError(f"Dataset {dataset_id} not found!") - columns = [column.column_name for column in dataset.columns] + columns = columns or [column.column_name for column in dataset.columns] metrics = [metric.metric_name for metric in dataset.metrics] query_obj = { "is_timeseries": False, "filter": [], "metrics": metrics if include_metrics else None, "columns": columns, - "groupby": groupby, } sqla_query = dataset.get_query_str_extended(query_obj) sql = sqla_query.sql diff --git a/tests/unit_tests/jinja_context_test.py b/tests/unit_tests/jinja_context_test.py index 58791d0d6ccfd..75c49f0977bf6 100644 --- a/tests/unit_tests/jinja_context_test.py +++ b/tests/unit_tests/jinja_context_test.py @@ -113,7 +113,7 @@ def test_dataset_macro(mocker: MockFixture, app_context: None) -> None: ) assert ( - dataset_macro(1, include_metrics=True, groupby=["ds"]) + dataset_macro(1, include_metrics=True, columns=["ds"]) == """(SELECT ds AS ds, COUNT(*) AS cnt FROM my_schema.old_dataset