diff --git a/metadata-ingestion/source_docs/lookml.md b/metadata-ingestion/source_docs/lookml.md index d971c39e2d76da..ddb99659c66b51 100644 --- a/metadata-ingestion/source_docs/lookml.md +++ b/metadata-ingestion/source_docs/lookml.md @@ -53,11 +53,13 @@ Note that a `.` is used to denote nested fields in the YAML recipe. | `view_pattern.ignoreCase` | | `True` | Whether to ignore case sensitivity during pattern matching. | | `env` | | `"PROD"` | Environment to use in namespace when constructing URNs. | | `parse_table_names_from_sql` | | `False` | See note below. | +| `sql_parser` | | `datahub.utilities.sql_parser.DefaultSQLParser` | See note below. | -Note! The integration can use [`sql-metadata`](https://pypi.org/project/sql-metadata/) to try to parse the tables the -views depends on. As these SQL's can be complicated, and the package doesn't official support all the SQL dialects that -Looker supports, the result might not be correct. This parsing is disabled by default, but can be enabled by setting -`parse_table_names_from_sql: True`. +Note! The integration can use an SQL parser to try to parse the tables the views depends on. This parsing is disabled by default, +but can be enabled by setting `parse_table_names_from_sql: True`. The default parser is based on the [`sql-metadata`](https://pypi.org/project/sql-metadata/) package. +As this package doesn't officially support all the SQL dialects that Looker supports, the result might not be correct. You can, however, implement a +custom parser and take it into use by setting the `sql_parser` configuration value. A custom SQL parser must inherit from `datahub.utilities.sql_parser.SQLParser` +and must be made available to Datahub by ,for example, installing it. The configuration then needs to be set to `module_name.ClassName` of the parser. ## Compatibility diff --git a/metadata-ingestion/src/datahub/ingestion/source/lookml.py b/metadata-ingestion/src/datahub/ingestion/source/lookml.py index 3ca86df6cb964b..7e1dd832bde369 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/lookml.py +++ b/metadata-ingestion/src/datahub/ingestion/source/lookml.py @@ -1,4 +1,5 @@ import glob +import importlib import itertools import logging import pathlib @@ -8,15 +9,16 @@ from dataclasses import field as dataclass_field from dataclasses import replace from enum import Enum -from typing import Any, Dict, Iterable, List, Optional, Set, Tuple +from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Type import pydantic +from datahub.utilities.sql_parser import SQLParser + if sys.version_info >= (3, 7): import lkml else: raise ModuleNotFoundError("The lookml plugin requires Python 3.7 or newer.") -from sql_metadata import Parser as SQLParser import datahub.emitter.mce_builder as builder from datahub.configuration import ConfigModel @@ -66,6 +68,7 @@ class LookMLSourceConfig(ConfigModel): view_pattern: AllowDenyPattern = AllowDenyPattern.allow_all() env: str = builder.DEFAULT_ENV parse_table_names_from_sql: bool = False + sql_parser: str = "datahub.utilities.sql_parser.DefaultSQLParser" @dataclass @@ -252,8 +255,23 @@ class LookerView: fields: List[ViewField] @classmethod - def _get_sql_table_names(cls, sql: str) -> List[str]: - sql_table_names: List[str] = SQLParser(sql).tables + def _import_sql_parser_cls(cls, sql_parser_path: str) -> Type[SQLParser]: + assert "." in sql_parser_path, "sql_parser-path must contain a ." + module_name, cls_name = sql_parser_path.rsplit(".", 1) + import sys + + logger.info(sys.path) + parser_cls = getattr(importlib.import_module(module_name), cls_name) + if not issubclass(parser_cls, SQLParser): + raise ValueError(f"must be derived from {SQLParser}; got {parser_cls}") + + return parser_cls + + @classmethod + def _get_sql_table_names(cls, sql: str, sql_parser_path: str) -> List[str]: + parser_cls = cls._import_sql_parser_cls(sql_parser_path) + + sql_table_names: List[str] = parser_cls(sql).get_tables() # Remove quotes from table names sql_table_names = [t.replace('"', "") for t in sql_table_names] @@ -290,6 +308,7 @@ def from_looker_dict( looker_viewfile_loader: LookerViewFileLoader, reporter: LookMLSourceReport, parse_table_names_from_sql: bool = False, + sql_parser_path: str = "datahub.utilities.sql_parser.DefaultSQLParser", ) -> Optional["LookerView"]: view_name = looker_view["name"] logger.debug(f"Handling view {view_name}") @@ -330,7 +349,9 @@ def from_looker_dict( sql_table_names = [] if parse_table_names_from_sql and "sql" in derived_table: # Get the list of tables in the query - sql_table_names = cls._get_sql_table_names(derived_table["sql"]) + sql_table_names = cls._get_sql_table_names( + derived_table["sql"], sql_parser_path + ) return LookerView( absolute_file_path=looker_viewfile.absolute_file_path, @@ -686,6 +707,7 @@ def get_workunits(self) -> Iterable[MetadataWorkUnit]: viewfile_loader, self.reporter, self.source_config.parse_table_names_from_sql, + self.source_config.sql_parser, ) except Exception as e: self.reporter.report_warning( diff --git a/metadata-ingestion/src/datahub/utilities/sql_parser.py b/metadata-ingestion/src/datahub/utilities/sql_parser.py new file mode 100644 index 00000000000000..a046c31f314acb --- /dev/null +++ b/metadata-ingestion/src/datahub/utilities/sql_parser.py @@ -0,0 +1,24 @@ +from abc import ABCMeta, abstractmethod +from typing import List + +try: + from sql_metadata import Parser as MetadataSQLParser +except ImportError: + pass + + +class SQLParser(metaclass=ABCMeta): + def __init__(self, sql_query: str) -> None: + self._sql_query = sql_query + + @abstractmethod + def get_tables(self) -> List[str]: + pass + + +class DefaultSQLParser(SQLParser): + def __init__(self, sql_query: str) -> None: + self._parser = MetadataSQLParser(sql_query) + + def get_tables(self) -> List[str]: + return self._parser.tables diff --git a/metadata-ingestion/tests/unit/test_utilities.py b/metadata-ingestion/tests/unit/test_utilities.py index cb1261715ff6b7..919c45c4b7518f 100644 --- a/metadata-ingestion/tests/unit/test_utilities.py +++ b/metadata-ingestion/tests/unit/test_utilities.py @@ -1,5 +1,10 @@ +import sys + +import pytest + from datahub.utilities.delayed_iter import delayed_iter from datahub.utilities.groupby import groupby_unsorted +from datahub.utilities.sql_parser import DefaultSQLParser def test_delayed_iter(): @@ -44,3 +49,15 @@ def test_groupby_unsorted(): ("B", ["B"]), ("C", ["C", "C"]), ] + + +@pytest.mark.integration +@pytest.mark.skipif( + sys.version_info < (3, 7), reason="The LookML source requires Python 3.7+" +) +def test_default_sql_parser(): + sql_query = "SELECT foo.a, foo.b, bar.c FROM foo JOIN bar ON (foo.a == bar.b);" + + tables_list = DefaultSQLParser(sql_query).get_tables() + + assert tables_list == ["foo", "bar"]