Skip to content

Commit

Permalink
feat(ingestion): bring your own SQL parser (datahub-project#3110)
Browse files Browse the repository at this point in the history
  • Loading branch information
frsann authored and Rahul Jain committed Aug 31, 2021
1 parent 04c43ed commit e84e9f1
Show file tree
Hide file tree
Showing 4 changed files with 74 additions and 9 deletions.
10 changes: 6 additions & 4 deletions metadata-ingestion/source_docs/lookml.md
Original file line number Diff line number Diff line change
Expand Up @@ -53,11 +53,13 @@ Note that a `.` is used to denote nested fields in the YAML recipe.
| `view_pattern.ignoreCase` | | `True` | Whether to ignore case sensitivity during pattern matching. |
| `env` | | `"PROD"` | Environment to use in namespace when constructing URNs. |
| `parse_table_names_from_sql` | | `False` | See note below. |
| `sql_parser` | | `datahub.utilities.sql_parser.DefaultSQLParser` | See note below. |

Note! The integration can use [`sql-metadata`](https://pypi.org/project/sql-metadata/) to try to parse the tables the
views depends on. As these SQL's can be complicated, and the package doesn't official support all the SQL dialects that
Looker supports, the result might not be correct. This parsing is disabled by default, but can be enabled by setting
`parse_table_names_from_sql: True`.
Note! The integration can use an SQL parser to try to parse the tables the views depends on. This parsing is disabled by default,
but can be enabled by setting `parse_table_names_from_sql: True`. The default parser is based on the [`sql-metadata`](https://pypi.org/project/sql-metadata/) package.
As this package doesn't officially support all the SQL dialects that Looker supports, the result might not be correct. You can, however, implement a
custom parser and take it into use by setting the `sql_parser` configuration value. A custom SQL parser must inherit from `datahub.utilities.sql_parser.SQLParser`
and must be made available to Datahub by ,for example, installing it. The configuration then needs to be set to `module_name.ClassName` of the parser.

## Compatibility

Expand Down
32 changes: 27 additions & 5 deletions metadata-ingestion/src/datahub/ingestion/source/lookml.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import glob
import importlib
import itertools
import logging
import pathlib
Expand All @@ -8,15 +9,16 @@
from dataclasses import field as dataclass_field
from dataclasses import replace
from enum import Enum
from typing import Any, Dict, Iterable, List, Optional, Set, Tuple
from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Type

import pydantic

from datahub.utilities.sql_parser import SQLParser

if sys.version_info >= (3, 7):
import lkml
else:
raise ModuleNotFoundError("The lookml plugin requires Python 3.7 or newer.")
from sql_metadata import Parser as SQLParser

import datahub.emitter.mce_builder as builder
from datahub.configuration import ConfigModel
Expand Down Expand Up @@ -66,6 +68,7 @@ class LookMLSourceConfig(ConfigModel):
view_pattern: AllowDenyPattern = AllowDenyPattern.allow_all()
env: str = builder.DEFAULT_ENV
parse_table_names_from_sql: bool = False
sql_parser: str = "datahub.utilities.sql_parser.DefaultSQLParser"


@dataclass
Expand Down Expand Up @@ -252,8 +255,23 @@ class LookerView:
fields: List[ViewField]

@classmethod
def _get_sql_table_names(cls, sql: str) -> List[str]:
sql_table_names: List[str] = SQLParser(sql).tables
def _import_sql_parser_cls(cls, sql_parser_path: str) -> Type[SQLParser]:
assert "." in sql_parser_path, "sql_parser-path must contain a ."
module_name, cls_name = sql_parser_path.rsplit(".", 1)
import sys

logger.info(sys.path)
parser_cls = getattr(importlib.import_module(module_name), cls_name)
if not issubclass(parser_cls, SQLParser):
raise ValueError(f"must be derived from {SQLParser}; got {parser_cls}")

return parser_cls

@classmethod
def _get_sql_table_names(cls, sql: str, sql_parser_path: str) -> List[str]:
parser_cls = cls._import_sql_parser_cls(sql_parser_path)

sql_table_names: List[str] = parser_cls(sql).get_tables()

# Remove quotes from table names
sql_table_names = [t.replace('"', "") for t in sql_table_names]
Expand Down Expand Up @@ -290,6 +308,7 @@ def from_looker_dict(
looker_viewfile_loader: LookerViewFileLoader,
reporter: LookMLSourceReport,
parse_table_names_from_sql: bool = False,
sql_parser_path: str = "datahub.utilities.sql_parser.DefaultSQLParser",
) -> Optional["LookerView"]:
view_name = looker_view["name"]
logger.debug(f"Handling view {view_name}")
Expand Down Expand Up @@ -330,7 +349,9 @@ def from_looker_dict(
sql_table_names = []
if parse_table_names_from_sql and "sql" in derived_table:
# Get the list of tables in the query
sql_table_names = cls._get_sql_table_names(derived_table["sql"])
sql_table_names = cls._get_sql_table_names(
derived_table["sql"], sql_parser_path
)

return LookerView(
absolute_file_path=looker_viewfile.absolute_file_path,
Expand Down Expand Up @@ -686,6 +707,7 @@ def get_workunits(self) -> Iterable[MetadataWorkUnit]:
viewfile_loader,
self.reporter,
self.source_config.parse_table_names_from_sql,
self.source_config.sql_parser,
)
except Exception as e:
self.reporter.report_warning(
Expand Down
24 changes: 24 additions & 0 deletions metadata-ingestion/src/datahub/utilities/sql_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
from abc import ABCMeta, abstractmethod
from typing import List

try:
from sql_metadata import Parser as MetadataSQLParser
except ImportError:
pass


class SQLParser(metaclass=ABCMeta):
def __init__(self, sql_query: str) -> None:
self._sql_query = sql_query

@abstractmethod
def get_tables(self) -> List[str]:
pass


class DefaultSQLParser(SQLParser):
def __init__(self, sql_query: str) -> None:
self._parser = MetadataSQLParser(sql_query)

def get_tables(self) -> List[str]:
return self._parser.tables
17 changes: 17 additions & 0 deletions metadata-ingestion/tests/unit/test_utilities.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,10 @@
import sys

import pytest

from datahub.utilities.delayed_iter import delayed_iter
from datahub.utilities.groupby import groupby_unsorted
from datahub.utilities.sql_parser import DefaultSQLParser


def test_delayed_iter():
Expand Down Expand Up @@ -44,3 +49,15 @@ def test_groupby_unsorted():
("B", ["B"]),
("C", ["C", "C"]),
]


@pytest.mark.integration
@pytest.mark.skipif(
sys.version_info < (3, 7), reason="The LookML source requires Python 3.7+"
)
def test_default_sql_parser():
sql_query = "SELECT foo.a, foo.b, bar.c FROM foo JOIN bar ON (foo.a == bar.b);"

tables_list = DefaultSQLParser(sql_query).get_tables()

assert tables_list == ["foo", "bar"]

0 comments on commit e84e9f1

Please sign in to comment.