Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(ingestion/lookml): support looker -- if comments #11113

Merged
merged 13 commits into from
Aug 16, 2024
1 change: 1 addition & 0 deletions metadata-ingestion/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,7 @@
*sqlglot_lib,
"GitPython>2",
"python-liquid",
"deepmerge>=1.1.1"
}

bigquery_common = {
Expand Down
Original file line number Diff line number Diff line change
@@ -1 +1,9 @@
IMPORTED_PROJECTS = "imported_projects"
SQL_TABLE_NAME = "sql_table_name"
DATAHUB_TRANSFORMED_SQL_TABLE_NAME = "datahub_transformed_sql_table_name"
DERIVED_TABLE = "derived_table"
SQL = "sql"
DATAHUB_TRANSFORMED_SQL = "datahub_transformed_sql"
prod = "prod"
dev = "dev"
NAME = "name"
Original file line number Diff line number Diff line change
@@ -1,17 +1,18 @@
import logging
import pathlib
from dataclasses import replace
from typing import Any, Dict, Optional
from typing import Dict, Optional

from datahub.ingestion.source.looker.lkml_patched import load_lkml
from datahub.ingestion.source.looker.looker_config import LookerConnectionDefinition
from datahub.ingestion.source.looker.looker_dataclasses import LookerViewFile
from datahub.ingestion.source.looker.looker_template_language import (
resolve_liquid_variable_in_view_dict,
process_lookml_template_language,
)
from datahub.ingestion.source.looker.lookml_config import (
_EXPLORE_FILE_EXTENSION,
_VIEW_FILE_EXTENSION,
LookMLSourceConfig,
LookMLSourceReport,
)

Expand All @@ -29,13 +30,13 @@ def __init__(
root_project_name: Optional[str],
base_projects_folder: Dict[str, pathlib.Path],
reporter: LookMLSourceReport,
liquid_variable: Dict[Any, Any],
source_config: LookMLSourceConfig,
) -> None:
self.viewfile_cache: Dict[str, Optional[LookerViewFile]] = {}
self._root_project_name = root_project_name
self._base_projects_folder = base_projects_folder
self.reporter = reporter
self.liquid_variable = liquid_variable
self.source_config = source_config

def _load_viewfile(
self, project_name: str, path: str, reporter: LookMLSourceReport
Expand Down Expand Up @@ -73,9 +74,9 @@ def _load_viewfile(

parsed = load_lkml(path)

resolve_liquid_variable_in_view_dict(
raw_view=parsed,
liquid_variable=self.liquid_variable,
process_lookml_template_language(
view_lkml_file_dict=parsed,
source_config=self.source_config,
)

looker_viewfile = LookerViewFile.from_looker_dict(
Expand Down
Original file line number Diff line number Diff line change
@@ -1,15 +1,30 @@
import logging
import re
from typing import Any, ClassVar, Dict, Set
from abc import ABC, abstractmethod
from typing import Any, ClassVar, Dict, List, Optional, Set

from deepmerge import always_merger
from liquid import Undefined
from liquid.exceptions import LiquidSyntaxError

from datahub.ingestion.source.looker.looker_constant import (
DATAHUB_TRANSFORMED_SQL,
DATAHUB_TRANSFORMED_SQL_TABLE_NAME,
DERIVED_TABLE,
NAME,
SQL,
SQL_TABLE_NAME,
dev,
prod,
)
from datahub.ingestion.source.looker.looker_liquid_tag import (
CustomTagException,
create_template,
)
from datahub.ingestion.source.looker.lookml_config import DERIVED_VIEW_PATTERN
from datahub.ingestion.source.looker.lookml_config import (
DERIVED_VIEW_PATTERN,
LookMLSourceConfig,
)

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -92,52 +107,195 @@ def resolve_liquid_variable(text: str, liquid_variable: Dict[Any, Any]) -> str:
return text


def _drop_derived_view_pattern(value: str) -> str:
# Drop ${ and }
return re.sub(DERIVED_VIEW_PATTERN, r"\1", value)
class LookMLViewTransformer(ABC):
source_config: LookMLSourceConfig

def __init__(self, source_config: LookMLSourceConfig):
self.source_config = source_config

def _complete_incomplete_sql(raw_view: dict, sql: str) -> str:
def transform(self, view: dict) -> dict:
value_to_transform: Optional[str] = None

# Looker supports sql fragments that omit the SELECT and FROM parts of the query
# Add those in if we detect that it is missing
sql_query: str = sql
if SQL_TABLE_NAME in view:
# Give precedence to already processed transformed sql_table_name to apply more transformation
value_to_transform = view.get(
DATAHUB_TRANSFORMED_SQL_TABLE_NAME, view[SQL_TABLE_NAME]
)

if not re.search(r"SELECT\s", sql_query, flags=re.I):
# add a SELECT clause at the beginning
sql_query = f"SELECT {sql}"
if DERIVED_TABLE in view and SQL in view[DERIVED_TABLE]:
# Give precedence to already processed transformed view.derived.sql to apply more transformation
value_to_transform = view[DERIVED_TABLE].get(
DATAHUB_TRANSFORMED_SQL, view[DERIVED_TABLE][SQL]
)

if not re.search(r"FROM\s", sql_query, flags=re.I):
# add a FROM clause at the end
sql_query = f"{sql_query} FROM {raw_view['name']}"
if value_to_transform is None:
return {}

return _drop_derived_view_pattern(sql_query)
logger.debug(f"value to transform = {value_to_transform}")

transformed_value: str = self._apply_transformation(
value=value_to_transform, view=view
)

def resolve_liquid_variable_in_view_dict(
raw_view: dict, liquid_variable: Dict[Any, Any]
) -> None:
if "views" not in raw_view:
return
logger.debug(f"transformed value = {transformed_value}")

for view in raw_view["views"]:
if "sql_table_name" in view:
view["datahub_transformed_sql_table_name"] = resolve_liquid_variable(
text=view["sql_table_name"],
liquid_variable=liquid_variable,
) # keeping original sql_table_name as is to avoid any visualization issue later
if SQL_TABLE_NAME in view and value_to_transform:
return {DATAHUB_TRANSFORMED_SQL_TABLE_NAME: transformed_value}

if DERIVED_TABLE in view and SQL in view[DERIVED_TABLE] and value_to_transform:
return {DERIVED_TABLE: {DATAHUB_TRANSFORMED_SQL: transformed_value}}

return {}

@abstractmethod
def _apply_transformation(self, value: str, view: dict) -> str:
pass


class LiquidVariableTransformer(LookMLViewTransformer):
"""
Replace the liquid variables with their values.
"""

def _apply_transformation(self, value: str, view: dict) -> str:
return resolve_liquid_variable(
text=value,
liquid_variable=self.source_config.liquid_variable,
)


class IncompleteSqlTransformer(LookMLViewTransformer):
"""
lookml view may contain the fragment of sql, however for lineage generation we need a complete sql.
IncompleteSqlTransformer will complete the view's sql.
"""

def _apply_transformation(self, value: str, view: dict) -> str:

# Looker supports sql fragments that omit the SELECT and FROM parts of the query
# Add those in if we detect that it is missing
sql_query: str = value

if not re.search(r"SELECT\s", sql_query, flags=re.I):
# add a SELECT clause at the beginning
sql_query = f"SELECT {sql_query}"

if not re.search(r"FROM\s", sql_query, flags=re.I):
# add a FROM clause at the end
sql_query = f"{sql_query} FROM {view[NAME]}"

return sql_query


class DropDerivedViewPatternTransformer(LookMLViewTransformer):
"""
drop ${} from datahub_transformed_sql_table_name and view["derived_table"]["datahub_transformed_sql_table_name"] values.

Example: transform ${employee_income_source.SQL_TABLE_NAME} to employee_income_source.SQL_TABLE_NAME
"""

def _apply_transformation(self, value: str, view: dict) -> str:
return re.sub(
DERIVED_VIEW_PATTERN,
r"\1",
value,
)

view["datahub_transformed_sql_table_name"] = _drop_derived_view_pattern(
value=view["datahub_transformed_sql_table_name"]
)

if "derived_table" in view and "sql" in view["derived_table"]:
# In sql we don't need to remove the extra spaces as sql parser takes care of extra spaces and \n
# while generating URN from sql
view["derived_table"]["datahub_transformed_sql"] = resolve_liquid_variable(
text=view["derived_table"]["sql"], liquid_variable=liquid_variable
) # keeping original sql as is, so that on UI sql will be shown same is it is visible on looker portal
class LookMlIfCommentTransformer(LookMLViewTransformer):
"""
Evaluate the looker -- if -- comments.
"""

view["derived_table"]["datahub_transformed_sql"] = _complete_incomplete_sql(
raw_view=view, sql=view["derived_table"]["datahub_transformed_sql"]
evaluate_to_true_regx: str
remove_if_comment_line_regx: str

def __init__(self, source_config: LookMLSourceConfig):
super().__init__(source_config=source_config)

# This regx will keep whatever after -- if looker_environment --
self.evaluate_to_true_regx = r"-- if {} --".format(
self.source_config.looker_environment
)

# It will remove all other lines starts with -- if ... --
self.remove_if_comment_line_regx = r"-- if {} --.*?(?=\n|-- if|$)".format(
dev if self.source_config.looker_environment.lower() == prod else prod
)

def _apply_regx(self, value: str) -> str:
result: str = re.sub(
self.remove_if_comment_line_regx, "", value, flags=re.IGNORECASE | re.DOTALL
)

# Remove '-- if prod --' but keep the rest of the line
result = re.sub(self.evaluate_to_true_regx, "", result, flags=re.IGNORECASE)

return result

def _apply_transformation(self, value: str, view: dict) -> str:
return self._apply_regx(value)


class TransformedLookMlView:
transformers: List[LookMLViewTransformer]
view_dict: dict
transformed_dict: dict

def __init__(
self,
transformers: List[LookMLViewTransformer],
view_dict: dict,
):
self.transformers = transformers
self.view_dict = view_dict
self.transformed_dict = {}

def view(self) -> dict:
if self.transformed_dict:
return self.transformed_dict

self.transformed_dict = {**self.view_dict}

logger.debug(f"Processing view {self.view_dict[NAME]}")

for transformer in self.transformers:
logger.debug(f"Applying transformer {transformer.__class__.__name__}")

self.transformed_dict = always_merger.merge(
self.transformed_dict, transformer.transform(self.transformed_dict)
)

return self.transformed_dict


def process_lookml_template_language(
source_config: LookMLSourceConfig,
view_lkml_file_dict: dict,
) -> None:
if "views" not in view_lkml_file_dict:
return

transformers: List[LookMLViewTransformer] = [
LookMlIfCommentTransformer(
source_config=source_config
), # First evaluate the -- if -- comments. Looker does the same
LiquidVariableTransformer(
source_config=source_config
), # Now resolve liquid variables
DropDerivedViewPatternTransformer(
source_config=source_config
), # Remove any ${} symbol
IncompleteSqlTransformer(
source_config=source_config
), # complete any incomplete sql
]
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

interesting - so we're pushing some of our transformation logic into the jinja / lkml parse layer?


transformed_views: List[dict] = []

for view in view_lkml_file_dict["views"]:
transformed_views.append(
TransformedLookMlView(transformers=transformers, view_dict=view).view()
)

view_lkml_file_dict["views"] = transformed_views
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import logging
from dataclasses import dataclass, field as dataclass_field
from datetime import timedelta
from typing import Any, Dict, List, Optional, Union
from typing import Any, Dict, List, Literal, Optional, Union

import pydantic
from pydantic import root_validator, validator
Expand Down Expand Up @@ -174,6 +174,13 @@ class LookMLSourceConfig(
"view.sql_table_name. Defaults to an empty dictionary.",
)

looker_environment: Literal["prod", "dev"] = Field(
"prod",
description="A looker prod or dev environment. "
"It helps to evaluate looker if comments i.e. -- if prod --. "
"All if comments are evaluated to true for configured looker_environment value",
)

@validator("connection_to_platform_map", pre=True)
def convert_string_to_connection_def(cls, conn_map):
# Previous version of config supported strings in connection map. This upconverts strings to ConnectionMap
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -669,7 +669,7 @@ def get_internal_workunits(self) -> Iterable[MetadataWorkUnit]: # noqa: C901
self.source_config.project_name,
self.base_projects_folder,
self.reporter,
self.source_config.liquid_variable,
self.source_config,
)

# Some views can be mentioned by multiple 'include' statements and can be included via different connections.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ include: "employee_total_income.view.lkml"
include: "top_10_employee_income_source.view.lkml"
include: "employee_tax_report.view.lkml"
include: "employee_salary_rating.view.lkml"
include: "environment_activity_logs.view.lkml"
include: "employee_income_source_as_per_env.view.lkml"
include: "rent_as_employee_income_source.view.lkml"

explore: activity_logs {
Expand All @@ -26,5 +28,11 @@ explore: employee_tax_report {
explore: employee_salary_rating {
}

explore: environment_activity_logs {
}

explore: employee_income_source_as_per_env {
}

explore: rent_as_employee_income_source {
}
Loading
Loading