From cc8e8fa543f0f8e7c54cef890c73dd57b11ad40b Mon Sep 17 00:00:00 2001 From: Vladislavs Gaidass Date: Sat, 23 Apr 2022 17:37:23 +0200 Subject: [PATCH 1/3] fix(bigquery): exclude job audit logs with errors --- .../src/datahub/ingestion/source/sql/bigquery.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/bigquery.py b/metadata-ingestion/src/datahub/ingestion/source/sql/bigquery.py index b0d06631a1d501..772f59538687c6 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/bigquery.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/bigquery.py @@ -173,6 +173,9 @@ def bigquery_audit_metadata_query_template( """ Receives a dataset (with project specified) and returns a query template that is used to query exported AuditLogs containing protoPayloads of type BigQueryAuditMetadata. + Include only those that: + - have been completed (jobStatus.jobState = "DONE") + - do not contain errors (jobStatus.errorResults is none) :param dataset: the dataset to query against in the form of $PROJECT.$DATASET :param use_date_sharded_tables: whether to read from date sharded audit log tables or time partitioned audit log tables @@ -213,6 +216,7 @@ def bigquery_audit_metadata_query_template( AND timestamp < "{end_time}" AND protopayload_auditlog.serviceName="bigquery.googleapis.com" AND JSON_EXTRACT_SCALAR(protopayload_auditlog.metadataJson, "$.jobChange.job.jobStatus.jobState") = "DONE" + AND JSON_EXTRACT(protopayload_auditlog.metadataJson, "$.jobChange.job.jobStatus.errorResults") IS NULL AND JSON_EXTRACT(protopayload_auditlog.metadataJson, "$.jobChange.job.jobConfig.queryConfig") IS NOT NULL; """ From 4e873943b1785a3ce9f69f4e31d53ca298efd452 Mon Sep 17 00:00:00 2001 From: Vladislavs Gaidass Date: Sat, 23 Apr 2022 17:40:39 +0200 Subject: [PATCH 2/3] fix(bigquery): add handling of sql keywords 'from', 'with', 'admin', 'hour' --- .../utilities/sql_lineage_parser_impl.py | 96 ++++++--- .../tests/unit/test_utilities.py | 182 +++++++++++++++++- 2 files changed, 250 insertions(+), 28 deletions(-) diff --git a/metadata-ingestion/src/datahub/utilities/sql_lineage_parser_impl.py b/metadata-ingestion/src/datahub/utilities/sql_lineage_parser_impl.py index 15463b4d56ae36..c5961cfa8cd63f 100644 --- a/metadata-ingestion/src/datahub/utilities/sql_lineage_parser_impl.py +++ b/metadata-ingestion/src/datahub/utilities/sql_lineage_parser_impl.py @@ -5,6 +5,7 @@ from typing import Dict, List, Set from sqllineage.core.holders import Column, SQLLineageHolder +from sqllineage.exceptions import SQLLineageException try: import sqlparse @@ -20,8 +21,10 @@ class SqlLineageSQLParserImpl: _DATE_SWAP_TOKEN = "__d_a_t_e" + _HOUR_SWAP_TOKEN = "__h_o_u_r" _TIMESTAMP_SWAP_TOKEN = "__t_i_m_e_s_t_a_m_p" _DATA_SWAP_TOKEN = "__d_a_t_a" + _ADMIN_SWAP_TOKEN = "__a_d_m_i_n" _MYVIEW_SQL_TABLE_NAME_TOKEN = "__my_view__.__sql_table_name__" _MYVIEW_LOOKER_TOKEN = "my_view.SQL_TABLE_NAME" @@ -32,18 +35,59 @@ def __init__(self, sql_query: str) -> None: if "lateral flatten" in sql_query: sql_query = sql_query[: sql_query.find("lateral flatten")] + # BigQuery can use # as a comment symbol and that may break the parsing + # if the comment starts without a space sign after # + # and the comment is written inside SQL DDL statement (e.g., after CREATE...AS) + sql_query = re.sub(r"#([^ ])", r"# \1", sql_query, flags=re.IGNORECASE) + + # Wrap calls to field '.from' in ` so it would not be taken as a reserved keyword + sql_query = re.sub(r"(\w*\.from)", r"`\1`", sql_query, flags=re.IGNORECASE) + + # Apply sqlparser formatting to get rid of comments and reindent keywords + # which should remove some potential inconsistencies in parsing output + sql_query = sqlparse.format( + sql_query.strip(), + reindent_aligned=True, + strip_comments=True, + ) + + # SqlLineageParser does not allow table/view names not being wrapped in quotes or backticks + # Add ` signs before and after supposed object name that comes right after reserved word FROM + # note 1: this excludes date/time/datetime extract functions like EXTRACT DATE FROM... + # note 2: this includes adding ` signs to CTE aliases + sql_query = re.sub( + r"(? None: # Replace lookml templates with the variable otherwise sqlparse can't parse ${ sql_query = re.sub(r"(\${)(.+)(})", r"\2", sql_query) if sql_query != original_sql_query: - logger.debug(f"rewrote original query {original_sql_query} as {sql_query}") + logger.debug(f"Rewrote original query {original_sql_query} as {sql_query}") self._sql = sql_query - self._stmt = [ - s - for s in sqlparse.parse( - # first apply sqlparser formatting just to get rid of comments, which cause - # inconsistencies in parsing output - sqlparse.format( - self._sql.strip(), - strip_comments=True, - use_space_around_operators=True, - ), - ) - if s.token_first(skip_cm=True) - ] + try: + self._stmt = [ + s + for s in sqlparse.parse( + sqlparse.format( + self._sql.strip(), + use_space_around_operators=True, + ), + ) + if s.token_first(skip_cm=True) + ] - with unittest.mock.patch( - "sqllineage.core.handlers.source.SourceHandler.end_of_query_cleanup", - datahub.utilities.sqllineage_patch.end_of_query_cleanup_patch, - ): with unittest.mock.patch( - "sqllineage.core.holders.SubQueryLineageHolder.add_column_lineage", - datahub.utilities.sqllineage_patch.add_column_lineage_patch, + "sqllineage.core.handlers.source.SourceHandler.end_of_query_cleanup", + datahub.utilities.sqllineage_patch.end_of_query_cleanup_patch, ): - self._stmt_holders = [ - LineageAnalyzer().analyze(stmt) for stmt in self._stmt - ] - self._sql_holder = SQLLineageHolder.of(*self._stmt_holders) + with unittest.mock.patch( + "sqllineage.core.holders.SubQueryLineageHolder.add_column_lineage", + datahub.utilities.sqllineage_patch.add_column_lineage_patch, + ): + self._stmt_holders = [ + LineageAnalyzer().analyze(stmt) for stmt in self._stmt + ] + self._sql_holder = SQLLineageHolder.of(*self._stmt_holders) + except SQLLineageException as e: + logger.error(f"SQL lineage analyzer error '{e}' for query: '{self._sql}") def get_tables(self) -> List[str]: result: List[str] = list() diff --git a/metadata-ingestion/tests/unit/test_utilities.py b/metadata-ingestion/tests/unit/test_utilities.py index 491e81d3ac57d4..4c15bec9cfa3e0 100644 --- a/metadata-ingestion/tests/unit/test_utilities.py +++ b/metadata-ingestion/tests/unit/test_utilities.py @@ -391,10 +391,188 @@ def test_with_keyword_data(): SELECT *, 'foo' AS bar - FROM `project.example_dataset.another_example_table_vw` + FROM `project.example_dataset.example_table` ) SELECT * FROM data """ ) - assert parser.get_tables() == ["project.example_dataset.another_example_table_vw"] + assert parser.get_tables() == ["project.example_dataset.example_table"] + + +def test_with_keyword_admin(): + parser = SqlLineageSQLParser( + sql_query=""" + WITH admin AS ( + SELECT * + FROM `project.example_dataset.example_table` + ) + SELECT * FROM admin + """ + ) + + assert parser.get_tables() == ["project.example_dataset.example_table"] + + +def test_sqllineage_sql_parser_create_or_replace_table_name_not_wrapped_in_backticks(): + parser = SqlLineageSQLParser( + sql_query=""" + CREATE OR REPLACE TABLE project.dataset.test_table AS + WITH cte AS ( + SELECT + *, + EXTRACT(MICROSECOND FROM time_field) AS col_1, + EXTRACT(MILLISECOND FROM time_field) AS col_2, + EXTRACT(SECOND FROM time_field) AS col_3, + EXTRACT(MINUTE FROM time_field) AS col_4, + EXTRACT(HOUR FROM time_field) AS col_5, + EXTRACT(DAYOFWEEK FROM time_field) AS col_6, + EXTRACT(DAY FROM time_field) AS col_7, + EXTRACT(DAYOFYEAR FROM time_field) AS col_8, + EXTRACT(WEEK FROM time_field) AS col_9, + EXTRACT(WEEK FROM time_field) AS col_10, + EXTRACT(ISOWEEK FROM time_field) AS col_11, + EXTRACT(MONTH FROM time_field) AS col_12, + EXTRACT(QUARTER FROM time_field) AS col_13, + EXTRACT(YEAR FROM time_field) AS col_14, + EXTRACT(ISOYEAR FROM time_field) AS col_15, + EXTRACT(DATE FROM time_field) AS col_16, + EXTRACT(TIME FROM time_field) AS col_17 + FROM project.dataset.src_table_a + ) + SELECT * FROM cte + UNION + SELECT * + FROM project.dataset.src_table_b + UNION + SELECT * FROM `project.dataset.src_table_c` + """ + ) + + assert parser.get_tables() == [ + "project.dataset.src_table_a", + "project.dataset.src_table_b", + "project.dataset.src_table_c", + ] + + +def test_sqllineage_sql_parser_create_or_replace_view_name_not_wrapped_in_backticks(): + parser = SqlLineageSQLParser( + sql_query=""" + CREATE OR REPLACE VIEW project.dataset.test_view AS + WITH cte AS ( + SELECT + *, + 'foo' AS bar + FROM project.dataset.src_table_a + ) + SELECT * FROM cte + UNION + SELECT * + FROM project.dataset.src_table_b + UNION + SELECT * FROM `project.dataset.src_table_c` + """ + ) + + assert parser.get_tables() == [ + "project.dataset.src_table_a", + "project.dataset.src_table_b", + "project.dataset.src_table_c", + ] + + +def test_sqllineage_sql_parser_create_table_name_not_wrapped_in_backticks(): + parser = SqlLineageSQLParser( + sql_query=""" + CREATE TABLE project.dataset.test_table AS + WITH cte AS ( + SELECT + *, + 'foo' AS bar + FROM project.dataset.src_table_a + ) + SELECT * FROM cte + UNION + SELECT * + FROM project.dataset.src_table_b + UNION + SELECT * FROM `project.dataset.src_table_c` + """ + ) + + assert parser.get_tables() == [ + "project.dataset.src_table_a", + "project.dataset.src_table_b", + "project.dataset.src_table_c", + ] + + +def test_sqllineage_sql_parser_create_view_name_not_wrapped_in_backticks(): + parser = SqlLineageSQLParser( + sql_query=""" + CREATE VIEW project.dataset.test_view AS + WITH cte AS ( + SELECT + *, + 'foo' AS bar + FROM project.dataset.src_table_a + ) + SELECT * FROM cte + UNION + SELECT * + FROM project.dataset.src_table_b + UNION + SELECT * FROM `project.dataset.src_table_c` + """ + ) + + assert parser.get_tables() == [ + "project.dataset.src_table_a", + "project.dataset.src_table_b", + "project.dataset.src_table_c", + ] + + +def test_sqllineage_sql_parser_from_as_column_name_is_escaped(): + parser = SqlLineageSQLParser( + sql_query=""" + CREATE TABLE project.dataset.test_table AS + SELECT x.from AS col + FROM project.dataset.src_table AS x + """ + ) + + assert parser.get_tables() == [ + "project.dataset.src_table", + ] + + +def test_sqllineage_sql_parser_cte_alias_near_with_keyword_is_escaped(): + parser = SqlLineageSQLParser( + sql_query=""" + create or replace view `project.dataset.test_view` as + with map as ( + select col_1 + col_2 + from `project.dataset.src_table_a` a + join `project.dataset.src_table_b` b + on a.col_1 = b.col_2 + ), + cte_2 as ( + select * from `project.dataset.src_table_c` + ) + select * from map + union all + select * from cte_2 + """ + ) + + # results would (somehow) also contain "cte_2" which should be considered a subquery rather than a table + assert set( + [ + "project.dataset.src_table_a", + "project.dataset.src_table_b", + "project.dataset.src_table_c", + ] + ).issubset(parser.get_tables()) From 89816e32668d101e2d2c257504e0d654b0949e9a Mon Sep 17 00:00:00 2001 From: Vladislavs Gaidass Date: Fri, 29 Apr 2022 12:31:03 +0200 Subject: [PATCH 3/3] fix(bigquery): isolate bigquery-relevant sql parser to its own class --- .../datahub/ingestion/source/sql/bigquery.py | 4 +- .../datahub/utilities/bigquery_sql_parser.py | 80 ++++++ .../utilities/sql_lineage_parser_impl.py | 43 +-- .../tests/unit/test_bigquery_sql_lineage.py | 119 +++++++++ .../tests/unit/test_bigquery_sql_parser.py | 245 ++++++++++++++++++ .../tests/unit/test_utilities.py | 213 --------------- 6 files changed, 449 insertions(+), 255 deletions(-) create mode 100644 metadata-ingestion/src/datahub/utilities/bigquery_sql_parser.py create mode 100644 metadata-ingestion/tests/unit/test_bigquery_sql_lineage.py create mode 100644 metadata-ingestion/tests/unit/test_bigquery_sql_parser.py diff --git a/metadata-ingestion/src/datahub/ingestion/source/sql/bigquery.py b/metadata-ingestion/src/datahub/ingestion/source/sql/bigquery.py index 772f59538687c6..ce13cadebdc91b 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/sql/bigquery.py +++ b/metadata-ingestion/src/datahub/ingestion/source/sql/bigquery.py @@ -54,7 +54,7 @@ UpstreamClass, UpstreamLineageClass, ) -from datahub.utilities.sql_parser import DefaultSQLParser +from datahub.utilities.bigquery_sql_parser import BigQuerySQLParser logger = logging.getLogger(__name__) @@ -550,7 +550,7 @@ def _create_lineage_map(self, entries: Iterable[QueryEvent]) -> Dict[str, Set[st # If there is a view being referenced then bigquery sends both the view as well as underlying table # in the references. There is no distinction between direct/base objects accessed. So doing sql parsing # to ensure we only use direct objects accessed for lineage - parser = DefaultSQLParser(e.query) + parser = BigQuerySQLParser(e.query) referenced_objs = set( map(lambda x: x.split(".")[-1], parser.get_tables()) ) diff --git a/metadata-ingestion/src/datahub/utilities/bigquery_sql_parser.py b/metadata-ingestion/src/datahub/utilities/bigquery_sql_parser.py new file mode 100644 index 00000000000000..b6245977c5b22d --- /dev/null +++ b/metadata-ingestion/src/datahub/utilities/bigquery_sql_parser.py @@ -0,0 +1,80 @@ +import re +from typing import List + +import sqlparse + +from datahub.utilities.sql_parser import SqlLineageSQLParser, SQLParser + + +class BigQuerySQLParser(SQLParser): + parser: SQLParser + + def __init__(self, sql_query: str) -> None: + super().__init__(sql_query) + + self._parsed_sql_query = self.parse_sql_query(sql_query) + self.parser = SqlLineageSQLParser(self._parsed_sql_query) + + def parse_sql_query(self, sql_query: str) -> str: + sql_query = BigQuerySQLParser._parse_bigquery_comment_sign(sql_query) + sql_query = BigQuerySQLParser._escape_keyword_from_as_field_name(sql_query) + sql_query = BigQuerySQLParser._escape_cte_name_after_keyword_with(sql_query) + + sql_query = sqlparse.format( + sql_query.strip(), + reindent_aligned=True, + strip_comments=True, + ) + + sql_query = BigQuerySQLParser._escape_table_or_view_name_at_create_statement( + sql_query + ) + sql_query = BigQuerySQLParser._escape_object_name_after_keyword_from(sql_query) + + return sql_query + + @staticmethod + def _parse_bigquery_comment_sign(sql_query: str) -> str: + return re.sub(r"#(.*)", r"-- \1", sql_query, flags=re.IGNORECASE) + + @staticmethod + def _escape_keyword_from_as_field_name(sql_query: str) -> str: + return re.sub(r"(\w*\.from)", r"`\1`", sql_query, flags=re.IGNORECASE) + + @staticmethod + def _escape_cte_name_after_keyword_with(sql_query: str) -> str: + """ + Escape the first cte name in case it is one of reserved words + """ + return re.sub(r"(with\s)([^`\s()]+)", r"\1`\2`", sql_query, flags=re.IGNORECASE) + + @staticmethod + def _escape_table_or_view_name_at_create_statement(sql_query: str) -> str: + """ + Reason: in case table name contains hyphens which breaks sqllineage later on + """ + return re.sub( + r"(create.*\s)(table\s|view\s)([^`\s()]+)(?=\sas)", + r"\1\2`\3`", + sql_query, + flags=re.IGNORECASE, + ) + + @staticmethod + def _escape_object_name_after_keyword_from(sql_query: str) -> str: + """ + Reason: in case table name contains hyphens which breaks sqllineage later on + Note: ignore cases of having keyword FROM as part of datetime function EXTRACT + """ + return re.sub( + r"(? List[str]: + return self.parser.get_tables() + + def get_columns(self) -> List[str]: + return self.parser.get_columns() diff --git a/metadata-ingestion/src/datahub/utilities/sql_lineage_parser_impl.py b/metadata-ingestion/src/datahub/utilities/sql_lineage_parser_impl.py index c5961cfa8cd63f..412c8fcf465ef2 100644 --- a/metadata-ingestion/src/datahub/utilities/sql_lineage_parser_impl.py +++ b/metadata-ingestion/src/datahub/utilities/sql_lineage_parser_impl.py @@ -35,46 +35,6 @@ def __init__(self, sql_query: str) -> None: if "lateral flatten" in sql_query: sql_query = sql_query[: sql_query.find("lateral flatten")] - # BigQuery can use # as a comment symbol and that may break the parsing - # if the comment starts without a space sign after # - # and the comment is written inside SQL DDL statement (e.g., after CREATE...AS) - sql_query = re.sub(r"#([^ ])", r"# \1", sql_query, flags=re.IGNORECASE) - - # Wrap calls to field '.from' in ` so it would not be taken as a reserved keyword - sql_query = re.sub(r"(\w*\.from)", r"`\1`", sql_query, flags=re.IGNORECASE) - - # Apply sqlparser formatting to get rid of comments and reindent keywords - # which should remove some potential inconsistencies in parsing output - sql_query = sqlparse.format( - sql_query.strip(), - reindent_aligned=True, - strip_comments=True, - ) - - # SqlLineageParser does not allow table/view names not being wrapped in quotes or backticks - # Add ` signs before and after supposed object name that comes right after reserved word FROM - # note 1: this excludes date/time/datetime extract functions like EXTRACT DATE FROM... - # note 2: this includes adding ` signs to CTE aliases - sql_query = re.sub( - r"(? None: self._stmt = [ s for s in sqlparse.parse( + # first apply sqlparser formatting just to get rid of comments, which cause + # inconsistencies in parsing output sqlparse.format( self._sql.strip(), + strip_comments=True, use_space_around_operators=True, ), ) diff --git a/metadata-ingestion/tests/unit/test_bigquery_sql_lineage.py b/metadata-ingestion/tests/unit/test_bigquery_sql_lineage.py new file mode 100644 index 00000000000000..c2c6d6bd7c8680 --- /dev/null +++ b/metadata-ingestion/tests/unit/test_bigquery_sql_lineage.py @@ -0,0 +1,119 @@ +from datahub.utilities.bigquery_sql_parser import BigQuerySQLParser + + +def test_bigquery_sql_lineage_hash_as_comment_sign_is_accepted(): + parser = BigQuerySQLParser( + sql_query=""" +/* +HERE IS A STANDARD COMMENT BLOCK +THIS WILL NOT BREAK sqllineage +*/ +CREATE OR REPLACE TABLE `project.dataset.trg_tbl`AS +#This, comment will not break sqllineage +SELECT foo +-- this comment will not break sqllineage either +# this comment will not break sqllineage either +FROM `project.dataset.src_tbl` + """ + ) + + assert parser.get_tables() == ["project.dataset.src_tbl"] + + +def test_bigquery_sql_lineage_keyword_data_is_accepted(): + parser = BigQuerySQLParser( + sql_query=""" + WITH data AS ( + SELECT + *, + 'foo' AS bar + FROM `project.example_dataset.example_table` + ) + SELECT * FROM data + """ + ) + + assert parser.get_tables() == ["project.example_dataset.example_table"] + + +def test_bigquery_sql_lineage_keyword_admin_is_accepted(): + parser = BigQuerySQLParser( + sql_query=""" + WITH admin AS ( + SELECT * + FROM `project.example_dataset.example_table` + ) + SELECT * FROM admin + """ + ) + + assert parser.get_tables() == ["project.example_dataset.example_table"] + + +def test_bigquery_sql_lineage_cte_alias_as_keyword_is_accepted(): + parser = BigQuerySQLParser( + sql_query=""" +CREATE OR REPLACE TABLE `project.dataset.test_table` AS +WITH map AS ( + SELECT a.col_1, + b.col_2 + FROM ( + SELECT DISTINCT * + FROM ( + SELECT col_1 + FROM `project.dataset.source_table_a` + ) + ) a + JOIN `project.dataset.source_table_b` b + ON a.col_1 = b.col_1 + ) +SELECT * + FROM map + """ + ) + + assert parser.get_tables() == [ + "project.dataset.source_table_a", + "project.dataset.source_table_b", + ] + + +def test_bigquery_sql_lineage_create_or_replace_view_name_with_hyphens_is_accepted(): + parser = BigQuerySQLParser( + sql_query=""" + CREATE OR REPLACE VIEW test-project.dataset.test_view AS + SELECT * + FROM project.dataset.src_table_a + UNION + SELECT * FROM `project.dataset.src_table_b` + """ + ) + + assert parser.get_tables() == [ + "project.dataset.src_table_a", + "project.dataset.src_table_b", + ] + + +def test_bigquery_sql_lineage_source_table_name_with_hyphens_is_accepted(): + parser = BigQuerySQLParser( + sql_query=""" + CREATE OR REPLACE VIEW `project.dataset.test_view` AS + SELECT * + FROM test-project.dataset.src_table + """ + ) + + assert parser.get_tables() == ["test-project.dataset.src_table"] + + +def test_bigquery_sql_lineage_from_as_column_name_is_accepted(): + parser = BigQuerySQLParser( + sql_query=""" + CREATE OR REPLACE VIEW `project.dataset.test_view` AS + SELECT x.from AS col + FROM project.dataset.src_table AS x + """ + ) + + assert parser.get_tables() == ["project.dataset.src_table"] diff --git a/metadata-ingestion/tests/unit/test_bigquery_sql_parser.py b/metadata-ingestion/tests/unit/test_bigquery_sql_parser.py new file mode 100644 index 00000000000000..2f0ea07da1f5d7 --- /dev/null +++ b/metadata-ingestion/tests/unit/test_bigquery_sql_parser.py @@ -0,0 +1,245 @@ +from datahub.utilities.bigquery_sql_parser import BigQuerySQLParser + + +def test_bigquery_sql_parser_comments_are_removed(): + parser = BigQuerySQLParser( + sql_query=""" +/* +HERE IS A STANDARD COMMENT BLOCK +THIS WILL NOT BREAK sqllineage +*/ +CREATE OR REPLACE TABLE `project.dataset.test_view` AS +#This, comment will not break sqllineage +SELECT foo +-- this comment will not break sqllineage either +# this comment will not break sqllineage either + FROM `project.dataset.src_table` +""" + ) + + assert ( + parser._parsed_sql_query + == """CREATE OR REPLACE TABLE `project.dataset.test_view` AS SELECT foo + FROM `project.dataset.src_table`""" + ) + + +def test_bigquery_sql_parser_formats_input_sql(): + parser = BigQuerySQLParser( + sql_query=""" +CREATE OR REPLACE TABLE `project.dataset.test_view` AS +SELECT foo FROM `project.dataset.src_table_a` AS a +INNER JOIN `project.dataset.src_table_b` AS b ON a.key_field = b.key_field +""" + ) + + assert ( + parser._parsed_sql_query + == """CREATE OR REPLACE TABLE `project.dataset.test_view` AS SELECT foo + FROM `project.dataset.src_table_a` AS a + INNER JOIN `project.dataset.src_table_b` AS b + ON a.key_field = b.key_field""" + ) + + +def test_bigquery_sql_parser_comment_sign_switched_correctly(): + sql_query = BigQuerySQLParser._parse_bigquery_comment_sign( + """ +#upper comment +SELECT * FROM hello +# lower comment +""" + ) + + assert ( + sql_query + == """ +-- upper comment +SELECT * FROM hello +-- lower comment +""" + ) + + +def test_bigquery_sql_parser_keyword_from_is_escaped_if_used_as_fieldname(): + sql_query = BigQuerySQLParser._escape_keyword_from_as_field_name( + """ +SELECT hello.from AS col FROM hello +""" + ) + + assert ( + sql_query + == """ +SELECT `hello.from` AS col FROM hello +""" + ) + + +def test_bigquery_sql_parser_first_cte_name_is_escaped(): + sql_query = BigQuerySQLParser._escape_cte_name_after_keyword_with( + """ +CREATE OR REPLACE VIEW `test_view` AS +WITH cte_1 AS ( + SELECT * FROM foo +), +cte_2 AS ( + SELECT * FROM bar +) +SELECT * FROM cte_1 UNION ALL +SELECT * FROM cte_2 +""" + ) + + assert ( + sql_query + == """ +CREATE OR REPLACE VIEW `test_view` AS +WITH `cte_1` AS ( + SELECT * FROM foo +), +cte_2 AS ( + SELECT * FROM bar +) +SELECT * FROM cte_1 UNION ALL +SELECT * FROM cte_2 +""" + ) + + +def test_bigquery_sql_parser_table_name_is_escaped_at_create_statement(): + sql_query_create = BigQuerySQLParser._escape_table_or_view_name_at_create_statement( + """ +CREATE TABLE project.dataset.test_table AS +col_1 STRING, +col_2 STRING +""" + ) + + sql_query_create_or_replace = BigQuerySQLParser._escape_table_or_view_name_at_create_statement( + """ +CREATE OR REPLACE TABLE project.dataset.test_table AS +col_1 STRING, +col_2 STRING +""" + ) + + assert ( + sql_query_create + == """ +CREATE TABLE `project.dataset.test_table` AS +col_1 STRING, +col_2 STRING +""" + ) + assert ( + sql_query_create_or_replace + == """ +CREATE OR REPLACE TABLE `project.dataset.test_table` AS +col_1 STRING, +col_2 STRING +""" + ) + + +def test_bigquery_sql_parser_view_name_is_escaped_at_create_statement(): + sql_query_create = BigQuerySQLParser._escape_table_or_view_name_at_create_statement( + """ +CREATE VIEW project.dataset.test_view AS +SELECT * FROM project.dataset.src_table +""" + ) + + sql_query_create_or_replace = BigQuerySQLParser._escape_table_or_view_name_at_create_statement( + """ +CREATE OR REPLACE VIEW project.dataset.test_view AS +SELECT * FROM project.dataset.src_table +""" + ) + + assert ( + sql_query_create + == """ +CREATE VIEW `project.dataset.test_view` AS +SELECT * FROM project.dataset.src_table +""" + ) + assert ( + sql_query_create_or_replace + == """ +CREATE OR REPLACE VIEW `project.dataset.test_view` AS +SELECT * FROM project.dataset.src_table +""" + ) + + +def test_bigquery_sql_parser_object_name_is_escaped_after_keyword_from(): + sql_query = BigQuerySQLParser._escape_object_name_after_keyword_from( + """ +CREATE OR REPLACE VIEW `project.dataset.test_view` AS +SELECT * FROM src-project.dataset.src_table_a UNION ALL +SELECT * FROM project.dataset.src_table_b +""" + ) + + assert ( + sql_query + == """ +CREATE OR REPLACE VIEW `project.dataset.test_view` AS +SELECT * FROM `src-project.dataset.src_table_a` UNION ALL +SELECT * FROM `project.dataset.src_table_b` +""" + ) + + +def test_bigquery_sql_parser_field_name_is_not_escaped_after_keyword_from_in_datetime_functions(): + sql_query = BigQuerySQLParser._escape_object_name_after_keyword_from( + """ +CREATE OR REPLACE VIEW `project.dataset.test_view` AS +SELECT +EXTRACT(MICROSECOND FROM time_field) AS col_1, +EXTRACT(MILLISECOND FROM time_field) AS col_2, +EXTRACT(SECOND FROM time_field) AS col_3, +EXTRACT(MINUTE FROM time_field) AS col_4, +EXTRACT(HOUR FROM time_field) AS col_5, +EXTRACT(DAYOFWEEK FROM time_field) AS col_6, +EXTRACT(DAY FROM time_field) AS col_7, +EXTRACT(DAYOFYEAR FROM time_field) AS col_8, +EXTRACT(WEEK FROM time_field) AS col_9, +EXTRACT(WEEK FROM time_field) AS col_10, +EXTRACT(ISOWEEK FROM time_field) AS col_11, +EXTRACT(MONTH FROM time_field) AS col_12, +EXTRACT(QUARTER FROM time_field) AS col_13, +EXTRACT(YEAR FROM time_field) AS col_14, +EXTRACT(ISOYEAR FROM time_field) AS col_15, +EXTRACT(DATE FROM time_field) AS col_16, +EXTRACT(TIME FROM time_field) AS col_17 +FROM src-project.dataset.src_table_a +""" + ) + + assert ( + sql_query + == """ +CREATE OR REPLACE VIEW `project.dataset.test_view` AS +SELECT +EXTRACT(MICROSECOND FROM time_field) AS col_1, +EXTRACT(MILLISECOND FROM time_field) AS col_2, +EXTRACT(SECOND FROM time_field) AS col_3, +EXTRACT(MINUTE FROM time_field) AS col_4, +EXTRACT(HOUR FROM time_field) AS col_5, +EXTRACT(DAYOFWEEK FROM time_field) AS col_6, +EXTRACT(DAY FROM time_field) AS col_7, +EXTRACT(DAYOFYEAR FROM time_field) AS col_8, +EXTRACT(WEEK FROM time_field) AS col_9, +EXTRACT(WEEK FROM time_field) AS col_10, +EXTRACT(ISOWEEK FROM time_field) AS col_11, +EXTRACT(MONTH FROM time_field) AS col_12, +EXTRACT(QUARTER FROM time_field) AS col_13, +EXTRACT(YEAR FROM time_field) AS col_14, +EXTRACT(ISOYEAR FROM time_field) AS col_15, +EXTRACT(DATE FROM time_field) AS col_16, +EXTRACT(TIME FROM time_field) AS col_17 +FROM `src-project.dataset.src_table_a` +""" + ) diff --git a/metadata-ingestion/tests/unit/test_utilities.py b/metadata-ingestion/tests/unit/test_utilities.py index 4c15bec9cfa3e0..0790fe12981f9a 100644 --- a/metadata-ingestion/tests/unit/test_utilities.py +++ b/metadata-ingestion/tests/unit/test_utilities.py @@ -363,216 +363,3 @@ def test_sqllineage_sql_parser_tables_from_redash_query(): table_list = SqlLineageSQLParser(sql_query).get_tables() table_list.sort() assert table_list == ["order_items", "orders", "staffs"] - - -def test_hash_in_sql_query_with_no_space(): - parser = SqlLineageSQLParser( - sql_query=""" -/* -HERE IS A STANDARD COMMENT BLOCK -THIS WILL NOT BREAK sqllineage -*/ -CREATE OR REPLACE TABLE `foo.bar.trg_tbl`AS -#This, comment will not break sqllineage -SELECT foo --- this comment will not break sqllineage either -# this comment will not break sqllineage either -FROM `foo.bar.src_tbl` - """ - ) - - assert parser.get_tables() == ["foo.bar.src_tbl"] - - -def test_with_keyword_data(): - parser = SqlLineageSQLParser( - sql_query=""" - WITH data AS ( - SELECT - *, - 'foo' AS bar - FROM `project.example_dataset.example_table` - ) - SELECT * FROM data - """ - ) - - assert parser.get_tables() == ["project.example_dataset.example_table"] - - -def test_with_keyword_admin(): - parser = SqlLineageSQLParser( - sql_query=""" - WITH admin AS ( - SELECT * - FROM `project.example_dataset.example_table` - ) - SELECT * FROM admin - """ - ) - - assert parser.get_tables() == ["project.example_dataset.example_table"] - - -def test_sqllineage_sql_parser_create_or_replace_table_name_not_wrapped_in_backticks(): - parser = SqlLineageSQLParser( - sql_query=""" - CREATE OR REPLACE TABLE project.dataset.test_table AS - WITH cte AS ( - SELECT - *, - EXTRACT(MICROSECOND FROM time_field) AS col_1, - EXTRACT(MILLISECOND FROM time_field) AS col_2, - EXTRACT(SECOND FROM time_field) AS col_3, - EXTRACT(MINUTE FROM time_field) AS col_4, - EXTRACT(HOUR FROM time_field) AS col_5, - EXTRACT(DAYOFWEEK FROM time_field) AS col_6, - EXTRACT(DAY FROM time_field) AS col_7, - EXTRACT(DAYOFYEAR FROM time_field) AS col_8, - EXTRACT(WEEK FROM time_field) AS col_9, - EXTRACT(WEEK FROM time_field) AS col_10, - EXTRACT(ISOWEEK FROM time_field) AS col_11, - EXTRACT(MONTH FROM time_field) AS col_12, - EXTRACT(QUARTER FROM time_field) AS col_13, - EXTRACT(YEAR FROM time_field) AS col_14, - EXTRACT(ISOYEAR FROM time_field) AS col_15, - EXTRACT(DATE FROM time_field) AS col_16, - EXTRACT(TIME FROM time_field) AS col_17 - FROM project.dataset.src_table_a - ) - SELECT * FROM cte - UNION - SELECT * - FROM project.dataset.src_table_b - UNION - SELECT * FROM `project.dataset.src_table_c` - """ - ) - - assert parser.get_tables() == [ - "project.dataset.src_table_a", - "project.dataset.src_table_b", - "project.dataset.src_table_c", - ] - - -def test_sqllineage_sql_parser_create_or_replace_view_name_not_wrapped_in_backticks(): - parser = SqlLineageSQLParser( - sql_query=""" - CREATE OR REPLACE VIEW project.dataset.test_view AS - WITH cte AS ( - SELECT - *, - 'foo' AS bar - FROM project.dataset.src_table_a - ) - SELECT * FROM cte - UNION - SELECT * - FROM project.dataset.src_table_b - UNION - SELECT * FROM `project.dataset.src_table_c` - """ - ) - - assert parser.get_tables() == [ - "project.dataset.src_table_a", - "project.dataset.src_table_b", - "project.dataset.src_table_c", - ] - - -def test_sqllineage_sql_parser_create_table_name_not_wrapped_in_backticks(): - parser = SqlLineageSQLParser( - sql_query=""" - CREATE TABLE project.dataset.test_table AS - WITH cte AS ( - SELECT - *, - 'foo' AS bar - FROM project.dataset.src_table_a - ) - SELECT * FROM cte - UNION - SELECT * - FROM project.dataset.src_table_b - UNION - SELECT * FROM `project.dataset.src_table_c` - """ - ) - - assert parser.get_tables() == [ - "project.dataset.src_table_a", - "project.dataset.src_table_b", - "project.dataset.src_table_c", - ] - - -def test_sqllineage_sql_parser_create_view_name_not_wrapped_in_backticks(): - parser = SqlLineageSQLParser( - sql_query=""" - CREATE VIEW project.dataset.test_view AS - WITH cte AS ( - SELECT - *, - 'foo' AS bar - FROM project.dataset.src_table_a - ) - SELECT * FROM cte - UNION - SELECT * - FROM project.dataset.src_table_b - UNION - SELECT * FROM `project.dataset.src_table_c` - """ - ) - - assert parser.get_tables() == [ - "project.dataset.src_table_a", - "project.dataset.src_table_b", - "project.dataset.src_table_c", - ] - - -def test_sqllineage_sql_parser_from_as_column_name_is_escaped(): - parser = SqlLineageSQLParser( - sql_query=""" - CREATE TABLE project.dataset.test_table AS - SELECT x.from AS col - FROM project.dataset.src_table AS x - """ - ) - - assert parser.get_tables() == [ - "project.dataset.src_table", - ] - - -def test_sqllineage_sql_parser_cte_alias_near_with_keyword_is_escaped(): - parser = SqlLineageSQLParser( - sql_query=""" - create or replace view `project.dataset.test_view` as - with map as ( - select col_1 - col_2 - from `project.dataset.src_table_a` a - join `project.dataset.src_table_b` b - on a.col_1 = b.col_2 - ), - cte_2 as ( - select * from `project.dataset.src_table_c` - ) - select * from map - union all - select * from cte_2 - """ - ) - - # results would (somehow) also contain "cte_2" which should be considered a subquery rather than a table - assert set( - [ - "project.dataset.src_table_a", - "project.dataset.src_table_b", - "project.dataset.src_table_c", - ] - ).issubset(parser.get_tables())