From f8028be49cc093b9091c4502fd5b34d65265896f Mon Sep 17 00:00:00 2001 From: Ethan Cartwright Date: Mon, 15 Jul 2024 11:09:56 -0400 Subject: [PATCH 1/4] add flag --- .../ingestion/source/snowflake/snowflake_config.py | 5 +++++ .../ingestion/source/snowflake/snowflake_schema.py | 8 ++++++-- .../ingestion/source/snowflake/snowflake_schema_gen.py | 4 +++- 3 files changed, 14 insertions(+), 3 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py index 588187e8e11c2..6bf60a1503eda 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_config.py @@ -90,6 +90,11 @@ class SnowflakeV2Config( description="If enabled, populates the snowflake usage statistics. Requires appropriate grants given to the role.", ) + include_view_definitions: bool = Field( + default=True, + description="If enabled, populates the ingested views' definitions.", + ) + include_technical_schema: bool = Field( default=True, description="If enabled, populates the snowflake technical schema and descriptions.", diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema.py index 4bc684a22514c..bf6605e317bad 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema.py @@ -329,7 +329,9 @@ def get_tables_for_schema( return tables @serialized_lru_cache(maxsize=1) - def get_views_for_database(self, db_name: str) -> Dict[str, List[SnowflakeView]]: + def get_views_for_database( + self, db_name: str, include_view_definitions: bool + ) -> Dict[str, List[SnowflakeView]]: page_limit = SHOW_VIEWS_MAX_PAGE_SIZE views: Dict[str, List[SnowflakeView]] = {} @@ -362,7 +364,9 @@ def get_views_for_database(self, db_name: str) -> Dict[str, List[SnowflakeView]] created=view["created_on"], # last_altered=table["last_altered"], comment=view["comment"], - view_definition=view["text"], + view_definition=( + view["text"] if include_view_definitions else None + ), last_altered=view["created_on"], materialized=( view.get("is_materialized", "false").lower() == "true" diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema_gen.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema_gen.py index b6f16cd671b8d..de365360cee7f 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema_gen.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema_gen.py @@ -1009,7 +1009,9 @@ def get_tables_for_schema( def get_views_for_schema( self, schema_name: str, db_name: str ) -> List[SnowflakeView]: - views = self.data_dictionary.get_views_for_database(db_name) + views = self.data_dictionary.get_views_for_database( + db_name, self.config.include_view_definitions + ) # Some schema may not have any table return views.get(schema_name, []) From 70b1b11bc301ea9cdc01e79d5f8ab7334b4f2edf Mon Sep 17 00:00:00 2001 From: Ethan Cartwright Date: Mon, 15 Jul 2024 14:48:23 -0400 Subject: [PATCH 2/4] include/exclude at generation time rather than fetch time --- .../source/snowflake/snowflake_schema.py | 14 +++++------- .../source/snowflake/snowflake_schema_gen.py | 22 ++++++++++--------- 2 files changed, 17 insertions(+), 19 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema.py index 8bde620880c7d..a38e87b512a49 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema.py @@ -132,9 +132,9 @@ def __init__(self) -> None: ) # self._table_tags[][][] = list of tags applied to table - self._table_tags: Dict[ - str, Dict[str, Dict[str, List[SnowflakeTag]]] - ] = defaultdict(lambda: defaultdict(lambda: defaultdict(list))) + self._table_tags: Dict[str, Dict[str, Dict[str, List[SnowflakeTag]]]] = ( + defaultdict(lambda: defaultdict(lambda: defaultdict(list))) + ) # self._column_tags[][][][] = list of tags applied to column self._column_tags: Dict[ @@ -320,9 +320,7 @@ def get_tables_for_schema( return tables @serialized_lru_cache(maxsize=1) - def get_views_for_database( - self, db_name: str, include_view_definitions: bool - ) -> Dict[str, List[SnowflakeView]]: + def get_views_for_database(self, db_name: str) -> Dict[str, List[SnowflakeView]]: page_limit = SHOW_VIEWS_MAX_PAGE_SIZE views: Dict[str, List[SnowflakeView]] = {} @@ -355,9 +353,7 @@ def get_views_for_database( created=view["created_on"], # last_altered=table["last_altered"], comment=view["comment"], - view_definition=( - view["text"] if include_view_definitions else None - ), + view_definition=view["text"], last_altered=view["created_on"], materialized=( view.get("is_materialized", "false").lower() == "true" diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema_gen.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema_gen.py index 69467b0e98c09..67f0a1635fa76 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema_gen.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema_gen.py @@ -171,9 +171,9 @@ def __init__( config, self.data_dictionary, self.report ) self.profiler: Optional[SnowflakeProfiler] = profiler - self.snowsight_url_builder: Optional[ - SnowsightUrlBuilder - ] = snowsight_url_builder + self.snowsight_url_builder: Optional[SnowsightUrlBuilder] = ( + snowsight_url_builder + ) # These are populated as side-effects of get_workunits_internal. self.databases: List[SnowflakeDatabase] = [] @@ -228,9 +228,9 @@ def get_databases(self) -> Optional[List[SnowflakeDatabase]]: ) return None else: - ischema_databases: List[ - SnowflakeDatabase - ] = self.get_databases_from_ischema(databases) + ischema_databases: List[SnowflakeDatabase] = ( + self.get_databases_from_ischema(databases) + ) if len(ischema_databases) == 0: self.report_error( @@ -753,7 +753,11 @@ def gen_dataset_workunits( view_properties_aspect = ViewProperties( materialized=table.materialized, viewLanguage="SQL", - viewLogic=table.view_definition, + viewLogic=( + table.view_definition + if self.config.include_view_definitions + else None + ), ) yield MetadataChangeProposalWrapper( @@ -1017,9 +1021,7 @@ def get_tables_for_schema( def get_views_for_schema( self, schema_name: str, db_name: str ) -> List[SnowflakeView]: - views = self.data_dictionary.get_views_for_database( - db_name, self.config.include_view_definitions - ) + views = self.data_dictionary.get_views_for_database(db_name) # Some schema may not have any table return views.get(schema_name, []) From cf95fe47c49eb773eec91f0623bcfe03fc013b5e Mon Sep 17 00:00:00 2001 From: Ethan Cartwright Date: Mon, 15 Jul 2024 16:21:45 -0400 Subject: [PATCH 3/4] fix type issue and linting --- .../ingestion/source/snowflake/snowflake_schema.py | 6 +++--- .../source/snowflake/snowflake_schema_gen.py | 12 ++++++------ .../pegasus/com/linkedin/dataset/ViewProperties.pdl | 2 +- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema.py index a38e87b512a49..ce8f20d23aa6b 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema.py @@ -132,9 +132,9 @@ def __init__(self) -> None: ) # self._table_tags[][][] = list of tags applied to table - self._table_tags: Dict[str, Dict[str, Dict[str, List[SnowflakeTag]]]] = ( - defaultdict(lambda: defaultdict(lambda: defaultdict(list))) - ) + self._table_tags: Dict[ + str, Dict[str, Dict[str, List[SnowflakeTag]]] + ] = defaultdict(lambda: defaultdict(lambda: defaultdict(list))) # self._column_tags[][][][] = list of tags applied to column self._column_tags: Dict[ diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema_gen.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema_gen.py index 67f0a1635fa76..0719dd54eca69 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema_gen.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema_gen.py @@ -171,9 +171,9 @@ def __init__( config, self.data_dictionary, self.report ) self.profiler: Optional[SnowflakeProfiler] = profiler - self.snowsight_url_builder: Optional[SnowsightUrlBuilder] = ( - snowsight_url_builder - ) + self.snowsight_url_builder: Optional[ + SnowsightUrlBuilder + ] = snowsight_url_builder # These are populated as side-effects of get_workunits_internal. self.databases: List[SnowflakeDatabase] = [] @@ -228,9 +228,9 @@ def get_databases(self) -> Optional[List[SnowflakeDatabase]]: ) return None else: - ischema_databases: List[SnowflakeDatabase] = ( - self.get_databases_from_ischema(databases) - ) + ischema_databases: List[ + SnowflakeDatabase + ] = self.get_databases_from_ischema(databases) if len(ischema_databases) == 0: self.report_error( diff --git a/metadata-models/src/main/pegasus/com/linkedin/dataset/ViewProperties.pdl b/metadata-models/src/main/pegasus/com/linkedin/dataset/ViewProperties.pdl index b1d6a979e9a0f..5c47bcf4b93f8 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/dataset/ViewProperties.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/dataset/ViewProperties.pdl @@ -23,7 +23,7 @@ record ViewProperties { /** * The view logic */ - viewLogic: string + viewLogic: optional string /** * The formatted view logic. This is particularly used for SQL sources, where the SQL From d17ccb1f515d0828c8c5f83369fdccbc823f7544 Mon Sep 17 00:00:00 2001 From: Ethan Cartwright Date: Mon, 15 Jul 2024 17:08:29 -0400 Subject: [PATCH 4/4] use empty str instead of None for easy bwd compat --- .../datahub/ingestion/source/snowflake/snowflake_schema_gen.py | 2 +- .../src/main/pegasus/com/linkedin/dataset/ViewProperties.pdl | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema_gen.py b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema_gen.py index 0719dd54eca69..dcc18635de32c 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema_gen.py +++ b/metadata-ingestion/src/datahub/ingestion/source/snowflake/snowflake_schema_gen.py @@ -756,7 +756,7 @@ def gen_dataset_workunits( viewLogic=( table.view_definition if self.config.include_view_definitions - else None + else "" ), ) diff --git a/metadata-models/src/main/pegasus/com/linkedin/dataset/ViewProperties.pdl b/metadata-models/src/main/pegasus/com/linkedin/dataset/ViewProperties.pdl index 5c47bcf4b93f8..b1d6a979e9a0f 100644 --- a/metadata-models/src/main/pegasus/com/linkedin/dataset/ViewProperties.pdl +++ b/metadata-models/src/main/pegasus/com/linkedin/dataset/ViewProperties.pdl @@ -23,7 +23,7 @@ record ViewProperties { /** * The view logic */ - viewLogic: optional string + viewLogic: string /** * The formatted view logic. This is particularly used for SQL sources, where the SQL