From 1e334af89cfbe318474103347623e141e591684c Mon Sep 17 00:00:00 2001 From: Pere Miquel Brull Date: Wed, 29 Dec 2021 17:33:40 +0100 Subject: [PATCH] Fix linting (#1958) --- .pylintrc | 6 +- .../metadata/ingestion/source/sql_source.py | 284 +++++++++++------- 2 files changed, 175 insertions(+), 115 deletions(-) diff --git a/.pylintrc b/.pylintrc index 93c953b43640..2e99493ce45b 100644 --- a/.pylintrc +++ b/.pylintrc @@ -1,6 +1,10 @@ [BASIC] # W1203: logging-fstring-interpolation - f-string brings better readability and unifies style -disable=W1203 +# W1202: logging-format-interpolation - lazy formatting in logging functions +disable=W1203,W1202 +docstring-min-length=20 +max-args=7 +max-attributes=12 [MASTER] fail-under=6.0 diff --git a/ingestion/src/metadata/ingestion/source/sql_source.py b/ingestion/src/metadata/ingestion/source/sql_source.py index 236a8fc6bffe..184004b44e1c 100644 --- a/ingestion/src/metadata/ingestion/source/sql_source.py +++ b/ingestion/src/metadata/ingestion/source/sql_source.py @@ -8,6 +8,9 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +""" +Generic source to build SQL connectors. +""" import json import logging import re @@ -16,7 +19,7 @@ from abc import abstractmethod from dataclasses import dataclass, field from datetime import datetime -from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Type +from typing import Dict, Iterable, List, Optional, Tuple from urllib.parse import quote_plus from pydantic import SecretStr @@ -54,20 +57,22 @@ @dataclass class SQLSourceStatus(SourceStatus): + """ + Reports the source status after ingestion + """ + success: List[str] = field(default_factory=list) failures: List[str] = field(default_factory=list) warnings: List[str] = field(default_factory=list) filtered: List[str] = field(default_factory=list) - def scanned(self, table_name: str) -> None: - self.success.append(table_name) - logger.info("Table Scanned: {}".format(table_name)) + def scanned(self, record: str) -> None: + self.success.append(record) + logger.info(f"Table Scanned: {record}") - def filter( - self, table_name: str, err: str, dataset_name: str = None, col_type: str = None - ) -> None: - self.filtered.append(table_name) - logger.warning("Dropped Table {} due to {}".format(table_name, err)) + def filter(self, record: str, err: str) -> None: + self.filtered.append(record) + logger.warning(f"Dropped Table {record} due to {err}") def build_sql_source_connection_url( @@ -76,8 +81,11 @@ def build_sql_source_connection_url( username: Optional[str] = None, password: Optional[SecretStr] = None, database: Optional[str] = None, - options: dict = {}, + options: Optional[dict] = None, ) -> str: + """ + Helper function to prepare the db URL + """ url = f"{scheme}://" if username is not None: @@ -100,6 +108,12 @@ def build_sql_source_connection_url( class SQLConnectionConfig(ConfigModel): + """ + Config class containing all supported + configurations for an SQL source, including + data profiling and DBT generated information. + """ + username: Optional[str] = None password: Optional[SecretStr] = None host_port: str @@ -144,7 +158,8 @@ def _get_table_description(schema: str, table: str, inspector: Inspector) -> str description = None try: table_info: dict = inspector.get_table_comment(table, schema) - except Exception as err: + # Catch any exception without breaking the ingestion + except Exception as err: # pylint: disable=broad-except logger.error(f"Table Description Error : {err}") else: description = table_info["text"] @@ -152,6 +167,12 @@ def _get_table_description(schema: str, table: str, inspector: Inspector) -> str class SQLSource(Source[OMetaDatabaseAndTable]): + """ + Source Connector implementation to extract + Database & Table information and convert it + to OpenMetadata Entities + """ + def __init__( self, config: SQLConnectionConfig, @@ -174,26 +195,38 @@ def __init__( self.data_profiler = None self.data_models = {} if self.config.dbt_catalog_file is not None: - self.dbt_catalog = json.load(open(self.config.dbt_catalog_file, "r")) + with open(self.config.dbt_catalog_file, "r", encoding="utf-8") as catalog: + self.dbt_catalog = json.load(catalog) if self.config.dbt_manifest_file is not None: - self.dbt_manifest = json.load(open(self.config.dbt_manifest_file, "r")) + with open(self.config.dbt_manifest_file, "r", encoding="utf-8") as manifest: + self.dbt_manifest = json.load(manifest) - def _instantiate_profiler(self): + def _instantiate_profiler(self) -> bool: + """ + If the profiler is configured, load it and run. + + Return True if the profiling ran correctly + """ try: if self.config.data_profiler_enabled: if self.data_profiler is None: + # pylint: disable=import-outside-toplevel from metadata.profiler.dataprofiler import DataProfiler + # pylint: enable=import-outside-toplevel + self.data_profiler = DataProfiler( status=self.status, connection_str=self.connection_string ) return True - return False - except Exception: + # Catch any errors during profiling init and continue ingestion + except Exception as exc: # pylint: disable=broad-except logger.error( + f"Error loading profiler {exc}" "DataProfiler configuration is enabled. Please make sure you ran " "pip install 'openmetadata-ingestion[data-profiler]'" ) + return False def prepare(self): self._parse_data_model() @@ -204,15 +237,15 @@ def create( ): pass - def type_of_column_name(self, sa_type, table_name: str, column_name: str): - return sa_type - - def standardize_schema_table_names( - self, schema: str, table: str - ) -> Tuple[str, str]: + @staticmethod + def standardize_schema_table_names(schema: str, table: str) -> Tuple[str, str]: return schema, table - def fetch_sample_data(self, schema: str, table: str): + def fetch_sample_data(self, schema: str, table: str) -> Optional[TableData]: + """ + Get some sample data from the source to be added + to the Table Entities + """ try: query = self.config.query.format(schema, table) logger.info(query) @@ -221,14 +254,14 @@ def fetch_sample_data(self, schema: str, table: str): for col in results.keys(): cols.append(col.replace(".", "_DOT_")) rows = [] - for r in results: - row = list(r) + for res in results: + row = list(res) rows.append(row) return TableData(columns=cols, rows=rows) - except Exception as err: - logger.error( - "Failed to generate sample data for {} - {}".format(table, err) - ) + # Catch any errors and continue the ingestion + except Exception as err: # pylint: disable=broad-except + logger.error(f"Failed to generate sample data for {table} - {err}") + return None def next_record(self) -> Iterable[OMetaDatabaseAndTable]: inspector = inspect(self.engine) @@ -236,7 +269,7 @@ def next_record(self) -> Iterable[OMetaDatabaseAndTable]: if not self.sql_config.schema_filter_pattern.included(schema): self.status.filter(schema, "Schema pattern not allowed") continue - logger.debug("total tables {}".format(inspector.get_table_names(schema))) + logger.debug(f"Total tables {inspector.get_table_names(schema)}") if self.config.include_tables: yield from self.fetch_tables(inspector, schema) if self.config.include_views: @@ -245,6 +278,10 @@ def next_record(self) -> Iterable[OMetaDatabaseAndTable]: def fetch_tables( self, inspector: Inspector, schema: str ) -> Iterable[OMetaDatabaseAndTable]: + """ + Scrape an SQL schema and prepare Database and Table + OpenMetadata Entities + """ for table_name in inspector.get_table_names(schema): try: schema, table_name = self.standardize_schema_table_names( @@ -252,13 +289,11 @@ def fetch_tables( ) if not self.sql_config.table_filter_pattern.included(table_name): self.status.filter( - "{}.{}".format(self.config.get_service_name(), table_name), + f"{self.config.get_service_name()}.{table_name}", "Table pattern not allowed", ) continue - self.status.scanned( - "{}.{}".format(self.config.get_service_name(), table_name) - ) + self.status.scanned(f"{self.config.get_service_name()}.{table_name}") description = _get_table_description(schema, table_name, inspector) fqn = f"{self.config.service_name}.{self.config.database}.{schema}.{table_name}" @@ -275,7 +310,8 @@ def fetch_tables( if self.sql_config.generate_sample_data: table_data = self.fetch_sample_data(schema, table_name) table_entity.sampleData = table_data - except Exception as err: + # Catch any errors during the ingestion and continue + except Exception as err: # pylint: disable=broad-except logger.error(repr(err)) logger.error(err) @@ -291,16 +327,19 @@ def fetch_tables( table=table_entity, database=self._get_database(schema) ) yield table_and_db - except Exception as err: + # Catch any errors during the ingestion and continue + except Exception as err: # pylint: disable=broad-except logger.error(err) - self.status.warnings.append( - "{}.{}".format(self.config.service_name, table_name) - ) + self.status.warnings.append(f"{self.config.service_name}.{table_name}") continue def fetch_views( self, inspector: Inspector, schema: str ) -> Iterable[OMetaDatabaseAndTable]: + """ + Get all views in the SQL schema and prepare + Database & Table OpenMetadata Entities + """ for view_name in inspector.get_view_names(schema): try: if self.config.scheme == "bigquery": @@ -309,12 +348,11 @@ def fetch_views( ) if not self.sql_config.table_filter_pattern.included(view_name): self.status.filter( - "{}.{}".format(self.config.get_service_name(), view_name), + f"{self.config.get_service_name()}.{view_name}", "View pattern not allowed", ) continue try: - if self.config.scheme == "bigquery": view_definition = inspector.get_view_definition( f"{self.config.project_id}.{schema}.{view_name}" @@ -329,17 +367,15 @@ def fetch_views( except NotImplementedError: view_definition = "" - description = _get_table_description(schema, view_name, inspector) - table_columns = self._get_columns(schema, view_name, inspector) - view_name = view_name.replace(".", "_DOT_") - fqn = f"{self.config.service_name}.{self.config.database}.{schema}.{view_name}" table = Table( id=uuid.uuid4(), - name=view_name, + name=view_name.replace(".", "_DOT_"), tableType="View", - description=description if description is not None else " ", - fullyQualifiedName=fqn, - columns=table_columns, + description=_get_table_description(schema, view_name, inspector) + or "", + # This will be generated in the backend!! #1673 + fullyQualifiedName=view_name, + columns=self._get_columns(schema, view_name, inspector), viewDefinition=view_definition, ) if self.sql_config.generate_sample_data: @@ -350,31 +386,34 @@ def fetch_views( table=table, database=self._get_database(schema) ) yield table_and_db - except Exception as err: + # Catch any errors and continue the ingestion + except Exception as err: # pylint: disable=broad-except logger.error(err) - self.status.warnings.append( - "{}.{}".format(self.config.service_name, view_name) - ) + self.status.warnings.append(f"{self.config.service_name}.{view_name}") continue def _parse_data_model(self): - + """ + Get all the DBT information and feed it to the Table Entity + """ if self.config.dbt_manifest_file and self.config.dbt_catalog_file: logger.info("Parsing Data Models") - manifest_nodes = self.dbt_manifest["nodes"] - manifest_sources = self.dbt_manifest["sources"] - manifest_entities = {**manifest_nodes, **manifest_sources} - catalog_nodes = self.dbt_catalog["nodes"] - catalog_sources = self.dbt_catalog["sources"] - catalog_entities = {**catalog_nodes, **catalog_sources} + manifest_entities = { + **self.dbt_manifest["nodes"], + **self.dbt_manifest["sources"], + } + catalog_entities = { + **self.dbt_catalog["nodes"], + **self.dbt_catalog["sources"], + } for key, mnode in manifest_entities.items(): name = mnode["alias"] if "alias" in mnode.keys() else mnode["name"] cnode = catalog_entities.get(key) - if cnode is not None: - columns = self._parse_data_model_columns(name, mnode, cnode) - else: - columns = [] + columns = ( + self._parse_data_model_columns(name, mnode, cnode) if cnode else [] + ) + if mnode["resource_type"] == "test": continue upstream_nodes = self._parse_data_model_upstream(mnode) @@ -382,14 +421,12 @@ def _parse_data_model(self): mnode["alias"] if "alias" in mnode.keys() else mnode["name"] ) model_name = model_name.replace(".", "_DOT_") - description = mnode.get("description", "") schema = mnode["schema"] - path = f"{mnode['root_path']}/{mnode['original_file_path']}" raw_sql = mnode.get("raw_sql", "") model = DataModel( modelType=ModelType.DBT, - description=description, - path=path, + description=mnode.get("description", ""), + path=f"{mnode['root_path']}/{mnode['original_file_path']}", rawSql=raw_sql, sql=mnode.get("compiled_sql", raw_sql), columns=columns, @@ -403,12 +440,14 @@ def _parse_data_model_upstream(self, mnode): if "depends_on" in mnode and "nodes" in mnode["depends_on"]: for node in mnode["depends_on"]["nodes"]: try: - node_type, database, table = node.split(".", 2) + _, database, table = node.split(".", 2) table = table.replace(".", "_DOT_") table_fqn = f"{self.config.service_name}.{database}.{table}" upstream_nodes.append(table_fqn) - except Exception: - logger.error(f"Failed to parse the node {node} to capture lineage") + except Exception as err: # pylint: disable=broad-except + logger.error( + f"Failed to parse the node {node} to capture lineage {err}" + ) continue return upstream_nodes @@ -443,8 +482,8 @@ def _parse_data_model_columns( ordinalPosition=ccolumn["index"], ) columns.append(col) - except Exception as e: - logger.error(f"Failed to parse column type due to {e}") + except Exception as err: # pylint: disable=broad-except + logger.error(f"Failed to parse column type due to {err}") return columns @@ -454,33 +493,58 @@ def _get_database(self, schema: str) -> Database: service=EntityReference(id=self.service.id, type=self.config.service_type), ) - def parse_raw_data_type(self, raw_data_type): - return raw_data_type + @staticmethod + def _get_column_constraints( + column, pk_columns, unique_columns + ) -> Optional[Constraint]: + """ + Prepare column constraints for the Table Entity + """ + constraint = None + + if column["nullable"]: + constraint = Constraint.NULL + elif not column["nullable"]: + constraint = Constraint.NOT_NULL + + if column["name"] in pk_columns: + constraint = Constraint.PRIMARY_KEY + elif column["name"] in unique_columns: + constraint = Constraint.UNIQUE + + return constraint def _get_columns( self, schema: str, table: str, inspector: Inspector - ) -> List[Column]: + ) -> Optional[List[Column]]: + """ + Get columns types and constraints information + """ + + # Get inspector information: pk_constraints = inspector.get_pk_constraint(table, schema) + try: + unique_constraints = inspector.get_unique_constraints(table, schema) + except NotImplementedError: + logger.warning("Cannot obtain constraints - NotImplementedError") + unique_constraints = [] + pk_columns = ( pk_constraints["column_constraints"] if len(pk_constraints) > 0 and "column_constraints" in pk_constraints.keys() else {} ) - unique_constraints = [] - try: - unique_constraints = inspector.get_unique_constraints(table, schema) - except NotImplementedError: - pass - unique_columns = [] - for constraint in unique_constraints: - if "column_names" in constraint.keys(): - unique_columns = constraint["column_names"] + + unique_columns = [ + constraint["column_names"] + for constraint in unique_constraints + if "column_names" in constraint.keys() + ] + dataset_name = f"{schema}.{table}" - columns = inspector.get_columns(table, schema) table_columns = [] - row_order = 1 try: - for column in columns: + for row_order, column in enumerate(inspector.get_columns(table, schema)): if "." in column["name"]: logger.info(f"Found '.' in {column['name']}") column["name"] = column["name"].replace(".", "_DOT_") @@ -489,9 +553,6 @@ def _get_columns( col_data_length = None arr_data_type = None if "raw_data_type" in column and column["raw_data_type"] is not None: - column["raw_data_type"] = self.parse_raw_data_type( - column["raw_data_type"] - ) ( col_type, data_type_display, @@ -514,26 +575,16 @@ def _get_columns( r"(?:\w*)(?:[(]*)(\w*)(?:.*)", str(column["type"]) ).groups() data_type_display = column["type"] - col_constraint = None - if column["nullable"]: - col_constraint = Constraint.NULL - elif not column["nullable"]: - col_constraint = Constraint.NOT_NULL - if column["name"] in pk_columns: - col_constraint = Constraint.PRIMARY_KEY - elif column["name"] in unique_columns: - col_constraint = Constraint.UNIQUE - if col_type.upper() in ["CHAR", "VARCHAR", "BINARY", "VARBINARY"]: + + col_constraint = self._get_column_constraints( + column, pk_columns, unique_columns + ) + + if col_type.upper() in {"CHAR", "VARCHAR", "BINARY", "VARBINARY"}: col_data_length = column["type"].length if col_data_length is None: col_data_length = 1 try: - if col_type == "NULL": - col_type = self.type_of_column_name( - col_type, - column_name=column["name"], - table_name=dataset_name, - ) if col_type == "NULL": col_type = "VARCHAR" data_type_display = "varchar" @@ -544,27 +595,32 @@ def _get_columns( name=column["name"], description=column.get("comment", None), dataType=col_type, - dataTypeDisplay="{}({})".format(col_type, col_data_length) + dataTypeDisplay=f"{col_type}({col_data_length})" if data_type_display is None else f"{data_type_display}", dataLength=col_data_length, constraint=col_constraint, - ordinalPosition=row_order, - children=children if children is not None else None, + ordinalPosition=row_order + 1, # enumerate starts at 0 + children=children, arrayDataType=arr_data_type, ) - except Exception as err: + except Exception as err: # pylint: disable=broad-except logger.error(traceback.format_exc()) logger.error(traceback.print_exc()) logger.error(f"{err} : {column}") continue table_columns.append(om_column) - row_order = row_order + 1 return table_columns - except Exception as err: - logger.error("{}: {} {}".format(repr(err), table, err)) + except Exception as err: # pylint: disable=broad-except + logger.error(f"{repr(err)}: {table} {err}") + return None def run_data_profiler(self, table: str, schema: str) -> TableProfile: + """ + Run the profiler for a table in a schema. + + Prepare specific namings for different sources, e.g. bigquery + """ dataset_name = f"{schema}.{table}" self.status.scanned(f"profile of {dataset_name}") logger.info(