From 6c3fbf7a16b5f1cfa399c7dd958576ddb7c870e2 Mon Sep 17 00:00:00 2001 From: sushi30 Date: Fri, 18 Oct 2024 11:09:04 +0200 Subject: [PATCH 01/18] ref(profiler): use di for system profile - use source classes that can be overridden in system profiles - use a manifest class instead of factory to specify which class to resolve for connectors - example usage can be seen in redshift and snowflake --- .../source/database/databricks/manifest.py | 9 + .../source/database/redshift/manifest.py | 6 + .../database/redshift/profiler/profiler.py | 10 ++ .../database/redshift/profiler/system.py | 146 ++++++++------- .../source/database/snowflake/manifest.py | 6 + .../database/snowflake/profiler/profiler.py | 33 ++++ .../database/snowflake/profiler/system.py | 166 +++++++++++++++--- .../profiler/interface/profiler_interface.py | 5 +- .../interface/profiler_interface_factory.py | 3 + .../sqlalchemy/profiler_interface.py | 28 +-- .../src/metadata/profiler/metrics/core.py | 5 +- .../profiler/metrics/system/system.py | 81 ++++++++- .../profiler/source/base/profiler_source.py | 67 ++++--- .../metadata/profiler/source/metadata_ext.py | 41 +++-- ingestion/src/metadata/utils/importer.py | 47 ++++- ingestion/src/metadata/utils/lru_cache.py | 69 +++++--- ingestion/src/metadata/utils/manifest.py | 51 ++++++ ingestion/src/metadata/utils/test_suite.py | 84 +++++++++ ingestion/tests/cli_e2e/base/test_cli_db.py | 4 + ingestion/tests/cli_e2e/test_cli_redshift.py | 35 +++- ingestion/tests/cli_e2e/test_cli_snowflake.py | 7 +- 21 files changed, 725 insertions(+), 178 deletions(-) create mode 100644 ingestion/src/metadata/ingestion/source/database/databricks/manifest.py create mode 100644 ingestion/src/metadata/ingestion/source/database/redshift/manifest.py create mode 100644 ingestion/src/metadata/ingestion/source/database/redshift/profiler/profiler.py create mode 100644 ingestion/src/metadata/ingestion/source/database/snowflake/manifest.py create mode 100644 ingestion/src/metadata/ingestion/source/database/snowflake/profiler/profiler.py create mode 100644 ingestion/src/metadata/utils/manifest.py create mode 100644 ingestion/src/metadata/utils/test_suite.py diff --git a/ingestion/src/metadata/ingestion/source/database/databricks/manifest.py b/ingestion/src/metadata/ingestion/source/database/databricks/manifest.py new file mode 100644 index 000000000000..dd1b85ee5c10 --- /dev/null +++ b/ingestion/src/metadata/ingestion/source/database/databricks/manifest.py @@ -0,0 +1,9 @@ +from metadata.ingestion.source.database.databricks.profiler.profiler import ( + DatabricksProfiler, +) +from metadata.ingestion.source.database.redshift.profiler.profiler import ( + RedshiftProfiler, +) +from metadata.utils.manifest import BaseManifest, get_class_path + +RedshiftManifest = BaseManifest(profler_class=get_class_path(DatabricksProfiler)) diff --git a/ingestion/src/metadata/ingestion/source/database/redshift/manifest.py b/ingestion/src/metadata/ingestion/source/database/redshift/manifest.py new file mode 100644 index 000000000000..6983e32d6386 --- /dev/null +++ b/ingestion/src/metadata/ingestion/source/database/redshift/manifest.py @@ -0,0 +1,6 @@ +from metadata.ingestion.source.database.redshift.profiler.profiler import ( + RedshiftProfiler, +) +from metadata.utils.manifest import BaseManifest, get_class_path + +RedshiftManifest = BaseManifest(profler_class=get_class_path(RedshiftProfiler)) diff --git a/ingestion/src/metadata/ingestion/source/database/redshift/profiler/profiler.py b/ingestion/src/metadata/ingestion/source/database/redshift/profiler/profiler.py new file mode 100644 index 000000000000..f7181cb7060f --- /dev/null +++ b/ingestion/src/metadata/ingestion/source/database/redshift/profiler/profiler.py @@ -0,0 +1,10 @@ +from metadata.ingestion.source.database.redshift.profiler.system import ( + RedshiftSystemMetricsSource, +) +from metadata.profiler.interface.sqlalchemy.profiler_interface import ( + SQAProfilerInterface, +) + + +class RedshiftProfiler(SQAProfilerInterface): + system_metrics_source_class = RedshiftSystemMetricsSource diff --git a/ingestion/src/metadata/ingestion/source/database/redshift/profiler/system.py b/ingestion/src/metadata/ingestion/source/database/redshift/profiler/system.py index 8ea963e8ed4c..22fd06edfe5a 100644 --- a/ingestion/src/metadata/ingestion/source/database/redshift/profiler/system.py +++ b/ingestion/src/metadata/ingestion/source/database/redshift/profiler/system.py @@ -1,55 +1,72 @@ -from typing import Dict, List +from typing import List -from pydantic import TypeAdapter -from sqlalchemy.orm import DeclarativeMeta, Session +from sqlalchemy.orm import DeclarativeMeta from metadata.generated.schema.entity.data.table import SystemProfile from metadata.ingestion.source.database.redshift.queries import ( STL_QUERY, - get_metric_result, get_query_results, ) from metadata.profiler.metrics.system.dml_operation import DatabaseDMLOperations from metadata.profiler.metrics.system.system import ( - SYSTEM_QUERY_RESULT_CACHE, - get_system_metrics_for_dialect, + BaseSystemMetricsSource, + SQASessionProvider, + CacheProvider, ) -from metadata.profiler.orm.registry import Dialects from metadata.utils.logger import profiler_logger -from metadata.utils.profiler_utils import get_value_from_cache, set_cache +from metadata.utils.profiler_utils import QueryResult +from metadata.utils.time_utils import datetime_to_timestamp logger = profiler_logger() -@get_system_metrics_for_dialect.register(Dialects.Redshift) -def _( - dialect: str, - session: Session, - table: DeclarativeMeta, - *args, - **kwargs, -) -> List[SystemProfile]: - """List all the DML operations for reshifts tables +class RedshiftSystemMetricsSource( + SQASessionProvider, BaseSystemMetricsSource, CacheProvider +): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) - Args: - dialect (str): redshift - session (Session): session object - table (DeclarativeMeta): orm table + def get_inserts( + self, database: str, schema: str, table: str + ) -> List[SystemProfile]: + queries = self.get_or_update_cache( + f"{database}.{schema}", + self._get_insert_queries, + database=database, + schema=schema, + ) + return get_metric_result(queries, table) - Returns: - List[Dict]: - """ - logger.debug(f"Fetching system metrics for {dialect}") - database = session.get_bind().url.database - schema = table.__table_args__["schema"] # type: ignore + def get_kwargs(self, table: DeclarativeMeta, *args, **kwargs): + return { + "table": table.__table__.name, + "database": self.get_session().get_bind().url.database, + "schema": table.__table__.schema, + } - metric_results: List[Dict] = [] + def get_deletes( + self, database: str, schema: str, table: str + ) -> List[SystemProfile]: + queries = self.get_or_update_cache( + f"{database}.{schema}", + self._get_delete_queries, + database=database, + schema=schema, + ) + return get_metric_result(queries, table) + + def get_updates( + self, database: str, schema: str, table: str + ) -> List[SystemProfile]: + queries = self.get_or_update_cache( + f"{database}.{schema}", + self._get_update_queries, + database=database, + schema=schema, + ) + return get_metric_result(queries, table) - # get inserts ddl queries - inserts = get_value_from_cache( - SYSTEM_QUERY_RESULT_CACHE, f"{Dialects.Redshift}.{database}.{schema}.inserts" - ) - if not inserts: + def _get_insert_queries(self, database: str, schema: str) -> List[QueryResult]: insert_query = STL_QUERY.format( alias="si", join_type="LEFT", @@ -57,23 +74,13 @@ def _( database=database, schema=schema, ) - inserts = get_query_results( - session, + return get_query_results( + super().get_session(), insert_query, DatabaseDMLOperations.INSERT.value, ) - set_cache( - SYSTEM_QUERY_RESULT_CACHE, - f"{Dialects.Redshift}.{database}.{schema}.inserts", - inserts, - ) - metric_results.extend(get_metric_result(inserts, table.__tablename__)) - # get deletes ddl queries - deletes = get_value_from_cache( - SYSTEM_QUERY_RESULT_CACHE, f"{Dialects.Redshift}.{database}.{schema}.deletes" - ) - if not deletes: + def _get_delete_queries(self, database: str, schema: str) -> List[QueryResult]: delete_query = STL_QUERY.format( alias="sd", join_type="RIGHT", @@ -81,23 +88,13 @@ def _( database=database, schema=schema, ) - deletes = get_query_results( - session, + return get_query_results( + super().get_session(), delete_query, DatabaseDMLOperations.DELETE.value, ) - set_cache( - SYSTEM_QUERY_RESULT_CACHE, - f"{Dialects.Redshift}.{database}.{schema}.deletes", - deletes, - ) - metric_results.extend(get_metric_result(deletes, table.__tablename__)) # type: ignore - # get updates ddl queries - updates = get_value_from_cache( - SYSTEM_QUERY_RESULT_CACHE, f"{Dialects.Redshift}.{database}.{schema}.updates" - ) - if not updates: + def _get_update_queries(self, database: str, schema: str) -> List[QueryResult]: update_query = STL_QUERY.format( alias="si", join_type="INNER", @@ -105,16 +102,29 @@ def _( database=database, schema=schema, ) - updates = get_query_results( - session, + return get_query_results( + super().get_session(), update_query, DatabaseDMLOperations.UPDATE.value, ) - set_cache( - SYSTEM_QUERY_RESULT_CACHE, - f"{Dialects.Redshift}.{database}.{schema}.updates", - updates, - ) - metric_results.extend(get_metric_result(updates, table.__tablename__)) # type: ignore - return TypeAdapter(List[SystemProfile]).validate_python(metric_results) + +def get_metric_result(ddls: List[QueryResult], table_name: str) -> List: + """Given query results, retur the metric result + + Args: + ddls (List[QueryResult]): list of query results + table_name (str): table name + + Returns: + List: + """ + return [ + { + "timestamp": datetime_to_timestamp(ddl.start_time, milliseconds=True), + "operation": ddl.query_type, + "rowsAffected": ddl.rows, + } + for ddl in ddls + if ddl.table_name == table_name + ] diff --git a/ingestion/src/metadata/ingestion/source/database/snowflake/manifest.py b/ingestion/src/metadata/ingestion/source/database/snowflake/manifest.py new file mode 100644 index 000000000000..d6109be9ae01 --- /dev/null +++ b/ingestion/src/metadata/ingestion/source/database/snowflake/manifest.py @@ -0,0 +1,6 @@ +from metadata.ingestion.source.database.snowflake.profiler.profiler import ( + SnowflakeProfiler, +) +from metadata.utils.manifest import BaseManifest, get_class_path + +SnowflakeManifest = BaseManifest(profler_class=get_class_path(SnowflakeProfiler)) diff --git a/ingestion/src/metadata/ingestion/source/database/snowflake/profiler/profiler.py b/ingestion/src/metadata/ingestion/source/database/snowflake/profiler/profiler.py new file mode 100644 index 000000000000..fa7438c2511a --- /dev/null +++ b/ingestion/src/metadata/ingestion/source/database/snowflake/profiler/profiler.py @@ -0,0 +1,33 @@ +# Copyright 2021 Collate +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Interfaces with database for all database engine +supporting sqlalchemy abstraction layer +""" +from metadata.ingestion.source.database.snowflake.profiler.system import ( + SnowflakeSystemMetricsSource, +) +from metadata.profiler.interface.sqlalchemy.snowflake.profiler_interface import ( + SnowflakeProfilerInterface, +) +from metadata.utils.logger import profiler_interface_registry_logger + +logger = profiler_interface_registry_logger() + + +class SnowflakeProfiler(SnowflakeProfilerInterface): + """ + Interface to interact with registry supporting + sqlalchemy. + """ + + system_metrics_source_class = SnowflakeSystemMetricsSource diff --git a/ingestion/src/metadata/ingestion/source/database/snowflake/profiler/system.py b/ingestion/src/metadata/ingestion/source/database/snowflake/profiler/system.py index b7d398d79eac..26c247eed9e6 100644 --- a/ingestion/src/metadata/ingestion/source/database/snowflake/profiler/system.py +++ b/ingestion/src/metadata/ingestion/source/database/snowflake/profiler/system.py @@ -1,17 +1,29 @@ import re import traceback -from typing import List, Optional, Tuple +from typing import List, Optional, Tuple, Any, TypeVar, Callable import sqlalchemy.orm -from sqlalchemy.orm import DeclarativeMeta, Session +from pydantic import TypeAdapter +from sqlalchemy.orm import DeclarativeMeta +from metadata.generated.schema.entity.data.table import SystemProfile, DmlOperationType from metadata.ingestion.source.database.snowflake.models import ( SnowflakeQueryLogEntry, SnowflakeQueryResult, ) +from metadata.profiler.metrics.system.dml_operation import ( + DatabaseDMLOperations, +) +from metadata.profiler.metrics.system.system import ( + BaseSystemMetricsSource, + SQASessionProvider, + CacheProvider, +) from metadata.utils.logger import profiler_logger from metadata.utils.lru_cache import LRU_CACHE_SIZE, LRUCache -from metadata.utils.profiler_utils import get_identifiers_from_string +from metadata.utils.profiler_utils import get_identifiers_from_string, QueryResult +from metadata.utils.time_utils import datetime_to_timestamp +import hashlib PUBLIC_SCHEMA = "PUBLIC" logger = profiler_logger() @@ -23,6 +35,16 @@ IDENTIFIER_PATTERN = r"(IDENTIFIER\(\')([\w._\"]+)(\'\))" +def sha256_hash(text: str) -> str: + """Return the SHA256 hash of the text""" + + return hashlib.sha256(text.encode()).hexdigest() + + +cache = LRUCache(LRU_CACHE_SIZE) + + +@cache.wrap(key_func=lambda query: sha256_hash(query.strip())) def _parse_query(query: str) -> Optional[str]: """Parse snowflake queries to extract the identifiers""" match = re.match(QUERY_PATTERN, query, re.IGNORECASE) @@ -48,7 +70,7 @@ def _parse_query(query: str) -> Optional[str]: class SnowflakeTableResovler: def __init__(self, session: sqlalchemy.orm.Session): - self._cache = LRUCache(LRU_CACHE_SIZE) + self._cache = LRUCache[bool](LRU_CACHE_SIZE) self.session = session def show_tables(self, db, schema, table): @@ -241,20 +263,124 @@ def get_snowflake_system_queries( return None -def build_snowflake_query_results( - session: Session, - table: DeclarativeMeta, -) -> List[SnowflakeQueryResult]: - """List and parse snowflake DML query results""" - query_results = [] - resolver = SnowflakeTableResovler( - session=session, - ) - for row in SnowflakeQueryLogEntry.get_for_table(session, table.__tablename__): - result = get_snowflake_system_queries( - query_log_entry=row, - resolver=resolver, +class SnowflakeSystemMetricsSource( + SQASessionProvider, BaseSystemMetricsSource, CacheProvider[SnowflakeQueryLogEntry] +): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.resolver = SnowflakeTableResovler( + session=super().get_session(), ) - if result: - query_results.append(result) - return query_results + + def get_kwargs(self, table: DeclarativeMeta, *args, **kwargs): + return { + "table": table.__table__.name, + "database": self.get_session().get_bind().url.database, + "schema": table.__table__.schema, + } + + def get_inserts( + self, database: str, schema: str, table: str + ) -> List[SystemProfile]: + + return self.get_system_profile( + database, + schema, + table, + list( + self.get_queries_by_operation( + table, + [ + DatabaseDMLOperations.INSERT.value, + DatabaseDMLOperations.MERGE.value, + ], + ) + ), + "rows_inserted", + DmlOperationType.INSERT, + ) + + def get_deletes( + self, database: str, schema: str, table: str + ) -> List[SystemProfile]: + return self.get_system_profile( + database, + schema, + table, + list( + self.get_queries_by_operation( + table, + [ + DatabaseDMLOperations.DELETE.value, + ], + ) + ), + "rows_deleted", + DmlOperationType.DELETE, + ) + + @staticmethod + def get_system_profile( + db: str, + schema: str, + table: str, + query_results: List[SnowflakeQueryResult], + rows_affected_field: str, + operation: DmlOperationType, + ) -> List[SystemProfile]: + if not SnowflakeQueryResult.model_fields.get(rows_affected_field): + raise ValueError( + f"rows_affected_field [{rows_affected_field}] is not a valid field in SnowflakeQueryResult." + ) + return TypeAdapter(List[SystemProfile]).validate_python( + [ + { + "timestamp": datetime_to_timestamp(q.start_time, milliseconds=True), + "operation": operation, + "rowsAffected": getattr(q, rows_affected_field), + } + for q in query_results + if getattr(q, rows_affected_field) > 0 + and q.database_name == db + and q.schema_name == schema + and q.table_name == table + ] + ) + + def get_update_queries( + self, database: str, schema: str, table: str + ) -> List[SystemProfile]: + return self.get_system_profile( + database, + schema, + table, + list( + self.get_queries_by_operation( + table, + [ + DatabaseDMLOperations.UPDATE.value, + DatabaseDMLOperations.MERGE.value, + ], + ) + ), + "rows_updated", + DmlOperationType.UPDATE, + ) + + def get_queries_by_operation(self, table: str, operations: List[str]): + yield from ( + query for query in self.get_queries(table) if query.query_type in operations + ) + + def get_queries(self, table: str) -> List[SnowflakeQueryResult]: + queries = self.get_or_update_cache( + table, + SnowflakeQueryLogEntry.get_for_table, + session=super().get_session(), + tablename=table, + ) + results = [get_snowflake_system_queries( + query_log_entry=row, + resolver=self.resolver, + ) for row in queries] + return [result for result in results if result is not None] diff --git a/ingestion/src/metadata/profiler/interface/profiler_interface.py b/ingestion/src/metadata/profiler/interface/profiler_interface.py index 721a16d5a120..ef7e7a88d7c1 100644 --- a/ingestion/src/metadata/profiler/interface/profiler_interface.py +++ b/ingestion/src/metadata/profiler/interface/profiler_interface.py @@ -15,8 +15,9 @@ """ from abc import ABC, abstractmethod -from typing import Any, Dict, List, Optional, Union +from typing import Any, Dict, List, Optional, Union, Type +from metadata.profiler.metrics.system.system import System from sqlalchemy import Column from metadata.generated.schema.entity.data.database import ( @@ -460,7 +461,7 @@ def _compute_window_metrics( @abstractmethod def _compute_system_metrics( self, - metrics: Metrics, + metrics: Type[System], runner, *args, **kwargs, diff --git a/ingestion/src/metadata/profiler/interface/profiler_interface_factory.py b/ingestion/src/metadata/profiler/interface/profiler_interface_factory.py index dc443e047161..5b456ddfc4ca 100644 --- a/ingestion/src/metadata/profiler/interface/profiler_interface_factory.py +++ b/ingestion/src/metadata/profiler/interface/profiler_interface_factory.py @@ -57,6 +57,9 @@ class ProfilerInterfaceFactory(Factory): def create(self, interface_type: str, *args, **kwargs): """Create interface object based on interface type""" + + + interface_class_path = profiler_class_mapping.get( interface_type, profiler_class_mapping[DatabaseConnection.__name__] ) diff --git a/ingestion/src/metadata/profiler/interface/sqlalchemy/profiler_interface.py b/ingestion/src/metadata/profiler/interface/sqlalchemy/profiler_interface.py index 68ef06957677..e451ea793567 100644 --- a/ingestion/src/metadata/profiler/interface/sqlalchemy/profiler_interface.py +++ b/ingestion/src/metadata/profiler/interface/sqlalchemy/profiler_interface.py @@ -21,13 +21,17 @@ import traceback from collections import defaultdict from datetime import datetime -from typing import Any, Dict, List, Optional +from typing import Any, Dict, List, Optional, Type from sqlalchemy import Column, inspect, text from sqlalchemy.exc import DBAPIError, ProgrammingError, ResourceClosedError from sqlalchemy.orm import scoped_session -from metadata.generated.schema.entity.data.table import CustomMetricProfile, TableData +from metadata.generated.schema.entity.data.table import ( + CustomMetricProfile, + TableData, + SystemProfile, +) from metadata.generated.schema.tests.customMetric import CustomMetric from metadata.ingestion.connections.session import create_and_bind_thread_safe_session from metadata.mixins.sqalchemy.sqa_mixin import SQAInterfaceMixin @@ -38,6 +42,7 @@ from metadata.profiler.metrics.static.mean import Mean from metadata.profiler.metrics.static.stddev import StdDev from metadata.profiler.metrics.static.sum import Sum +from metadata.profiler.metrics.system.system import System, BaseSystemMetricsSource from metadata.profiler.orm.functions.table_metric_computer import TableMetricComputer from metadata.profiler.orm.registry import Dialects from metadata.profiler.processor.metric_filter import MetricFilter @@ -71,6 +76,8 @@ class SQAProfilerInterface(ProfilerInterface, SQAInterfaceMixin): # pylint: disable=too-many-arguments + system_metrics_source_class = BaseSystemMetricsSource + def __init__( self, service_connection_config, @@ -105,6 +112,9 @@ def __init__( self._table = self._convert_table_to_orm_object(sqa_metadata) self.create_session() + self.system_metrics_computer = self.system_metrics_source_class( + session=self.session + ) def create_session(self): self.session_factory = self._session_factory() @@ -363,12 +373,11 @@ def _compute_custom_metrics( def _compute_system_metrics( self, - metrics: Metrics, + metrics: Type[System], runner: QueryRunner, - session, *args, **kwargs, - ): + ) -> List[SystemProfile]: """Get system metric for tables Args: @@ -379,13 +388,8 @@ def _compute_system_metrics( Returns: dictionnary of results """ - try: - rows = metrics().sql(session, conn_config=self.service_connection_config) - return rows - except Exception as exc: - msg = f"Error trying to compute profile for {runner.table.__tablename__}: {exc}" - handle_query_exception(msg, exc, session) - return None + logger.debug(f"Computing system metrics for {runner.table.__tablename__}") + return self.system_metrics_computer.get_system_metrics(runner.table) def _create_thread_safe_sampler( self, diff --git a/ingestion/src/metadata/profiler/metrics/core.py b/ingestion/src/metadata/profiler/metrics/core.py index 70e387a7daeb..23f00264aedc 100644 --- a/ingestion/src/metadata/profiler/metrics/core.py +++ b/ingestion/src/metadata/profiler/metrics/core.py @@ -18,7 +18,7 @@ from abc import ABC, abstractmethod from enum import Enum from functools import wraps -from typing import Any, Callable, Dict, Optional, Tuple, TypeVar +from typing import Any, Callable, Dict, Optional, Tuple, TypeVar, TYPE_CHECKING from sqlalchemy import Column from sqlalchemy.orm import DeclarativeMeta, Session @@ -26,6 +26,9 @@ from metadata.generated.schema.entity.data.table import Table from metadata.profiler.adaptors.nosql_adaptor import NoSQLAdaptor +if TYPE_CHECKING: + pass + # When creating complex metrics, use inherit_cache = CACHE CACHE = True diff --git a/ingestion/src/metadata/profiler/metrics/system/system.py b/ingestion/src/metadata/profiler/metrics/system/system.py index 47524c430a2d..05892232312e 100644 --- a/ingestion/src/metadata/profiler/metrics/system/system.py +++ b/ingestion/src/metadata/profiler/metrics/system/system.py @@ -14,8 +14,9 @@ """ import traceback +from abc import ABC, abstractmethod from collections import defaultdict -from typing import Dict, List, Optional +from typing import Dict, List, Optional, Callable, Generic, TypeVar from pydantic import TypeAdapter from sqlalchemy import text @@ -26,9 +27,6 @@ from metadata.generated.schema.entity.services.connections.database.bigQueryConnection import ( BigQueryConnection, ) -from metadata.ingestion.source.database.snowflake.profiler.system import ( - build_snowflake_query_results, -) from metadata.profiler.metrics.core import SystemMetric from metadata.profiler.metrics.system.dml_operation import ( DML_OPERATION_MAP, @@ -43,6 +41,7 @@ from metadata.utils.dispatch import valuedispatch from metadata.utils.helpers import deep_size_of_dict from metadata.utils.logger import profiler_logger +from metadata.utils.lru_cache import LRUCache, LRU_CACHE_SIZE from metadata.utils.profiler_utils import get_value_from_cache, set_cache from metadata.utils.time_utils import datetime_to_timestamp @@ -58,6 +57,80 @@ def recursive_dic(): SYSTEM_QUERY_RESULT_CACHE = recursive_dic() +T = TypeVar("T") + + +class CacheProvider(ABC, Generic[T]): + def __init__(self): + self.cache = LRUCache[T](LRU_CACHE_SIZE) + + def get_or_update_cache( + self, + cache_path: str, + get_queries_fn: Callable[..., List[T]], + *args, + **kwargs, + ): + if cache_path in self.cache: + return self.cache.get(cache_path) + result = get_queries_fn(*args, **kwargs) + self.cache.put(cache_path, result) + return result + + +class BaseSystemMetricsSource: + def __init__(self, *args, **kwargs): + kwargs.pop("session", None) + if len(args) > 0: + logger.warning("Received unexpected arguments: %s", args) + if len(kwargs) > 0: + logger.warning("Received unexpected keyword arguments: %s", kwargs) + super().__init__() + + def get_system_metrics(self, *args, **kwargs) -> List[SystemProfile]: + """Return system metrics for a given table. Actual passed object can be a variety of types based + on the underlying infrastructure. For example, in the case of SQLalchemy, it can be a Table object + and in the case of Mongo, it can be a collection object.""" + kwargs = self.get_kwargs(*args, **kwargs) + return ( + self.get_inserts(**kwargs) + + self.get_deletes(**kwargs) + + self.get_updates(**kwargs) + ) + + def get_kwargs(self, *args, **kwargs): + return {} + + def get_inserts( + self, database: str, schema: str, table: str + ) -> List[SystemProfile]: + """Get insert queries""" + return [] + + def get_deletes( + self, database: str, schema: str, table: str + ) -> List[SystemProfile]: + """Get delete queries""" + return [] + + def get_updates( + self, database: str, schema: str, table: str + ) -> List[SystemProfile]: + """Get update queries""" + return [] + + +class SQASessionProvider: + def __init__(self, *args, **kwargs): + self.session = kwargs.pop("session") + super().__init__(*args, **kwargs) + + def get_session(self): + return self.session + + def get_database(self) -> str: + return self.session.get_bind().url.database + @valuedispatch def get_system_metrics_for_dialect( diff --git a/ingestion/src/metadata/profiler/source/base/profiler_source.py b/ingestion/src/metadata/profiler/source/base/profiler_source.py index 3d3d19fd3e9e..7fbb77cb2618 100644 --- a/ingestion/src/metadata/profiler/source/base/profiler_source.py +++ b/ingestion/src/metadata/profiler/source/base/profiler_source.py @@ -13,9 +13,11 @@ Base source for the profiler used to instantiate a profiler runner with its interface """ +import traceback from copy import deepcopy -from typing import List, Optional, Tuple, cast +from typing import List, Optional, Tuple, cast, Type +from docker.models.services import Service from sqlalchemy import MetaData from metadata.generated.schema.configuration.profilerConfiguration import ( @@ -31,6 +33,7 @@ DatabaseConnection, DatabaseService, ) +from metadata.generated.schema.entity.services.serviceType import ServiceType from metadata.generated.schema.metadataIngestion.databaseServiceProfilerPipeline import ( DatabaseServiceProfilerPipeline, ) @@ -40,13 +43,26 @@ from metadata.ingestion.ometa.ometa_api import OpenMetadata from metadata.profiler.api.models import ProfilerProcessorConfig, TableConfig from metadata.profiler.interface.profiler_interface import ProfilerInterface +from metadata.profiler.interface.sqlalchemy.profiler_interface import ( + SQAProfilerInterface, +) from metadata.profiler.metrics.registry import Metrics from metadata.profiler.processor.core import Profiler from metadata.profiler.processor.default import DefaultProfiler, get_default_metrics from metadata.profiler.source.profiler_source_interface import ProfilerSourceInterface +from metadata.utils.importer import ( + import_source_class, + DynamicImportException, + import_profiler_class, + import_from_module, +) +from metadata.utils.logger import profiler_logger +from metadata.utils.manifest import BaseManifest, get_class_path NON_SQA_DATABASE_CONNECTIONS = (DatalakeConnection,) +logger = profiler_logger() + class ProfilerSource(ProfilerSourceInterface): """ @@ -61,15 +77,14 @@ def __init__( global_profiler_configuration: ProfilerConfiguration, ): self.service_conn_config = self._copy_service_config(config, database) - self.source_config = config.source.sourceConfig.config - self.source_config = cast( - DatabaseServiceProfilerPipeline, self.source_config - ) # satisfy type checker + self.source_config = DatabaseServiceProfilerPipeline.model_validate( + config.source.sourceConfig.config + ) self.profiler_config = ProfilerProcessorConfig.model_validate( config.processor.model_dump().get("config") ) self.ometa_client = ometa_client - self.profiler_interface_type: str = self._get_profiler_interface_type(config) + self.profiler_interface_type: str = config.source.type.lower() self.sqa_metadata = self._set_sqa_metadata() self._interface = None self.global_profiler_configuration = global_profiler_configuration @@ -92,18 +107,6 @@ def _set_sqa_metadata(self): return MetaData() return None - def _get_profiler_interface_type(self, config) -> str: - """_summary_ - - Args: - config (_type_): profiler config - Returns: - str: - """ - if isinstance(self.service_conn_config, NON_SQA_DATABASE_CONNECTIONS): - return self.service_conn_config.__class__.__name__ - return config.source.serviceConnection.root.config.__class__.__name__ - @staticmethod def get_config_for_table(entity: Table, profiler_config) -> Optional[TableConfig]: """Get config for a specific entity @@ -196,12 +199,15 @@ def create_profiler_interface( db_service: Optional[DatabaseService], ) -> ProfilerInterface: """Create sqlalchemy profiler interface""" - from metadata.profiler.interface.profiler_interface_factory import ( # pylint: disable=import-outside-toplevel - profiler_interface_factory, + profiler_class = self.import_profiler_class( + ServiceType.Database, source_type=self.profiler_interface_type ) - - profiler_interface: ProfilerInterface = profiler_interface_factory.create( - self.profiler_interface_type, + logger.debug( + "Using profiler class: %s from %s", + profiler_class, + profiler_class.__module__, + ) + profiler_interface: ProfilerInterface = profiler_class.create( entity, schema_entity, database_entity, @@ -217,6 +223,21 @@ def create_profiler_interface( self.interface = profiler_interface return self.interface + def import_profiler_class( + self, service_type: ServiceType, source_type: str + ) -> Type[ProfilerInterface]: + try: + class_path = BaseManifest.get_for_source( + service_type, source_type + ).profler_class + if class_path is None: + class_path = get_class_path(SQAProfilerInterface) + return cast(Type[ProfilerInterface], import_from_module(class_path)) + except DynamicImportException as e: + logger.debug(traceback.format_exc()) + logger.debug(f"Failed to import profiler for source '{source_type}'") + return SQAProfilerInterface + def _get_context_entities( self, entity: Table ) -> Tuple[DatabaseSchema, Database, DatabaseService]: diff --git a/ingestion/src/metadata/profiler/source/metadata_ext.py b/ingestion/src/metadata/profiler/source/metadata_ext.py index 9c00c11d57b0..363a0bdea4f2 100644 --- a/ingestion/src/metadata/profiler/source/metadata_ext.py +++ b/ingestion/src/metadata/profiler/source/metadata_ext.py @@ -21,7 +21,7 @@ """ import traceback from copy import deepcopy -from typing import Iterable, cast +from typing import Iterable, cast, Type from sqlalchemy.inspection import inspect @@ -38,18 +38,27 @@ from metadata.generated.schema.metadataIngestion.workflow import ( OpenMetadataWorkflowConfig, ) +from metadata.generated.schema.tests.testSuite import ServiceType from metadata.ingestion.api.models import Either from metadata.ingestion.ometa.ometa_api import OpenMetadata +from metadata.profiler.interface.sqlalchemy.profiler_interface import ( + SQAProfilerInterface, +) from metadata.profiler.source.metadata import ( OpenMetadataSource, ProfilerSourceAndEntity, ) -from metadata.profiler.source.profiler_source_factory import profiler_source_factory from metadata.utils import fqn from metadata.utils.class_helper import get_service_type_from_source_type from metadata.utils.filters import filter_by_database, filter_by_schema, filter_by_table -from metadata.utils.importer import import_source_class +from metadata.utils.importer import ( + import_source_class, + import_profiler_class, + ProfilerInterface, + import_from_module, +) from metadata.utils.logger import profiler_logger +from metadata.utils.manifest import BaseManifest, get_class_path from metadata.utils.ssl_manager import get_ssl_connection logger = profiler_logger() @@ -145,16 +154,18 @@ def _iter(self, *_, **__) -> Iterable[Either[ProfilerSourceAndEntity]]: ) continue - profiler_source = profiler_source_factory.create( - self.config.source.type.lower(), - self.config, - database_entity, - self.metadata, - global_profiler_config, - ) + class_path = BaseManifest.get_for_source( + ServiceType.Database, + source_type=self.config.source.type.lower(), + ).profler_class + if class_path is None: + class_path = get_class_path(SQAProfilerInterface) yield Either( right=ProfilerSourceAndEntity( - profiler_source=profiler_source, + profiler_source=cast( + Type[ProfilerInterface], + import_from_module(class_path), + ), entity=table_entity, ) ) @@ -206,9 +217,11 @@ def get_database_names(self) -> Iterable[str]: ) if filter_by_database( self.source_config.databaseFilterPattern, - database_fqn - if self.source_config.useFqnForFiltering - else database, + ( + database_fqn + if self.source_config.useFqnForFiltering + else database + ), ): self.status.filter(database, "Database pattern not allowed") continue diff --git a/ingestion/src/metadata/utils/importer.py b/ingestion/src/metadata/utils/importer.py index 12c03eb2ecef..03226638bdee 100644 --- a/ingestion/src/metadata/utils/importer.py +++ b/ingestion/src/metadata/utils/importer.py @@ -15,7 +15,7 @@ import sys import traceback from enum import Enum -from typing import Any, Callable, Optional, Type, TypeVar +from typing import Any, Callable, Optional, Type, TypeVar, TYPE_CHECKING from pydantic import BaseModel @@ -23,15 +23,25 @@ from metadata.generated.schema.entity.services.connections.metadata.openMetadataConnection import ( OpenMetadataConnection, ) +from metadata.generated.schema.entity.services.databaseService import ( + DatabaseService, + DatabaseConnection, +) from metadata.generated.schema.entity.services.serviceType import ServiceType from metadata.generated.schema.metadataIngestion.workflow import Sink as WorkflowSink from metadata.ingestion.api.steps import BulkSink, Processor, Sink, Source, Stage +from metadata.profiler.metrics.system.system import BaseSystemMetricsSource from metadata.utils.class_helper import get_service_type_from_source_type from metadata.utils.client_version import get_client_version from metadata.utils.constants import CUSTOM_CONNECTOR_PREFIX from metadata.utils.logger import utils_logger from metadata.utils.singleton import Singleton +if TYPE_CHECKING: + from metadata.profiler.interface.profiler_interface import ProfilerInterface +else: + ProfilerInterface = Any + logger = utils_logger() T = TypeVar("T") @@ -131,7 +141,7 @@ def import_from_module(key: str) -> Type[Any]: try: obj = getattr(importlib.import_module(module_name), obj_name) return obj - except Exception as err: + except ModuleNotFoundError as err: logger.debug(traceback.format_exc()) raise DynamicImportException(module=module_name, key=obj_name, cause=err) @@ -152,6 +162,24 @@ def import_source_class( ) +def import_profiler_class( + source_connection_type: DatabaseConnection, + from_: str = "profiler", + service_type: ServiceType = ServiceType.Database, +) -> Type[ProfilerInterface]: + """ + Import the profiler class for a source connection by default will be found in the metadata + """ + return import_from_module( + "metadata.ingestion.source.{}.{}.{}.profiler.{}Profiler".format( + service_type.name.lower(), + source_connection_type.config.type.value.lower(), + from_, + source_connection_type.config.type.value, + ) + ) + + def import_processor_class( processor_type: str, from_: str = "ingestion" ) -> Type[Processor]: @@ -302,3 +330,18 @@ def import_side_effects(self, *modules): def import_side_effects(*modules): SideEffectsLoader().import_side_effects(*modules) + + +def import_system_metrics_computer(db_service: DatabaseService): + """ + Import the system metrics profile class + """ + try: + return import_from_module( + "metadata.ingestion.source.database.{}.profiler.system.SystemMetricsComputer".format( + db_service.type + ) + ) + except DynamicImportException as err: + logger.debug("Could not import system metrics computer: %s", err) + return BaseSystemMetricsSource diff --git a/ingestion/src/metadata/utils/lru_cache.py b/ingestion/src/metadata/utils/lru_cache.py index bcbb78eb9434..54712e78f51e 100644 --- a/ingestion/src/metadata/utils/lru_cache.py +++ b/ingestion/src/metadata/utils/lru_cache.py @@ -14,46 +14,61 @@ """ from collections import OrderedDict +from typing import TypeVar, Generic, Callable +import threading + LRU_CACHE_SIZE = 4096 +T = TypeVar("T") + -class LRUCache: +class LRUCache(Generic[T]): """Least Recently Used cache""" def __init__(self, capacity: int) -> None: self._cache = OrderedDict() self.capacity = capacity + self.lock = threading.Lock() def clear(self): - self._cache = OrderedDict() + with self.lock: + self._cache = OrderedDict() + + def get(self, key) -> T: + with self.lock: + self._cache.move_to_end(key) + return self._cache[key] - def get(self, key): - """ - Returns the value associated to `key` if it exists, - updating the cache usage. - Raises `KeyError` if `key doesn't exist in the cache. - """ - self._cache.move_to_end(key) - return self._cache[key] - - def put(self, key, value) -> None: - """ - Assigns `value` to `key`, overwriting `key` if it already exists - in the cache and updating the cache usage. - If the size of the cache grows above capacity, pops the least used - element. - """ - self._cache[key] = value - self._cache.move_to_end(key) - if len(self._cache) > self.capacity: - self._cache.popitem(last=False) + def put(self, key: str, value: T) -> None: + with self.lock: + self._cache[key] = value + self._cache.move_to_end(key) + if len(self._cache) > self.capacity: + self._cache.popitem(last=False) def __contains__(self, key) -> bool: - if key not in self._cache: - return False - self._cache.move_to_end(key) - return True + with self.lock: + if key not in self._cache: + return False + self._cache.move_to_end(key) + return True def __len__(self) -> int: - return len(self._cache) + with self.lock: + return len(self._cache) + + def wrap(self, key_func: Callable[..., str]): + + def wrapper(func: Callable[..., T]): + def wrapped(*args, **kwargs) -> T: + key = key_func(*args, **kwargs) + if key in self: + return self.get(key) + value = func(*args, **kwargs) + self.put(key, value) + return value + + return wrapped + + return wrapper diff --git a/ingestion/src/metadata/utils/manifest.py b/ingestion/src/metadata/utils/manifest.py new file mode 100644 index 000000000000..2948632eec09 --- /dev/null +++ b/ingestion/src/metadata/utils/manifest.py @@ -0,0 +1,51 @@ +from typing import cast + +from metadata.generated.schema.tests.testSuite import ServiceType +from metadata.ingestion.models.custom_pydantic import BaseModel +from metadata.utils.importer import ( + import_from_module, + get_module_dir, + get_source_module_name, + get_class_name_root, +) + + +class BaseManifest(BaseModel): + """Base manifest for storing class information for a source. We use strings to store the class information + for these reasons: + 1. manifests can be defined using json/yaml and deserialized into this class. + 2. We can dynamically import the class when needed and avoid dependency issues. + 3. We avoid circular imports. + 4. We can hot-swap the class implementation without changing the manifest (example: for testing). + + - Dyanmic factory? + - Different name the class name? + + + + # TODO + - add a "default" manifest that will provide "sensible" default like SQAProfilerInterface or based on entity type / service type + + """ + + profler_class: str + + @classmethod + def get_for_source( + cls, service_type: ServiceType, source_type: str, from_: str = "ingestion" + ) -> "BaseManifest": + return cls.model_validate( + import_from_module( + "metadata.{}.source.{}.{}.{}.{}Manifest".format( + from_, + service_type.name.lower(), + get_module_dir(source_type), + "manifest", + get_class_name_root(source_type), + ) + ) + ) + + +def get_class_path(module): + return module.__module__ + "." + module.__name__ diff --git a/ingestion/src/metadata/utils/test_suite.py b/ingestion/src/metadata/utils/test_suite.py new file mode 100644 index 000000000000..fd4e90e1b88b --- /dev/null +++ b/ingestion/src/metadata/utils/test_suite.py @@ -0,0 +1,84 @@ +# Copyright 2021 Collate +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Helper module for test suite functions +""" + +from __future__ import annotations + +from datetime import datetime +from typing import Callable, List, Optional + +from metadata.generated.schema.tests.basic import ( + TestCaseResult, + TestCaseStatus, + TestResultValue, +) +from metadata.generated.schema.tests.testCase import TestCaseParameterValue + + +def get_test_case_param_value( + test_case_param_vals: list[TestCaseParameterValue], + name: str, + type_, + default=None, + pre_processor: Optional[Callable] = None, +): + """Give a column and a type return the value with the appropriate type casting for the + test case definition. + + Args: + test_case: the test case + type_ (Union[float, int, str]): type for the value + name (str): column name + default (_type_, optional): Default value to return if column is not found + pre_processor: pre processor function/type to use against the value before casting to type_ + """ + value = next( + (param.value for param in test_case_param_vals if param.name == name), None + ) + + if not value: + return default + + if not pre_processor: + return type_(value) + + pre_processed_value = pre_processor(value) + return type_(pre_processed_value) + + +def build_test_case_result( + execution_datetime: datetime, + status: TestCaseStatus, + result: str, + test_result_value: List[TestResultValue], + sample_data: Optional[str] = None, +) -> TestCaseResult: + """create a test case result object + + Args: + execution_datetime (datetime): execution datetime of the test + status (TestCaseStatus): failed, succeed, aborted + result (str): message to display + testResultValue (List[TestResultValue]): values for the test result + + Returns: + TestCaseResult: + """ + return TestCaseResult( + timestamp=execution_datetime, + testCaseStatus=status, + result=result, + testResultValue=test_result_value, + sampleData=sample_data, + ) diff --git a/ingestion/tests/cli_e2e/base/test_cli_db.py b/ingestion/tests/cli_e2e/base/test_cli_db.py index 332bc852e0be..809e1823190e 100644 --- a/ingestion/tests/cli_e2e/base/test_cli_db.py +++ b/ingestion/tests/cli_e2e/base/test_cli_db.py @@ -72,6 +72,7 @@ def test_create_table_with_profiler(self) -> None: result = self.run_command("profile") sink_status, source_status = self.retrieve_statuses(result) self.assert_for_table_with_profiler(source_status, sink_status) + self.system_profile_assertions() @pytest.mark.order(3) def test_delete_table_is_marked_as_deleted(self) -> None: @@ -416,3 +417,6 @@ def get_expected_test_case_results(self) -> List[TestCaseResult]: def assert_status_for_data_quality(self, source_status, sink_status): pass + + def system_profile_assertions(self): + pass diff --git a/ingestion/tests/cli_e2e/test_cli_redshift.py b/ingestion/tests/cli_e2e/test_cli_redshift.py index 405dfe3e9d24..6d94d3286b39 100644 --- a/ingestion/tests/cli_e2e/test_cli_redshift.py +++ b/ingestion/tests/cli_e2e/test_cli_redshift.py @@ -12,9 +12,12 @@ """ Redshift E2E tests """ - +from datetime import datetime from typing import List +from _openmetadata_testutils.pydantic.test_utils import assert_equal_pydantic_objects +from metadata.generated.schema.entity.data.table import SystemProfile, DmlOperationType +from metadata.generated.schema.type.basic import Timestamp from metadata.ingestion.api.status import Status from .common.test_cli_db import CliCommonDB @@ -232,3 +235,33 @@ def update_queries() -> List[str]: UPDATE e2e_cli_tests.dbt_jaffle.persons SET full_name = 'Bruce Wayne' WHERE person_id = 3 """, ] + + def system_profile_assertions(self): + cases = [ + ( + "e2e_redshift.e2e_cli_tests.dbt_jaffle.persons", + [ + SystemProfile( + timestamp=Timestamp(root=0), + operation=DmlOperationType.INSERT, + rowsAffected=6, + ) + ], + ) + ] + for table_fqn, expected_profile in cases: + actual_profiles = self.openmetadata.get_profile_data( + table_fqn, + start_ts=int((datetime.now().timestamp() - 600) * 1000), + end_ts=int(datetime.now().timestamp() * 1000), + profile_type=SystemProfile, + ).entities + actual_profiles = sorted(actual_profiles, key=lambda x: x.timestamp.root) + actual_profiles = actual_profiles[-len(expected_profile) :] + actual_profiles = [ + p.copy(update={"timestamp": Timestamp(root=0)}) for p in actual_profiles + ] + try: + assert_equal_pydantic_objects(expected_profile, actual_profiles) + except AssertionError as e: + raise AssertionError(f"Table: {table_fqn}") from e diff --git a/ingestion/tests/cli_e2e/test_cli_snowflake.py b/ingestion/tests/cli_e2e/test_cli_snowflake.py index 70eb1547dccf..9898bdb8e307 100644 --- a/ingestion/tests/cli_e2e/test_cli_snowflake.py +++ b/ingestion/tests/cli_e2e/test_cli_snowflake.py @@ -167,11 +167,10 @@ def test_create_table_with_profiler(self) -> None: result = self.run_command("profile") sink_status, source_status = self.retrieve_statuses(result) self.assert_for_table_with_profiler(source_status, sink_status) - self.custom_profiler_assertions() + self.system_profile_assertions() @staticmethod - def expected_tables() -> int: - return 7 + def expected_tables() -> int: return 7 def inserted_rows_count(self) -> int: return len( @@ -233,7 +232,7 @@ def update_queries() -> List[str]: """, ] - def custom_profiler_assertions(self): + def system_profile_assertions(self): cases = [ ( "e2e_snowflake.E2E_DB.E2E_TEST.E2E_TABLE", From 6ccbf2ec3eec7f16a5b8891da7b4f7ae7807cdd8 Mon Sep 17 00:00:00 2001 From: sushi30 Date: Fri, 18 Oct 2024 11:53:33 +0200 Subject: [PATCH 02/18] - added manifests for all custom profilers - used super() dependency injection in order for system metrics source - formatting --- .../source/database/bigquery/manifest.py | 6 ++ .../source/database/databricks/manifest.py | 11 ++- .../source/database/datalake/manifest.py | 6 ++ .../ingestion/source/database/db2/manifest.py | 6 ++ .../source/database/dynamodb/manifest.py | 4 ++ .../source/database/mariadb/manifest.py | 6 ++ .../source/database/mongodb/manifest.py | 4 ++ .../database/redshift/profiler/profiler.py | 4 +- .../database/redshift/profiler/system.py | 11 ++- .../source/database/singlestore/manifest.py | 4 ++ .../database/snowflake/profiler/profiler.py | 2 +- .../database/snowflake/profiler/system.py | 29 ++++---- .../source/database/trino/manifest.py | 6 ++ .../source/database/unitycatalog/manifest.py | 8 +++ .../profiler/interface/profiler_interface.py | 4 +- .../interface/profiler_interface_factory.py | 4 +- .../sqlalchemy/profiler_interface.py | 8 +-- .../src/metadata/profiler/metrics/core.py | 2 +- .../profiler/metrics/system/system.py | 56 ++++++++------- .../profiler/source/base/profiler_source.py | 31 ++------ .../metadata/profiler/source/metadata_ext.py | 37 +++++----- ingestion/src/metadata/utils/importer.py | 10 +-- ingestion/src/metadata/utils/lru_cache.py | 6 +- ingestion/src/metadata/utils/manifest.py | 70 +++++++++++++------ ingestion/tests/cli_e2e/test_cli_redshift.py | 2 +- ingestion/tests/cli_e2e/test_cli_snowflake.py | 3 +- 26 files changed, 203 insertions(+), 137 deletions(-) create mode 100644 ingestion/src/metadata/ingestion/source/database/bigquery/manifest.py create mode 100644 ingestion/src/metadata/ingestion/source/database/datalake/manifest.py create mode 100644 ingestion/src/metadata/ingestion/source/database/db2/manifest.py create mode 100644 ingestion/src/metadata/ingestion/source/database/dynamodb/manifest.py create mode 100644 ingestion/src/metadata/ingestion/source/database/mariadb/manifest.py create mode 100644 ingestion/src/metadata/ingestion/source/database/mongodb/manifest.py create mode 100644 ingestion/src/metadata/ingestion/source/database/singlestore/manifest.py create mode 100644 ingestion/src/metadata/ingestion/source/database/trino/manifest.py create mode 100644 ingestion/src/metadata/ingestion/source/database/unitycatalog/manifest.py diff --git a/ingestion/src/metadata/ingestion/source/database/bigquery/manifest.py b/ingestion/src/metadata/ingestion/source/database/bigquery/manifest.py new file mode 100644 index 000000000000..66ca67dfb501 --- /dev/null +++ b/ingestion/src/metadata/ingestion/source/database/bigquery/manifest.py @@ -0,0 +1,6 @@ +from metadata.profiler.interface.sqlalchemy.bigquery.profiler_interface import ( + BigQueryProfilerInterface, +) +from metadata.utils.manifest import BaseManifest, get_class_path + +BigqueryManifest = BaseManifest(profler_class=get_class_path(BigQueryProfilerInterface)) diff --git a/ingestion/src/metadata/ingestion/source/database/databricks/manifest.py b/ingestion/src/metadata/ingestion/source/database/databricks/manifest.py index dd1b85ee5c10..bc745191d1fa 100644 --- a/ingestion/src/metadata/ingestion/source/database/databricks/manifest.py +++ b/ingestion/src/metadata/ingestion/source/database/databricks/manifest.py @@ -1,9 +1,8 @@ -from metadata.ingestion.source.database.databricks.profiler.profiler import ( - DatabricksProfiler, -) -from metadata.ingestion.source.database.redshift.profiler.profiler import ( - RedshiftProfiler, +from metadata.profiler.interface.sqlalchemy.databricks.profiler_interface import ( + DatabricksProfilerInterface, ) from metadata.utils.manifest import BaseManifest, get_class_path -RedshiftManifest = BaseManifest(profler_class=get_class_path(DatabricksProfiler)) +DatabricksManifest = BaseManifest( + profler_class=get_class_path(DatabricksProfilerInterface) +) diff --git a/ingestion/src/metadata/ingestion/source/database/datalake/manifest.py b/ingestion/src/metadata/ingestion/source/database/datalake/manifest.py new file mode 100644 index 000000000000..a1b4e76faf74 --- /dev/null +++ b/ingestion/src/metadata/ingestion/source/database/datalake/manifest.py @@ -0,0 +1,6 @@ +from metadata.profiler.interface.pandas.profiler_interface import ( + PandasProfilerInterface, +) +from metadata.utils.manifest import BaseManifest, get_class_path + +DatalakeManifest = BaseManifest(profler_class=get_class_path(PandasProfilerInterface)) diff --git a/ingestion/src/metadata/ingestion/source/database/db2/manifest.py b/ingestion/src/metadata/ingestion/source/database/db2/manifest.py new file mode 100644 index 000000000000..46e57bc7e006 --- /dev/null +++ b/ingestion/src/metadata/ingestion/source/database/db2/manifest.py @@ -0,0 +1,6 @@ +from metadata.profiler.interface.sqlalchemy.db2.profiler_interface import ( + DB2ProfilerInterface, +) +from metadata.utils.manifest import BaseManifest, get_class_path + +Db2Manifest = BaseManifest(profler_class=get_class_path(DB2ProfilerInterface)) diff --git a/ingestion/src/metadata/ingestion/source/database/dynamodb/manifest.py b/ingestion/src/metadata/ingestion/source/database/dynamodb/manifest.py new file mode 100644 index 000000000000..1d4980701e8f --- /dev/null +++ b/ingestion/src/metadata/ingestion/source/database/dynamodb/manifest.py @@ -0,0 +1,4 @@ +from ingestion.tests.integration.profiler.test_nosql_profiler import NoSQLProfiler +from metadata.utils.manifest import BaseManifest, get_class_path + +DyanmodbManifest = BaseManifest(profler_class=get_class_path(NoSQLProfiler)) diff --git a/ingestion/src/metadata/ingestion/source/database/mariadb/manifest.py b/ingestion/src/metadata/ingestion/source/database/mariadb/manifest.py new file mode 100644 index 000000000000..d1ad846fe3b2 --- /dev/null +++ b/ingestion/src/metadata/ingestion/source/database/mariadb/manifest.py @@ -0,0 +1,6 @@ +from metadata.profiler.interface.sqlalchemy.mariadb.profiler_interface import ( + MariaDBProfilerInterface, +) +from metadata.utils.manifest import BaseManifest, get_class_path + +MariadbManifest = BaseManifest(profler_class=get_class_path(MariaDBProfilerInterface)) diff --git a/ingestion/src/metadata/ingestion/source/database/mongodb/manifest.py b/ingestion/src/metadata/ingestion/source/database/mongodb/manifest.py new file mode 100644 index 000000000000..f590cde0e28f --- /dev/null +++ b/ingestion/src/metadata/ingestion/source/database/mongodb/manifest.py @@ -0,0 +1,4 @@ +from ingestion.tests.integration.profiler.test_nosql_profiler import NoSQLProfiler +from metadata.utils.manifest import BaseManifest, get_class_path + +MongodbManifest = BaseManifest(profler_class=get_class_path(NoSQLProfiler)) diff --git a/ingestion/src/metadata/ingestion/source/database/redshift/profiler/profiler.py b/ingestion/src/metadata/ingestion/source/database/redshift/profiler/profiler.py index f7181cb7060f..1b678967d605 100644 --- a/ingestion/src/metadata/ingestion/source/database/redshift/profiler/profiler.py +++ b/ingestion/src/metadata/ingestion/source/database/redshift/profiler/profiler.py @@ -1,5 +1,5 @@ from metadata.ingestion.source.database.redshift.profiler.system import ( - RedshiftSystemMetricsSource, + RedshiftSystemMetricsComputer, ) from metadata.profiler.interface.sqlalchemy.profiler_interface import ( SQAProfilerInterface, @@ -7,4 +7,4 @@ class RedshiftProfiler(SQAProfilerInterface): - system_metrics_source_class = RedshiftSystemMetricsSource + system_metrics_computer_class = RedshiftSystemMetricsComputer diff --git a/ingestion/src/metadata/ingestion/source/database/redshift/profiler/system.py b/ingestion/src/metadata/ingestion/source/database/redshift/profiler/system.py index 22fd06edfe5a..6069a6ee10f9 100644 --- a/ingestion/src/metadata/ingestion/source/database/redshift/profiler/system.py +++ b/ingestion/src/metadata/ingestion/source/database/redshift/profiler/system.py @@ -9,9 +9,10 @@ ) from metadata.profiler.metrics.system.dml_operation import DatabaseDMLOperations from metadata.profiler.metrics.system.system import ( - BaseSystemMetricsSource, - SQASessionProvider, CacheProvider, + EmptySystemMetricsSource, + SQASessionProvider, + SystemMetricsComputer, ) from metadata.utils.logger import profiler_logger from metadata.utils.profiler_utils import QueryResult @@ -21,7 +22,7 @@ class RedshiftSystemMetricsSource( - SQASessionProvider, BaseSystemMetricsSource, CacheProvider + SQASessionProvider, EmptySystemMetricsSource, CacheProvider ): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -128,3 +129,7 @@ def get_metric_result(ddls: List[QueryResult], table_name: str) -> List: for ddl in ddls if ddl.table_name == table_name ] + + +class RedshiftSystemMetricsComputer(SystemMetricsComputer, RedshiftSystemMetricsSource): + pass diff --git a/ingestion/src/metadata/ingestion/source/database/singlestore/manifest.py b/ingestion/src/metadata/ingestion/source/database/singlestore/manifest.py new file mode 100644 index 000000000000..f35cfcf652aa --- /dev/null +++ b/ingestion/src/metadata/ingestion/source/database/singlestore/manifest.py @@ -0,0 +1,4 @@ +from metadata.ingestion.source.database.singlestore.metadata import SinglestoreSource +from metadata.utils.manifest import BaseManifest, get_class_path + +SinglestoreManifest = BaseManifest(profler_class=get_class_path(SinglestoreSource)) diff --git a/ingestion/src/metadata/ingestion/source/database/snowflake/profiler/profiler.py b/ingestion/src/metadata/ingestion/source/database/snowflake/profiler/profiler.py index fa7438c2511a..fc1dd245f1dc 100644 --- a/ingestion/src/metadata/ingestion/source/database/snowflake/profiler/profiler.py +++ b/ingestion/src/metadata/ingestion/source/database/snowflake/profiler/profiler.py @@ -30,4 +30,4 @@ class SnowflakeProfiler(SnowflakeProfilerInterface): sqlalchemy. """ - system_metrics_source_class = SnowflakeSystemMetricsSource + system_metrics_computer_class = SnowflakeSystemMetricsSource diff --git a/ingestion/src/metadata/ingestion/source/database/snowflake/profiler/system.py b/ingestion/src/metadata/ingestion/source/database/snowflake/profiler/system.py index 26c247eed9e6..95555eb04fd0 100644 --- a/ingestion/src/metadata/ingestion/source/database/snowflake/profiler/system.py +++ b/ingestion/src/metadata/ingestion/source/database/snowflake/profiler/system.py @@ -1,29 +1,27 @@ +import hashlib import re import traceback -from typing import List, Optional, Tuple, Any, TypeVar, Callable +from typing import List, Optional, Tuple import sqlalchemy.orm from pydantic import TypeAdapter from sqlalchemy.orm import DeclarativeMeta -from metadata.generated.schema.entity.data.table import SystemProfile, DmlOperationType +from metadata.generated.schema.entity.data.table import DmlOperationType, SystemProfile from metadata.ingestion.source.database.snowflake.models import ( SnowflakeQueryLogEntry, SnowflakeQueryResult, ) -from metadata.profiler.metrics.system.dml_operation import ( - DatabaseDMLOperations, -) +from metadata.profiler.metrics.system.dml_operation import DatabaseDMLOperations from metadata.profiler.metrics.system.system import ( - BaseSystemMetricsSource, - SQASessionProvider, CacheProvider, + EmptySystemMetricsSource, + SQASessionProvider, ) from metadata.utils.logger import profiler_logger from metadata.utils.lru_cache import LRU_CACHE_SIZE, LRUCache -from metadata.utils.profiler_utils import get_identifiers_from_string, QueryResult +from metadata.utils.profiler_utils import get_identifiers_from_string from metadata.utils.time_utils import datetime_to_timestamp -import hashlib PUBLIC_SCHEMA = "PUBLIC" logger = profiler_logger() @@ -264,7 +262,7 @@ def get_snowflake_system_queries( class SnowflakeSystemMetricsSource( - SQASessionProvider, BaseSystemMetricsSource, CacheProvider[SnowflakeQueryLogEntry] + SQASessionProvider, EmptySystemMetricsSource, CacheProvider[SnowflakeQueryLogEntry] ): def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -379,8 +377,11 @@ def get_queries(self, table: str) -> List[SnowflakeQueryResult]: session=super().get_session(), tablename=table, ) - results = [get_snowflake_system_queries( - query_log_entry=row, - resolver=self.resolver, - ) for row in queries] + results = [ + get_snowflake_system_queries( + query_log_entry=row, + resolver=self.resolver, + ) + for row in queries + ] return [result for result in results if result is not None] diff --git a/ingestion/src/metadata/ingestion/source/database/trino/manifest.py b/ingestion/src/metadata/ingestion/source/database/trino/manifest.py new file mode 100644 index 000000000000..88eb9499f6c2 --- /dev/null +++ b/ingestion/src/metadata/ingestion/source/database/trino/manifest.py @@ -0,0 +1,6 @@ +from metadata.profiler.interface.sqlalchemy.trino.profiler_interface import ( + TrinoProfilerInterface, +) +from metadata.utils.manifest import BaseManifest, get_class_path + +TrinoManifest = BaseManifest(profler_class=get_class_path(TrinoProfilerInterface)) diff --git a/ingestion/src/metadata/ingestion/source/database/unitycatalog/manifest.py b/ingestion/src/metadata/ingestion/source/database/unitycatalog/manifest.py new file mode 100644 index 000000000000..aea1e93c9823 --- /dev/null +++ b/ingestion/src/metadata/ingestion/source/database/unitycatalog/manifest.py @@ -0,0 +1,8 @@ +from metadata.profiler.interface.sqlalchemy.unity_catalog.profiler_interface import ( + UnityCatalogProfilerInterface, +) +from metadata.utils.manifest import BaseManifest, get_class_path + +UnitycatalogManifest = BaseManifest( + profler_class=get_class_path(UnityCatalogProfilerInterface) +) diff --git a/ingestion/src/metadata/profiler/interface/profiler_interface.py b/ingestion/src/metadata/profiler/interface/profiler_interface.py index ef7e7a88d7c1..884ceac4c69c 100644 --- a/ingestion/src/metadata/profiler/interface/profiler_interface.py +++ b/ingestion/src/metadata/profiler/interface/profiler_interface.py @@ -15,9 +15,8 @@ """ from abc import ABC, abstractmethod -from typing import Any, Dict, List, Optional, Union, Type +from typing import Any, Dict, List, Optional, Type, Union -from metadata.profiler.metrics.system.system import System from sqlalchemy import Column from metadata.generated.schema.entity.data.database import ( @@ -61,6 +60,7 @@ ) from metadata.profiler.metrics.core import MetricTypes from metadata.profiler.metrics.registry import Metrics +from metadata.profiler.metrics.system.system import System from metadata.profiler.processor.runner import QueryRunner from metadata.utils.constants import SAMPLE_DATA_DEFAULT_COUNT from metadata.utils.partition import get_partition_details diff --git a/ingestion/src/metadata/profiler/interface/profiler_interface_factory.py b/ingestion/src/metadata/profiler/interface/profiler_interface_factory.py index 5b456ddfc4ca..be4ce89d5629 100644 --- a/ingestion/src/metadata/profiler/interface/profiler_interface_factory.py +++ b/ingestion/src/metadata/profiler/interface/profiler_interface_factory.py @@ -57,9 +57,7 @@ class ProfilerInterfaceFactory(Factory): def create(self, interface_type: str, *args, **kwargs): """Create interface object based on interface type""" - - - + interface_class_path = profiler_class_mapping.get( interface_type, profiler_class_mapping[DatabaseConnection.__name__] ) diff --git a/ingestion/src/metadata/profiler/interface/sqlalchemy/profiler_interface.py b/ingestion/src/metadata/profiler/interface/sqlalchemy/profiler_interface.py index e451ea793567..2cfc3751bf12 100644 --- a/ingestion/src/metadata/profiler/interface/sqlalchemy/profiler_interface.py +++ b/ingestion/src/metadata/profiler/interface/sqlalchemy/profiler_interface.py @@ -29,8 +29,8 @@ from metadata.generated.schema.entity.data.table import ( CustomMetricProfile, - TableData, SystemProfile, + TableData, ) from metadata.generated.schema.tests.customMetric import CustomMetric from metadata.ingestion.connections.session import create_and_bind_thread_safe_session @@ -42,7 +42,7 @@ from metadata.profiler.metrics.static.mean import Mean from metadata.profiler.metrics.static.stddev import StdDev from metadata.profiler.metrics.static.sum import Sum -from metadata.profiler.metrics.system.system import System, BaseSystemMetricsSource +from metadata.profiler.metrics.system.system import System, SystemMetricsComputer from metadata.profiler.orm.functions.table_metric_computer import TableMetricComputer from metadata.profiler.orm.registry import Dialects from metadata.profiler.processor.metric_filter import MetricFilter @@ -76,7 +76,7 @@ class SQAProfilerInterface(ProfilerInterface, SQAInterfaceMixin): # pylint: disable=too-many-arguments - system_metrics_source_class = BaseSystemMetricsSource + system_metrics_computer_class = SystemMetricsComputer def __init__( self, @@ -112,7 +112,7 @@ def __init__( self._table = self._convert_table_to_orm_object(sqa_metadata) self.create_session() - self.system_metrics_computer = self.system_metrics_source_class( + self.system_metrics_computer = self.system_metrics_computer_class( session=self.session ) diff --git a/ingestion/src/metadata/profiler/metrics/core.py b/ingestion/src/metadata/profiler/metrics/core.py index 23f00264aedc..9acba468dc7d 100644 --- a/ingestion/src/metadata/profiler/metrics/core.py +++ b/ingestion/src/metadata/profiler/metrics/core.py @@ -18,7 +18,7 @@ from abc import ABC, abstractmethod from enum import Enum from functools import wraps -from typing import Any, Callable, Dict, Optional, Tuple, TypeVar, TYPE_CHECKING +from typing import TYPE_CHECKING, Any, Callable, Dict, Optional, Tuple, TypeVar from sqlalchemy import Column from sqlalchemy.orm import DeclarativeMeta, Session diff --git a/ingestion/src/metadata/profiler/metrics/system/system.py b/ingestion/src/metadata/profiler/metrics/system/system.py index 05892232312e..526c0b8de602 100644 --- a/ingestion/src/metadata/profiler/metrics/system/system.py +++ b/ingestion/src/metadata/profiler/metrics/system/system.py @@ -14,9 +14,9 @@ """ import traceback -from abc import ABC, abstractmethod +from abc import ABC from collections import defaultdict -from typing import Dict, List, Optional, Callable, Generic, TypeVar +from typing import Callable, Dict, Generic, List, Optional, TypeVar from pydantic import TypeAdapter from sqlalchemy import text @@ -41,7 +41,7 @@ from metadata.utils.dispatch import valuedispatch from metadata.utils.helpers import deep_size_of_dict from metadata.utils.logger import profiler_logger -from metadata.utils.lru_cache import LRUCache, LRU_CACHE_SIZE +from metadata.utils.lru_cache import LRU_CACHE_SIZE, LRUCache from metadata.utils.profiler_utils import get_value_from_cache, set_cache from metadata.utils.time_utils import datetime_to_timestamp @@ -78,7 +78,10 @@ def get_or_update_cache( return result -class BaseSystemMetricsSource: +class EmptySystemMetricsSource: + """Empty system metrics source that can be used as a default. Just returns an empty list of system metrics + for any resource.""" + def __init__(self, *args, **kwargs): kwargs.pop("session", None) if len(args) > 0: @@ -87,38 +90,37 @@ def __init__(self, *args, **kwargs): logger.warning("Received unexpected keyword arguments: %s", kwargs) super().__init__() - def get_system_metrics(self, *args, **kwargs) -> List[SystemProfile]: - """Return system metrics for a given table. Actual passed object can be a variety of types based - on the underlying infrastructure. For example, in the case of SQLalchemy, it can be a Table object - and in the case of Mongo, it can be a collection object.""" - kwargs = self.get_kwargs(*args, **kwargs) - return ( - self.get_inserts(**kwargs) - + self.get_deletes(**kwargs) - + self.get_updates(**kwargs) - ) - - def get_kwargs(self, *args, **kwargs): - return {} - - def get_inserts( - self, database: str, schema: str, table: str - ) -> List[SystemProfile]: + def get_inserts(self, *args, **kwargs) -> List[SystemProfile]: """Get insert queries""" return [] - def get_deletes( - self, database: str, schema: str, table: str - ) -> List[SystemProfile]: + def get_deletes(self, *args, **kwargs) -> List[SystemProfile]: """Get delete queries""" return [] - def get_updates( - self, database: str, schema: str, table: str - ) -> List[SystemProfile]: + def get_updates(self, *args, **kwargs) -> List[SystemProfile]: """Get update queries""" return [] + def get_kwargs(self, *args, **kwargs): + return {} + + +class SystemMetricsComputer(EmptySystemMetricsSource): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + def get_system_metrics(self, *args, **kwargs) -> List[SystemProfile]: + """Return system metrics for a given table. Actual passed object can be a variety of types based + on the underlying infrastructure. For example, in the case of SQLalchemy, it can be a Table object + and in the case of Mongo, it can be a collection object.""" + kwargs = super().get_kwargs(*args, **kwargs) + return ( + super().get_inserts(**kwargs) + + super().get_deletes(**kwargs) + + super().get_updates(**kwargs) + ) + class SQASessionProvider: def __init__(self, *args, **kwargs): diff --git a/ingestion/src/metadata/profiler/source/base/profiler_source.py b/ingestion/src/metadata/profiler/source/base/profiler_source.py index 7fbb77cb2618..41d7db9fb98c 100644 --- a/ingestion/src/metadata/profiler/source/base/profiler_source.py +++ b/ingestion/src/metadata/profiler/source/base/profiler_source.py @@ -13,11 +13,9 @@ Base source for the profiler used to instantiate a profiler runner with its interface """ -import traceback from copy import deepcopy -from typing import List, Optional, Tuple, cast, Type +from typing import List, Optional, Tuple, Type, cast -from docker.models.services import Service from sqlalchemy import MetaData from metadata.generated.schema.configuration.profilerConfiguration import ( @@ -43,21 +41,13 @@ from metadata.ingestion.ometa.ometa_api import OpenMetadata from metadata.profiler.api.models import ProfilerProcessorConfig, TableConfig from metadata.profiler.interface.profiler_interface import ProfilerInterface -from metadata.profiler.interface.sqlalchemy.profiler_interface import ( - SQAProfilerInterface, -) from metadata.profiler.metrics.registry import Metrics from metadata.profiler.processor.core import Profiler from metadata.profiler.processor.default import DefaultProfiler, get_default_metrics from metadata.profiler.source.profiler_source_interface import ProfilerSourceInterface -from metadata.utils.importer import ( - import_source_class, - DynamicImportException, - import_profiler_class, - import_from_module, -) +from metadata.utils.importer import import_from_module from metadata.utils.logger import profiler_logger -from metadata.utils.manifest import BaseManifest, get_class_path +from metadata.utils.manifest import BaseManifest NON_SQA_DATABASE_CONNECTIONS = (DatalakeConnection,) @@ -226,17 +216,10 @@ def create_profiler_interface( def import_profiler_class( self, service_type: ServiceType, source_type: str ) -> Type[ProfilerInterface]: - try: - class_path = BaseManifest.get_for_source( - service_type, source_type - ).profler_class - if class_path is None: - class_path = get_class_path(SQAProfilerInterface) - return cast(Type[ProfilerInterface], import_from_module(class_path)) - except DynamicImportException as e: - logger.debug(traceback.format_exc()) - logger.debug(f"Failed to import profiler for source '{source_type}'") - return SQAProfilerInterface + class_path = BaseManifest.get_for_source( + service_type, source_type + ).profler_class + return cast(Type[ProfilerInterface], import_from_module(class_path)) def _get_context_entities( self, entity: Table diff --git a/ingestion/src/metadata/profiler/source/metadata_ext.py b/ingestion/src/metadata/profiler/source/metadata_ext.py index 363a0bdea4f2..891236c90ff6 100644 --- a/ingestion/src/metadata/profiler/source/metadata_ext.py +++ b/ingestion/src/metadata/profiler/source/metadata_ext.py @@ -21,7 +21,7 @@ """ import traceback from copy import deepcopy -from typing import Iterable, cast, Type +from typing import Iterable, Type, cast from sqlalchemy.inspection import inspect @@ -29,6 +29,7 @@ from metadata.generated.schema.entity.services.ingestionPipelines.status import ( StackTraceError, ) +from metadata.generated.schema.entity.services.serviceType import ServiceType from metadata.generated.schema.metadataIngestion.databaseServiceMetadataPipeline import ( DatabaseServiceMetadataPipeline, ) @@ -38,12 +39,8 @@ from metadata.generated.schema.metadataIngestion.workflow import ( OpenMetadataWorkflowConfig, ) -from metadata.generated.schema.tests.testSuite import ServiceType from metadata.ingestion.api.models import Either from metadata.ingestion.ometa.ometa_api import OpenMetadata -from metadata.profiler.interface.sqlalchemy.profiler_interface import ( - SQAProfilerInterface, -) from metadata.profiler.source.metadata import ( OpenMetadataSource, ProfilerSourceAndEntity, @@ -52,13 +49,12 @@ from metadata.utils.class_helper import get_service_type_from_source_type from metadata.utils.filters import filter_by_database, filter_by_schema, filter_by_table from metadata.utils.importer import ( - import_source_class, - import_profiler_class, ProfilerInterface, import_from_module, + import_source_class, ) from metadata.utils.logger import profiler_logger -from metadata.utils.manifest import BaseManifest, get_class_path +from metadata.utils.manifest import BaseManifest from metadata.utils.ssl_manager import get_ssl_connection logger = profiler_logger() @@ -154,18 +150,15 @@ def _iter(self, *_, **__) -> Iterable[Either[ProfilerSourceAndEntity]]: ) continue - class_path = BaseManifest.get_for_source( - ServiceType.Database, - source_type=self.config.source.type.lower(), - ).profler_class - if class_path is None: - class_path = get_class_path(SQAProfilerInterface) + profiler_source = self.import_profiler_interface()( + self.config, + database_entity, + self.metadata, + global_profiler_config, + ) yield Either( right=ProfilerSourceAndEntity( - profiler_source=cast( - Type[ProfilerInterface], - import_from_module(class_path), - ), + profiler_source=profiler_source, entity=table_entity, ) ) @@ -185,6 +178,14 @@ def get_table_names(self, schema_name: str) -> Iterable[str]: continue yield table_name + def import_profiler_interface(self) -> Type[ProfilerInterface]: + class_path = BaseManifest.get_for_source( + ServiceType.Database, + source_type=self.config.source.type.lower(), + ).profler_class + profiler_source_class = import_from_module(class_path) + return profiler_source_class + def get_schema_names(self) -> Iterable[str]: if self.service_connection.__dict__.get("databaseSchema"): yield self.service_connection.databaseSchema diff --git a/ingestion/src/metadata/utils/importer.py b/ingestion/src/metadata/utils/importer.py index 03226638bdee..13d22b235f8d 100644 --- a/ingestion/src/metadata/utils/importer.py +++ b/ingestion/src/metadata/utils/importer.py @@ -15,7 +15,7 @@ import sys import traceback from enum import Enum -from typing import Any, Callable, Optional, Type, TypeVar, TYPE_CHECKING +from typing import TYPE_CHECKING, Any, Callable, Optional, Type, TypeVar from pydantic import BaseModel @@ -24,13 +24,13 @@ OpenMetadataConnection, ) from metadata.generated.schema.entity.services.databaseService import ( - DatabaseService, DatabaseConnection, + DatabaseService, ) from metadata.generated.schema.entity.services.serviceType import ServiceType from metadata.generated.schema.metadataIngestion.workflow import Sink as WorkflowSink from metadata.ingestion.api.steps import BulkSink, Processor, Sink, Source, Stage -from metadata.profiler.metrics.system.system import BaseSystemMetricsSource +from metadata.profiler.metrics.system.system import EmptySystemMetricsSource from metadata.utils.class_helper import get_service_type_from_source_type from metadata.utils.client_version import get_client_version from metadata.utils.constants import CUSTOM_CONNECTOR_PREFIX @@ -136,7 +136,7 @@ def import_from_module(key: str) -> Type[Any]: """ Dynamically import an object from a module path """ - + logger.debug("Importing: %s", key) module_name, obj_name = key.rsplit(MODULE_SEPARATOR, 1) try: obj = getattr(importlib.import_module(module_name), obj_name) @@ -344,4 +344,4 @@ def import_system_metrics_computer(db_service: DatabaseService): ) except DynamicImportException as err: logger.debug("Could not import system metrics computer: %s", err) - return BaseSystemMetricsSource + return EmptySystemMetricsSource diff --git a/ingestion/src/metadata/utils/lru_cache.py b/ingestion/src/metadata/utils/lru_cache.py index 54712e78f51e..3ddb3f23a64b 100644 --- a/ingestion/src/metadata/utils/lru_cache.py +++ b/ingestion/src/metadata/utils/lru_cache.py @@ -13,10 +13,9 @@ LRU cache """ -from collections import OrderedDict -from typing import TypeVar, Generic, Callable import threading - +from collections import OrderedDict +from typing import Callable, Generic, TypeVar LRU_CACHE_SIZE = 4096 @@ -59,7 +58,6 @@ def __len__(self) -> int: return len(self._cache) def wrap(self, key_func: Callable[..., str]): - def wrapper(func: Callable[..., T]): def wrapped(*args, **kwargs) -> T: key = key_func(*args, **kwargs) diff --git a/ingestion/src/metadata/utils/manifest.py b/ingestion/src/metadata/utils/manifest.py index 2948632eec09..d6261ec909f5 100644 --- a/ingestion/src/metadata/utils/manifest.py +++ b/ingestion/src/metadata/utils/manifest.py @@ -1,12 +1,17 @@ -from typing import cast +""" +Manifests are used to store class information +""" -from metadata.generated.schema.tests.testSuite import ServiceType +from metadata.generated.schema.entity.services.serviceType import ServiceType from metadata.ingestion.models.custom_pydantic import BaseModel +from metadata.profiler.interface.sqlalchemy.profiler_interface import ( + SQAProfilerInterface, +) from metadata.utils.importer import ( - import_from_module, - get_module_dir, - get_source_module_name, + DynamicImportException, get_class_name_root, + get_module_dir, + import_from_module, ) @@ -17,15 +22,13 @@ class BaseManifest(BaseModel): 2. We can dynamically import the class when needed and avoid dependency issues. 3. We avoid circular imports. 4. We can hot-swap the class implementation without changing the manifest (example: for testing). - + + # TODO: naming? - Dyanmic factory? - Different name the class name? - - - - # TODO - - add a "default" manifest that will provide "sensible" default like SQAProfilerInterface or based on entity type / service type - + + # TODO: functionality + - is this expected to be a extended? """ profler_class: str @@ -34,18 +37,43 @@ class BaseManifest(BaseModel): def get_for_source( cls, service_type: ServiceType, source_type: str, from_: str = "ingestion" ) -> "BaseManifest": - return cls.model_validate( - import_from_module( - "metadata.{}.source.{}.{}.{}.{}Manifest".format( - from_, - service_type.name.lower(), - get_module_dir(source_type), - "manifest", - get_class_name_root(source_type), + """Retrieves the manifest for a given source type. If it does not exist will attempt to retrieve + a default manifest for the service type. + + Args: + service_type (ServiceType): The service type. + source_type (str): The source type. + from_ (str, optional): The module to import from. Defaults to "ingestion". + + Returns: + BaseManifest: The manifest for the source type. + """ + try: + return cls.model_validate( + import_from_module( + "metadata.{}.source.{}.{}.{}.{}Manifest".format( # pylint: disable=C0209 + from_, + service_type.name.lower(), + get_module_dir(source_type), + "manifest", + get_class_name_root(source_type), + ) ) ) - ) + except DynamicImportException: + try: + return DEFAULT_MANIFEST_MAP[service_type] + except KeyError: + raise RuntimeError(f"No manifest found for source type: {source_type}") def get_class_path(module): return module.__module__ + "." + module.__name__ + + +DefaultDatabaseManifest = BaseManifest( + profler_class=get_class_path(SQAProfilerInterface) +) + + +DEFAULT_MANIFEST_MAP = {ServiceType.Database: DefaultDatabaseManifest} diff --git a/ingestion/tests/cli_e2e/test_cli_redshift.py b/ingestion/tests/cli_e2e/test_cli_redshift.py index 6d94d3286b39..0c96b3c4af85 100644 --- a/ingestion/tests/cli_e2e/test_cli_redshift.py +++ b/ingestion/tests/cli_e2e/test_cli_redshift.py @@ -16,7 +16,7 @@ from typing import List from _openmetadata_testutils.pydantic.test_utils import assert_equal_pydantic_objects -from metadata.generated.schema.entity.data.table import SystemProfile, DmlOperationType +from metadata.generated.schema.entity.data.table import DmlOperationType, SystemProfile from metadata.generated.schema.type.basic import Timestamp from metadata.ingestion.api.status import Status diff --git a/ingestion/tests/cli_e2e/test_cli_snowflake.py b/ingestion/tests/cli_e2e/test_cli_snowflake.py index 9898bdb8e307..cb483ab2c4df 100644 --- a/ingestion/tests/cli_e2e/test_cli_snowflake.py +++ b/ingestion/tests/cli_e2e/test_cli_snowflake.py @@ -170,7 +170,8 @@ def test_create_table_with_profiler(self) -> None: self.system_profile_assertions() @staticmethod - def expected_tables() -> int: return 7 + def expected_tables() -> int: + return 7 def inserted_rows_count(self) -> int: return len( From 146025c2192afdf69280d24b3929facb2238cf81 Mon Sep 17 00:00:00 2001 From: sushi30 Date: Fri, 18 Oct 2024 15:14:44 +0200 Subject: [PATCH 03/18] - implement spec for all source types - added docs for the new specification - added some pylint ignores in the importer module --- .../ingestion/source/api/rest/service_spec.py | 4 + .../metadata/ingestion/source/connections.py | 7 +- .../dashboard/domodashboard/service_spec.py | 6 + .../dashboard/lightdash/service_spec.py | 4 + .../source/dashboard/looker/service_spec.py | 4 + .../source/dashboard/metabase/service_spec.py | 4 + .../source/dashboard/mode/service_spec.py | 4 + .../source/dashboard/mstr/service_spec.py | 4 + .../source/dashboard/powerbi/service_spec.py | 4 + .../dashboard/qlikcloud/service_spec.py | 4 + .../dashboard/qliksense/service_spec.py | 4 + .../dashboard/quicksight/service_spec.py | 4 + .../source/dashboard/redash/service_spec.py | 4 + .../source/dashboard/sigma/service_spec.py | 4 + .../source/dashboard/superset/service_spec.py | 4 + .../source/dashboard/tableau/service_spec.py | 4 + .../source/database/athena/service_spec.py | 4 + .../source/database/azuresql/service_spec.py | 4 + .../source/database/bigquery/manifest.py | 6 - .../source/database/bigquery/service_spec.py | 9 ++ .../source/database/bigtable/service_spec.py | 4 + .../database/clickhouse/service_spec.py | 4 + .../source/database/couchbase/service_spec.py | 4 + .../source/database/databricks/manifest.py | 8 -- .../database/databricks/service_spec.py | 9 ++ .../source/database/datalake/manifest.py | 6 - .../source/database/datalake/service_spec.py | 9 ++ .../ingestion/source/database/db2/manifest.py | 6 - .../source/database/db2/service_spec.py | 9 ++ .../source/database/dbt/service_spec.py | 4 + .../source/database/deltalake/service_spec.py | 4 + .../database/domodatabase/service_spec.py | 4 + .../source/database/doris/service_spec.py | 4 + .../source/database/druid/service_spec.py | 4 + .../source/database/dynamodb/manifest.py | 4 - .../source/database/dynamodb/service_spec.py | 7 ++ .../source/database/glue/service_spec.py | 4 + .../source/database/greenplum/service_spec.py | 4 + .../source/database/hive/service_spec.py | 4 + .../source/database/iceberg/service_spec.py | 4 + .../source/database/impala/service_spec.py | 4 + .../source/database/mariadb/manifest.py | 6 - .../source/database/mariadb/service_spec.py | 4 + .../source/database/mongodb/manifest.py | 4 - .../source/database/mongodb/service_spec.py | 7 ++ .../source/database/mssql/service_spec.py | 4 + .../source/database/mysql/service_spec.py | 4 + .../source/database/oracle/service_spec.py | 4 + .../source/database/pinotdb/service_spec.py | 4 + .../source/database/postgres/service_spec.py | 4 + .../source/database/presto/service_spec.py | 4 + .../source/database/redshift/manifest.py | 6 - .../source/database/redshift/service_spec.py | 9 ++ .../database/salesforce/service_spec.py | 4 + .../source/database/saperp/service_spec.py | 4 + .../source/database/saphana/service_spec.py | 4 + .../source/database/sas/service_spec.py | 4 + .../source/database/singlestore/manifest.py | 4 - .../database/singlestore/service_spec.py | 9 ++ .../source/database/snowflake/manifest.py | 6 - .../source/database/snowflake/service_spec.py | 9 ++ .../source/database/sqlite/service_spec.py | 4 + .../source/database/teradata/service_spec.py | 4 + .../source/database/trino/manifest.py | 6 - .../source/database/trino/service_spec.py | 9 ++ .../source/database/unitycatalog/manifest.py | 8 -- .../database/unitycatalog/service_spec.py | 10 ++ .../source/database/vertica/service_spec.py | 6 + .../source/messaging/kafka/service_spec.py | 4 + .../source/messaging/kinesis/service_spec.py | 4 + .../source/messaging/redpanda/service_spec.py | 4 + .../metadata/alationsink/service_spec.py | 4 + .../source/metadata/amundsen/service_spec.py | 4 + .../source/metadata/atlas/service_spec.py | 4 + .../source/mlmodel/mlflow/service_spec.py | 4 + .../source/mlmodel/sagemaker/service_spec.py | 4 + .../source/pipeline/airbyte/service_spec.py | 4 + .../source/pipeline/airflow/service_spec.py | 4 + .../source/pipeline/dagster/service_spec.py | 4 + .../databrickspipeline/service_spec.py | 6 + .../source/pipeline/dbtcloud/service_spec.py | 4 + .../pipeline/domopipeline/service_spec.py | 4 + .../source/pipeline/fivetran/service_spec.py | 4 + .../source/pipeline/flink/service_spec.py | 4 + .../pipeline/gluepipeline/service_spec.py | 4 + .../pipeline/kafkaconnect/service_spec.py | 4 + .../source/pipeline/nifi/service_spec.py | 4 + .../pipeline/openlineage/service_spec.py | 4 + .../source/pipeline/spline/service_spec.py | 4 + .../search/elasticsearch/service_spec.py | 4 + .../source/storage/gcs/service_spec.py | 4 + .../source/storage/s3/service_spec.py | 4 + .../interface/profiler_interface_factory.py | 108 ------------------ .../profiler/source/base/profiler_source.py | 11 +- .../metadata/profiler/source/metadata_ext.py | 6 +- ingestion/src/metadata/utils/importer.py | 60 +++------- ingestion/src/metadata/utils/logger.py | 15 +++ ingestion/src/metadata/utils/manifest.py | 79 ------------- .../metadata/utils/service_spec/__init__.py | 5 + .../metadata/utils/service_spec/default.py | 14 +++ .../utils/service_spec/service_spec.py | 77 +++++++++++++ .../develop-ingestion-code.md | 6 + .../releases/releases/index.md | 9 ++ 103 files changed, 527 insertions(+), 314 deletions(-) create mode 100644 ingestion/src/metadata/ingestion/source/api/rest/service_spec.py create mode 100644 ingestion/src/metadata/ingestion/source/dashboard/domodashboard/service_spec.py create mode 100644 ingestion/src/metadata/ingestion/source/dashboard/lightdash/service_spec.py create mode 100644 ingestion/src/metadata/ingestion/source/dashboard/looker/service_spec.py create mode 100644 ingestion/src/metadata/ingestion/source/dashboard/metabase/service_spec.py create mode 100644 ingestion/src/metadata/ingestion/source/dashboard/mode/service_spec.py create mode 100644 ingestion/src/metadata/ingestion/source/dashboard/mstr/service_spec.py create mode 100644 ingestion/src/metadata/ingestion/source/dashboard/powerbi/service_spec.py create mode 100644 ingestion/src/metadata/ingestion/source/dashboard/qlikcloud/service_spec.py create mode 100644 ingestion/src/metadata/ingestion/source/dashboard/qliksense/service_spec.py create mode 100644 ingestion/src/metadata/ingestion/source/dashboard/quicksight/service_spec.py create mode 100644 ingestion/src/metadata/ingestion/source/dashboard/redash/service_spec.py create mode 100644 ingestion/src/metadata/ingestion/source/dashboard/sigma/service_spec.py create mode 100644 ingestion/src/metadata/ingestion/source/dashboard/superset/service_spec.py create mode 100644 ingestion/src/metadata/ingestion/source/dashboard/tableau/service_spec.py create mode 100644 ingestion/src/metadata/ingestion/source/database/athena/service_spec.py create mode 100644 ingestion/src/metadata/ingestion/source/database/azuresql/service_spec.py delete mode 100644 ingestion/src/metadata/ingestion/source/database/bigquery/manifest.py create mode 100644 ingestion/src/metadata/ingestion/source/database/bigquery/service_spec.py create mode 100644 ingestion/src/metadata/ingestion/source/database/bigtable/service_spec.py create mode 100644 ingestion/src/metadata/ingestion/source/database/clickhouse/service_spec.py create mode 100644 ingestion/src/metadata/ingestion/source/database/couchbase/service_spec.py delete mode 100644 ingestion/src/metadata/ingestion/source/database/databricks/manifest.py create mode 100644 ingestion/src/metadata/ingestion/source/database/databricks/service_spec.py delete mode 100644 ingestion/src/metadata/ingestion/source/database/datalake/manifest.py create mode 100644 ingestion/src/metadata/ingestion/source/database/datalake/service_spec.py delete mode 100644 ingestion/src/metadata/ingestion/source/database/db2/manifest.py create mode 100644 ingestion/src/metadata/ingestion/source/database/db2/service_spec.py create mode 100644 ingestion/src/metadata/ingestion/source/database/dbt/service_spec.py create mode 100644 ingestion/src/metadata/ingestion/source/database/deltalake/service_spec.py create mode 100644 ingestion/src/metadata/ingestion/source/database/domodatabase/service_spec.py create mode 100644 ingestion/src/metadata/ingestion/source/database/doris/service_spec.py create mode 100644 ingestion/src/metadata/ingestion/source/database/druid/service_spec.py delete mode 100644 ingestion/src/metadata/ingestion/source/database/dynamodb/manifest.py create mode 100644 ingestion/src/metadata/ingestion/source/database/dynamodb/service_spec.py create mode 100644 ingestion/src/metadata/ingestion/source/database/glue/service_spec.py create mode 100644 ingestion/src/metadata/ingestion/source/database/greenplum/service_spec.py create mode 100644 ingestion/src/metadata/ingestion/source/database/hive/service_spec.py create mode 100644 ingestion/src/metadata/ingestion/source/database/iceberg/service_spec.py create mode 100644 ingestion/src/metadata/ingestion/source/database/impala/service_spec.py delete mode 100644 ingestion/src/metadata/ingestion/source/database/mariadb/manifest.py create mode 100644 ingestion/src/metadata/ingestion/source/database/mariadb/service_spec.py delete mode 100644 ingestion/src/metadata/ingestion/source/database/mongodb/manifest.py create mode 100644 ingestion/src/metadata/ingestion/source/database/mongodb/service_spec.py create mode 100644 ingestion/src/metadata/ingestion/source/database/mssql/service_spec.py create mode 100644 ingestion/src/metadata/ingestion/source/database/mysql/service_spec.py create mode 100644 ingestion/src/metadata/ingestion/source/database/oracle/service_spec.py create mode 100644 ingestion/src/metadata/ingestion/source/database/pinotdb/service_spec.py create mode 100644 ingestion/src/metadata/ingestion/source/database/postgres/service_spec.py create mode 100644 ingestion/src/metadata/ingestion/source/database/presto/service_spec.py delete mode 100644 ingestion/src/metadata/ingestion/source/database/redshift/manifest.py create mode 100644 ingestion/src/metadata/ingestion/source/database/redshift/service_spec.py create mode 100644 ingestion/src/metadata/ingestion/source/database/salesforce/service_spec.py create mode 100644 ingestion/src/metadata/ingestion/source/database/saperp/service_spec.py create mode 100644 ingestion/src/metadata/ingestion/source/database/saphana/service_spec.py create mode 100644 ingestion/src/metadata/ingestion/source/database/sas/service_spec.py delete mode 100644 ingestion/src/metadata/ingestion/source/database/singlestore/manifest.py create mode 100644 ingestion/src/metadata/ingestion/source/database/singlestore/service_spec.py delete mode 100644 ingestion/src/metadata/ingestion/source/database/snowflake/manifest.py create mode 100644 ingestion/src/metadata/ingestion/source/database/snowflake/service_spec.py create mode 100644 ingestion/src/metadata/ingestion/source/database/sqlite/service_spec.py create mode 100644 ingestion/src/metadata/ingestion/source/database/teradata/service_spec.py delete mode 100644 ingestion/src/metadata/ingestion/source/database/trino/manifest.py create mode 100644 ingestion/src/metadata/ingestion/source/database/trino/service_spec.py delete mode 100644 ingestion/src/metadata/ingestion/source/database/unitycatalog/manifest.py create mode 100644 ingestion/src/metadata/ingestion/source/database/unitycatalog/service_spec.py create mode 100644 ingestion/src/metadata/ingestion/source/database/vertica/service_spec.py create mode 100644 ingestion/src/metadata/ingestion/source/messaging/kafka/service_spec.py create mode 100644 ingestion/src/metadata/ingestion/source/messaging/kinesis/service_spec.py create mode 100644 ingestion/src/metadata/ingestion/source/messaging/redpanda/service_spec.py create mode 100644 ingestion/src/metadata/ingestion/source/metadata/alationsink/service_spec.py create mode 100644 ingestion/src/metadata/ingestion/source/metadata/amundsen/service_spec.py create mode 100644 ingestion/src/metadata/ingestion/source/metadata/atlas/service_spec.py create mode 100644 ingestion/src/metadata/ingestion/source/mlmodel/mlflow/service_spec.py create mode 100644 ingestion/src/metadata/ingestion/source/mlmodel/sagemaker/service_spec.py create mode 100644 ingestion/src/metadata/ingestion/source/pipeline/airbyte/service_spec.py create mode 100644 ingestion/src/metadata/ingestion/source/pipeline/airflow/service_spec.py create mode 100644 ingestion/src/metadata/ingestion/source/pipeline/dagster/service_spec.py create mode 100644 ingestion/src/metadata/ingestion/source/pipeline/databrickspipeline/service_spec.py create mode 100644 ingestion/src/metadata/ingestion/source/pipeline/dbtcloud/service_spec.py create mode 100644 ingestion/src/metadata/ingestion/source/pipeline/domopipeline/service_spec.py create mode 100644 ingestion/src/metadata/ingestion/source/pipeline/fivetran/service_spec.py create mode 100644 ingestion/src/metadata/ingestion/source/pipeline/flink/service_spec.py create mode 100644 ingestion/src/metadata/ingestion/source/pipeline/gluepipeline/service_spec.py create mode 100644 ingestion/src/metadata/ingestion/source/pipeline/kafkaconnect/service_spec.py create mode 100644 ingestion/src/metadata/ingestion/source/pipeline/nifi/service_spec.py create mode 100644 ingestion/src/metadata/ingestion/source/pipeline/openlineage/service_spec.py create mode 100644 ingestion/src/metadata/ingestion/source/pipeline/spline/service_spec.py create mode 100644 ingestion/src/metadata/ingestion/source/search/elasticsearch/service_spec.py create mode 100644 ingestion/src/metadata/ingestion/source/storage/gcs/service_spec.py create mode 100644 ingestion/src/metadata/ingestion/source/storage/s3/service_spec.py delete mode 100644 ingestion/src/metadata/profiler/interface/profiler_interface_factory.py delete mode 100644 ingestion/src/metadata/utils/manifest.py create mode 100644 ingestion/src/metadata/utils/service_spec/__init__.py create mode 100644 ingestion/src/metadata/utils/service_spec/default.py create mode 100644 ingestion/src/metadata/utils/service_spec/service_spec.py diff --git a/ingestion/src/metadata/ingestion/source/api/rest/service_spec.py b/ingestion/src/metadata/ingestion/source/api/rest/service_spec.py new file mode 100644 index 000000000000..63f4439254d3 --- /dev/null +++ b/ingestion/src/metadata/ingestion/source/api/rest/service_spec.py @@ -0,0 +1,4 @@ +from metadata.ingestion.source.api.rest.metadata import RestSource +from metadata.utils.service_spec import BaseSpec + +ServiceSpec = BaseSpec(metadata_source_class=RestSource) diff --git a/ingestion/src/metadata/ingestion/source/connections.py b/ingestion/src/metadata/ingestion/source/connections.py index d7f58234adc7..42eba3dd57de 100644 --- a/ingestion/src/metadata/ingestion/source/connections.py +++ b/ingestion/src/metadata/ingestion/source/connections.py @@ -19,8 +19,13 @@ from pydantic import BaseModel from sqlalchemy.engine import Engine -from metadata.utils.importer import import_connection_fn +# isort: off +# logger is a root library and needs to be imported first avoid circular imports from metadata.utils.logger import cli_logger +from metadata.utils.importer import import_connection_fn + +# isort: on + logger = cli_logger() diff --git a/ingestion/src/metadata/ingestion/source/dashboard/domodashboard/service_spec.py b/ingestion/src/metadata/ingestion/source/dashboard/domodashboard/service_spec.py new file mode 100644 index 000000000000..005964bb2e0f --- /dev/null +++ b/ingestion/src/metadata/ingestion/source/dashboard/domodashboard/service_spec.py @@ -0,0 +1,6 @@ +from metadata.ingestion.source.dashboard.domodashboard.metadata import ( + DomodashboardSource, +) +from metadata.utils.service_spec import BaseSpec + +ServiceSpec = BaseSpec(metadata_source_class=DomodashboardSource) diff --git a/ingestion/src/metadata/ingestion/source/dashboard/lightdash/service_spec.py b/ingestion/src/metadata/ingestion/source/dashboard/lightdash/service_spec.py new file mode 100644 index 000000000000..f9e645eae315 --- /dev/null +++ b/ingestion/src/metadata/ingestion/source/dashboard/lightdash/service_spec.py @@ -0,0 +1,4 @@ +from metadata.ingestion.source.dashboard.lightdash.metadata import LightdashSource +from metadata.utils.service_spec import BaseSpec + +ServiceSpec = BaseSpec(metadata_source_class=LightdashSource) diff --git a/ingestion/src/metadata/ingestion/source/dashboard/looker/service_spec.py b/ingestion/src/metadata/ingestion/source/dashboard/looker/service_spec.py new file mode 100644 index 000000000000..660d08142402 --- /dev/null +++ b/ingestion/src/metadata/ingestion/source/dashboard/looker/service_spec.py @@ -0,0 +1,4 @@ +from metadata.ingestion.source.dashboard.looker.metadata import LookerSource +from metadata.utils.service_spec import BaseSpec + +ServiceSpec = BaseSpec(metadata_source_class=LookerSource) diff --git a/ingestion/src/metadata/ingestion/source/dashboard/metabase/service_spec.py b/ingestion/src/metadata/ingestion/source/dashboard/metabase/service_spec.py new file mode 100644 index 000000000000..830fdd8eb208 --- /dev/null +++ b/ingestion/src/metadata/ingestion/source/dashboard/metabase/service_spec.py @@ -0,0 +1,4 @@ +from metadata.ingestion.source.dashboard.metabase.metadata import MetabaseSource +from metadata.utils.service_spec import BaseSpec + +ServiceSpec = BaseSpec(metadata_source_class=MetabaseSource) diff --git a/ingestion/src/metadata/ingestion/source/dashboard/mode/service_spec.py b/ingestion/src/metadata/ingestion/source/dashboard/mode/service_spec.py new file mode 100644 index 000000000000..fb39c9435e75 --- /dev/null +++ b/ingestion/src/metadata/ingestion/source/dashboard/mode/service_spec.py @@ -0,0 +1,4 @@ +from metadata.ingestion.source.dashboard.mode.metadata import ModeSource +from metadata.utils.service_spec import BaseSpec + +ServiceSpec = BaseSpec(metadata_source_class=ModeSource) diff --git a/ingestion/src/metadata/ingestion/source/dashboard/mstr/service_spec.py b/ingestion/src/metadata/ingestion/source/dashboard/mstr/service_spec.py new file mode 100644 index 000000000000..f2ea7e03df8a --- /dev/null +++ b/ingestion/src/metadata/ingestion/source/dashboard/mstr/service_spec.py @@ -0,0 +1,4 @@ +from metadata.ingestion.source.dashboard.mstr.metadata import MstrSource +from metadata.utils.service_spec import BaseSpec + +ServiceSpec = BaseSpec(metadata_source_class=MstrSource) diff --git a/ingestion/src/metadata/ingestion/source/dashboard/powerbi/service_spec.py b/ingestion/src/metadata/ingestion/source/dashboard/powerbi/service_spec.py new file mode 100644 index 000000000000..dd6c2f4053be --- /dev/null +++ b/ingestion/src/metadata/ingestion/source/dashboard/powerbi/service_spec.py @@ -0,0 +1,4 @@ +from metadata.ingestion.source.dashboard.powerbi.metadata import PowerbiSource +from metadata.utils.service_spec import BaseSpec + +ServiceSpec = BaseSpec(metadata_source_class=PowerbiSource) diff --git a/ingestion/src/metadata/ingestion/source/dashboard/qlikcloud/service_spec.py b/ingestion/src/metadata/ingestion/source/dashboard/qlikcloud/service_spec.py new file mode 100644 index 000000000000..e59a7c3c4e24 --- /dev/null +++ b/ingestion/src/metadata/ingestion/source/dashboard/qlikcloud/service_spec.py @@ -0,0 +1,4 @@ +from metadata.ingestion.source.dashboard.qlikcloud.metadata import QlikcloudSource +from metadata.utils.service_spec import BaseSpec + +ServiceSpec = BaseSpec(metadata_source_class=QlikcloudSource) diff --git a/ingestion/src/metadata/ingestion/source/dashboard/qliksense/service_spec.py b/ingestion/src/metadata/ingestion/source/dashboard/qliksense/service_spec.py new file mode 100644 index 000000000000..bedf3c5ca269 --- /dev/null +++ b/ingestion/src/metadata/ingestion/source/dashboard/qliksense/service_spec.py @@ -0,0 +1,4 @@ +from metadata.ingestion.source.dashboard.qliksense.metadata import QliksenseSource +from metadata.utils.service_spec import BaseSpec + +ServiceSpec = BaseSpec(metadata_source_class=QliksenseSource) diff --git a/ingestion/src/metadata/ingestion/source/dashboard/quicksight/service_spec.py b/ingestion/src/metadata/ingestion/source/dashboard/quicksight/service_spec.py new file mode 100644 index 000000000000..c8549572623e --- /dev/null +++ b/ingestion/src/metadata/ingestion/source/dashboard/quicksight/service_spec.py @@ -0,0 +1,4 @@ +from metadata.ingestion.source.dashboard.quicksight.metadata import QuicksightSource +from metadata.utils.service_spec import BaseSpec + +ServiceSpec = BaseSpec(metadata_source_class=QuicksightSource) diff --git a/ingestion/src/metadata/ingestion/source/dashboard/redash/service_spec.py b/ingestion/src/metadata/ingestion/source/dashboard/redash/service_spec.py new file mode 100644 index 000000000000..c5557d7732b0 --- /dev/null +++ b/ingestion/src/metadata/ingestion/source/dashboard/redash/service_spec.py @@ -0,0 +1,4 @@ +from metadata.ingestion.source.dashboard.redash.metadata import RedashSource +from metadata.utils.service_spec import BaseSpec + +ServiceSpec = BaseSpec(metadata_source_class=RedashSource) diff --git a/ingestion/src/metadata/ingestion/source/dashboard/sigma/service_spec.py b/ingestion/src/metadata/ingestion/source/dashboard/sigma/service_spec.py new file mode 100644 index 000000000000..ddec090a90be --- /dev/null +++ b/ingestion/src/metadata/ingestion/source/dashboard/sigma/service_spec.py @@ -0,0 +1,4 @@ +from metadata.ingestion.source.dashboard.sigma.metadata import SigmaSource +from metadata.utils.service_spec import BaseSpec + +ServiceSpec = BaseSpec(metadata_source_class=SigmaSource) diff --git a/ingestion/src/metadata/ingestion/source/dashboard/superset/service_spec.py b/ingestion/src/metadata/ingestion/source/dashboard/superset/service_spec.py new file mode 100644 index 000000000000..c01d75ba1707 --- /dev/null +++ b/ingestion/src/metadata/ingestion/source/dashboard/superset/service_spec.py @@ -0,0 +1,4 @@ +from metadata.ingestion.source.dashboard.superset.metadata import SupersetSource +from metadata.utils.service_spec import BaseSpec + +ServiceSpec = BaseSpec(metadata_source_class=SupersetSource) diff --git a/ingestion/src/metadata/ingestion/source/dashboard/tableau/service_spec.py b/ingestion/src/metadata/ingestion/source/dashboard/tableau/service_spec.py new file mode 100644 index 000000000000..5743360ec108 --- /dev/null +++ b/ingestion/src/metadata/ingestion/source/dashboard/tableau/service_spec.py @@ -0,0 +1,4 @@ +from metadata.ingestion.source.dashboard.tableau.metadata import TableauSource +from metadata.utils.service_spec import BaseSpec + +ServiceSpec = BaseSpec(metadata_source_class=TableauSource) diff --git a/ingestion/src/metadata/ingestion/source/database/athena/service_spec.py b/ingestion/src/metadata/ingestion/source/database/athena/service_spec.py new file mode 100644 index 000000000000..64d59936af31 --- /dev/null +++ b/ingestion/src/metadata/ingestion/source/database/athena/service_spec.py @@ -0,0 +1,4 @@ +from metadata.ingestion.source.database.athena.metadata import AthenaSource +from metadata.utils.manifest import DefaultDatabaseSpec + +ServiceSpec = DefaultDatabaseSpec(metadata_source_class=AthenaSource) diff --git a/ingestion/src/metadata/ingestion/source/database/azuresql/service_spec.py b/ingestion/src/metadata/ingestion/source/database/azuresql/service_spec.py new file mode 100644 index 000000000000..305ce57fa3f6 --- /dev/null +++ b/ingestion/src/metadata/ingestion/source/database/azuresql/service_spec.py @@ -0,0 +1,4 @@ +from metadata.ingestion.source.database.azuresql.metadata import AzuresqlSource +from metadata.utils.manifest import DefaultDatabaseSpec + +ServiceSpec = DefaultDatabaseSpec(metadata_source_class=AzuresqlSource) diff --git a/ingestion/src/metadata/ingestion/source/database/bigquery/manifest.py b/ingestion/src/metadata/ingestion/source/database/bigquery/manifest.py deleted file mode 100644 index 66ca67dfb501..000000000000 --- a/ingestion/src/metadata/ingestion/source/database/bigquery/manifest.py +++ /dev/null @@ -1,6 +0,0 @@ -from metadata.profiler.interface.sqlalchemy.bigquery.profiler_interface import ( - BigQueryProfilerInterface, -) -from metadata.utils.manifest import BaseManifest, get_class_path - -BigqueryManifest = BaseManifest(profler_class=get_class_path(BigQueryProfilerInterface)) diff --git a/ingestion/src/metadata/ingestion/source/database/bigquery/service_spec.py b/ingestion/src/metadata/ingestion/source/database/bigquery/service_spec.py new file mode 100644 index 000000000000..fb0db5a64edb --- /dev/null +++ b/ingestion/src/metadata/ingestion/source/database/bigquery/service_spec.py @@ -0,0 +1,9 @@ +from metadata.ingestion.source.database.bigquery.metadata import BigquerySource +from metadata.profiler.interface.sqlalchemy.bigquery.profiler_interface import ( + BigQueryProfilerInterface, +) +from metadata.utils.manifest import DefaultDatabaseSpec + +ServiceSpec = DefaultDatabaseSpec( + metadata_source_class=BigquerySource, profiler_class=BigQueryProfilerInterface +) diff --git a/ingestion/src/metadata/ingestion/source/database/bigtable/service_spec.py b/ingestion/src/metadata/ingestion/source/database/bigtable/service_spec.py new file mode 100644 index 000000000000..f002a9315462 --- /dev/null +++ b/ingestion/src/metadata/ingestion/source/database/bigtable/service_spec.py @@ -0,0 +1,4 @@ +from metadata.ingestion.source.database.bigtable.metadata import BigtableSource +from metadata.utils.manifest import DefaultDatabaseSpec + +ServiceSpec = DefaultDatabaseSpec(metadata_source_class=BigtableSource) diff --git a/ingestion/src/metadata/ingestion/source/database/clickhouse/service_spec.py b/ingestion/src/metadata/ingestion/source/database/clickhouse/service_spec.py new file mode 100644 index 000000000000..f43fc6ed8224 --- /dev/null +++ b/ingestion/src/metadata/ingestion/source/database/clickhouse/service_spec.py @@ -0,0 +1,4 @@ +from metadata.ingestion.source.database.clickhouse.metadata import ClickhouseSource +from metadata.utils.manifest import DefaultDatabaseSpec + +ServiceSpec = DefaultDatabaseSpec(metadata_source_class=ClickhouseSource) diff --git a/ingestion/src/metadata/ingestion/source/database/couchbase/service_spec.py b/ingestion/src/metadata/ingestion/source/database/couchbase/service_spec.py new file mode 100644 index 000000000000..2f6bbb602341 --- /dev/null +++ b/ingestion/src/metadata/ingestion/source/database/couchbase/service_spec.py @@ -0,0 +1,4 @@ +from metadata.ingestion.source.database.couchbase.metadata import CouchbaseSource +from metadata.utils.manifest import DefaultDatabaseSpec + +ServiceSpec = DefaultDatabaseSpec(metadata_source_class=CouchbaseSource) diff --git a/ingestion/src/metadata/ingestion/source/database/databricks/manifest.py b/ingestion/src/metadata/ingestion/source/database/databricks/manifest.py deleted file mode 100644 index bc745191d1fa..000000000000 --- a/ingestion/src/metadata/ingestion/source/database/databricks/manifest.py +++ /dev/null @@ -1,8 +0,0 @@ -from metadata.profiler.interface.sqlalchemy.databricks.profiler_interface import ( - DatabricksProfilerInterface, -) -from metadata.utils.manifest import BaseManifest, get_class_path - -DatabricksManifest = BaseManifest( - profler_class=get_class_path(DatabricksProfilerInterface) -) diff --git a/ingestion/src/metadata/ingestion/source/database/databricks/service_spec.py b/ingestion/src/metadata/ingestion/source/database/databricks/service_spec.py new file mode 100644 index 000000000000..463d6508f959 --- /dev/null +++ b/ingestion/src/metadata/ingestion/source/database/databricks/service_spec.py @@ -0,0 +1,9 @@ +from metadata.ingestion.source.database.databricks.metadata import DatabricksSource +from metadata.profiler.interface.sqlalchemy.databricks.profiler_interface import ( + DatabricksProfilerInterface, +) +from metadata.utils.manifest import DefaultDatabaseSpec + +ServiceSpec = DefaultDatabaseSpec( + metadata_source_class=DatabricksSource, profiler_class=DatabricksProfilerInterface +) diff --git a/ingestion/src/metadata/ingestion/source/database/datalake/manifest.py b/ingestion/src/metadata/ingestion/source/database/datalake/manifest.py deleted file mode 100644 index a1b4e76faf74..000000000000 --- a/ingestion/src/metadata/ingestion/source/database/datalake/manifest.py +++ /dev/null @@ -1,6 +0,0 @@ -from metadata.profiler.interface.pandas.profiler_interface import ( - PandasProfilerInterface, -) -from metadata.utils.manifest import BaseManifest, get_class_path - -DatalakeManifest = BaseManifest(profler_class=get_class_path(PandasProfilerInterface)) diff --git a/ingestion/src/metadata/ingestion/source/database/datalake/service_spec.py b/ingestion/src/metadata/ingestion/source/database/datalake/service_spec.py new file mode 100644 index 000000000000..234af15692f1 --- /dev/null +++ b/ingestion/src/metadata/ingestion/source/database/datalake/service_spec.py @@ -0,0 +1,9 @@ +from metadata.ingestion.source.database.datalake.metadata import DatalakeSource +from metadata.profiler.interface.pandas.profiler_interface import ( + PandasProfilerInterface, +) +from metadata.utils.manifest import DefaultDatabaseSpec + +ServiceSpec = DefaultDatabaseSpec( + metadata_source_class=DatalakeSource, profiler_class=PandasProfilerInterface +) diff --git a/ingestion/src/metadata/ingestion/source/database/db2/manifest.py b/ingestion/src/metadata/ingestion/source/database/db2/manifest.py deleted file mode 100644 index 46e57bc7e006..000000000000 --- a/ingestion/src/metadata/ingestion/source/database/db2/manifest.py +++ /dev/null @@ -1,6 +0,0 @@ -from metadata.profiler.interface.sqlalchemy.db2.profiler_interface import ( - DB2ProfilerInterface, -) -from metadata.utils.manifest import BaseManifest, get_class_path - -Db2Manifest = BaseManifest(profler_class=get_class_path(DB2ProfilerInterface)) diff --git a/ingestion/src/metadata/ingestion/source/database/db2/service_spec.py b/ingestion/src/metadata/ingestion/source/database/db2/service_spec.py new file mode 100644 index 000000000000..c2d904523d45 --- /dev/null +++ b/ingestion/src/metadata/ingestion/source/database/db2/service_spec.py @@ -0,0 +1,9 @@ +from metadata.ingestion.source.database.db2.metadata import Db2Source +from metadata.profiler.interface.sqlalchemy.db2.profiler_interface import ( + DB2ProfilerInterface, +) +from metadata.utils.manifest import DefaultDatabaseSpec + +ServiceSpec = DefaultDatabaseSpec( + metadata_source_class=Db2Source, profiler_class=DB2ProfilerInterface +) diff --git a/ingestion/src/metadata/ingestion/source/database/dbt/service_spec.py b/ingestion/src/metadata/ingestion/source/database/dbt/service_spec.py new file mode 100644 index 000000000000..6126e67432a4 --- /dev/null +++ b/ingestion/src/metadata/ingestion/source/database/dbt/service_spec.py @@ -0,0 +1,4 @@ +from metadata.ingestion.source.database.dbt.metadata import DbtSource +from metadata.utils.service_spec import BaseSpec + +ServiceSpec = BaseSpec(metadata_source_class=DbtSource) diff --git a/ingestion/src/metadata/ingestion/source/database/deltalake/service_spec.py b/ingestion/src/metadata/ingestion/source/database/deltalake/service_spec.py new file mode 100644 index 000000000000..ba8886b56f94 --- /dev/null +++ b/ingestion/src/metadata/ingestion/source/database/deltalake/service_spec.py @@ -0,0 +1,4 @@ +from metadata.ingestion.source.database.deltalake.metadata import DeltalakeSource +from metadata.utils.manifest import DefaultDatabaseSpec + +ServiceSpec = DefaultDatabaseSpec(metadata_source_class=DeltalakeSource) diff --git a/ingestion/src/metadata/ingestion/source/database/domodatabase/service_spec.py b/ingestion/src/metadata/ingestion/source/database/domodatabase/service_spec.py new file mode 100644 index 000000000000..3192386039ae --- /dev/null +++ b/ingestion/src/metadata/ingestion/source/database/domodatabase/service_spec.py @@ -0,0 +1,4 @@ +from metadata.ingestion.source.database.domodatabase.metadata import DomodatabaseSource +from metadata.utils.manifest import DefaultDatabaseSpec + +ServiceSpec = DefaultDatabaseSpec(metadata_source_class=DomodatabaseSource) diff --git a/ingestion/src/metadata/ingestion/source/database/doris/service_spec.py b/ingestion/src/metadata/ingestion/source/database/doris/service_spec.py new file mode 100644 index 000000000000..4e7fda236030 --- /dev/null +++ b/ingestion/src/metadata/ingestion/source/database/doris/service_spec.py @@ -0,0 +1,4 @@ +from metadata.ingestion.source.database.doris.metadata import DorisSource +from metadata.utils.manifest import DefaultDatabaseSpec + +ServiceSpec = DefaultDatabaseSpec(metadata_source_class=DorisSource) diff --git a/ingestion/src/metadata/ingestion/source/database/druid/service_spec.py b/ingestion/src/metadata/ingestion/source/database/druid/service_spec.py new file mode 100644 index 000000000000..e3958b254173 --- /dev/null +++ b/ingestion/src/metadata/ingestion/source/database/druid/service_spec.py @@ -0,0 +1,4 @@ +from metadata.ingestion.source.database.druid.metadata import DruidSource +from metadata.utils.manifest import DefaultDatabaseSpec + +ServiceSpec = DefaultDatabaseSpec(metadata_source_class=DruidSource) diff --git a/ingestion/src/metadata/ingestion/source/database/dynamodb/manifest.py b/ingestion/src/metadata/ingestion/source/database/dynamodb/manifest.py deleted file mode 100644 index 1d4980701e8f..000000000000 --- a/ingestion/src/metadata/ingestion/source/database/dynamodb/manifest.py +++ /dev/null @@ -1,4 +0,0 @@ -from ingestion.tests.integration.profiler.test_nosql_profiler import NoSQLProfiler -from metadata.utils.manifest import BaseManifest, get_class_path - -DyanmodbManifest = BaseManifest(profler_class=get_class_path(NoSQLProfiler)) diff --git a/ingestion/src/metadata/ingestion/source/database/dynamodb/service_spec.py b/ingestion/src/metadata/ingestion/source/database/dynamodb/service_spec.py new file mode 100644 index 000000000000..3ed0bd0de58c --- /dev/null +++ b/ingestion/src/metadata/ingestion/source/database/dynamodb/service_spec.py @@ -0,0 +1,7 @@ +from metadata.ingestion.source.database.dynamodb.metadata import DynamodbSource +from metadata.profiler.interface.nosql.profiler_interface import NoSQLProfilerInterface +from metadata.utils.manifest import DefaultDatabaseSpec + +ServiceSpec = DefaultDatabaseSpec( + metadata_source_class=DynamodbSource, profiler_class=NoSQLProfilerInterface +) diff --git a/ingestion/src/metadata/ingestion/source/database/glue/service_spec.py b/ingestion/src/metadata/ingestion/source/database/glue/service_spec.py new file mode 100644 index 000000000000..44dcad3d1fa0 --- /dev/null +++ b/ingestion/src/metadata/ingestion/source/database/glue/service_spec.py @@ -0,0 +1,4 @@ +from metadata.ingestion.source.database.glue.metadata import GlueSource +from metadata.utils.manifest import DefaultDatabaseSpec + +ServiceSpec = DefaultDatabaseSpec(metadata_source_class=GlueSource) diff --git a/ingestion/src/metadata/ingestion/source/database/greenplum/service_spec.py b/ingestion/src/metadata/ingestion/source/database/greenplum/service_spec.py new file mode 100644 index 000000000000..43c79633f404 --- /dev/null +++ b/ingestion/src/metadata/ingestion/source/database/greenplum/service_spec.py @@ -0,0 +1,4 @@ +from metadata.ingestion.source.database.greenplum.metadata import GreenplumSource +from metadata.utils.manifest import DefaultDatabaseSpec + +ServiceSpec = DefaultDatabaseSpec(metadata_source_class=GreenplumSource) diff --git a/ingestion/src/metadata/ingestion/source/database/hive/service_spec.py b/ingestion/src/metadata/ingestion/source/database/hive/service_spec.py new file mode 100644 index 000000000000..bd80439fd136 --- /dev/null +++ b/ingestion/src/metadata/ingestion/source/database/hive/service_spec.py @@ -0,0 +1,4 @@ +from metadata.ingestion.source.database.hive.metadata import HiveSource +from metadata.utils.manifest import DefaultDatabaseSpec + +ServiceSpec = DefaultDatabaseSpec(metadata_source_class=HiveSource) diff --git a/ingestion/src/metadata/ingestion/source/database/iceberg/service_spec.py b/ingestion/src/metadata/ingestion/source/database/iceberg/service_spec.py new file mode 100644 index 000000000000..5c31342ece0d --- /dev/null +++ b/ingestion/src/metadata/ingestion/source/database/iceberg/service_spec.py @@ -0,0 +1,4 @@ +from metadata.ingestion.source.database.iceberg.metadata import IcebergSource +from metadata.utils.manifest import DefaultDatabaseSpec + +ServiceSpec = DefaultDatabaseSpec(metadata_source_class=IcebergSource) diff --git a/ingestion/src/metadata/ingestion/source/database/impala/service_spec.py b/ingestion/src/metadata/ingestion/source/database/impala/service_spec.py new file mode 100644 index 000000000000..2c81447e9e9f --- /dev/null +++ b/ingestion/src/metadata/ingestion/source/database/impala/service_spec.py @@ -0,0 +1,4 @@ +from metadata.ingestion.source.database.impala.metadata import ImpalaSource +from metadata.utils.manifest import DefaultDatabaseSpec + +ServiceSpec = DefaultDatabaseSpec(metadata_source_class=ImpalaSource) diff --git a/ingestion/src/metadata/ingestion/source/database/mariadb/manifest.py b/ingestion/src/metadata/ingestion/source/database/mariadb/manifest.py deleted file mode 100644 index d1ad846fe3b2..000000000000 --- a/ingestion/src/metadata/ingestion/source/database/mariadb/manifest.py +++ /dev/null @@ -1,6 +0,0 @@ -from metadata.profiler.interface.sqlalchemy.mariadb.profiler_interface import ( - MariaDBProfilerInterface, -) -from metadata.utils.manifest import BaseManifest, get_class_path - -MariadbManifest = BaseManifest(profler_class=get_class_path(MariaDBProfilerInterface)) diff --git a/ingestion/src/metadata/ingestion/source/database/mariadb/service_spec.py b/ingestion/src/metadata/ingestion/source/database/mariadb/service_spec.py new file mode 100644 index 000000000000..692963cdf9db --- /dev/null +++ b/ingestion/src/metadata/ingestion/source/database/mariadb/service_spec.py @@ -0,0 +1,4 @@ +from metadata.ingestion.source.database.mariadb.metadata import MariadbSource +from metadata.utils.manifest import DefaultDatabaseSpec + +ServiceSpec = DefaultDatabaseSpec(metadata_source_class=MariadbSource) diff --git a/ingestion/src/metadata/ingestion/source/database/mongodb/manifest.py b/ingestion/src/metadata/ingestion/source/database/mongodb/manifest.py deleted file mode 100644 index f590cde0e28f..000000000000 --- a/ingestion/src/metadata/ingestion/source/database/mongodb/manifest.py +++ /dev/null @@ -1,4 +0,0 @@ -from ingestion.tests.integration.profiler.test_nosql_profiler import NoSQLProfiler -from metadata.utils.manifest import BaseManifest, get_class_path - -MongodbManifest = BaseManifest(profler_class=get_class_path(NoSQLProfiler)) diff --git a/ingestion/src/metadata/ingestion/source/database/mongodb/service_spec.py b/ingestion/src/metadata/ingestion/source/database/mongodb/service_spec.py new file mode 100644 index 000000000000..e60602540de9 --- /dev/null +++ b/ingestion/src/metadata/ingestion/source/database/mongodb/service_spec.py @@ -0,0 +1,7 @@ +from metadata.ingestion.source.database.mongodb.metadata import MongodbSource +from metadata.profiler.interface.nosql.profiler_interface import NoSQLProfilerInterface +from metadata.utils.manifest import DefaultDatabaseSpec + +ServiceSpec = DefaultDatabaseSpec( + metadata_source_class=MongodbSource, profiler_class=NoSQLProfilerInterface +) diff --git a/ingestion/src/metadata/ingestion/source/database/mssql/service_spec.py b/ingestion/src/metadata/ingestion/source/database/mssql/service_spec.py new file mode 100644 index 000000000000..fbad24d21143 --- /dev/null +++ b/ingestion/src/metadata/ingestion/source/database/mssql/service_spec.py @@ -0,0 +1,4 @@ +from metadata.ingestion.source.database.mssql.metadata import MssqlSource +from metadata.utils.manifest import DefaultDatabaseSpec + +ServiceSpec = DefaultDatabaseSpec(metadata_source_class=MssqlSource) diff --git a/ingestion/src/metadata/ingestion/source/database/mysql/service_spec.py b/ingestion/src/metadata/ingestion/source/database/mysql/service_spec.py new file mode 100644 index 000000000000..2d24a115e98c --- /dev/null +++ b/ingestion/src/metadata/ingestion/source/database/mysql/service_spec.py @@ -0,0 +1,4 @@ +from metadata.ingestion.source.database.mysql.metadata import MysqlSource +from metadata.utils.manifest import DefaultDatabaseSpec + +ServiceSpec = DefaultDatabaseSpec(metadata_source_class=MysqlSource) diff --git a/ingestion/src/metadata/ingestion/source/database/oracle/service_spec.py b/ingestion/src/metadata/ingestion/source/database/oracle/service_spec.py new file mode 100644 index 000000000000..69ee6f788f4e --- /dev/null +++ b/ingestion/src/metadata/ingestion/source/database/oracle/service_spec.py @@ -0,0 +1,4 @@ +from metadata.ingestion.source.database.oracle.metadata import OracleSource +from metadata.utils.manifest import DefaultDatabaseSpec + +ServiceSpec = DefaultDatabaseSpec(metadata_source_class=OracleSource) diff --git a/ingestion/src/metadata/ingestion/source/database/pinotdb/service_spec.py b/ingestion/src/metadata/ingestion/source/database/pinotdb/service_spec.py new file mode 100644 index 000000000000..1fe227ff0543 --- /dev/null +++ b/ingestion/src/metadata/ingestion/source/database/pinotdb/service_spec.py @@ -0,0 +1,4 @@ +from metadata.ingestion.source.database.pinotdb.metadata import PinotdbSource +from metadata.utils.manifest import DefaultDatabaseSpec + +ServiceSpec = DefaultDatabaseSpec(metadata_source_class=PinotdbSource) diff --git a/ingestion/src/metadata/ingestion/source/database/postgres/service_spec.py b/ingestion/src/metadata/ingestion/source/database/postgres/service_spec.py new file mode 100644 index 000000000000..c0cdb957fe3f --- /dev/null +++ b/ingestion/src/metadata/ingestion/source/database/postgres/service_spec.py @@ -0,0 +1,4 @@ +from metadata.ingestion.source.database.postgres.metadata import PostgresSource +from metadata.utils.manifest import DefaultDatabaseSpec + +ServiceSpec = DefaultDatabaseSpec(metadata_source_class=PostgresSource) diff --git a/ingestion/src/metadata/ingestion/source/database/presto/service_spec.py b/ingestion/src/metadata/ingestion/source/database/presto/service_spec.py new file mode 100644 index 000000000000..eb515c7bade4 --- /dev/null +++ b/ingestion/src/metadata/ingestion/source/database/presto/service_spec.py @@ -0,0 +1,4 @@ +from metadata.ingestion.source.database.presto.metadata import PrestoSource +from metadata.utils.manifest import DefaultDatabaseSpec + +ServiceSpec = DefaultDatabaseSpec(metadata_source_class=PrestoSource) diff --git a/ingestion/src/metadata/ingestion/source/database/redshift/manifest.py b/ingestion/src/metadata/ingestion/source/database/redshift/manifest.py deleted file mode 100644 index 6983e32d6386..000000000000 --- a/ingestion/src/metadata/ingestion/source/database/redshift/manifest.py +++ /dev/null @@ -1,6 +0,0 @@ -from metadata.ingestion.source.database.redshift.profiler.profiler import ( - RedshiftProfiler, -) -from metadata.utils.manifest import BaseManifest, get_class_path - -RedshiftManifest = BaseManifest(profler_class=get_class_path(RedshiftProfiler)) diff --git a/ingestion/src/metadata/ingestion/source/database/redshift/service_spec.py b/ingestion/src/metadata/ingestion/source/database/redshift/service_spec.py new file mode 100644 index 000000000000..37afb59af167 --- /dev/null +++ b/ingestion/src/metadata/ingestion/source/database/redshift/service_spec.py @@ -0,0 +1,9 @@ +from metadata.ingestion.source.database.redshift.metadata import RedshiftSource +from metadata.ingestion.source.database.redshift.profiler.profiler import ( + RedshiftProfiler, +) +from metadata.utils.manifest import DefaultDatabaseSpec + +ServiceSpec = DefaultDatabaseSpec( + metadata_source_class=RedshiftSource, profiler_class=RedshiftProfiler +) diff --git a/ingestion/src/metadata/ingestion/source/database/salesforce/service_spec.py b/ingestion/src/metadata/ingestion/source/database/salesforce/service_spec.py new file mode 100644 index 000000000000..28505f1a152f --- /dev/null +++ b/ingestion/src/metadata/ingestion/source/database/salesforce/service_spec.py @@ -0,0 +1,4 @@ +from metadata.ingestion.source.database.salesforce.metadata import SalesforceSource +from metadata.utils.service_spec import BaseSpec + +ServiceSpec = BaseSpec(metadata_source_class=SalesforceSource) diff --git a/ingestion/src/metadata/ingestion/source/database/saperp/service_spec.py b/ingestion/src/metadata/ingestion/source/database/saperp/service_spec.py new file mode 100644 index 000000000000..c0e3a0a27457 --- /dev/null +++ b/ingestion/src/metadata/ingestion/source/database/saperp/service_spec.py @@ -0,0 +1,4 @@ +from metadata.ingestion.source.database.saperp.metadata import SaperpSource +from metadata.utils.service_spec import BaseSpec + +ServiceSpec = BaseSpec(metadata_source_class=SaperpSource) diff --git a/ingestion/src/metadata/ingestion/source/database/saphana/service_spec.py b/ingestion/src/metadata/ingestion/source/database/saphana/service_spec.py new file mode 100644 index 000000000000..4149a9512455 --- /dev/null +++ b/ingestion/src/metadata/ingestion/source/database/saphana/service_spec.py @@ -0,0 +1,4 @@ +from metadata.ingestion.source.database.saphana.metadata import SaphanaSource +from metadata.utils.manifest import DefaultDatabaseSpec + +ServiceSpec = DefaultDatabaseSpec(metadata_source_class=SaphanaSource) diff --git a/ingestion/src/metadata/ingestion/source/database/sas/service_spec.py b/ingestion/src/metadata/ingestion/source/database/sas/service_spec.py new file mode 100644 index 000000000000..81b49e86e9e3 --- /dev/null +++ b/ingestion/src/metadata/ingestion/source/database/sas/service_spec.py @@ -0,0 +1,4 @@ +from metadata.ingestion.source.database.sas.metadata import SasSource +from metadata.utils.service_spec import BaseSpec + +ServiceSpec = BaseSpec(metadata_source_class=SasSource) diff --git a/ingestion/src/metadata/ingestion/source/database/singlestore/manifest.py b/ingestion/src/metadata/ingestion/source/database/singlestore/manifest.py deleted file mode 100644 index f35cfcf652aa..000000000000 --- a/ingestion/src/metadata/ingestion/source/database/singlestore/manifest.py +++ /dev/null @@ -1,4 +0,0 @@ -from metadata.ingestion.source.database.singlestore.metadata import SinglestoreSource -from metadata.utils.manifest import BaseManifest, get_class_path - -SinglestoreManifest = BaseManifest(profler_class=get_class_path(SinglestoreSource)) diff --git a/ingestion/src/metadata/ingestion/source/database/singlestore/service_spec.py b/ingestion/src/metadata/ingestion/source/database/singlestore/service_spec.py new file mode 100644 index 000000000000..87690636e251 --- /dev/null +++ b/ingestion/src/metadata/ingestion/source/database/singlestore/service_spec.py @@ -0,0 +1,9 @@ +from metadata.ingestion.source.database.singlestore.metadata import SinglestoreSource +from metadata.profiler.interface.sqlalchemy.single_store.profiler_interface import ( + SingleStoreProfilerInterface, +) +from metadata.utils.manifest import DefaultDatabaseSpec + +ServiceSpec = DefaultDatabaseSpec( + metadata_source_class=SinglestoreSource, profiler_class=SingleStoreProfilerInterface +) diff --git a/ingestion/src/metadata/ingestion/source/database/snowflake/manifest.py b/ingestion/src/metadata/ingestion/source/database/snowflake/manifest.py deleted file mode 100644 index d6109be9ae01..000000000000 --- a/ingestion/src/metadata/ingestion/source/database/snowflake/manifest.py +++ /dev/null @@ -1,6 +0,0 @@ -from metadata.ingestion.source.database.snowflake.profiler.profiler import ( - SnowflakeProfiler, -) -from metadata.utils.manifest import BaseManifest, get_class_path - -SnowflakeManifest = BaseManifest(profler_class=get_class_path(SnowflakeProfiler)) diff --git a/ingestion/src/metadata/ingestion/source/database/snowflake/service_spec.py b/ingestion/src/metadata/ingestion/source/database/snowflake/service_spec.py new file mode 100644 index 000000000000..bf6a88c980c7 --- /dev/null +++ b/ingestion/src/metadata/ingestion/source/database/snowflake/service_spec.py @@ -0,0 +1,9 @@ +from metadata.ingestion.source.database.snowflake.metadata import SnowflakeSource +from metadata.ingestion.source.database.snowflake.profiler.profiler import ( + SnowflakeProfiler, +) +from metadata.utils.manifest import DefaultDatabaseSpec + +ServiceSpec = DefaultDatabaseSpec( + metadata_source_class=SnowflakeSource, profiler_class=SnowflakeProfiler +) diff --git a/ingestion/src/metadata/ingestion/source/database/sqlite/service_spec.py b/ingestion/src/metadata/ingestion/source/database/sqlite/service_spec.py new file mode 100644 index 000000000000..36223f4dac18 --- /dev/null +++ b/ingestion/src/metadata/ingestion/source/database/sqlite/service_spec.py @@ -0,0 +1,4 @@ +from metadata.ingestion.source.database.sqlite.metadata import SqliteSource +from metadata.utils.manifest import DefaultDatabaseSpec + +ServiceSpec = DefaultDatabaseSpec(metadata_source_class=SqliteSource) diff --git a/ingestion/src/metadata/ingestion/source/database/teradata/service_spec.py b/ingestion/src/metadata/ingestion/source/database/teradata/service_spec.py new file mode 100644 index 000000000000..95d699d740ba --- /dev/null +++ b/ingestion/src/metadata/ingestion/source/database/teradata/service_spec.py @@ -0,0 +1,4 @@ +from metadata.ingestion.source.database.teradata.metadata import TeradataSource +from metadata.utils.manifest import DefaultDatabaseSpec + +ServiceSpec = DefaultDatabaseSpec(metadata_source_class=TeradataSource) diff --git a/ingestion/src/metadata/ingestion/source/database/trino/manifest.py b/ingestion/src/metadata/ingestion/source/database/trino/manifest.py deleted file mode 100644 index 88eb9499f6c2..000000000000 --- a/ingestion/src/metadata/ingestion/source/database/trino/manifest.py +++ /dev/null @@ -1,6 +0,0 @@ -from metadata.profiler.interface.sqlalchemy.trino.profiler_interface import ( - TrinoProfilerInterface, -) -from metadata.utils.manifest import BaseManifest, get_class_path - -TrinoManifest = BaseManifest(profler_class=get_class_path(TrinoProfilerInterface)) diff --git a/ingestion/src/metadata/ingestion/source/database/trino/service_spec.py b/ingestion/src/metadata/ingestion/source/database/trino/service_spec.py new file mode 100644 index 000000000000..622b2d9e822b --- /dev/null +++ b/ingestion/src/metadata/ingestion/source/database/trino/service_spec.py @@ -0,0 +1,9 @@ +from metadata.ingestion.source.database.trino.metadata import TrinoSource +from metadata.profiler.interface.sqlalchemy.trino.profiler_interface import ( + TrinoProfilerInterface, +) +from metadata.utils.manifest import DefaultDatabaseSpec + +ServiceSpec = DefaultDatabaseSpec( + metadata_source_class=TrinoSource, profiler_class=TrinoProfilerInterface +) diff --git a/ingestion/src/metadata/ingestion/source/database/unitycatalog/manifest.py b/ingestion/src/metadata/ingestion/source/database/unitycatalog/manifest.py deleted file mode 100644 index aea1e93c9823..000000000000 --- a/ingestion/src/metadata/ingestion/source/database/unitycatalog/manifest.py +++ /dev/null @@ -1,8 +0,0 @@ -from metadata.profiler.interface.sqlalchemy.unity_catalog.profiler_interface import ( - UnityCatalogProfilerInterface, -) -from metadata.utils.manifest import BaseManifest, get_class_path - -UnitycatalogManifest = BaseManifest( - profler_class=get_class_path(UnityCatalogProfilerInterface) -) diff --git a/ingestion/src/metadata/ingestion/source/database/unitycatalog/service_spec.py b/ingestion/src/metadata/ingestion/source/database/unitycatalog/service_spec.py new file mode 100644 index 000000000000..87ca5e99f939 --- /dev/null +++ b/ingestion/src/metadata/ingestion/source/database/unitycatalog/service_spec.py @@ -0,0 +1,10 @@ +from metadata.ingestion.source.database.unitycatalog.metadata import UnitycatalogSource +from metadata.profiler.interface.sqlalchemy.unity_catalog.profiler_interface import ( + UnityCatalogProfilerInterface, +) +from metadata.utils.manifest import DefaultDatabaseSpec + +ServiceSpec = DefaultDatabaseSpec( + metadata_source_class=UnitycatalogSource, + profiler_class=UnityCatalogProfilerInterface, +) diff --git a/ingestion/src/metadata/ingestion/source/database/vertica/service_spec.py b/ingestion/src/metadata/ingestion/source/database/vertica/service_spec.py new file mode 100644 index 000000000000..207c909e0ff9 --- /dev/null +++ b/ingestion/src/metadata/ingestion/source/database/vertica/service_spec.py @@ -0,0 +1,6 @@ +from metadata.ingestion.source.database.vertica.metadata import VerticaSource +from metadata.utils.manifest import DefaultDatabaseSpec + +ServiceSpec = DefaultDatabaseSpec( + metadata_source_class=VerticaSource, profiler_class=None +) diff --git a/ingestion/src/metadata/ingestion/source/messaging/kafka/service_spec.py b/ingestion/src/metadata/ingestion/source/messaging/kafka/service_spec.py new file mode 100644 index 000000000000..3d789a62db53 --- /dev/null +++ b/ingestion/src/metadata/ingestion/source/messaging/kafka/service_spec.py @@ -0,0 +1,4 @@ +from metadata.ingestion.source.messaging.kafka.metadata import KafkaSource +from metadata.utils.service_spec import BaseSpec + +ServiceSpec = BaseSpec(metadata_source_class=KafkaSource) diff --git a/ingestion/src/metadata/ingestion/source/messaging/kinesis/service_spec.py b/ingestion/src/metadata/ingestion/source/messaging/kinesis/service_spec.py new file mode 100644 index 000000000000..feb043ff99c1 --- /dev/null +++ b/ingestion/src/metadata/ingestion/source/messaging/kinesis/service_spec.py @@ -0,0 +1,4 @@ +from metadata.ingestion.source.messaging.kinesis.metadata import KinesisSource +from metadata.utils.service_spec import BaseSpec + +ServiceSpec = BaseSpec(metadata_source_class=KinesisSource) diff --git a/ingestion/src/metadata/ingestion/source/messaging/redpanda/service_spec.py b/ingestion/src/metadata/ingestion/source/messaging/redpanda/service_spec.py new file mode 100644 index 000000000000..8b76e817ab49 --- /dev/null +++ b/ingestion/src/metadata/ingestion/source/messaging/redpanda/service_spec.py @@ -0,0 +1,4 @@ +from metadata.ingestion.source.messaging.redpanda.metadata import RedpandaSource +from metadata.utils.service_spec import BaseSpec + +ServiceSpec = BaseSpec(metadata_source_class=RedpandaSource) diff --git a/ingestion/src/metadata/ingestion/source/metadata/alationsink/service_spec.py b/ingestion/src/metadata/ingestion/source/metadata/alationsink/service_spec.py new file mode 100644 index 000000000000..d539d57cefe0 --- /dev/null +++ b/ingestion/src/metadata/ingestion/source/metadata/alationsink/service_spec.py @@ -0,0 +1,4 @@ +from metadata.ingestion.source.metadata.alationsink.metadata import AlationsinkSource +from metadata.utils.service_spec import BaseSpec + +ServiceSpec = BaseSpec(metadata_source_class=AlationsinkSource) diff --git a/ingestion/src/metadata/ingestion/source/metadata/amundsen/service_spec.py b/ingestion/src/metadata/ingestion/source/metadata/amundsen/service_spec.py new file mode 100644 index 000000000000..efeb8bb2c598 --- /dev/null +++ b/ingestion/src/metadata/ingestion/source/metadata/amundsen/service_spec.py @@ -0,0 +1,4 @@ +from metadata.ingestion.source.metadata.amundsen.metadata import AmundsenSource +from metadata.utils.service_spec import BaseSpec + +ServiceSpec = BaseSpec(metadata_source_class=AmundsenSource) diff --git a/ingestion/src/metadata/ingestion/source/metadata/atlas/service_spec.py b/ingestion/src/metadata/ingestion/source/metadata/atlas/service_spec.py new file mode 100644 index 000000000000..9e946f558e8a --- /dev/null +++ b/ingestion/src/metadata/ingestion/source/metadata/atlas/service_spec.py @@ -0,0 +1,4 @@ +from metadata.ingestion.source.metadata.atlas.metadata import AtlasSource +from metadata.utils.service_spec import BaseSpec + +ServiceSpec = BaseSpec(metadata_source_class=AtlasSource) diff --git a/ingestion/src/metadata/ingestion/source/mlmodel/mlflow/service_spec.py b/ingestion/src/metadata/ingestion/source/mlmodel/mlflow/service_spec.py new file mode 100644 index 000000000000..902f09311c39 --- /dev/null +++ b/ingestion/src/metadata/ingestion/source/mlmodel/mlflow/service_spec.py @@ -0,0 +1,4 @@ +from metadata.ingestion.source.mlmodel.mlflow.metadata import MlflowSource +from metadata.utils.service_spec import BaseSpec + +ServiceSpec = BaseSpec(metadata_source_class=MlflowSource) diff --git a/ingestion/src/metadata/ingestion/source/mlmodel/sagemaker/service_spec.py b/ingestion/src/metadata/ingestion/source/mlmodel/sagemaker/service_spec.py new file mode 100644 index 000000000000..484de9795a8c --- /dev/null +++ b/ingestion/src/metadata/ingestion/source/mlmodel/sagemaker/service_spec.py @@ -0,0 +1,4 @@ +from metadata.ingestion.source.mlmodel.sagemaker.metadata import SagemakerSource +from metadata.utils.service_spec import BaseSpec + +ServiceSpec = BaseSpec(metadata_source_class=SagemakerSource) diff --git a/ingestion/src/metadata/ingestion/source/pipeline/airbyte/service_spec.py b/ingestion/src/metadata/ingestion/source/pipeline/airbyte/service_spec.py new file mode 100644 index 000000000000..4e46824ac397 --- /dev/null +++ b/ingestion/src/metadata/ingestion/source/pipeline/airbyte/service_spec.py @@ -0,0 +1,4 @@ +from metadata.ingestion.source.pipeline.airbyte.metadata import AirbyteSource +from metadata.utils.service_spec import BaseSpec + +ServiceSpec = BaseSpec(metadata_source_class=AirbyteSource) diff --git a/ingestion/src/metadata/ingestion/source/pipeline/airflow/service_spec.py b/ingestion/src/metadata/ingestion/source/pipeline/airflow/service_spec.py new file mode 100644 index 000000000000..a02e30d76306 --- /dev/null +++ b/ingestion/src/metadata/ingestion/source/pipeline/airflow/service_spec.py @@ -0,0 +1,4 @@ +from metadata.ingestion.source.pipeline.airflow.metadata import AirflowSource +from metadata.utils.service_spec import BaseSpec + +ServiceSpec = BaseSpec(metadata_source_class=AirflowSource) diff --git a/ingestion/src/metadata/ingestion/source/pipeline/dagster/service_spec.py b/ingestion/src/metadata/ingestion/source/pipeline/dagster/service_spec.py new file mode 100644 index 000000000000..9c8a9e5176fa --- /dev/null +++ b/ingestion/src/metadata/ingestion/source/pipeline/dagster/service_spec.py @@ -0,0 +1,4 @@ +from metadata.ingestion.source.pipeline.dagster.metadata import DagsterSource +from metadata.utils.service_spec import BaseSpec + +ServiceSpec = BaseSpec(metadata_source_class=DagsterSource) diff --git a/ingestion/src/metadata/ingestion/source/pipeline/databrickspipeline/service_spec.py b/ingestion/src/metadata/ingestion/source/pipeline/databrickspipeline/service_spec.py new file mode 100644 index 000000000000..a76b65df78aa --- /dev/null +++ b/ingestion/src/metadata/ingestion/source/pipeline/databrickspipeline/service_spec.py @@ -0,0 +1,6 @@ +from metadata.ingestion.source.pipeline.databrickspipeline.metadata import ( + DatabrickspipelineSource, +) +from metadata.utils.service_spec import BaseSpec + +ServiceSpec = BaseSpec(metadata_source_class=DatabrickspipelineSource) diff --git a/ingestion/src/metadata/ingestion/source/pipeline/dbtcloud/service_spec.py b/ingestion/src/metadata/ingestion/source/pipeline/dbtcloud/service_spec.py new file mode 100644 index 000000000000..34f69c2db0a2 --- /dev/null +++ b/ingestion/src/metadata/ingestion/source/pipeline/dbtcloud/service_spec.py @@ -0,0 +1,4 @@ +from metadata.ingestion.source.pipeline.dbtcloud.metadata import DbtcloudSource +from metadata.utils.service_spec import BaseSpec + +ServiceSpec = BaseSpec(metadata_source_class=DbtcloudSource) diff --git a/ingestion/src/metadata/ingestion/source/pipeline/domopipeline/service_spec.py b/ingestion/src/metadata/ingestion/source/pipeline/domopipeline/service_spec.py new file mode 100644 index 000000000000..e720a39abc29 --- /dev/null +++ b/ingestion/src/metadata/ingestion/source/pipeline/domopipeline/service_spec.py @@ -0,0 +1,4 @@ +from metadata.ingestion.source.pipeline.domopipeline.metadata import DomopipelineSource +from metadata.utils.service_spec import BaseSpec + +ServiceSpec = BaseSpec(metadata_source_class=DomopipelineSource) diff --git a/ingestion/src/metadata/ingestion/source/pipeline/fivetran/service_spec.py b/ingestion/src/metadata/ingestion/source/pipeline/fivetran/service_spec.py new file mode 100644 index 000000000000..34dd83a75f3e --- /dev/null +++ b/ingestion/src/metadata/ingestion/source/pipeline/fivetran/service_spec.py @@ -0,0 +1,4 @@ +from metadata.ingestion.source.pipeline.fivetran.metadata import FivetranSource +from metadata.utils.service_spec import BaseSpec + +ServiceSpec = BaseSpec(metadata_source_class=FivetranSource) diff --git a/ingestion/src/metadata/ingestion/source/pipeline/flink/service_spec.py b/ingestion/src/metadata/ingestion/source/pipeline/flink/service_spec.py new file mode 100644 index 000000000000..4a773ef9a729 --- /dev/null +++ b/ingestion/src/metadata/ingestion/source/pipeline/flink/service_spec.py @@ -0,0 +1,4 @@ +from metadata.ingestion.source.pipeline.flink.metadata import FlinkSource +from metadata.utils.service_spec import BaseSpec + +ServiceSpec = BaseSpec(metadata_source_class=FlinkSource) diff --git a/ingestion/src/metadata/ingestion/source/pipeline/gluepipeline/service_spec.py b/ingestion/src/metadata/ingestion/source/pipeline/gluepipeline/service_spec.py new file mode 100644 index 000000000000..b0c465f4c226 --- /dev/null +++ b/ingestion/src/metadata/ingestion/source/pipeline/gluepipeline/service_spec.py @@ -0,0 +1,4 @@ +from metadata.ingestion.source.pipeline.gluepipeline.metadata import GluepipelineSource +from metadata.utils.service_spec import BaseSpec + +ServiceSpec = BaseSpec(metadata_source_class=GluepipelineSource) diff --git a/ingestion/src/metadata/ingestion/source/pipeline/kafkaconnect/service_spec.py b/ingestion/src/metadata/ingestion/source/pipeline/kafkaconnect/service_spec.py new file mode 100644 index 000000000000..a9a2556ac975 --- /dev/null +++ b/ingestion/src/metadata/ingestion/source/pipeline/kafkaconnect/service_spec.py @@ -0,0 +1,4 @@ +from metadata.ingestion.source.pipeline.kafkaconnect.metadata import KafkaconnectSource +from metadata.utils.service_spec import BaseSpec + +ServiceSpec = BaseSpec(metadata_source_class=KafkaconnectSource) diff --git a/ingestion/src/metadata/ingestion/source/pipeline/nifi/service_spec.py b/ingestion/src/metadata/ingestion/source/pipeline/nifi/service_spec.py new file mode 100644 index 000000000000..531935883e22 --- /dev/null +++ b/ingestion/src/metadata/ingestion/source/pipeline/nifi/service_spec.py @@ -0,0 +1,4 @@ +from metadata.ingestion.source.pipeline.nifi.metadata import NifiSource +from metadata.utils.service_spec import BaseSpec + +ServiceSpec = BaseSpec(metadata_source_class=NifiSource) diff --git a/ingestion/src/metadata/ingestion/source/pipeline/openlineage/service_spec.py b/ingestion/src/metadata/ingestion/source/pipeline/openlineage/service_spec.py new file mode 100644 index 000000000000..a556398938fa --- /dev/null +++ b/ingestion/src/metadata/ingestion/source/pipeline/openlineage/service_spec.py @@ -0,0 +1,4 @@ +from metadata.ingestion.source.pipeline.openlineage.metadata import OpenlineageSource +from metadata.utils.service_spec import BaseSpec + +ServiceSpec = BaseSpec(metadata_source_class=OpenlineageSource) diff --git a/ingestion/src/metadata/ingestion/source/pipeline/spline/service_spec.py b/ingestion/src/metadata/ingestion/source/pipeline/spline/service_spec.py new file mode 100644 index 000000000000..20527834826a --- /dev/null +++ b/ingestion/src/metadata/ingestion/source/pipeline/spline/service_spec.py @@ -0,0 +1,4 @@ +from metadata.ingestion.source.pipeline.spline.metadata import SplineSource +from metadata.utils.service_spec import BaseSpec + +ServiceSpec = BaseSpec(metadata_source_class=SplineSource) diff --git a/ingestion/src/metadata/ingestion/source/search/elasticsearch/service_spec.py b/ingestion/src/metadata/ingestion/source/search/elasticsearch/service_spec.py new file mode 100644 index 000000000000..fa2da637fc2c --- /dev/null +++ b/ingestion/src/metadata/ingestion/source/search/elasticsearch/service_spec.py @@ -0,0 +1,4 @@ +from metadata.ingestion.source.search.elasticsearch.metadata import ElasticsearchSource +from metadata.utils.service_spec import BaseSpec + +ServiceSpec = BaseSpec(metadata_source_class=ElasticsearchSource) diff --git a/ingestion/src/metadata/ingestion/source/storage/gcs/service_spec.py b/ingestion/src/metadata/ingestion/source/storage/gcs/service_spec.py new file mode 100644 index 000000000000..73df4a9f6200 --- /dev/null +++ b/ingestion/src/metadata/ingestion/source/storage/gcs/service_spec.py @@ -0,0 +1,4 @@ +from metadata.ingestion.source.storage.gcs.metadata import GcsSource +from metadata.utils.service_spec import BaseSpec + +ServiceSpec = BaseSpec(metadata_source_class=GcsSource) diff --git a/ingestion/src/metadata/ingestion/source/storage/s3/service_spec.py b/ingestion/src/metadata/ingestion/source/storage/s3/service_spec.py new file mode 100644 index 000000000000..6a3a31e96ac2 --- /dev/null +++ b/ingestion/src/metadata/ingestion/source/storage/s3/service_spec.py @@ -0,0 +1,4 @@ +from metadata.ingestion.source.storage.s3.metadata import S3Source +from metadata.utils.service_spec import BaseSpec + +ServiceSpec = BaseSpec(metadata_source_class=S3Source) diff --git a/ingestion/src/metadata/profiler/interface/profiler_interface_factory.py b/ingestion/src/metadata/profiler/interface/profiler_interface_factory.py deleted file mode 100644 index be4ce89d5629..000000000000 --- a/ingestion/src/metadata/profiler/interface/profiler_interface_factory.py +++ /dev/null @@ -1,108 +0,0 @@ -# Copyright 2021 Collate -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -Factory class for creating profiler interface objects -""" - -import importlib -from typing import Dict, cast - -from metadata.generated.schema.entity.services.connections.database.bigQueryConnection import ( - BigQueryConnection, -) -from metadata.generated.schema.entity.services.connections.database.databricksConnection import ( - DatabricksConnection, -) -from metadata.generated.schema.entity.services.connections.database.datalakeConnection import ( - DatalakeConnection, -) -from metadata.generated.schema.entity.services.connections.database.db2Connection import ( - Db2Connection, -) -from metadata.generated.schema.entity.services.connections.database.dynamoDBConnection import ( - DynamoDBConnection, -) -from metadata.generated.schema.entity.services.connections.database.mariaDBConnection import ( - MariaDBConnection, -) -from metadata.generated.schema.entity.services.connections.database.mongoDBConnection import ( - MongoDBConnection, -) -from metadata.generated.schema.entity.services.connections.database.singleStoreConnection import ( - SingleStoreConnection, -) -from metadata.generated.schema.entity.services.connections.database.snowflakeConnection import ( - SnowflakeConnection, -) -from metadata.generated.schema.entity.services.connections.database.trinoConnection import ( - TrinoConnection, -) -from metadata.generated.schema.entity.services.connections.database.unityCatalogConnection import ( - UnityCatalogConnection, -) -from metadata.generated.schema.entity.services.databaseService import DatabaseConnection -from metadata.profiler.factory import Factory -from metadata.profiler.interface.profiler_interface import ProfilerInterface - - -class ProfilerInterfaceFactory(Factory): - def create(self, interface_type: str, *args, **kwargs): - """Create interface object based on interface type""" - - interface_class_path = profiler_class_mapping.get( - interface_type, profiler_class_mapping[DatabaseConnection.__name__] - ) - try: - module_path, class_name = interface_class_path.rsplit(".", 1) - module = importlib.import_module(module_path) - profiler_class = getattr(module, class_name) - except (ImportError, AttributeError) as e: - raise ImportError(f"Error importing {class_name} from {module_path}: {e}") - profiler_class = cast(ProfilerInterface, profiler_class) - return profiler_class.create(*args, **kwargs) - - -profiler_interface_factory = ProfilerInterfaceFactory() - -BASE_PROFILER_PATH = "metadata.profiler.interface" -SQLALCHEMY_PROFILER_PATH = f"{BASE_PROFILER_PATH}.sqlalchemy" -NOSQL_PROFILER_PATH = ( - f"{BASE_PROFILER_PATH}.nosql.profiler_interface.NoSQLProfilerInterface" -) -PANDAS_PROFILER_PATH = ( - f"{BASE_PROFILER_PATH}.pandas.profiler_interface.PandasProfilerInterface" -) - -# Configuration for dynamic imports -profiler_class_mapping: Dict[str, str] = { - DatabaseConnection.__name__: SQLALCHEMY_PROFILER_PATH - + ".profiler_interface.SQAProfilerInterface", - BigQueryConnection.__name__: SQLALCHEMY_PROFILER_PATH - + ".bigquery.profiler_interface.BigQueryProfilerInterface", - SingleStoreConnection.__name__: SQLALCHEMY_PROFILER_PATH - + ".single_store.profiler_interface.SingleStoreProfilerInterface", - DatalakeConnection.__name__: PANDAS_PROFILER_PATH, - MariaDBConnection.__name__: SQLALCHEMY_PROFILER_PATH - + ".mariadb.profiler_interface.MariaDBProfilerInterface", - SnowflakeConnection.__name__: SQLALCHEMY_PROFILER_PATH - + ".snowflake.profiler_interface.SnowflakeProfilerInterface", - TrinoConnection.__name__: SQLALCHEMY_PROFILER_PATH - + ".trino.profiler_interface.TrinoProfilerInterface", - UnityCatalogConnection.__name__: SQLALCHEMY_PROFILER_PATH - + ".unity_catalog.profiler_interface.UnityCatalogProfilerInterface", - DatabricksConnection.__name__: SQLALCHEMY_PROFILER_PATH - + ".databricks.profiler_interface.DatabricksProfilerInterface", - Db2Connection.__name__: SQLALCHEMY_PROFILER_PATH - + ".db2.profiler_interface.DB2ProfilerInterface", - MongoDBConnection.__name__: NOSQL_PROFILER_PATH, - DynamoDBConnection.__name__: NOSQL_PROFILER_PATH, -} diff --git a/ingestion/src/metadata/profiler/source/base/profiler_source.py b/ingestion/src/metadata/profiler/source/base/profiler_source.py index 41d7db9fb98c..b84aa994f2ae 100644 --- a/ingestion/src/metadata/profiler/source/base/profiler_source.py +++ b/ingestion/src/metadata/profiler/source/base/profiler_source.py @@ -47,7 +47,7 @@ from metadata.profiler.source.profiler_source_interface import ProfilerSourceInterface from metadata.utils.importer import import_from_module from metadata.utils.logger import profiler_logger -from metadata.utils.manifest import BaseManifest +from metadata.utils.service_spec.service_spec import BaseSpec NON_SQA_DATABASE_CONNECTIONS = (DatalakeConnection,) @@ -192,11 +192,6 @@ def create_profiler_interface( profiler_class = self.import_profiler_class( ServiceType.Database, source_type=self.profiler_interface_type ) - logger.debug( - "Using profiler class: %s from %s", - profiler_class, - profiler_class.__module__, - ) profiler_interface: ProfilerInterface = profiler_class.create( entity, schema_entity, @@ -216,9 +211,7 @@ def create_profiler_interface( def import_profiler_class( self, service_type: ServiceType, source_type: str ) -> Type[ProfilerInterface]: - class_path = BaseManifest.get_for_source( - service_type, source_type - ).profler_class + class_path = BaseSpec.get_for_source(service_type, source_type).profiler_class return cast(Type[ProfilerInterface], import_from_module(class_path)) def _get_context_entities( diff --git a/ingestion/src/metadata/profiler/source/metadata_ext.py b/ingestion/src/metadata/profiler/source/metadata_ext.py index 891236c90ff6..1e6ef70897b5 100644 --- a/ingestion/src/metadata/profiler/source/metadata_ext.py +++ b/ingestion/src/metadata/profiler/source/metadata_ext.py @@ -54,7 +54,7 @@ import_source_class, ) from metadata.utils.logger import profiler_logger -from metadata.utils.manifest import BaseManifest +from metadata.utils.service_spec import BaseSpec from metadata.utils.ssl_manager import get_ssl_connection logger = profiler_logger() @@ -179,10 +179,10 @@ def get_table_names(self, schema_name: str) -> Iterable[str]: yield table_name def import_profiler_interface(self) -> Type[ProfilerInterface]: - class_path = BaseManifest.get_for_source( + class_path = BaseSpec.get_for_source( ServiceType.Database, source_type=self.config.source.type.lower(), - ).profler_class + ).profiler_class profiler_source_class = import_from_module(class_path) return profiler_source_class diff --git a/ingestion/src/metadata/utils/importer.py b/ingestion/src/metadata/utils/importer.py index 13d22b235f8d..4ed3754c74d1 100644 --- a/ingestion/src/metadata/utils/importer.py +++ b/ingestion/src/metadata/utils/importer.py @@ -15,7 +15,7 @@ import sys import traceback from enum import Enum -from typing import TYPE_CHECKING, Any, Callable, Optional, Type, TypeVar +from typing import Any, Callable, Optional, Type, TypeVar, cast from pydantic import BaseModel @@ -23,10 +23,7 @@ from metadata.generated.schema.entity.services.connections.metadata.openMetadataConnection import ( OpenMetadataConnection, ) -from metadata.generated.schema.entity.services.databaseService import ( - DatabaseConnection, - DatabaseService, -) +from metadata.generated.schema.entity.services.databaseService import DatabaseService from metadata.generated.schema.entity.services.serviceType import ServiceType from metadata.generated.schema.metadataIngestion.workflow import Sink as WorkflowSink from metadata.ingestion.api.steps import BulkSink, Processor, Sink, Source, Stage @@ -35,13 +32,9 @@ from metadata.utils.client_version import get_client_version from metadata.utils.constants import CUSTOM_CONNECTOR_PREFIX from metadata.utils.logger import utils_logger +from metadata.utils.service_spec import BaseSpec from metadata.utils.singleton import Singleton -if TYPE_CHECKING: - from metadata.profiler.interface.profiler_interface import ProfilerInterface -else: - ProfilerInterface = Any - logger = utils_logger() T = TypeVar("T") @@ -146,37 +139,16 @@ def import_from_module(key: str) -> Type[Any]: raise DynamicImportException(module=module_name, key=obj_name, cause=err) -# module building strings read better with .format instead of f-strings -# pylint: disable=consider-using-f-string def import_source_class( service_type: ServiceType, source_type: str, from_: str = "ingestion" ) -> Type[Source]: - return import_from_module( - "metadata.{}.source.{}.{}.{}.{}Source".format( - from_, - service_type.name.lower(), - get_module_dir(source_type), - get_source_module_name(source_type), - get_class_name_root(source_type), - ) - ) - - -def import_profiler_class( - source_connection_type: DatabaseConnection, - from_: str = "profiler", - service_type: ServiceType = ServiceType.Database, -) -> Type[ProfilerInterface]: - """ - Import the profiler class for a source connection by default will be found in the metadata - """ - return import_from_module( - "metadata.ingestion.source.{}.{}.{}.profiler.{}Profiler".format( - service_type.name.lower(), - source_connection_type.config.type.value.lower(), - from_, - source_connection_type.config.type.value, - ) + return cast( + Type[Source], + import_from_module( + BaseSpec.get_for_source( + service_type, source_type, from_ + ).metadata_source_class + ), ) @@ -184,7 +156,7 @@ def import_processor_class( processor_type: str, from_: str = "ingestion" ) -> Type[Processor]: return import_from_module( - "metadata.{}.processor.{}.{}Processor".format( + "metadata.{}.processor.{}.{}Processor".format( # pylint: disable=consider-using-f-string from_, get_module_name(processor_type), get_class_name_root(processor_type), @@ -194,7 +166,7 @@ def import_processor_class( def import_stage_class(stage_type: str, from_: str = "ingestion") -> Type[Stage]: return import_from_module( - "metadata.{}.stage.{}.{}Stage".format( + "metadata.{}.stage.{}.{}Stage".format( # pylint: disable=consider-using-f-string from_, get_module_name(stage_type), get_class_name_root(stage_type), @@ -204,7 +176,7 @@ def import_stage_class(stage_type: str, from_: str = "ingestion") -> Type[Stage] def import_sink_class(sink_type: str, from_: str = "ingestion") -> Type[Sink]: return import_from_module( - "metadata.{}.sink.{}.{}Sink".format( + "metadata.{}.sink.{}.{}Sink".format( # pylint: disable=consider-using-f-string from_, get_module_name(sink_type), get_class_name_root(sink_type), @@ -216,7 +188,7 @@ def import_bulk_sink_type( bulk_sink_type: str, from_: str = "ingestion" ) -> Type[BulkSink]: return import_from_module( - "metadata.{}.bulksink.{}.{}BulkSink".format( + "metadata.{}.bulksink.{}.{}BulkSink".format( # pylint: disable=consider-using-f-string from_, get_module_name(bulk_sink_type), get_class_name_root(bulk_sink_type), @@ -301,7 +273,7 @@ def import_test_case_class( test_definition[0].upper() + test_definition[1:] ) # change test names to camel case return import_from_module( - "metadata.data_quality.validations.{}.{}.{}.{}Validator".format( + "metadata.data_quality.validations.{}.{}.{}.{}Validator".format( # pylint: disable=consider-using-f-string test_type.lower(), runner_type, test_definition, @@ -338,7 +310,7 @@ def import_system_metrics_computer(db_service: DatabaseService): """ try: return import_from_module( - "metadata.ingestion.source.database.{}.profiler.system.SystemMetricsComputer".format( + "metadata.ingestion.source.database.{}.profiler.system.SystemMetricsComputer".format( # pylint: disable=consider-using-f-string db_service.type ) ) diff --git a/ingestion/src/metadata/utils/logger.py b/ingestion/src/metadata/utils/logger.py index c33eb3b96d19..6fdd4c75c712 100644 --- a/ingestion/src/metadata/utils/logger.py +++ b/ingestion/src/metadata/utils/logger.py @@ -10,6 +10,21 @@ # limitations under the License. """ Module centralising logger configs + +This library might produce circular imports when using together with metadata.utils in these cases use the following +notation to avoid circular imports: + +```` +import third_pary_lib + +# isort: off - logger is a root library and this is how to avoid circular imports +from metadata.utils.logger import cli_logger +from metadata.utils.... import ... + +# isort: on + +``` + """ import logging diff --git a/ingestion/src/metadata/utils/manifest.py b/ingestion/src/metadata/utils/manifest.py deleted file mode 100644 index d6261ec909f5..000000000000 --- a/ingestion/src/metadata/utils/manifest.py +++ /dev/null @@ -1,79 +0,0 @@ -""" -Manifests are used to store class information -""" - -from metadata.generated.schema.entity.services.serviceType import ServiceType -from metadata.ingestion.models.custom_pydantic import BaseModel -from metadata.profiler.interface.sqlalchemy.profiler_interface import ( - SQAProfilerInterface, -) -from metadata.utils.importer import ( - DynamicImportException, - get_class_name_root, - get_module_dir, - import_from_module, -) - - -class BaseManifest(BaseModel): - """Base manifest for storing class information for a source. We use strings to store the class information - for these reasons: - 1. manifests can be defined using json/yaml and deserialized into this class. - 2. We can dynamically import the class when needed and avoid dependency issues. - 3. We avoid circular imports. - 4. We can hot-swap the class implementation without changing the manifest (example: for testing). - - # TODO: naming? - - Dyanmic factory? - - Different name the class name? - - # TODO: functionality - - is this expected to be a extended? - """ - - profler_class: str - - @classmethod - def get_for_source( - cls, service_type: ServiceType, source_type: str, from_: str = "ingestion" - ) -> "BaseManifest": - """Retrieves the manifest for a given source type. If it does not exist will attempt to retrieve - a default manifest for the service type. - - Args: - service_type (ServiceType): The service type. - source_type (str): The source type. - from_ (str, optional): The module to import from. Defaults to "ingestion". - - Returns: - BaseManifest: The manifest for the source type. - """ - try: - return cls.model_validate( - import_from_module( - "metadata.{}.source.{}.{}.{}.{}Manifest".format( # pylint: disable=C0209 - from_, - service_type.name.lower(), - get_module_dir(source_type), - "manifest", - get_class_name_root(source_type), - ) - ) - ) - except DynamicImportException: - try: - return DEFAULT_MANIFEST_MAP[service_type] - except KeyError: - raise RuntimeError(f"No manifest found for source type: {source_type}") - - -def get_class_path(module): - return module.__module__ + "." + module.__name__ - - -DefaultDatabaseManifest = BaseManifest( - profler_class=get_class_path(SQAProfilerInterface) -) - - -DEFAULT_MANIFEST_MAP = {ServiceType.Database: DefaultDatabaseManifest} diff --git a/ingestion/src/metadata/utils/service_spec/__init__.py b/ingestion/src/metadata/utils/service_spec/__init__.py new file mode 100644 index 000000000000..6543fd660cab --- /dev/null +++ b/ingestion/src/metadata/utils/service_spec/__init__.py @@ -0,0 +1,5 @@ +"""Module for the OpenMetadat Ingestion Service Specification (ServiceSpec)""" + +from metadata.utils.service_spec.service_spec import BaseSpec + +__all__ = ["BaseSpec"] diff --git a/ingestion/src/metadata/utils/service_spec/default.py b/ingestion/src/metadata/utils/service_spec/default.py new file mode 100644 index 000000000000..ef103f264d9c --- /dev/null +++ b/ingestion/src/metadata/utils/service_spec/default.py @@ -0,0 +1,14 @@ +""" +Default service specs for services. +""" + +from typing import Optional + +from metadata.profiler.interface.sqlalchemy.profiler_interface import ( + SQAProfilerInterface, +) +from metadata.utils.service_spec.service_spec import BaseSpec + + +class DefaultDatabaseSpec(BaseSpec): + profiler_class: Optional[str] = SQAProfilerInterface diff --git a/ingestion/src/metadata/utils/service_spec/service_spec.py b/ingestion/src/metadata/utils/service_spec/service_spec.py new file mode 100644 index 000000000000..9943ef972b19 --- /dev/null +++ b/ingestion/src/metadata/utils/service_spec/service_spec.py @@ -0,0 +1,77 @@ +""" +Manifests are used to store class information +""" + +from typing import Optional + +from pydantic import model_validator + +from metadata.generated.schema.entity.services.serviceType import ServiceType +from metadata.ingestion.models.custom_pydantic import BaseModel +from metadata.utils.importer import get_module_dir, import_from_module + + +class BaseSpec(BaseModel): + """ + # The OpenMetadata Ingestion Service Specification (Spec) + + This is the API for defining a service in OpenMetadata it needs to be in the classpath of the connector in + the form: + + metadata.ingestion.source.{service_type}.{service_name}.service_spec.ServiceSpec + + Example for postres: + + metadata.ingestion.source.database.postgres.service_spec.ServiceSpec + + You can supply either strings with the full classpath or concrete classes that will be converted to strings. + + The use of strings for the values gives us a few advantages: + 1. manifests can be defined using json/yaml and deserialized into this class. + 2. We can dynamically import the class when needed and avoid dependency issues. + 3. We avoid circular imports. + 4. We can hot-swap the class implementation without changing the manifest (example: for testing). + """ + + profiler_class: Optional[str] + metadata_source_class: str + + @model_validator(mode="before") + @classmethod + def transform_fields(cls, values): + """This allows us to pass in the class directly instead of the string representation of the class. The + validator will convert the class to a string representation of the class.""" + for field in list(cls.model_fields.keys()): + if isinstance(values.get(field), type): + values[field] = get_class_path(values[field]) + return values + + @classmethod + def get_for_source( + cls, service_type: ServiceType, source_type: str, from_: str = "ingestion" + ) -> "BaseSpec": + """Retrieves the manifest for a given source type. If it does not exist will attempt to retrieve + a default manifest for the service type. + + Args: + service_type (ServiceType): The service type. + source_type (str): The source type. + from_ (str, optional): The module to import from. Defaults to "ingestion". + + Returns: + BaseSpec: The manifest for the source type. + """ + return cls.model_validate( + import_from_module( + "metadata.{}.source.{}.{}.{}.ServiceSpec".format( # pylint: disable=C0209 + from_, + service_type.name.lower(), + get_module_dir(source_type), + "service_spec", + ) + ) + ) + + +def get_class_path(module): + return module.__module__ + "." + module.__name__ diff --git a/openmetadata-docs/content/v1.6.x-SNAPSHOT/developers/contribute/developing-a-new-connector/develop-ingestion-code.md b/openmetadata-docs/content/v1.6.x-SNAPSHOT/developers/contribute/developing-a-new-connector/develop-ingestion-code.md index d08ab34f29e1..dc7cbedc551f 100644 --- a/openmetadata-docs/content/v1.6.x-SNAPSHOT/developers/contribute/developing-a-new-connector/develop-ingestion-code.md +++ b/openmetadata-docs/content/v1.6.x-SNAPSHOT/developers/contribute/developing-a-new-connector/develop-ingestion-code.md @@ -26,6 +26,12 @@ From the Service Topology you can understand what methods you need to implement: Can be found in [`ingestion/src/metadata/ingestion/source/database/database_service.py`](https://github.com/open-metadata/OpenMetadata/blob/main/ingestion/src/metadata/ingestion/source/database/database_service.py) +{%inlineCallout icon="description" bold="OpenMetadata 1.6.0 or later" href="/deployment"%} +Starting from 1.6.0 the OpenMetadata Ingestion Framewotk is using a ServiceSpec specificaiton +in order to define the entrypoints for the ingestion process. +{%/inlineCallout%} + + ```python class DatabaseServiceTopology(ServiceTopology): """ diff --git a/openmetadata-docs/content/v1.6.x-SNAPSHOT/releases/releases/index.md b/openmetadata-docs/content/v1.6.x-SNAPSHOT/releases/releases/index.md index 3c64ceacf62c..b999d8a07782 100644 --- a/openmetadata-docs/content/v1.6.x-SNAPSHOT/releases/releases/index.md +++ b/openmetadata-docs/content/v1.6.x-SNAPSHOT/releases/releases/index.md @@ -14,6 +14,15 @@ version. To see what's coming in next releases, please check our [Roadmap](/rele {% partial file="/v1.5/releases/latest.md" /%} + +# 1.6.0 + +## Breaking Changes + +- The ingestion Framework now uses the OpenMetadata Ingestion Service Specificaiton (OMISS) to specify +enrtypoints to ingestion operations. [Click here](./todo-need-link) for more info. + + # 1.5.5 Release {% note noteType="Tip" %} From 92e53cedc75c22c58d11d84d7f49af2f2dd61139 Mon Sep 17 00:00:00 2001 From: sushi30 Date: Mon, 21 Oct 2024 12:17:39 +0200 Subject: [PATCH 04/18] remove TYPE_CHECKING in core.py --- ingestion/src/metadata/profiler/metrics/core.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/ingestion/src/metadata/profiler/metrics/core.py b/ingestion/src/metadata/profiler/metrics/core.py index 9acba468dc7d..70e387a7daeb 100644 --- a/ingestion/src/metadata/profiler/metrics/core.py +++ b/ingestion/src/metadata/profiler/metrics/core.py @@ -18,7 +18,7 @@ from abc import ABC, abstractmethod from enum import Enum from functools import wraps -from typing import TYPE_CHECKING, Any, Callable, Dict, Optional, Tuple, TypeVar +from typing import Any, Callable, Dict, Optional, Tuple, TypeVar from sqlalchemy import Column from sqlalchemy.orm import DeclarativeMeta, Session @@ -26,9 +26,6 @@ from metadata.generated.schema.entity.data.table import Table from metadata.profiler.adaptors.nosql_adaptor import NoSQLAdaptor -if TYPE_CHECKING: - pass - # When creating complex metrics, use inherit_cache = CACHE CACHE = True From 8d4f1d4885342b5b97230d1c510b8f7105bc21f7 Mon Sep 17 00:00:00 2001 From: sushi30 Date: Mon, 21 Oct 2024 19:02:15 +0200 Subject: [PATCH 05/18] - deleted valuedispatch function - deleted get_system_metrics_by_dialect - implemented BigQueryProfiler with a system metrics source - moved import_source_class to BaseSpec --- .../data_quality/source/test_suite.py | 2 +- .../metadata/ingestion/source/connections.py | 7 +- .../source/database/athena/service_spec.py | 2 +- .../source/database/azuresql/service_spec.py | 2 +- .../bigquery/incremental_table_processor.py | 2 +- .../database/bigquery/profiler/__init__.py | 0 .../database/bigquery/profiler/profiler.py | 26 ++ .../database/bigquery/profiler/system.py | 161 ++++++++++++ .../source/database/bigquery/queries.py | 74 ++++++ .../source/database/bigquery/service_spec.py | 8 +- .../source/database/bigtable/models.py | 2 +- .../source/database/bigtable/service_spec.py | 2 +- .../database/clickhouse/service_spec.py | 2 +- .../source/database/couchbase/service_spec.py | 2 +- .../database/databricks/service_spec.py | 2 +- .../source/database/datalake/service_spec.py | 2 +- .../source/database/db2/service_spec.py | 2 +- .../source/database/deltalake/service_spec.py | 2 +- .../database/domodatabase/service_spec.py | 2 +- .../source/database/doris/service_spec.py | 2 +- .../source/database/druid/service_spec.py | 2 +- .../source/database/dynamodb/service_spec.py | 2 +- .../source/database/glue/service_spec.py | 2 +- .../source/database/greenplum/service_spec.py | 2 +- .../source/database/hive/service_spec.py | 2 +- .../source/database/iceberg/service_spec.py | 2 +- .../source/database/impala/service_spec.py | 2 +- .../source/database/mariadb/service_spec.py | 2 +- .../source/database/mongodb/service_spec.py | 2 +- .../source/database/mssql/service_spec.py | 2 +- .../source/database/mysql/service_spec.py | 2 +- .../source/database/oracle/service_spec.py | 2 +- .../source/database/pinotdb/service_spec.py | 2 +- .../source/database/postgres/service_spec.py | 2 +- .../source/database/presto/service_spec.py | 2 +- .../source/database/redshift/service_spec.py | 2 +- .../source/database/saphana/service_spec.py | 2 +- .../database/singlestore/service_spec.py | 2 +- .../database/snowflake/profiler/system.py | 3 +- .../source/database/snowflake/service_spec.py | 2 +- .../source/database/sqlite/service_spec.py | 2 +- .../source/database/teradata/service_spec.py | 2 +- .../source/database/trino/service_spec.py | 2 +- .../database/unitycatalog/service_spec.py | 2 +- .../source/database/vertica/service_spec.py | 2 +- .../metrics/system/queries/bigquery.py | 54 ---- .../profiler/metrics/system/system.py | 241 +----------------- .../metadata/profiler/source/metadata_ext.py | 10 +- ingestion/src/metadata/utils/dispatch.py | 54 +--- ingestion/src/metadata/utils/importer.py | 18 +- ingestion/src/metadata/utils/logger.py | 15 -- .../utils/service_spec/service_spec.py | 16 +- ingestion/src/metadata/workflow/ingestion.py | 2 +- 53 files changed, 332 insertions(+), 433 deletions(-) create mode 100644 ingestion/src/metadata/ingestion/source/database/bigquery/profiler/__init__.py create mode 100644 ingestion/src/metadata/ingestion/source/database/bigquery/profiler/profiler.py create mode 100644 ingestion/src/metadata/ingestion/source/database/bigquery/profiler/system.py delete mode 100644 ingestion/src/metadata/profiler/metrics/system/queries/bigquery.py diff --git a/ingestion/src/metadata/data_quality/source/test_suite.py b/ingestion/src/metadata/data_quality/source/test_suite.py index 08ebdec500d3..a2fb2cfae2ee 100644 --- a/ingestion/src/metadata/data_quality/source/test_suite.py +++ b/ingestion/src/metadata/data_quality/source/test_suite.py @@ -38,8 +38,8 @@ from metadata.ingestion.ometa.ometa_api import OpenMetadata from metadata.utils import fqn from metadata.utils.constants import CUSTOM_CONNECTOR_PREFIX -from metadata.utils.importer import import_source_class from metadata.utils.logger import test_suite_logger +from metadata.utils.service_spec.service_spec import import_source_class logger = test_suite_logger() diff --git a/ingestion/src/metadata/ingestion/source/connections.py b/ingestion/src/metadata/ingestion/source/connections.py index 42eba3dd57de..d7f58234adc7 100644 --- a/ingestion/src/metadata/ingestion/source/connections.py +++ b/ingestion/src/metadata/ingestion/source/connections.py @@ -19,13 +19,8 @@ from pydantic import BaseModel from sqlalchemy.engine import Engine -# isort: off -# logger is a root library and needs to be imported first avoid circular imports -from metadata.utils.logger import cli_logger from metadata.utils.importer import import_connection_fn - -# isort: on - +from metadata.utils.logger import cli_logger logger = cli_logger() diff --git a/ingestion/src/metadata/ingestion/source/database/athena/service_spec.py b/ingestion/src/metadata/ingestion/source/database/athena/service_spec.py index 64d59936af31..f8f34bcfd3d8 100644 --- a/ingestion/src/metadata/ingestion/source/database/athena/service_spec.py +++ b/ingestion/src/metadata/ingestion/source/database/athena/service_spec.py @@ -1,4 +1,4 @@ from metadata.ingestion.source.database.athena.metadata import AthenaSource -from metadata.utils.manifest import DefaultDatabaseSpec +from metadata.utils.service_spec.default import DefaultDatabaseSpec ServiceSpec = DefaultDatabaseSpec(metadata_source_class=AthenaSource) diff --git a/ingestion/src/metadata/ingestion/source/database/azuresql/service_spec.py b/ingestion/src/metadata/ingestion/source/database/azuresql/service_spec.py index 305ce57fa3f6..ca0f6c7f9836 100644 --- a/ingestion/src/metadata/ingestion/source/database/azuresql/service_spec.py +++ b/ingestion/src/metadata/ingestion/source/database/azuresql/service_spec.py @@ -1,4 +1,4 @@ from metadata.ingestion.source.database.azuresql.metadata import AzuresqlSource -from metadata.utils.manifest import DefaultDatabaseSpec +from metadata.utils.service_spec.default import DefaultDatabaseSpec ServiceSpec = DefaultDatabaseSpec(metadata_source_class=AzuresqlSource) diff --git a/ingestion/src/metadata/ingestion/source/database/bigquery/incremental_table_processor.py b/ingestion/src/metadata/ingestion/source/database/bigquery/incremental_table_processor.py index 58530670a880..706057efe48e 100644 --- a/ingestion/src/metadata/ingestion/source/database/bigquery/incremental_table_processor.py +++ b/ingestion/src/metadata/ingestion/source/database/bigquery/incremental_table_processor.py @@ -66,7 +66,7 @@ def set_changed_tables_map( for entry in entries: table_name = entry.payload.get("resourceName", "").split("/")[-1] - timestamp = entry.timestamp + timestamp = entry.start_time deleted = self._is_table_deleted(entry) if table_name not in table_map: diff --git a/ingestion/src/metadata/ingestion/source/database/bigquery/profiler/__init__.py b/ingestion/src/metadata/ingestion/source/database/bigquery/profiler/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/ingestion/src/metadata/ingestion/source/database/bigquery/profiler/profiler.py b/ingestion/src/metadata/ingestion/source/database/bigquery/profiler/profiler.py new file mode 100644 index 000000000000..cd2f6ebe9510 --- /dev/null +++ b/ingestion/src/metadata/ingestion/source/database/bigquery/profiler/profiler.py @@ -0,0 +1,26 @@ +from typing import List, Type + +from metadata.generated.schema.entity.data.table import SystemProfile +from metadata.ingestion.source.database.bigquery.profiler.system import ( + BigQuerySystemMetricsComputer, +) +from metadata.profiler.interface.sqlalchemy.bigquery.profiler_interface import ( + BigQueryProfilerInterface, +) +from metadata.profiler.metrics.system.system import System +from metadata.profiler.processor.runner import QueryRunner + + +class BigQueryProfiler(BigQueryProfilerInterface): + system_metrics_computer_class = BigQuerySystemMetricsComputer + + def _compute_system_metrics( + self, + metrics: Type[System], + runner: QueryRunner, + *args, + **kwargs, + ) -> List[SystemProfile]: + return self.system_metrics_computer.get_system_metrics( + runner.table, self.service_connection_config + ) diff --git a/ingestion/src/metadata/ingestion/source/database/bigquery/profiler/system.py b/ingestion/src/metadata/ingestion/source/database/bigquery/profiler/system.py new file mode 100644 index 000000000000..7aa6d787be82 --- /dev/null +++ b/ingestion/src/metadata/ingestion/source/database/bigquery/profiler/system.py @@ -0,0 +1,161 @@ +from typing import List + +from pydantic import TypeAdapter +from sqlalchemy.orm import DeclarativeMeta + +from metadata.generated.schema.entity.data.table import DmlOperationType, SystemProfile +from metadata.generated.schema.entity.services.connections.database.bigQueryConnection import ( + BigQueryConnection, +) +from metadata.ingestion.source.database.bigquery.queries import BigQueryQueryResult +from metadata.profiler.metrics.system.dml_operation import DatabaseDMLOperations +from metadata.profiler.metrics.system.system import ( + CacheProvider, + EmptySystemMetricsSource, + SQASessionProvider, + SystemMetricsComputer, +) +from metadata.utils.logger import profiler_logger +from metadata.utils.time_utils import datetime_to_timestamp + +logger = profiler_logger() + + +class BigQuerySystemMetricsSource( + SQASessionProvider, EmptySystemMetricsSource, CacheProvider +): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + def get_kwargs( + self, + table: DeclarativeMeta, + service_connection: BigQueryConnection, + *args, + **kwargs, + ): + return { + "table": table.__table__.name, + "dataset_id": table.__table_args__["schema"], + "project_id": super().get_session().get_bind().url.host, + "usage_location": service_connection.usageLocation, + } + + def get_deletes( + self, table: str, project_id: str, usage_location: str, dataset_id: str + ) -> List[SystemProfile]: + return self.get_system_profile( + project_id, + dataset_id, + table, + list( + self.get_queries_by_operation( + usage_location, + project_id, + dataset_id, + [ + DatabaseDMLOperations.DELETE, + ], + ) + ), + "deleted_row_count", + DmlOperationType.DELETE, + ) + + def get_updates( + self, table: str, project_id: str, usage_location: str, dataset_id: str + ) -> List[SystemProfile]: + return self.get_system_profile( + project_id, + dataset_id, + table, + self.get_queries_by_operation( + usage_location, + project_id, + dataset_id, + [ + DatabaseDMLOperations.UPDATE, + DatabaseDMLOperations.MERGE, + ], + ), + "updated_row_count", + DmlOperationType.UPDATE, + ) + + def get_inserts( + self, table: str, project_id: str, usage_location: str, dataset_id: str + ) -> List[SystemProfile]: + return self.get_system_profile( + project_id, + dataset_id, + table, + self.get_queries_by_operation( + usage_location, + project_id, + dataset_id, + [ + DatabaseDMLOperations.INSERT, + DatabaseDMLOperations.MERGE, + ], + ), + "inserted_row_count", + DmlOperationType.INSERT, + ) + + def get_queries_by_operation( + self, + usage_location: str, + project_id: str, + dataset_id: str, + operations: List[DatabaseDMLOperations], + ) -> List[BigQueryQueryResult]: + ops = {op.value for op in operations} + yield from ( + query + for query in self.get_queries(usage_location, project_id, dataset_id) + if query.statement_type in ops + ) + + def get_queries( + self, usage_location: str, project_id: str, dataset_id: str + ) -> List[BigQueryQueryResult]: + return self.get_or_update_cache( + f"{project_id}.{dataset_id}", + BigQueryQueryResult.get_for_table, + session=super().get_session(), + usage_location=usage_location, + project_id=project_id, + dataset_id=dataset_id, + ) + + @staticmethod + def get_system_profile( + project_id: str, + dataset_id: str, + table: str, + query_results: List[BigQueryQueryResult], + rows_affected_field: str, + operation: DmlOperationType, + ) -> List[SystemProfile]: + if not BigQueryQueryResult.model_fields.get(rows_affected_field): + raise ValueError( + f"rows_affected_field [{rows_affected_field}] is not a valid field in BigQueryQueryResult." + ) + return TypeAdapter(List[SystemProfile]).validate_python( + [ + { + "timestamp": datetime_to_timestamp(q.start_time, milliseconds=True), + "operation": operation, + "rowsAffected": getattr(q, rows_affected_field), + } + for q in query_results + if getattr(q, rows_affected_field) > 0 + and q.project_id == project_id + and q.dataset_id == dataset_id + and q.table_name == table + ] + ) + + +class BigQuerySystemMetricsComputer(SystemMetricsComputer, BigQuerySystemMetricsSource): + pass diff --git a/ingestion/src/metadata/ingestion/source/database/bigquery/queries.py b/ingestion/src/metadata/ingestion/source/database/bigquery/queries.py index 07a50b538c89..e4809727e483 100644 --- a/ingestion/src/metadata/ingestion/source/database/bigquery/queries.py +++ b/ingestion/src/metadata/ingestion/source/database/bigquery/queries.py @@ -13,6 +13,14 @@ """ import textwrap +from datetime import datetime +from typing import List, Optional + +from pydantic import BaseModel, TypeAdapter +from sqlalchemy import text +from sqlalchemy.orm import Session + +from metadata.profiler.metrics.system.dml_operation import DatabaseDMLOperations BIGQUERY_STATEMENT = textwrap.dedent( """ @@ -172,3 +180,69 @@ AND resource.labels.dataset_id = "{dataset}" AND timestamp >= "{start_date}" """ + + +class BigQueryQueryResult(BaseModel): + project_id: str + dataset_id: str + table_name: str + inserted_row_count: Optional[int] = None + deleted_row_count: Optional[int] = None + updated_row_count: Optional[int] = None + start_time: datetime + statement_type: str + + @staticmethod + def get_for_table( + session: Session, + usage_location: str, + dataset_id: str, + project_id: str, + ): + rows = session.execute( + text( + JOBS.format( + usage_location=usage_location, + dataset_id=dataset_id, + project_id=project_id, + insert=DatabaseDMLOperations.INSERT.value, + update=DatabaseDMLOperations.UPDATE.value, + delete=DatabaseDMLOperations.DELETE.value, + merge=DatabaseDMLOperations.MERGE.value, + ) + ) + ) + + return TypeAdapter(List[BigQueryQueryResult]).validate_python(map(dict, rows)) + + +DML_STAT_TO_DML_STATEMENT_MAPPING = { + "inserted_row_count": DatabaseDMLOperations.INSERT.value, + "deleted_row_count": DatabaseDMLOperations.DELETE.value, + "updated_row_count": DatabaseDMLOperations.UPDATE.value, +} + +JOBS = """ + SELECT + statement_type, + start_time, + destination_table.project_id as project_id, + destination_table.dataset_id as dataset_id, + destination_table.table_id as table_name, + dml_statistics.inserted_row_count as inserted_row_count, + dml_statistics.deleted_row_count as deleted_row_count, + dml_statistics.updated_row_count as updated_row_count + FROM + `region-{usage_location}`.INFORMATION_SCHEMA.JOBS + WHERE + DATE(creation_time) >= CURRENT_DATE() - 1 AND + destination_table.dataset_id = '{dataset_id}' AND + destination_table.project_id = '{project_id}' AND + statement_type IN ( + '{insert}', + '{update}', + '{delete}', + '{merge}' + ) + ORDER BY creation_time DESC; +""" diff --git a/ingestion/src/metadata/ingestion/source/database/bigquery/service_spec.py b/ingestion/src/metadata/ingestion/source/database/bigquery/service_spec.py index fb0db5a64edb..2b8f412b11da 100644 --- a/ingestion/src/metadata/ingestion/source/database/bigquery/service_spec.py +++ b/ingestion/src/metadata/ingestion/source/database/bigquery/service_spec.py @@ -1,9 +1,9 @@ from metadata.ingestion.source.database.bigquery.metadata import BigquerySource -from metadata.profiler.interface.sqlalchemy.bigquery.profiler_interface import ( - BigQueryProfilerInterface, +from metadata.ingestion.source.database.bigquery.profiler.profiler import ( + BigQueryProfiler, ) -from metadata.utils.manifest import DefaultDatabaseSpec +from metadata.utils.service_spec.default import DefaultDatabaseSpec ServiceSpec = DefaultDatabaseSpec( - metadata_source_class=BigquerySource, profiler_class=BigQueryProfilerInterface + metadata_source_class=BigquerySource, profiler_class=BigQueryProfiler ) diff --git a/ingestion/src/metadata/ingestion/source/database/bigtable/models.py b/ingestion/src/metadata/ingestion/source/database/bigtable/models.py index 146cc658b00b..91ab588975a4 100644 --- a/ingestion/src/metadata/ingestion/source/database/bigtable/models.py +++ b/ingestion/src/metadata/ingestion/source/database/bigtable/models.py @@ -43,7 +43,7 @@ def from_partial_row(cls, row: PartialRowData): cells.setdefault(column_family, {}) for column, cell in cf_cells.items(): cells[column_family][column] = Cell( - values=[Value(timestamp=c.timestamp, value=c.value) for c in cell] + values=[Value(timestamp=c.start_time, value=c.value) for c in cell] ) return cls(cells=cells, row_key=row.row_key) diff --git a/ingestion/src/metadata/ingestion/source/database/bigtable/service_spec.py b/ingestion/src/metadata/ingestion/source/database/bigtable/service_spec.py index f002a9315462..08eb68c1b7d0 100644 --- a/ingestion/src/metadata/ingestion/source/database/bigtable/service_spec.py +++ b/ingestion/src/metadata/ingestion/source/database/bigtable/service_spec.py @@ -1,4 +1,4 @@ from metadata.ingestion.source.database.bigtable.metadata import BigtableSource -from metadata.utils.manifest import DefaultDatabaseSpec +from metadata.utils.service_spec.default import DefaultDatabaseSpec ServiceSpec = DefaultDatabaseSpec(metadata_source_class=BigtableSource) diff --git a/ingestion/src/metadata/ingestion/source/database/clickhouse/service_spec.py b/ingestion/src/metadata/ingestion/source/database/clickhouse/service_spec.py index f43fc6ed8224..68e75d8fb319 100644 --- a/ingestion/src/metadata/ingestion/source/database/clickhouse/service_spec.py +++ b/ingestion/src/metadata/ingestion/source/database/clickhouse/service_spec.py @@ -1,4 +1,4 @@ from metadata.ingestion.source.database.clickhouse.metadata import ClickhouseSource -from metadata.utils.manifest import DefaultDatabaseSpec +from metadata.utils.service_spec.default import DefaultDatabaseSpec ServiceSpec = DefaultDatabaseSpec(metadata_source_class=ClickhouseSource) diff --git a/ingestion/src/metadata/ingestion/source/database/couchbase/service_spec.py b/ingestion/src/metadata/ingestion/source/database/couchbase/service_spec.py index 2f6bbb602341..8a396949b58d 100644 --- a/ingestion/src/metadata/ingestion/source/database/couchbase/service_spec.py +++ b/ingestion/src/metadata/ingestion/source/database/couchbase/service_spec.py @@ -1,4 +1,4 @@ from metadata.ingestion.source.database.couchbase.metadata import CouchbaseSource -from metadata.utils.manifest import DefaultDatabaseSpec +from metadata.utils.service_spec.default import DefaultDatabaseSpec ServiceSpec = DefaultDatabaseSpec(metadata_source_class=CouchbaseSource) diff --git a/ingestion/src/metadata/ingestion/source/database/databricks/service_spec.py b/ingestion/src/metadata/ingestion/source/database/databricks/service_spec.py index 463d6508f959..00a9d9e2f79b 100644 --- a/ingestion/src/metadata/ingestion/source/database/databricks/service_spec.py +++ b/ingestion/src/metadata/ingestion/source/database/databricks/service_spec.py @@ -2,7 +2,7 @@ from metadata.profiler.interface.sqlalchemy.databricks.profiler_interface import ( DatabricksProfilerInterface, ) -from metadata.utils.manifest import DefaultDatabaseSpec +from metadata.utils.service_spec.default import DefaultDatabaseSpec ServiceSpec = DefaultDatabaseSpec( metadata_source_class=DatabricksSource, profiler_class=DatabricksProfilerInterface diff --git a/ingestion/src/metadata/ingestion/source/database/datalake/service_spec.py b/ingestion/src/metadata/ingestion/source/database/datalake/service_spec.py index 234af15692f1..bbd36b6f312c 100644 --- a/ingestion/src/metadata/ingestion/source/database/datalake/service_spec.py +++ b/ingestion/src/metadata/ingestion/source/database/datalake/service_spec.py @@ -2,7 +2,7 @@ from metadata.profiler.interface.pandas.profiler_interface import ( PandasProfilerInterface, ) -from metadata.utils.manifest import DefaultDatabaseSpec +from metadata.utils.service_spec.default import DefaultDatabaseSpec ServiceSpec = DefaultDatabaseSpec( metadata_source_class=DatalakeSource, profiler_class=PandasProfilerInterface diff --git a/ingestion/src/metadata/ingestion/source/database/db2/service_spec.py b/ingestion/src/metadata/ingestion/source/database/db2/service_spec.py index c2d904523d45..e5ec7fdd3ed7 100644 --- a/ingestion/src/metadata/ingestion/source/database/db2/service_spec.py +++ b/ingestion/src/metadata/ingestion/source/database/db2/service_spec.py @@ -2,7 +2,7 @@ from metadata.profiler.interface.sqlalchemy.db2.profiler_interface import ( DB2ProfilerInterface, ) -from metadata.utils.manifest import DefaultDatabaseSpec +from metadata.utils.service_spec.default import DefaultDatabaseSpec ServiceSpec = DefaultDatabaseSpec( metadata_source_class=Db2Source, profiler_class=DB2ProfilerInterface diff --git a/ingestion/src/metadata/ingestion/source/database/deltalake/service_spec.py b/ingestion/src/metadata/ingestion/source/database/deltalake/service_spec.py index ba8886b56f94..83eaa31c628b 100644 --- a/ingestion/src/metadata/ingestion/source/database/deltalake/service_spec.py +++ b/ingestion/src/metadata/ingestion/source/database/deltalake/service_spec.py @@ -1,4 +1,4 @@ from metadata.ingestion.source.database.deltalake.metadata import DeltalakeSource -from metadata.utils.manifest import DefaultDatabaseSpec +from metadata.utils.service_spec.default import DefaultDatabaseSpec ServiceSpec = DefaultDatabaseSpec(metadata_source_class=DeltalakeSource) diff --git a/ingestion/src/metadata/ingestion/source/database/domodatabase/service_spec.py b/ingestion/src/metadata/ingestion/source/database/domodatabase/service_spec.py index 3192386039ae..4f10d286a381 100644 --- a/ingestion/src/metadata/ingestion/source/database/domodatabase/service_spec.py +++ b/ingestion/src/metadata/ingestion/source/database/domodatabase/service_spec.py @@ -1,4 +1,4 @@ from metadata.ingestion.source.database.domodatabase.metadata import DomodatabaseSource -from metadata.utils.manifest import DefaultDatabaseSpec +from metadata.utils.service_spec.default import DefaultDatabaseSpec ServiceSpec = DefaultDatabaseSpec(metadata_source_class=DomodatabaseSource) diff --git a/ingestion/src/metadata/ingestion/source/database/doris/service_spec.py b/ingestion/src/metadata/ingestion/source/database/doris/service_spec.py index 4e7fda236030..84937aca34ea 100644 --- a/ingestion/src/metadata/ingestion/source/database/doris/service_spec.py +++ b/ingestion/src/metadata/ingestion/source/database/doris/service_spec.py @@ -1,4 +1,4 @@ from metadata.ingestion.source.database.doris.metadata import DorisSource -from metadata.utils.manifest import DefaultDatabaseSpec +from metadata.utils.service_spec.default import DefaultDatabaseSpec ServiceSpec = DefaultDatabaseSpec(metadata_source_class=DorisSource) diff --git a/ingestion/src/metadata/ingestion/source/database/druid/service_spec.py b/ingestion/src/metadata/ingestion/source/database/druid/service_spec.py index e3958b254173..e83fe9a41a07 100644 --- a/ingestion/src/metadata/ingestion/source/database/druid/service_spec.py +++ b/ingestion/src/metadata/ingestion/source/database/druid/service_spec.py @@ -1,4 +1,4 @@ from metadata.ingestion.source.database.druid.metadata import DruidSource -from metadata.utils.manifest import DefaultDatabaseSpec +from metadata.utils.service_spec.default import DefaultDatabaseSpec ServiceSpec = DefaultDatabaseSpec(metadata_source_class=DruidSource) diff --git a/ingestion/src/metadata/ingestion/source/database/dynamodb/service_spec.py b/ingestion/src/metadata/ingestion/source/database/dynamodb/service_spec.py index 3ed0bd0de58c..5c5555707dae 100644 --- a/ingestion/src/metadata/ingestion/source/database/dynamodb/service_spec.py +++ b/ingestion/src/metadata/ingestion/source/database/dynamodb/service_spec.py @@ -1,6 +1,6 @@ from metadata.ingestion.source.database.dynamodb.metadata import DynamodbSource from metadata.profiler.interface.nosql.profiler_interface import NoSQLProfilerInterface -from metadata.utils.manifest import DefaultDatabaseSpec +from metadata.utils.service_spec.default import DefaultDatabaseSpec ServiceSpec = DefaultDatabaseSpec( metadata_source_class=DynamodbSource, profiler_class=NoSQLProfilerInterface diff --git a/ingestion/src/metadata/ingestion/source/database/glue/service_spec.py b/ingestion/src/metadata/ingestion/source/database/glue/service_spec.py index 44dcad3d1fa0..79e029904ed8 100644 --- a/ingestion/src/metadata/ingestion/source/database/glue/service_spec.py +++ b/ingestion/src/metadata/ingestion/source/database/glue/service_spec.py @@ -1,4 +1,4 @@ from metadata.ingestion.source.database.glue.metadata import GlueSource -from metadata.utils.manifest import DefaultDatabaseSpec +from metadata.utils.service_spec.default import DefaultDatabaseSpec ServiceSpec = DefaultDatabaseSpec(metadata_source_class=GlueSource) diff --git a/ingestion/src/metadata/ingestion/source/database/greenplum/service_spec.py b/ingestion/src/metadata/ingestion/source/database/greenplum/service_spec.py index 43c79633f404..9fe3ac14cfa1 100644 --- a/ingestion/src/metadata/ingestion/source/database/greenplum/service_spec.py +++ b/ingestion/src/metadata/ingestion/source/database/greenplum/service_spec.py @@ -1,4 +1,4 @@ from metadata.ingestion.source.database.greenplum.metadata import GreenplumSource -from metadata.utils.manifest import DefaultDatabaseSpec +from metadata.utils.service_spec.default import DefaultDatabaseSpec ServiceSpec = DefaultDatabaseSpec(metadata_source_class=GreenplumSource) diff --git a/ingestion/src/metadata/ingestion/source/database/hive/service_spec.py b/ingestion/src/metadata/ingestion/source/database/hive/service_spec.py index bd80439fd136..e303ccfab773 100644 --- a/ingestion/src/metadata/ingestion/source/database/hive/service_spec.py +++ b/ingestion/src/metadata/ingestion/source/database/hive/service_spec.py @@ -1,4 +1,4 @@ from metadata.ingestion.source.database.hive.metadata import HiveSource -from metadata.utils.manifest import DefaultDatabaseSpec +from metadata.utils.service_spec.default import DefaultDatabaseSpec ServiceSpec = DefaultDatabaseSpec(metadata_source_class=HiveSource) diff --git a/ingestion/src/metadata/ingestion/source/database/iceberg/service_spec.py b/ingestion/src/metadata/ingestion/source/database/iceberg/service_spec.py index 5c31342ece0d..db888fd6e326 100644 --- a/ingestion/src/metadata/ingestion/source/database/iceberg/service_spec.py +++ b/ingestion/src/metadata/ingestion/source/database/iceberg/service_spec.py @@ -1,4 +1,4 @@ from metadata.ingestion.source.database.iceberg.metadata import IcebergSource -from metadata.utils.manifest import DefaultDatabaseSpec +from metadata.utils.service_spec.default import DefaultDatabaseSpec ServiceSpec = DefaultDatabaseSpec(metadata_source_class=IcebergSource) diff --git a/ingestion/src/metadata/ingestion/source/database/impala/service_spec.py b/ingestion/src/metadata/ingestion/source/database/impala/service_spec.py index 2c81447e9e9f..51e302a88e42 100644 --- a/ingestion/src/metadata/ingestion/source/database/impala/service_spec.py +++ b/ingestion/src/metadata/ingestion/source/database/impala/service_spec.py @@ -1,4 +1,4 @@ from metadata.ingestion.source.database.impala.metadata import ImpalaSource -from metadata.utils.manifest import DefaultDatabaseSpec +from metadata.utils.service_spec.default import DefaultDatabaseSpec ServiceSpec = DefaultDatabaseSpec(metadata_source_class=ImpalaSource) diff --git a/ingestion/src/metadata/ingestion/source/database/mariadb/service_spec.py b/ingestion/src/metadata/ingestion/source/database/mariadb/service_spec.py index 692963cdf9db..40f55327c905 100644 --- a/ingestion/src/metadata/ingestion/source/database/mariadb/service_spec.py +++ b/ingestion/src/metadata/ingestion/source/database/mariadb/service_spec.py @@ -1,4 +1,4 @@ from metadata.ingestion.source.database.mariadb.metadata import MariadbSource -from metadata.utils.manifest import DefaultDatabaseSpec +from metadata.utils.service_spec.default import DefaultDatabaseSpec ServiceSpec = DefaultDatabaseSpec(metadata_source_class=MariadbSource) diff --git a/ingestion/src/metadata/ingestion/source/database/mongodb/service_spec.py b/ingestion/src/metadata/ingestion/source/database/mongodb/service_spec.py index e60602540de9..b3feafb4a665 100644 --- a/ingestion/src/metadata/ingestion/source/database/mongodb/service_spec.py +++ b/ingestion/src/metadata/ingestion/source/database/mongodb/service_spec.py @@ -1,6 +1,6 @@ from metadata.ingestion.source.database.mongodb.metadata import MongodbSource from metadata.profiler.interface.nosql.profiler_interface import NoSQLProfilerInterface -from metadata.utils.manifest import DefaultDatabaseSpec +from metadata.utils.service_spec.default import DefaultDatabaseSpec ServiceSpec = DefaultDatabaseSpec( metadata_source_class=MongodbSource, profiler_class=NoSQLProfilerInterface diff --git a/ingestion/src/metadata/ingestion/source/database/mssql/service_spec.py b/ingestion/src/metadata/ingestion/source/database/mssql/service_spec.py index fbad24d21143..c371f3ae5de4 100644 --- a/ingestion/src/metadata/ingestion/source/database/mssql/service_spec.py +++ b/ingestion/src/metadata/ingestion/source/database/mssql/service_spec.py @@ -1,4 +1,4 @@ from metadata.ingestion.source.database.mssql.metadata import MssqlSource -from metadata.utils.manifest import DefaultDatabaseSpec +from metadata.utils.service_spec.default import DefaultDatabaseSpec ServiceSpec = DefaultDatabaseSpec(metadata_source_class=MssqlSource) diff --git a/ingestion/src/metadata/ingestion/source/database/mysql/service_spec.py b/ingestion/src/metadata/ingestion/source/database/mysql/service_spec.py index 2d24a115e98c..0cf927395161 100644 --- a/ingestion/src/metadata/ingestion/source/database/mysql/service_spec.py +++ b/ingestion/src/metadata/ingestion/source/database/mysql/service_spec.py @@ -1,4 +1,4 @@ from metadata.ingestion.source.database.mysql.metadata import MysqlSource -from metadata.utils.manifest import DefaultDatabaseSpec +from metadata.utils.service_spec.default import DefaultDatabaseSpec ServiceSpec = DefaultDatabaseSpec(metadata_source_class=MysqlSource) diff --git a/ingestion/src/metadata/ingestion/source/database/oracle/service_spec.py b/ingestion/src/metadata/ingestion/source/database/oracle/service_spec.py index 69ee6f788f4e..ffc2bb80aaff 100644 --- a/ingestion/src/metadata/ingestion/source/database/oracle/service_spec.py +++ b/ingestion/src/metadata/ingestion/source/database/oracle/service_spec.py @@ -1,4 +1,4 @@ from metadata.ingestion.source.database.oracle.metadata import OracleSource -from metadata.utils.manifest import DefaultDatabaseSpec +from metadata.utils.service_spec.default import DefaultDatabaseSpec ServiceSpec = DefaultDatabaseSpec(metadata_source_class=OracleSource) diff --git a/ingestion/src/metadata/ingestion/source/database/pinotdb/service_spec.py b/ingestion/src/metadata/ingestion/source/database/pinotdb/service_spec.py index 1fe227ff0543..ffc4b1885a17 100644 --- a/ingestion/src/metadata/ingestion/source/database/pinotdb/service_spec.py +++ b/ingestion/src/metadata/ingestion/source/database/pinotdb/service_spec.py @@ -1,4 +1,4 @@ from metadata.ingestion.source.database.pinotdb.metadata import PinotdbSource -from metadata.utils.manifest import DefaultDatabaseSpec +from metadata.utils.service_spec.default import DefaultDatabaseSpec ServiceSpec = DefaultDatabaseSpec(metadata_source_class=PinotdbSource) diff --git a/ingestion/src/metadata/ingestion/source/database/postgres/service_spec.py b/ingestion/src/metadata/ingestion/source/database/postgres/service_spec.py index c0cdb957fe3f..3e2744277c02 100644 --- a/ingestion/src/metadata/ingestion/source/database/postgres/service_spec.py +++ b/ingestion/src/metadata/ingestion/source/database/postgres/service_spec.py @@ -1,4 +1,4 @@ from metadata.ingestion.source.database.postgres.metadata import PostgresSource -from metadata.utils.manifest import DefaultDatabaseSpec +from metadata.utils.service_spec.default import DefaultDatabaseSpec ServiceSpec = DefaultDatabaseSpec(metadata_source_class=PostgresSource) diff --git a/ingestion/src/metadata/ingestion/source/database/presto/service_spec.py b/ingestion/src/metadata/ingestion/source/database/presto/service_spec.py index eb515c7bade4..c88f3f69db55 100644 --- a/ingestion/src/metadata/ingestion/source/database/presto/service_spec.py +++ b/ingestion/src/metadata/ingestion/source/database/presto/service_spec.py @@ -1,4 +1,4 @@ from metadata.ingestion.source.database.presto.metadata import PrestoSource -from metadata.utils.manifest import DefaultDatabaseSpec +from metadata.utils.service_spec.default import DefaultDatabaseSpec ServiceSpec = DefaultDatabaseSpec(metadata_source_class=PrestoSource) diff --git a/ingestion/src/metadata/ingestion/source/database/redshift/service_spec.py b/ingestion/src/metadata/ingestion/source/database/redshift/service_spec.py index 37afb59af167..7d1fbd2cafda 100644 --- a/ingestion/src/metadata/ingestion/source/database/redshift/service_spec.py +++ b/ingestion/src/metadata/ingestion/source/database/redshift/service_spec.py @@ -2,7 +2,7 @@ from metadata.ingestion.source.database.redshift.profiler.profiler import ( RedshiftProfiler, ) -from metadata.utils.manifest import DefaultDatabaseSpec +from metadata.utils.service_spec.default import DefaultDatabaseSpec ServiceSpec = DefaultDatabaseSpec( metadata_source_class=RedshiftSource, profiler_class=RedshiftProfiler diff --git a/ingestion/src/metadata/ingestion/source/database/saphana/service_spec.py b/ingestion/src/metadata/ingestion/source/database/saphana/service_spec.py index 4149a9512455..73f116521433 100644 --- a/ingestion/src/metadata/ingestion/source/database/saphana/service_spec.py +++ b/ingestion/src/metadata/ingestion/source/database/saphana/service_spec.py @@ -1,4 +1,4 @@ from metadata.ingestion.source.database.saphana.metadata import SaphanaSource -from metadata.utils.manifest import DefaultDatabaseSpec +from metadata.utils.service_spec.default import DefaultDatabaseSpec ServiceSpec = DefaultDatabaseSpec(metadata_source_class=SaphanaSource) diff --git a/ingestion/src/metadata/ingestion/source/database/singlestore/service_spec.py b/ingestion/src/metadata/ingestion/source/database/singlestore/service_spec.py index 87690636e251..3175e998f128 100644 --- a/ingestion/src/metadata/ingestion/source/database/singlestore/service_spec.py +++ b/ingestion/src/metadata/ingestion/source/database/singlestore/service_spec.py @@ -2,7 +2,7 @@ from metadata.profiler.interface.sqlalchemy.single_store.profiler_interface import ( SingleStoreProfilerInterface, ) -from metadata.utils.manifest import DefaultDatabaseSpec +from metadata.utils.service_spec.default import DefaultDatabaseSpec ServiceSpec = DefaultDatabaseSpec( metadata_source_class=SinglestoreSource, profiler_class=SingleStoreProfilerInterface diff --git a/ingestion/src/metadata/ingestion/source/database/snowflake/profiler/system.py b/ingestion/src/metadata/ingestion/source/database/snowflake/profiler/system.py index 95555eb04fd0..274204006b47 100644 --- a/ingestion/src/metadata/ingestion/source/database/snowflake/profiler/system.py +++ b/ingestion/src/metadata/ingestion/source/database/snowflake/profiler/system.py @@ -280,7 +280,6 @@ def get_kwargs(self, table: DeclarativeMeta, *args, **kwargs): def get_inserts( self, database: str, schema: str, table: str ) -> List[SystemProfile]: - return self.get_system_profile( database, schema, @@ -345,7 +344,7 @@ def get_system_profile( ] ) - def get_update_queries( + def get_updates( self, database: str, schema: str, table: str ) -> List[SystemProfile]: return self.get_system_profile( diff --git a/ingestion/src/metadata/ingestion/source/database/snowflake/service_spec.py b/ingestion/src/metadata/ingestion/source/database/snowflake/service_spec.py index bf6a88c980c7..ef4f3d680bff 100644 --- a/ingestion/src/metadata/ingestion/source/database/snowflake/service_spec.py +++ b/ingestion/src/metadata/ingestion/source/database/snowflake/service_spec.py @@ -2,7 +2,7 @@ from metadata.ingestion.source.database.snowflake.profiler.profiler import ( SnowflakeProfiler, ) -from metadata.utils.manifest import DefaultDatabaseSpec +from metadata.utils.service_spec.default import DefaultDatabaseSpec ServiceSpec = DefaultDatabaseSpec( metadata_source_class=SnowflakeSource, profiler_class=SnowflakeProfiler diff --git a/ingestion/src/metadata/ingestion/source/database/sqlite/service_spec.py b/ingestion/src/metadata/ingestion/source/database/sqlite/service_spec.py index 36223f4dac18..6ffbe21a16ce 100644 --- a/ingestion/src/metadata/ingestion/source/database/sqlite/service_spec.py +++ b/ingestion/src/metadata/ingestion/source/database/sqlite/service_spec.py @@ -1,4 +1,4 @@ from metadata.ingestion.source.database.sqlite.metadata import SqliteSource -from metadata.utils.manifest import DefaultDatabaseSpec +from metadata.utils.service_spec.default import DefaultDatabaseSpec ServiceSpec = DefaultDatabaseSpec(metadata_source_class=SqliteSource) diff --git a/ingestion/src/metadata/ingestion/source/database/teradata/service_spec.py b/ingestion/src/metadata/ingestion/source/database/teradata/service_spec.py index 95d699d740ba..4fe31877cdf7 100644 --- a/ingestion/src/metadata/ingestion/source/database/teradata/service_spec.py +++ b/ingestion/src/metadata/ingestion/source/database/teradata/service_spec.py @@ -1,4 +1,4 @@ from metadata.ingestion.source.database.teradata.metadata import TeradataSource -from metadata.utils.manifest import DefaultDatabaseSpec +from metadata.utils.service_spec.default import DefaultDatabaseSpec ServiceSpec = DefaultDatabaseSpec(metadata_source_class=TeradataSource) diff --git a/ingestion/src/metadata/ingestion/source/database/trino/service_spec.py b/ingestion/src/metadata/ingestion/source/database/trino/service_spec.py index 622b2d9e822b..7909416957de 100644 --- a/ingestion/src/metadata/ingestion/source/database/trino/service_spec.py +++ b/ingestion/src/metadata/ingestion/source/database/trino/service_spec.py @@ -2,7 +2,7 @@ from metadata.profiler.interface.sqlalchemy.trino.profiler_interface import ( TrinoProfilerInterface, ) -from metadata.utils.manifest import DefaultDatabaseSpec +from metadata.utils.service_spec.default import DefaultDatabaseSpec ServiceSpec = DefaultDatabaseSpec( metadata_source_class=TrinoSource, profiler_class=TrinoProfilerInterface diff --git a/ingestion/src/metadata/ingestion/source/database/unitycatalog/service_spec.py b/ingestion/src/metadata/ingestion/source/database/unitycatalog/service_spec.py index 87ca5e99f939..4b6a5868c7ad 100644 --- a/ingestion/src/metadata/ingestion/source/database/unitycatalog/service_spec.py +++ b/ingestion/src/metadata/ingestion/source/database/unitycatalog/service_spec.py @@ -2,7 +2,7 @@ from metadata.profiler.interface.sqlalchemy.unity_catalog.profiler_interface import ( UnityCatalogProfilerInterface, ) -from metadata.utils.manifest import DefaultDatabaseSpec +from metadata.utils.service_spec.default import DefaultDatabaseSpec ServiceSpec = DefaultDatabaseSpec( metadata_source_class=UnitycatalogSource, diff --git a/ingestion/src/metadata/ingestion/source/database/vertica/service_spec.py b/ingestion/src/metadata/ingestion/source/database/vertica/service_spec.py index 207c909e0ff9..df520fcb74f7 100644 --- a/ingestion/src/metadata/ingestion/source/database/vertica/service_spec.py +++ b/ingestion/src/metadata/ingestion/source/database/vertica/service_spec.py @@ -1,5 +1,5 @@ from metadata.ingestion.source.database.vertica.metadata import VerticaSource -from metadata.utils.manifest import DefaultDatabaseSpec +from metadata.utils.service_spec.default import DefaultDatabaseSpec ServiceSpec = DefaultDatabaseSpec( metadata_source_class=VerticaSource, profiler_class=None diff --git a/ingestion/src/metadata/profiler/metrics/system/queries/bigquery.py b/ingestion/src/metadata/profiler/metrics/system/queries/bigquery.py deleted file mode 100644 index f54853604bcd..000000000000 --- a/ingestion/src/metadata/profiler/metrics/system/queries/bigquery.py +++ /dev/null @@ -1,54 +0,0 @@ -# Copyright 2021 Collate -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -Bigquery System Metric Queries -""" -from datetime import datetime - -from pydantic import BaseModel - -from metadata.profiler.metrics.system.dml_operation import DatabaseDMLOperations - - -class BigQueryQueryResult(BaseModel): - table_name: dict - timestamp: datetime - query_type: str - dml_statistics: dict - - -DML_STAT_TO_DML_STATEMENT_MAPPING = { - "inserted_row_count": DatabaseDMLOperations.INSERT.value, - "deleted_row_count": DatabaseDMLOperations.DELETE.value, - "updated_row_count": DatabaseDMLOperations.UPDATE.value, -} - -JOBS = """ - SELECT - statement_type, - start_time, - destination_table, - dml_statistics - FROM - `region-{usage_location}`.INFORMATION_SCHEMA.JOBS - WHERE - DATE(creation_time) >= CURRENT_DATE() - 1 AND - destination_table.dataset_id = '{dataset_id}' AND - destination_table.project_id = '{project_id}' AND - statement_type IN ( - '{insert}', - '{update}', - '{delete}', - '{merge}' - ) - ORDER BY creation_time DESC; -""" diff --git a/ingestion/src/metadata/profiler/metrics/system/system.py b/ingestion/src/metadata/profiler/metrics/system/system.py index 526c0b8de602..e2f30d07e5db 100644 --- a/ingestion/src/metadata/profiler/metrics/system/system.py +++ b/ingestion/src/metadata/profiler/metrics/system/system.py @@ -13,37 +13,18 @@ System Metric """ -import traceback from abc import ABC from collections import defaultdict -from typing import Callable, Dict, Generic, List, Optional, TypeVar +from typing import Callable, Generic, List, TypeVar -from pydantic import TypeAdapter -from sqlalchemy import text -from sqlalchemy.orm import DeclarativeMeta, Session +from sqlalchemy.orm import Session from metadata.generated.schema.configuration.profilerConfiguration import MetricType from metadata.generated.schema.entity.data.table import SystemProfile -from metadata.generated.schema.entity.services.connections.database.bigQueryConnection import ( - BigQueryConnection, -) from metadata.profiler.metrics.core import SystemMetric -from metadata.profiler.metrics.system.dml_operation import ( - DML_OPERATION_MAP, - DatabaseDMLOperations, -) -from metadata.profiler.metrics.system.queries.bigquery import ( - DML_STAT_TO_DML_STATEMENT_MAPPING, - JOBS, - BigQueryQueryResult, -) -from metadata.profiler.orm.registry import Dialects -from metadata.utils.dispatch import valuedispatch from metadata.utils.helpers import deep_size_of_dict from metadata.utils.logger import profiler_logger from metadata.utils.lru_cache import LRU_CACHE_SIZE, LRUCache -from metadata.utils.profiler_utils import get_value_from_cache, set_cache -from metadata.utils.time_utils import datetime_to_timestamp logger = profiler_logger() @@ -134,208 +115,6 @@ def get_database(self) -> str: return self.session.get_bind().url.database -@valuedispatch -def get_system_metrics_for_dialect( - dialect: str, - session: Session, - table: DeclarativeMeta, - *args, - **kwargs, -) -> Optional[List[SystemProfile]]: - """_summary_ - - Args: - dialect (str): database API dialect - session (Session): session object - - Returns: - Optional[Dict]: For BigQuery, Snowflake, Redshift returns - { - timestamp: , - operationType: - rowsAffected: , - } else returns None - """ - logger.debug(f"System metrics not support for {dialect}. Skipping processing.") - - -@get_system_metrics_for_dialect.register(Dialects.BigQuery) -def _( - dialect: str, - session: Session, - table: DeclarativeMeta, - conn_config: BigQueryConnection, - *args, - **kwargs, -) -> List[SystemProfile]: - """Compute system metrics for bigquery - - Args: - dialect (str): bigquery - session (Session): session Object - table (DeclarativeMeta): orm table - - Returns: - List[Dict]: - """ - logger.debug(f"Fetching system metrics for {dialect}") - - project_id = session.get_bind().url.host - dataset_id = table.__table_args__["schema"] # type: ignore - - metric_results: List[Dict] = [] - - jobs = get_value_from_cache( - SYSTEM_QUERY_RESULT_CACHE, f"{Dialects.BigQuery}.{project_id}.{dataset_id}.jobs" - ) - - if not jobs: - cursor_jobs = session.execute( - text( - JOBS.format( - usage_location=conn_config.usageLocation, - dataset_id=dataset_id, - project_id=project_id, - insert=DatabaseDMLOperations.INSERT.value, - update=DatabaseDMLOperations.UPDATE.value, - delete=DatabaseDMLOperations.DELETE.value, - merge=DatabaseDMLOperations.MERGE.value, - ) - ) - ) - jobs = [ - BigQueryQueryResult( - query_type=row.statement_type, - timestamp=row.start_time, - table_name=row.destination_table, - dml_statistics=row.dml_statistics, - ) - for row in cursor_jobs - ] - set_cache( - SYSTEM_QUERY_RESULT_CACHE, - f"{Dialects.BigQuery}.{project_id}.{dataset_id}.jobs", - jobs, - ) - - for job in jobs: - if job.table_name.get("table_id") == table.__tablename__: # type: ignore - rows_affected = None - try: - if job.query_type == DatabaseDMLOperations.INSERT.value: - rows_affected = job.dml_statistics.get("inserted_row_count") - if job.query_type == DatabaseDMLOperations.DELETE.value: - rows_affected = job.dml_statistics.get("deleted_row_count") - if job.query_type == DatabaseDMLOperations.UPDATE.value: - rows_affected = job.dml_statistics.get("updated_row_count") - except AttributeError: - logger.debug(traceback.format_exc()) - rows_affected = None - - if job.query_type == DatabaseDMLOperations.MERGE.value: - for indx, key in enumerate(job.dml_statistics): - if job.dml_statistics[key] != 0: - metric_results.append( - { - # Merge statement can include multiple DML operations - # We are padding timestamps by 0,1,2 millisesond to avoid - # duplicate timestamps - "timestamp": int(job.timestamp.timestamp() * 1000) - + indx, - "operation": DML_STAT_TO_DML_STATEMENT_MAPPING.get(key), - "rowsAffected": job.dml_statistics[key], - } - ) - continue - - metric_results.append( - { - "timestamp": int(job.timestamp.timestamp() * 1000), - "operation": job.query_type, - "rowsAffected": rows_affected, - } - ) - - return TypeAdapter(List[SystemProfile]).validate_python(metric_results) - - -@get_system_metrics_for_dialect.register(Dialects.Snowflake) -def _( - dialect: str, - session: Session, - table: DeclarativeMeta, - *args, - **kwargs, -) -> Optional[List[Dict]]: - """Fetch system metrics for Snowflake. query_history will return maximum 10K rows in one request. - We'll be fetching all the queries ran for the past 24 hours and filtered on specific query types - (INSERTS, MERGE, DELETE, UPDATE). - - :waring: Unlike redshift and bigquery results are not cached as we'll be looking - at DDL for each table - - To get the number of rows affected we'll use the specific query ID. - - Args: - dialect (str): dialect - session (Session): session object - - Returns: - Dict: system metric - """ - logger.debug(f"Fetching system metrics for {dialect}") - - metric_results: List[Dict] = [] - - query_results = build_snowflake_query_results( - session=session, - table=table, - ) - - for query_result in query_results: - rows_affected = None - if query_result.query_type == DatabaseDMLOperations.INSERT.value: - rows_affected = query_result.rows_inserted - if query_result.query_type == DatabaseDMLOperations.DELETE.value: - rows_affected = query_result.rows_deleted - if query_result.query_type == DatabaseDMLOperations.UPDATE.value: - rows_affected = query_result.rows_updated - if query_result.query_type == DatabaseDMLOperations.MERGE.value: - if query_result.rows_inserted: - metric_results.append( - { - "timestamp": datetime_to_timestamp( - query_result.start_time, milliseconds=True - ), - "operation": DatabaseDMLOperations.INSERT.value, - "rowsAffected": query_result.rows_inserted, - } - ) - if query_result.rows_updated: - metric_results.append( - { - "timestamp": datetime_to_timestamp( - query_result.start_time, milliseconds=True - ), - "operation": DatabaseDMLOperations.UPDATE.value, - "rowsAffected": query_result.rows_updated, - } - ) - continue - - metric_results.append( - { - "timestamp": datetime_to_timestamp( - query_result.start_time, milliseconds=True - ), - "operation": DML_OPERATION_MAP.get(query_result.query_type), - "rowsAffected": rows_affected, - } - ) - - return TypeAdapter(List[SystemProfile]).validate_python(metric_results) - - class System(SystemMetric): """System metric class to fetch: 1. freshness @@ -384,18 +163,6 @@ def _validate_attrs(self, attr_list: List[str]) -> None: ) def sql(self, session: Session, **kwargs): - """Implements the SQL logic to fetch system data""" - self._validate_attrs(["table", "ometa_client", "db_service"]) - - conn_config = kwargs.get("conn_config") - - system_metrics = get_system_metrics_for_dialect( - session.get_bind().dialect.name, - session=session, - table=self.table, # pylint: disable=no-member - conn_config=conn_config, - ometa_client=self.ometa_client, # pylint: disable=no-member - db_service=self.db_service, # pylint: disable=no-member + raise NotImplementedError( + "SQL method is not implemented for System metric. Use SystemMetricsComputer.get_system_metrics instead" ) - self._manage_cache() - return system_metrics diff --git a/ingestion/src/metadata/profiler/source/metadata_ext.py b/ingestion/src/metadata/profiler/source/metadata_ext.py index 1e6ef70897b5..922539ebddd0 100644 --- a/ingestion/src/metadata/profiler/source/metadata_ext.py +++ b/ingestion/src/metadata/profiler/source/metadata_ext.py @@ -41,6 +41,7 @@ ) from metadata.ingestion.api.models import Either from metadata.ingestion.ometa.ometa_api import OpenMetadata +from metadata.profiler.interface.profiler_interface import ProfilerInterface from metadata.profiler.source.metadata import ( OpenMetadataSource, ProfilerSourceAndEntity, @@ -48,13 +49,10 @@ from metadata.utils import fqn from metadata.utils.class_helper import get_service_type_from_source_type from metadata.utils.filters import filter_by_database, filter_by_schema, filter_by_table -from metadata.utils.importer import ( - ProfilerInterface, - import_from_module, - import_source_class, -) +from metadata.utils.importer import import_from_module from metadata.utils.logger import profiler_logger from metadata.utils.service_spec import BaseSpec +from metadata.utils.service_spec.service_spec import import_source_class from metadata.utils.ssl_manager import get_ssl_connection logger = profiler_logger() @@ -184,7 +182,7 @@ def import_profiler_interface(self) -> Type[ProfilerInterface]: source_type=self.config.source.type.lower(), ).profiler_class profiler_source_class = import_from_module(class_path) - return profiler_source_class + return cast(Type[ProfilerInterface], profiler_source_class) def get_schema_names(self) -> Iterable[str]: if self.service_connection.__dict__.get("databaseSchema"): diff --git a/ingestion/src/metadata/utils/dispatch.py b/ingestion/src/metadata/utils/dispatch.py index ef802121a2b7..8795ef2e1a8a 100644 --- a/ingestion/src/metadata/utils/dispatch.py +++ b/ingestion/src/metadata/utils/dispatch.py @@ -14,9 +14,7 @@ """ from collections import namedtuple -from functools import update_wrapper -from types import MappingProxyType -from typing import Any, Callable, Type, TypeVar +from typing import Type, TypeVar from pydantic import BaseModel @@ -56,53 +54,3 @@ def inner(fn): Register = namedtuple("Register", ["add", "registry"]) return Register(add, registry) - - -def valuedispatch(func) -> Callable: - """Value dispatch for methods and functions - - Args: - func (_type_): function to run - - Returns: - Callable: wrapper - """ - - registry = {} - - def _is_valid_dispatch(value): - return isinstance(value, str) - - def dispatch(value: str) -> Callable: - try: - impl = registry[value] - except KeyError: - impl = registry[object] - return impl - - def register(value, func=None) -> Callable: - if _is_valid_dispatch(value): - if func is None: - return lambda f: register(value, f) - else: - raise TypeError( - "Invalid first argument to reigister()." f"{value} is not a string." - ) - - registry[value] = func - return func - - def wrapper(*args, **kwargs) -> Any: - if not args: - raise TypeError(f"{func_name} requires at least 1 argument") - if isinstance(args[0], (str, bytes)): - return dispatch(str(args[0]))(*args, **kwargs) - return dispatch(args[1])(*args, **kwargs) - - func_name = getattr(func, "__name__", "method value dispatch") - registry[object] = func - wrapper.register = register - wrapper.dispatch = dispatch - wrapper.registry = MappingProxyType(registry) # making registry read only - update_wrapper(wrapper, func) - return wrapper diff --git a/ingestion/src/metadata/utils/importer.py b/ingestion/src/metadata/utils/importer.py index 4ed3754c74d1..4ea145ec2295 100644 --- a/ingestion/src/metadata/utils/importer.py +++ b/ingestion/src/metadata/utils/importer.py @@ -15,7 +15,7 @@ import sys import traceback from enum import Enum -from typing import Any, Callable, Optional, Type, TypeVar, cast +from typing import Any, Callable, Optional, Type, TypeVar from pydantic import BaseModel @@ -26,13 +26,12 @@ from metadata.generated.schema.entity.services.databaseService import DatabaseService from metadata.generated.schema.entity.services.serviceType import ServiceType from metadata.generated.schema.metadataIngestion.workflow import Sink as WorkflowSink -from metadata.ingestion.api.steps import BulkSink, Processor, Sink, Source, Stage +from metadata.ingestion.api.steps import BulkSink, Processor, Sink, Stage from metadata.profiler.metrics.system.system import EmptySystemMetricsSource from metadata.utils.class_helper import get_service_type_from_source_type from metadata.utils.client_version import get_client_version from metadata.utils.constants import CUSTOM_CONNECTOR_PREFIX from metadata.utils.logger import utils_logger -from metadata.utils.service_spec import BaseSpec from metadata.utils.singleton import Singleton logger = utils_logger() @@ -139,19 +138,6 @@ def import_from_module(key: str) -> Type[Any]: raise DynamicImportException(module=module_name, key=obj_name, cause=err) -def import_source_class( - service_type: ServiceType, source_type: str, from_: str = "ingestion" -) -> Type[Source]: - return cast( - Type[Source], - import_from_module( - BaseSpec.get_for_source( - service_type, source_type, from_ - ).metadata_source_class - ), - ) - - def import_processor_class( processor_type: str, from_: str = "ingestion" ) -> Type[Processor]: diff --git a/ingestion/src/metadata/utils/logger.py b/ingestion/src/metadata/utils/logger.py index 6fdd4c75c712..c33eb3b96d19 100644 --- a/ingestion/src/metadata/utils/logger.py +++ b/ingestion/src/metadata/utils/logger.py @@ -10,21 +10,6 @@ # limitations under the License. """ Module centralising logger configs - -This library might produce circular imports when using together with metadata.utils in these cases use the following -notation to avoid circular imports: - -```` -import third_pary_lib - -# isort: off - logger is a root library and this is how to avoid circular imports -from metadata.utils.logger import cli_logger -from metadata.utils.... import ... - -# isort: on - -``` - """ import logging diff --git a/ingestion/src/metadata/utils/service_spec/service_spec.py b/ingestion/src/metadata/utils/service_spec/service_spec.py index 9943ef972b19..6b70bfe27d6c 100644 --- a/ingestion/src/metadata/utils/service_spec/service_spec.py +++ b/ingestion/src/metadata/utils/service_spec/service_spec.py @@ -2,11 +2,12 @@ Manifests are used to store class information """ -from typing import Optional +from typing import Optional, Type, cast from pydantic import model_validator from metadata.generated.schema.entity.services.serviceType import ServiceType +from metadata.ingestion.api.steps import Source from metadata.ingestion.models.custom_pydantic import BaseModel from metadata.utils.importer import get_module_dir, import_from_module @@ -75,3 +76,16 @@ def get_for_source( def get_class_path(module): return module.__module__ + "." + module.__name__ + + +def import_source_class( + service_type: ServiceType, source_type: str, from_: str = "ingestion" +) -> Type[Source]: + return cast( + Type[Source], + import_from_module( + BaseSpec.get_for_source( + service_type, source_type, from_ + ).metadata_source_class + ), + ) diff --git a/ingestion/src/metadata/workflow/ingestion.py b/ingestion/src/metadata/workflow/ingestion.py index 1e28f5013178..1133304a8e7d 100644 --- a/ingestion/src/metadata/workflow/ingestion.py +++ b/ingestion/src/metadata/workflow/ingestion.py @@ -48,9 +48,9 @@ DynamicImportException, MissingPluginException, import_from_module, - import_source_class, ) from metadata.utils.logger import ingestion_logger +from metadata.utils.service_spec.service_spec import import_source_class from metadata.workflow.base import BaseWorkflow, InvalidWorkflowJSONException logger = ingestion_logger() From 59a69974e71a021dfc72d8d779cdb984f10fc00f Mon Sep 17 00:00:00 2001 From: sushi30 Date: Tue, 22 Oct 2024 07:57:24 +0200 Subject: [PATCH 06/18] - removed tests related to the profiler factory --- .../unit/profiler/test_profiler_interface.py | 84 ------------------- .../test_profiler_interface_factory.py | 43 ---------- 2 files changed, 127 deletions(-) delete mode 100644 ingestion/tests/unit/profiler/test_profiler_interface_factory.py diff --git a/ingestion/tests/unit/profiler/test_profiler_interface.py b/ingestion/tests/unit/profiler/test_profiler_interface.py index 2c3026da0b86..4392914f021a 100644 --- a/ingestion/tests/unit/profiler/test_profiler_interface.py +++ b/ingestion/tests/unit/profiler/test_profiler_interface.py @@ -30,31 +30,6 @@ DataStorageConfig, SampleDataStorageConfig, ) -from metadata.generated.schema.entity.services.connections.database.bigQueryConnection import ( - BigQueryConnection, -) -from metadata.generated.schema.entity.services.connections.database.databricksConnection import ( - DatabricksConnection, -) -from metadata.generated.schema.entity.services.connections.database.datalakeConnection import ( - DatalakeConnection, -) -from metadata.generated.schema.entity.services.connections.database.mariaDBConnection import ( - MariaDBConnection, -) -from metadata.generated.schema.entity.services.connections.database.singleStoreConnection import ( - SingleStoreConnection, -) -from metadata.generated.schema.entity.services.connections.database.snowflakeConnection import ( - SnowflakeConnection, -) -from metadata.generated.schema.entity.services.connections.database.trinoConnection import ( - TrinoConnection, -) -from metadata.generated.schema.entity.services.connections.database.unityCatalogConnection import ( - UnityCatalogConnection, -) -from metadata.generated.schema.entity.services.databaseService import DatabaseConnection from metadata.generated.schema.metadataIngestion.databaseServiceProfilerPipeline import ( DatabaseServiceProfilerPipeline, ) @@ -65,37 +40,7 @@ ProfileSampleConfig, TableConfig, ) -from metadata.profiler.interface.pandas.profiler_interface import ( - PandasProfilerInterface, -) from metadata.profiler.interface.profiler_interface import ProfilerInterface -from metadata.profiler.interface.profiler_interface_factory import ( - ProfilerInterfaceFactory, -) -from metadata.profiler.interface.sqlalchemy.bigquery.profiler_interface import ( - BigQueryProfilerInterface, -) -from metadata.profiler.interface.sqlalchemy.databricks.profiler_interface import ( - DatabricksProfilerInterface, -) -from metadata.profiler.interface.sqlalchemy.mariadb.profiler_interface import ( - MariaDBProfilerInterface, -) -from metadata.profiler.interface.sqlalchemy.profiler_interface import ( - SQAProfilerInterface, -) -from metadata.profiler.interface.sqlalchemy.single_store.profiler_interface import ( - SingleStoreProfilerInterface, -) -from metadata.profiler.interface.sqlalchemy.snowflake.profiler_interface import ( - SnowflakeProfilerInterface, -) -from metadata.profiler.interface.sqlalchemy.trino.profiler_interface import ( - TrinoProfilerInterface, -) -from metadata.profiler.interface.sqlalchemy.unity_catalog.profiler_interface import ( - UnityCatalogProfilerInterface, -) class ProfilerInterfaceTest(TestCase): @@ -359,32 +304,3 @@ def test_table_config_casting(self): schema_config, table_fqn="demo" ), ) - - def test_register_many(self): - # Initialize factory - factory = ProfilerInterfaceFactory() - - # Define profiles dictionary - profiles = { - DatabaseConnection.__name__: SQAProfilerInterface, - BigQueryConnection.__name__: BigQueryProfilerInterface, - SingleStoreConnection.__name__: SingleStoreProfilerInterface, - DatalakeConnection.__name__: PandasProfilerInterface, - SnowflakeConnection.__name__: SnowflakeProfilerInterface, - TrinoConnection.__name__: TrinoProfilerInterface, - UnityCatalogConnection.__name__: UnityCatalogProfilerInterface, - DatabricksConnection.__name__: DatabricksProfilerInterface, - MariaDBConnection.__name__: MariaDBProfilerInterface, - } - - # Register profiles - factory.register_many(profiles) - - # Assert all expected interfaces are registered - expected_interfaces = set(profiles.keys()) - actual_interfaces = set(factory._interface_type.keys()) - assert expected_interfaces == actual_interfaces - - # Assert profiler classes match registered interfaces - for interface_type, interface_class in profiles.items(): - assert factory._interface_type[interface_type] == interface_class diff --git a/ingestion/tests/unit/profiler/test_profiler_interface_factory.py b/ingestion/tests/unit/profiler/test_profiler_interface_factory.py deleted file mode 100644 index 38a6263b0984..000000000000 --- a/ingestion/tests/unit/profiler/test_profiler_interface_factory.py +++ /dev/null @@ -1,43 +0,0 @@ -# Copyright 2021 Collate -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -Factory class for creating profiler interface objects -""" - -from typing import Dict -from unittest import TestCase - -from metadata.profiler.interface.profiler_interface_factory import ( - profiler_class_mapping, -) - - -class TestProfilerClassMapping(TestCase): - def setUp(self): - self.expected_mapping: Dict[str, str] = { - "DatabaseConnection": "metadata.profiler.interface.sqlalchemy.profiler_interface.SQAProfilerInterface", - "BigQueryConnection": "metadata.profiler.interface.sqlalchemy.bigquery.profiler_interface.BigQueryProfilerInterface", - "SingleStoreConnection": "metadata.profiler.interface.sqlalchemy.single_store.profiler_interface.SingleStoreProfilerInterface", - "DatalakeConnection": "metadata.profiler.interface.pandas.profiler_interface.PandasProfilerInterface", - "MariaDBConnection": "metadata.profiler.interface.sqlalchemy.mariadb.profiler_interface.MariaDBProfilerInterface", - "SnowflakeConnection": "metadata.profiler.interface.sqlalchemy.snowflake.profiler_interface.SnowflakeProfilerInterface", - "TrinoConnection": "metadata.profiler.interface.sqlalchemy.trino.profiler_interface.TrinoProfilerInterface", - "UnityCatalogConnection": "metadata.profiler.interface.sqlalchemy.unity_catalog.profiler_interface.UnityCatalogProfilerInterface", - "DatabricksConnection": "metadata.profiler.interface.sqlalchemy.databricks.profiler_interface.DatabricksProfilerInterface", - "Db2Connection": "metadata.profiler.interface.sqlalchemy.db2.profiler_interface.DB2ProfilerInterface", - "MongoDBConnection": "metadata.profiler.interface.nosql.profiler_interface.NoSQLProfilerInterface", - "DynamoDBConnection": "metadata.profiler.interface.nosql.profiler_interface.NoSQLProfilerInterface", - } - - def test_profiler_class_mapping(self): - self.assertEqual(len(profiler_class_mapping), len(self.expected_mapping)) - self.assertEqual(profiler_class_mapping, self.expected_mapping) From 6476f9f97c3271f5e73074c56ef9e49233ee4eb8 Mon Sep 17 00:00:00 2001 From: sushi30 Date: Tue, 22 Oct 2024 07:58:26 +0200 Subject: [PATCH 07/18] - reverted start_time - removed DML_STAT_TO_DML_STATEMENT_MAPPING - removed unused logger --- .../source/database/bigquery/incremental_table_processor.py | 2 +- .../metadata/ingestion/source/database/bigquery/queries.py | 6 ------ .../metadata/ingestion/source/database/bigtable/models.py | 2 +- .../source/database/snowflake/profiler/profiler.py | 2 -- 4 files changed, 2 insertions(+), 10 deletions(-) diff --git a/ingestion/src/metadata/ingestion/source/database/bigquery/incremental_table_processor.py b/ingestion/src/metadata/ingestion/source/database/bigquery/incremental_table_processor.py index 706057efe48e..58530670a880 100644 --- a/ingestion/src/metadata/ingestion/source/database/bigquery/incremental_table_processor.py +++ b/ingestion/src/metadata/ingestion/source/database/bigquery/incremental_table_processor.py @@ -66,7 +66,7 @@ def set_changed_tables_map( for entry in entries: table_name = entry.payload.get("resourceName", "").split("/")[-1] - timestamp = entry.start_time + timestamp = entry.timestamp deleted = self._is_table_deleted(entry) if table_name not in table_map: diff --git a/ingestion/src/metadata/ingestion/source/database/bigquery/queries.py b/ingestion/src/metadata/ingestion/source/database/bigquery/queries.py index e4809727e483..8886c9b4afac 100644 --- a/ingestion/src/metadata/ingestion/source/database/bigquery/queries.py +++ b/ingestion/src/metadata/ingestion/source/database/bigquery/queries.py @@ -216,12 +216,6 @@ def get_for_table( return TypeAdapter(List[BigQueryQueryResult]).validate_python(map(dict, rows)) -DML_STAT_TO_DML_STATEMENT_MAPPING = { - "inserted_row_count": DatabaseDMLOperations.INSERT.value, - "deleted_row_count": DatabaseDMLOperations.DELETE.value, - "updated_row_count": DatabaseDMLOperations.UPDATE.value, -} - JOBS = """ SELECT statement_type, diff --git a/ingestion/src/metadata/ingestion/source/database/bigtable/models.py b/ingestion/src/metadata/ingestion/source/database/bigtable/models.py index 91ab588975a4..146cc658b00b 100644 --- a/ingestion/src/metadata/ingestion/source/database/bigtable/models.py +++ b/ingestion/src/metadata/ingestion/source/database/bigtable/models.py @@ -43,7 +43,7 @@ def from_partial_row(cls, row: PartialRowData): cells.setdefault(column_family, {}) for column, cell in cf_cells.items(): cells[column_family][column] = Cell( - values=[Value(timestamp=c.start_time, value=c.value) for c in cell] + values=[Value(timestamp=c.timestamp, value=c.value) for c in cell] ) return cls(cells=cells, row_key=row.row_key) diff --git a/ingestion/src/metadata/ingestion/source/database/snowflake/profiler/profiler.py b/ingestion/src/metadata/ingestion/source/database/snowflake/profiler/profiler.py index fc1dd245f1dc..77718e7bbbee 100644 --- a/ingestion/src/metadata/ingestion/source/database/snowflake/profiler/profiler.py +++ b/ingestion/src/metadata/ingestion/source/database/snowflake/profiler/profiler.py @@ -21,8 +21,6 @@ ) from metadata.utils.logger import profiler_interface_registry_logger -logger = profiler_interface_registry_logger() - class SnowflakeProfiler(SnowflakeProfilerInterface): """ From 04fcc27f4489d96334bb48ea39a6ceffb4f615e9 Mon Sep 17 00:00:00 2001 From: sushi30 Date: Tue, 22 Oct 2024 08:57:40 +0200 Subject: [PATCH 08/18] - reverted start_time - removed DML_STAT_TO_DML_STATEMENT_MAPPING - removed unused logger --- .../database/snowflake/profiler/profiler.py | 1 - .../database/snowflake/profiler/system.py | 53 ++++++------ .../source/database/vertica/service_spec.py | 4 +- ingestion/src/metadata/utils/lru_cache.py | 48 +++++++++++ ingestion/src/metadata/utils/test_suite.py | 84 ------------------- 5 files changed, 77 insertions(+), 113 deletions(-) delete mode 100644 ingestion/src/metadata/utils/test_suite.py diff --git a/ingestion/src/metadata/ingestion/source/database/snowflake/profiler/profiler.py b/ingestion/src/metadata/ingestion/source/database/snowflake/profiler/profiler.py index 77718e7bbbee..4c53ab5e01e9 100644 --- a/ingestion/src/metadata/ingestion/source/database/snowflake/profiler/profiler.py +++ b/ingestion/src/metadata/ingestion/source/database/snowflake/profiler/profiler.py @@ -19,7 +19,6 @@ from metadata.profiler.interface.sqlalchemy.snowflake.profiler_interface import ( SnowflakeProfilerInterface, ) -from metadata.utils.logger import profiler_interface_registry_logger class SnowflakeProfiler(SnowflakeProfilerInterface): diff --git a/ingestion/src/metadata/ingestion/source/database/snowflake/profiler/system.py b/ingestion/src/metadata/ingestion/source/database/snowflake/profiler/system.py index 274204006b47..fa92017db4db 100644 --- a/ingestion/src/metadata/ingestion/source/database/snowflake/profiler/system.py +++ b/ingestion/src/metadata/ingestion/source/database/snowflake/profiler/system.py @@ -288,8 +288,8 @@ def get_inserts( self.get_queries_by_operation( table, [ - DatabaseDMLOperations.INSERT.value, - DatabaseDMLOperations.MERGE.value, + DatabaseDMLOperations.INSERT, + DatabaseDMLOperations.MERGE, ], ) ), @@ -297,6 +297,26 @@ def get_inserts( DmlOperationType.INSERT, ) + def get_updates( + self, database: str, schema: str, table: str + ) -> List[SystemProfile]: + return self.get_system_profile( + database, + schema, + table, + list( + self.get_queries_by_operation( + table, + [ + DatabaseDMLOperations.UPDATE, + DatabaseDMLOperations.MERGE, + ], + ) + ), + "rows_updated", + DmlOperationType.UPDATE, + ) + def get_deletes( self, database: str, schema: str, table: str ) -> List[SystemProfile]: @@ -308,7 +328,7 @@ def get_deletes( self.get_queries_by_operation( table, [ - DatabaseDMLOperations.DELETE.value, + DatabaseDMLOperations.DELETE, ], ) ), @@ -344,29 +364,12 @@ def get_system_profile( ] ) - def get_updates( - self, database: str, schema: str, table: str - ) -> List[SystemProfile]: - return self.get_system_profile( - database, - schema, - table, - list( - self.get_queries_by_operation( - table, - [ - DatabaseDMLOperations.UPDATE.value, - DatabaseDMLOperations.MERGE.value, - ], - ) - ), - "rows_updated", - DmlOperationType.UPDATE, - ) - - def get_queries_by_operation(self, table: str, operations: List[str]): + def get_queries_by_operation( + self, table: str, operations: List[DatabaseDMLOperations] + ): + ops = [op.value for op in operations] yield from ( - query for query in self.get_queries(table) if query.query_type in operations + query for query in self.get_queries(table) if query.query_type in ops ) def get_queries(self, table: str) -> List[SnowflakeQueryResult]: diff --git a/ingestion/src/metadata/ingestion/source/database/vertica/service_spec.py b/ingestion/src/metadata/ingestion/source/database/vertica/service_spec.py index df520fcb74f7..1b3f260ffd4a 100644 --- a/ingestion/src/metadata/ingestion/source/database/vertica/service_spec.py +++ b/ingestion/src/metadata/ingestion/source/database/vertica/service_spec.py @@ -1,6 +1,4 @@ from metadata.ingestion.source.database.vertica.metadata import VerticaSource from metadata.utils.service_spec.default import DefaultDatabaseSpec -ServiceSpec = DefaultDatabaseSpec( - metadata_source_class=VerticaSource, profiler_class=None -) +ServiceSpec = DefaultDatabaseSpec(metadata_source_class=VerticaSource) diff --git a/ingestion/src/metadata/utils/lru_cache.py b/ingestion/src/metadata/utils/lru_cache.py index 3ddb3f23a64b..c24db6111a4b 100644 --- a/ingestion/src/metadata/utils/lru_cache.py +++ b/ingestion/src/metadata/utils/lru_cache.py @@ -35,11 +35,32 @@ def clear(self): self._cache = OrderedDict() def get(self, key) -> T: + """ + Returns the value associated to `key` if it exists, + updating the cache usage. + Raises `KeyError` if `key doesn't exist in the cache. + + Args: + key: The key to get the value for + + Returns: + The value associated to `key` + """ with self.lock: self._cache.move_to_end(key) return self._cache[key] def put(self, key: str, value: T) -> None: + """ + Assigns `value` to `key`, overwriting `key` if it already exists + in the cache and updating the cache usage. + If the size of the cache grows above capacity, pops the least used + element. + + Args: + key: The key to assign the value to + value: The value to assign to the key + """ with self.lock: self._cache[key] = value self._cache.move_to_end(key) @@ -58,6 +79,33 @@ def __len__(self) -> int: return len(self._cache) def wrap(self, key_func: Callable[..., str]): + """Decorator to cache the result of a function based on its arguments. + + Example: + ```python + import time + from metadata.utils.lru_cache import LRUCache + cache = LRUCache(4096) + + @cache.wrap(lambda x, y: f"{x}-{y}") + def add(x, y): + time.sleep(1) + return x + y + start1 = time.time() + add(1, 2) # This will be cached and take 1 second + print('took', time.time() - start1, 'seconds') + start2 = time.time() + add(1, 2) # This will return the cached value and take no time + print('took', time.time() - start2, 'seconds') + ``` + Args: + key_func: A function that generates a key based on the arguments + of the decorated function. + + Returns: + A decorator that caches the result of the decorated function. + """ + def wrapper(func: Callable[..., T]): def wrapped(*args, **kwargs) -> T: key = key_func(*args, **kwargs) diff --git a/ingestion/src/metadata/utils/test_suite.py b/ingestion/src/metadata/utils/test_suite.py deleted file mode 100644 index fd4e90e1b88b..000000000000 --- a/ingestion/src/metadata/utils/test_suite.py +++ /dev/null @@ -1,84 +0,0 @@ -# Copyright 2021 Collate -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -Helper module for test suite functions -""" - -from __future__ import annotations - -from datetime import datetime -from typing import Callable, List, Optional - -from metadata.generated.schema.tests.basic import ( - TestCaseResult, - TestCaseStatus, - TestResultValue, -) -from metadata.generated.schema.tests.testCase import TestCaseParameterValue - - -def get_test_case_param_value( - test_case_param_vals: list[TestCaseParameterValue], - name: str, - type_, - default=None, - pre_processor: Optional[Callable] = None, -): - """Give a column and a type return the value with the appropriate type casting for the - test case definition. - - Args: - test_case: the test case - type_ (Union[float, int, str]): type for the value - name (str): column name - default (_type_, optional): Default value to return if column is not found - pre_processor: pre processor function/type to use against the value before casting to type_ - """ - value = next( - (param.value for param in test_case_param_vals if param.name == name), None - ) - - if not value: - return default - - if not pre_processor: - return type_(value) - - pre_processed_value = pre_processor(value) - return type_(pre_processed_value) - - -def build_test_case_result( - execution_datetime: datetime, - status: TestCaseStatus, - result: str, - test_result_value: List[TestResultValue], - sample_data: Optional[str] = None, -) -> TestCaseResult: - """create a test case result object - - Args: - execution_datetime (datetime): execution datetime of the test - status (TestCaseStatus): failed, succeed, aborted - result (str): message to display - testResultValue (List[TestResultValue]): values for the test result - - Returns: - TestCaseResult: - """ - return TestCaseResult( - timestamp=execution_datetime, - testCaseStatus=status, - result=result, - testResultValue=test_result_value, - sampleData=sample_data, - ) From 0c2cbec6217a3c54efae3831b17f40b6590fc63e Mon Sep 17 00:00:00 2001 From: sushi30 Date: Tue, 22 Oct 2024 10:16:51 +0200 Subject: [PATCH 09/18] fixed tests --- ingestion/tests/cli_e2e/test_cli_bigquery.py | 34 ++++++++++++++++++++ ingestion/tests/unit/test_importer.py | 24 ++------------ 2 files changed, 36 insertions(+), 22 deletions(-) diff --git a/ingestion/tests/cli_e2e/test_cli_bigquery.py b/ingestion/tests/cli_e2e/test_cli_bigquery.py index 4396b762f066..ab2545fea0b4 100644 --- a/ingestion/tests/cli_e2e/test_cli_bigquery.py +++ b/ingestion/tests/cli_e2e/test_cli_bigquery.py @@ -12,8 +12,12 @@ """ Test Bigquery connector with CLI """ +from datetime import datetime from typing import List +from _openmetadata_testutils.pydantic.test_utils import assert_equal_pydantic_objects +from metadata.generated.schema.entity.data.table import SystemProfile, DmlOperationType +from metadata.generated.schema.type.basic import Timestamp from .common.test_cli_db import CliCommonDB from .common_e2e_sqa_mixins import SQACommonMethods @@ -126,3 +130,33 @@ def update_queries() -> List[str]: UPDATE `open-metadata-beta.exclude_me`.orders SET order_name = 'NINTENDO' WHERE id = 2 """, ] + + def system_profile_assertions(self): + cases = [ + ( + "e2e_redshift.e2e_cli_tests.dbt_jaffle.persons", + [ + SystemProfile( + timestamp=Timestamp(root=0), + operation=DmlOperationType.INSERT, + rowsAffected=6, + ) + ], + ) + ] + for table_fqn, expected_profile in cases: + actual_profiles = self.openmetadata.get_profile_data( + table_fqn, + start_ts=int((datetime.now().timestamp() - 600) * 1000), + end_ts=int(datetime.now().timestamp() * 1000), + profile_type=SystemProfile, + ).entities + actual_profiles = sorted(actual_profiles, key=lambda x: x.timestamp.root) + actual_profiles = actual_profiles[-len(expected_profile) :] + actual_profiles = [ + p.copy(update={"timestamp": Timestamp(root=0)}) for p in actual_profiles + ] + try: + assert_equal_pydantic_objects(expected_profile, actual_profiles) + except AssertionError as e: + raise AssertionError(f"Table: {table_fqn}") from e diff --git a/ingestion/tests/unit/test_importer.py b/ingestion/tests/unit/test_importer.py index 522d8aa5a735..39a08df074e8 100644 --- a/ingestion/tests/unit/test_importer.py +++ b/ingestion/tests/unit/test_importer.py @@ -28,9 +28,9 @@ import_from_module, import_processor_class, import_sink_class, - import_source_class, import_stage_class, ) +from metadata.utils.service_spec.service_spec import import_source_class # pylint: disable=import-outside-toplevel @@ -61,12 +61,6 @@ def test_import_class(self) -> None: ) def test_import_source_class(self) -> None: - from metadata.ingestion.source.database.bigquery.lineage import ( - BigqueryLineageSource, - ) - from metadata.ingestion.source.database.bigquery.usage import ( - BigqueryUsageSource, - ) from metadata.ingestion.source.database.mysql.metadata import MysqlSource self.assertEqual( @@ -74,20 +68,6 @@ def test_import_source_class(self) -> None: MysqlSource, ) - self.assertEqual( - import_source_class( - service_type=ServiceType.Database, source_type="bigquery-lineage" - ), - BigqueryLineageSource, - ) - - self.assertEqual( - import_source_class( - service_type=ServiceType.Database, source_type="bigquery-usage" - ), - BigqueryUsageSource, - ) - def test_import_processor_class(self) -> None: from metadata.ingestion.processor.query_parser import QueryParserProcessor @@ -126,7 +106,7 @@ def test_import_get_connection(self) -> None: self.assertIsNotNone(get_connection_fn) self.assertRaises( - DynamicImportException, + AttributeError, import_connection_fn, connection=connection, function_name="random", From a32efc8a7e90874f1304520eeafc30823238a19f Mon Sep 17 00:00:00 2001 From: sushi30 Date: Tue, 22 Oct 2024 10:17:13 +0200 Subject: [PATCH 10/18] format --- ingestion/tests/cli_e2e/test_cli_bigquery.py | 3 ++- ingestion/tests/unit/test_importer.py | 1 - 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ingestion/tests/cli_e2e/test_cli_bigquery.py b/ingestion/tests/cli_e2e/test_cli_bigquery.py index ab2545fea0b4..7218a549fbf5 100644 --- a/ingestion/tests/cli_e2e/test_cli_bigquery.py +++ b/ingestion/tests/cli_e2e/test_cli_bigquery.py @@ -16,8 +16,9 @@ from typing import List from _openmetadata_testutils.pydantic.test_utils import assert_equal_pydantic_objects -from metadata.generated.schema.entity.data.table import SystemProfile, DmlOperationType +from metadata.generated.schema.entity.data.table import DmlOperationType, SystemProfile from metadata.generated.schema.type.basic import Timestamp + from .common.test_cli_db import CliCommonDB from .common_e2e_sqa_mixins import SQACommonMethods diff --git a/ingestion/tests/unit/test_importer.py b/ingestion/tests/unit/test_importer.py index 39a08df074e8..ce0ccb20b945 100644 --- a/ingestion/tests/unit/test_importer.py +++ b/ingestion/tests/unit/test_importer.py @@ -19,7 +19,6 @@ ) from metadata.generated.schema.entity.services.serviceType import ServiceType from metadata.utils.importer import ( - DynamicImportException, get_class_name_root, get_module_name, get_source_module_name, From 0e16f3e6df51c205b9fc2e23aed074a2a917d459 Mon Sep 17 00:00:00 2001 From: sushi30 Date: Tue, 22 Oct 2024 11:03:47 +0200 Subject: [PATCH 11/18] bigquery system profile e2e tests --- ingestion/tests/cli_e2e/base/test_cli_db.py | 35 ++++++++++++++++-- ingestion/tests/cli_e2e/test_cli_bigquery.py | 37 ++++++------------- ingestion/tests/cli_e2e/test_cli_redshift.py | 24 ++---------- ingestion/tests/cli_e2e/test_cli_snowflake.py | 23 ++---------- 4 files changed, 50 insertions(+), 69 deletions(-) diff --git a/ingestion/tests/cli_e2e/base/test_cli_db.py b/ingestion/tests/cli_e2e/base/test_cli_db.py index 809e1823190e..ec6aa8f07b40 100644 --- a/ingestion/tests/cli_e2e/base/test_cli_db.py +++ b/ingestion/tests/cli_e2e/base/test_cli_db.py @@ -13,7 +13,8 @@ Test database connectors with CLI """ from abc import abstractmethod -from typing import List, Optional +from datetime import datetime +from typing import List, Optional, Tuple from unittest import TestCase import pytest @@ -21,7 +22,7 @@ from _openmetadata_testutils.pydantic.test_utils import assert_equal_pydantic_objects from metadata.data_quality.api.models import TestCaseDefinition -from metadata.generated.schema.entity.data.table import Table +from metadata.generated.schema.entity.data.table import SystemProfile, Table from metadata.generated.schema.tests.basic import TestCaseResult from metadata.generated.schema.tests.testCase import TestCase as OMTestCase from metadata.ingestion.api.status import Status @@ -419,4 +420,32 @@ def assert_status_for_data_quality(self, source_status, sink_status): pass def system_profile_assertions(self): - pass + cases = self.get_system_profile_cases() + if not cases: + return + for table_fqn, expected_profile in cases: + actual_profiles = self.openmetadata.get_profile_data( + table_fqn, + start_ts=int((datetime.now().timestamp() - 600) * 1000), + end_ts=int(datetime.now().timestamp() * 1000), + profile_type=SystemProfile, + ).entities + actual_profiles = sorted( + actual_profiles, key=lambda x: x.timestamp.root + ) + actual_profiles = actual_profiles[-len(expected_profile) :] + assert len(expected_profile) == len(actual_profiles) + for expected, actual in zip(expected_profile, actual_profiles): + try: + assert_equal_pydantic_objects( + expected.model_copy(update={"timestamp": actual.timestamp}), + actual, + ) + except AssertionError as e: + raise AssertionError( + f"System metrics profile did not return exepcted results for table: {table_fqn}" + ) from e + + def get_system_profile_cases(self) -> List[Tuple[str, List[SystemProfile]]]: + """Return a list of tuples with the table fqn and the expected system profile""" + return [] diff --git a/ingestion/tests/cli_e2e/test_cli_bigquery.py b/ingestion/tests/cli_e2e/test_cli_bigquery.py index 7218a549fbf5..77d333902982 100644 --- a/ingestion/tests/cli_e2e/test_cli_bigquery.py +++ b/ingestion/tests/cli_e2e/test_cli_bigquery.py @@ -12,10 +12,8 @@ """ Test Bigquery connector with CLI """ -from datetime import datetime from typing import List -from _openmetadata_testutils.pydantic.test_utils import assert_equal_pydantic_objects from metadata.generated.schema.entity.data.table import DmlOperationType, SystemProfile from metadata.generated.schema.type.basic import Timestamp @@ -38,8 +36,8 @@ class BigqueryCliTest(CliCommonDB.TestSuite, SQACommonMethods): """ insert_data_queries: List[str] = [ - "INSERT INTO `open-metadata-beta.exclude_me`.orders (id, order_name) VALUES (1,'XBOX');", - "INSERT INTO `open-metadata-beta.exclude_me`.orders (id, order_name) VALUES (2,'PS');", + "INSERT INTO `open-metadata-beta.exclude_me`.orders (id, order_name) VALUES (1,'XBOX'), (2,'PS');", + "UPDATE `open-metadata-beta.exclude_me`.orders SET order_name = 'NINTENDO' WHERE id = 2", ] drop_table_query: str = """ @@ -132,32 +130,21 @@ def update_queries() -> List[str]: """, ] - def system_profile_assertions(self): - cases = [ + def get_system_profile_cases(self) -> List[Tuple[str, List[SystemProfile]]]: + return [ ( - "e2e_redshift.e2e_cli_tests.dbt_jaffle.persons", + "local_bigquery.open-metadata-beta.exclude_me.orders", [ + SystemProfile( + timestamp=Timestamp(root=0), + operation=DmlOperationType.UPDATE, + rowsAffected=1, + ), SystemProfile( timestamp=Timestamp(root=0), operation=DmlOperationType.INSERT, - rowsAffected=6, - ) + rowsAffected=2, + ), ], ) ] - for table_fqn, expected_profile in cases: - actual_profiles = self.openmetadata.get_profile_data( - table_fqn, - start_ts=int((datetime.now().timestamp() - 600) * 1000), - end_ts=int(datetime.now().timestamp() * 1000), - profile_type=SystemProfile, - ).entities - actual_profiles = sorted(actual_profiles, key=lambda x: x.timestamp.root) - actual_profiles = actual_profiles[-len(expected_profile) :] - actual_profiles = [ - p.copy(update={"timestamp": Timestamp(root=0)}) for p in actual_profiles - ] - try: - assert_equal_pydantic_objects(expected_profile, actual_profiles) - except AssertionError as e: - raise AssertionError(f"Table: {table_fqn}") from e diff --git a/ingestion/tests/cli_e2e/test_cli_redshift.py b/ingestion/tests/cli_e2e/test_cli_redshift.py index 0c96b3c4af85..aa11e10f3102 100644 --- a/ingestion/tests/cli_e2e/test_cli_redshift.py +++ b/ingestion/tests/cli_e2e/test_cli_redshift.py @@ -12,10 +12,8 @@ """ Redshift E2E tests """ -from datetime import datetime -from typing import List +from typing import List, Tuple -from _openmetadata_testutils.pydantic.test_utils import assert_equal_pydantic_objects from metadata.generated.schema.entity.data.table import DmlOperationType, SystemProfile from metadata.generated.schema.type.basic import Timestamp from metadata.ingestion.api.status import Status @@ -236,8 +234,8 @@ def update_queries() -> List[str]: """, ] - def system_profile_assertions(self): - cases = [ + def get_system_profile_cases(self) -> List[Tuple[str, List[SystemProfile]]]: + return [ ( "e2e_redshift.e2e_cli_tests.dbt_jaffle.persons", [ @@ -249,19 +247,3 @@ def system_profile_assertions(self): ], ) ] - for table_fqn, expected_profile in cases: - actual_profiles = self.openmetadata.get_profile_data( - table_fqn, - start_ts=int((datetime.now().timestamp() - 600) * 1000), - end_ts=int(datetime.now().timestamp() * 1000), - profile_type=SystemProfile, - ).entities - actual_profiles = sorted(actual_profiles, key=lambda x: x.timestamp.root) - actual_profiles = actual_profiles[-len(expected_profile) :] - actual_profiles = [ - p.copy(update={"timestamp": Timestamp(root=0)}) for p in actual_profiles - ] - try: - assert_equal_pydantic_objects(expected_profile, actual_profiles) - except AssertionError as e: - raise AssertionError(f"Table: {table_fqn}") from e diff --git a/ingestion/tests/cli_e2e/test_cli_snowflake.py b/ingestion/tests/cli_e2e/test_cli_snowflake.py index cb483ab2c4df..7258882878dd 100644 --- a/ingestion/tests/cli_e2e/test_cli_snowflake.py +++ b/ingestion/tests/cli_e2e/test_cli_snowflake.py @@ -14,11 +14,10 @@ """ from datetime import datetime from time import sleep -from typing import List +from typing import List, Tuple import pytest -from _openmetadata_testutils.pydantic.test_utils import assert_equal_pydantic_objects from metadata.generated.schema.entity.data.table import DmlOperationType, SystemProfile from metadata.generated.schema.tests.basic import TestCaseResult, TestCaseStatus from metadata.generated.schema.tests.testCase import TestCaseParameterValue @@ -233,8 +232,8 @@ def update_queries() -> List[str]: """, ] - def system_profile_assertions(self): - cases = [ + def get_system_profile_cases(self) -> List[Tuple[str, List[SystemProfile]]]: + return [ ( "e2e_snowflake.E2E_DB.E2E_TEST.E2E_TABLE", [ @@ -286,22 +285,6 @@ def system_profile_assertions(self): ], ), ] - for table_fqn, expected_profile in cases: - actual_profiles = self.openmetadata.get_profile_data( - table_fqn, - start_ts=int((datetime.now().timestamp() - 600) * 1000), - end_ts=int(datetime.now().timestamp() * 1000), - profile_type=SystemProfile, - ).entities - actual_profiles = sorted(actual_profiles, key=lambda x: x.timestamp.root) - actual_profiles = actual_profiles[-len(expected_profile) :] - actual_profiles = [ - p.copy(update={"timestamp": Timestamp(root=0)}) for p in actual_profiles - ] - try: - assert_equal_pydantic_objects(expected_profile, actual_profiles) - except AssertionError as e: - raise AssertionError(f"Table: {table_fqn}\n{e}") @classmethod def wait_for_query_log(cls, timeout=600): From 60f14d18b7ad52f33ad825f334cba3de8d49ebc5 Mon Sep 17 00:00:00 2001 From: sushi30 Date: Tue, 22 Oct 2024 11:05:09 +0200 Subject: [PATCH 12/18] fixed module docstring --- .../ingestion/source/database/snowflake/profiler/profiler.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/ingestion/src/metadata/ingestion/source/database/snowflake/profiler/profiler.py b/ingestion/src/metadata/ingestion/source/database/snowflake/profiler/profiler.py index 4c53ab5e01e9..be5322c37ab9 100644 --- a/ingestion/src/metadata/ingestion/source/database/snowflake/profiler/profiler.py +++ b/ingestion/src/metadata/ingestion/source/database/snowflake/profiler/profiler.py @@ -10,8 +10,7 @@ # limitations under the License. """ -Interfaces with database for all database engine -supporting sqlalchemy abstraction layer +Profiler for Snowflake """ from metadata.ingestion.source.database.snowflake.profiler.system import ( SnowflakeSystemMetricsSource, From c2619a9d96caa861d3677fcc9c7693aa46b22b44 Mon Sep 17 00:00:00 2001 From: sushi30 Date: Tue, 22 Oct 2024 12:34:47 +0200 Subject: [PATCH 13/18] - removed import_side_effects from redshift. we still use it in postgres for the orm conversion maps. - removed leftover methods --- .../source/database/redshift/metadata.py | 4 ---- ingestion/src/metadata/utils/importer.py | 17 ----------------- 2 files changed, 21 deletions(-) diff --git a/ingestion/src/metadata/ingestion/source/database/redshift/metadata.py b/ingestion/src/metadata/ingestion/source/database/redshift/metadata.py index 92267757dd91..270eae010920 100644 --- a/ingestion/src/metadata/ingestion/source/database/redshift/metadata.py +++ b/ingestion/src/metadata/ingestion/source/database/redshift/metadata.py @@ -96,7 +96,6 @@ ) from metadata.utils.filters import filter_by_database from metadata.utils.helpers import get_start_and_end -from metadata.utils.importer import import_side_effects from metadata.utils.logger import ingestion_logger from metadata.utils.sqlalchemy_utils import ( get_all_table_comments, @@ -106,9 +105,6 @@ logger = ingestion_logger() -import_side_effects( - "metadata.ingestion.source.database.redshift.profiler.system", -) STANDARD_TABLE_TYPES = { "r": TableType.Regular, diff --git a/ingestion/src/metadata/utils/importer.py b/ingestion/src/metadata/utils/importer.py index 4ea145ec2295..5fc557f9c965 100644 --- a/ingestion/src/metadata/utils/importer.py +++ b/ingestion/src/metadata/utils/importer.py @@ -23,11 +23,9 @@ from metadata.generated.schema.entity.services.connections.metadata.openMetadataConnection import ( OpenMetadataConnection, ) -from metadata.generated.schema.entity.services.databaseService import DatabaseService from metadata.generated.schema.entity.services.serviceType import ServiceType from metadata.generated.schema.metadataIngestion.workflow import Sink as WorkflowSink from metadata.ingestion.api.steps import BulkSink, Processor, Sink, Stage -from metadata.profiler.metrics.system.system import EmptySystemMetricsSource from metadata.utils.class_helper import get_service_type_from_source_type from metadata.utils.client_version import get_client_version from metadata.utils.constants import CUSTOM_CONNECTOR_PREFIX @@ -288,18 +286,3 @@ def import_side_effects(self, *modules): def import_side_effects(*modules): SideEffectsLoader().import_side_effects(*modules) - - -def import_system_metrics_computer(db_service: DatabaseService): - """ - Import the system metrics profile class - """ - try: - return import_from_module( - "metadata.ingestion.source.database.{}.profiler.system.SystemMetricsComputer".format( # pylint: disable=consider-using-f-string - db_service.type - ) - ) - except DynamicImportException as err: - logger.debug("Could not import system metrics computer: %s", err) - return EmptySystemMetricsSource From 71234d27c8f8653884742e5cbf7c1b455392671b Mon Sep 17 00:00:00 2001 From: sushi30 Date: Tue, 22 Oct 2024 12:45:22 +0200 Subject: [PATCH 14/18] - tests for BaseSpec - moved get_class_path to importer --- ingestion/src/metadata/utils/importer.py | 4 ++++ .../src/metadata/utils/service_spec/default.py | 3 ++- .../metadata/utils/service_spec/service_spec.py | 8 ++------ ingestion/tests/unit/utils/test_service_spec.py | 16 ++++++++++++++++ 4 files changed, 24 insertions(+), 7 deletions(-) create mode 100644 ingestion/tests/unit/utils/test_service_spec.py diff --git a/ingestion/src/metadata/utils/importer.py b/ingestion/src/metadata/utils/importer.py index 5fc557f9c965..5686c09fdc15 100644 --- a/ingestion/src/metadata/utils/importer.py +++ b/ingestion/src/metadata/utils/importer.py @@ -286,3 +286,7 @@ def import_side_effects(self, *modules): def import_side_effects(*modules): SideEffectsLoader().import_side_effects(*modules) + + +def get_class_path(module): + return module.__module__ + "." + module.__name__ diff --git a/ingestion/src/metadata/utils/service_spec/default.py b/ingestion/src/metadata/utils/service_spec/default.py index ef103f264d9c..92558a1409a4 100644 --- a/ingestion/src/metadata/utils/service_spec/default.py +++ b/ingestion/src/metadata/utils/service_spec/default.py @@ -7,8 +7,9 @@ from metadata.profiler.interface.sqlalchemy.profiler_interface import ( SQAProfilerInterface, ) +from metadata.utils.importer import get_class_path from metadata.utils.service_spec.service_spec import BaseSpec class DefaultDatabaseSpec(BaseSpec): - profiler_class: Optional[str] = SQAProfilerInterface + profiler_class: Optional[str] = get_class_path(SQAProfilerInterface) diff --git a/ingestion/src/metadata/utils/service_spec/service_spec.py b/ingestion/src/metadata/utils/service_spec/service_spec.py index 6b70bfe27d6c..aebc3be60a0f 100644 --- a/ingestion/src/metadata/utils/service_spec/service_spec.py +++ b/ingestion/src/metadata/utils/service_spec/service_spec.py @@ -9,7 +9,7 @@ from metadata.generated.schema.entity.services.serviceType import ServiceType from metadata.ingestion.api.steps import Source from metadata.ingestion.models.custom_pydantic import BaseModel -from metadata.utils.importer import get_module_dir, import_from_module +from metadata.utils.importer import get_class_path, get_module_dir, import_from_module class BaseSpec(BaseModel): @@ -34,7 +34,7 @@ class BaseSpec(BaseModel): 4. We can hot-swap the class implementation without changing the manifest (example: for testing). """ - profiler_class: Optional[str] + profiler_class: Optional[str] = None metadata_source_class: str @model_validator(mode="before") @@ -74,10 +74,6 @@ def get_for_source( ) -def get_class_path(module): - return module.__module__ + "." + module.__name__ - - def import_source_class( service_type: ServiceType, source_type: str, from_: str = "ingestion" ) -> Type[Source]: diff --git a/ingestion/tests/unit/utils/test_service_spec.py b/ingestion/tests/unit/utils/test_service_spec.py new file mode 100644 index 000000000000..9c0588e7565c --- /dev/null +++ b/ingestion/tests/unit/utils/test_service_spec.py @@ -0,0 +1,16 @@ +from metadata.ingestion.source.database.mysql.metadata import MysqlSource +from metadata.profiler.interface.sqlalchemy.profiler_interface import ( + SQAProfilerInterface, +) +from metadata.utils.importer import get_class_path +from metadata.utils.service_spec import BaseSpec +from metadata.utils.service_spec.default import DefaultDatabaseSpec + + +def test_service_spec(): + spec = BaseSpec(metadata_source_class=MysqlSource) + assert spec.metadata_source_class == get_class_path(MysqlSource) + + spec = DefaultDatabaseSpec(metadata_source_class=MysqlSource) + assert spec.metadata_source_class == get_class_path(MysqlSource) + assert spec.profiler_class == get_class_path(SQAProfilerInterface) From bd9583c4cbc51a99a1a4996924642aae3a103905 Mon Sep 17 00:00:00 2001 From: sushi30 Date: Tue, 22 Oct 2024 13:10:12 +0200 Subject: [PATCH 15/18] - moved constructors around to get rid of useless kwargs --- .../source/database/bigquery/profiler/profiler.py | 7 +++++-- .../source/database/redshift/profiler/profiler.py | 4 +++- .../source/database/snowflake/profiler/profiler.py | 10 ++++------ .../interface/sqlalchemy/profiler_interface.py | 12 +++++++----- .../src/metadata/profiler/metrics/system/system.py | 9 +++------ 5 files changed, 22 insertions(+), 20 deletions(-) diff --git a/ingestion/src/metadata/ingestion/source/database/bigquery/profiler/profiler.py b/ingestion/src/metadata/ingestion/source/database/bigquery/profiler/profiler.py index cd2f6ebe9510..cceeafe3ea6e 100644 --- a/ingestion/src/metadata/ingestion/source/database/bigquery/profiler/profiler.py +++ b/ingestion/src/metadata/ingestion/source/database/bigquery/profiler/profiler.py @@ -12,8 +12,6 @@ class BigQueryProfiler(BigQueryProfilerInterface): - system_metrics_computer_class = BigQuerySystemMetricsComputer - def _compute_system_metrics( self, metrics: Type[System], @@ -24,3 +22,8 @@ def _compute_system_metrics( return self.system_metrics_computer.get_system_metrics( runner.table, self.service_connection_config ) + + def initialize_system_metrics_computer( + self, **kwargs + ) -> BigQuerySystemMetricsComputer: + return BigQuerySystemMetricsComputer(session=self.session) diff --git a/ingestion/src/metadata/ingestion/source/database/redshift/profiler/profiler.py b/ingestion/src/metadata/ingestion/source/database/redshift/profiler/profiler.py index 1b678967d605..b64fcd2a11b7 100644 --- a/ingestion/src/metadata/ingestion/source/database/redshift/profiler/profiler.py +++ b/ingestion/src/metadata/ingestion/source/database/redshift/profiler/profiler.py @@ -4,7 +4,9 @@ from metadata.profiler.interface.sqlalchemy.profiler_interface import ( SQAProfilerInterface, ) +from metadata.profiler.metrics.system.system import SystemMetricsComputer class RedshiftProfiler(SQAProfilerInterface): - system_metrics_computer_class = RedshiftSystemMetricsComputer + def initialize_system_metrics_computer(self, **kwargs) -> SystemMetricsComputer: + return RedshiftSystemMetricsComputer(session=self.session) diff --git a/ingestion/src/metadata/ingestion/source/database/snowflake/profiler/profiler.py b/ingestion/src/metadata/ingestion/source/database/snowflake/profiler/profiler.py index be5322c37ab9..e028bb2a229c 100644 --- a/ingestion/src/metadata/ingestion/source/database/snowflake/profiler/profiler.py +++ b/ingestion/src/metadata/ingestion/source/database/snowflake/profiler/profiler.py @@ -21,9 +21,7 @@ class SnowflakeProfiler(SnowflakeProfilerInterface): - """ - Interface to interact with registry supporting - sqlalchemy. - """ - - system_metrics_computer_class = SnowflakeSystemMetricsSource + def initialize_system_metrics_computer( + self, **kwargs + ) -> SnowflakeSystemMetricsSource: + return SnowflakeSystemMetricsSource(session=self.session) diff --git a/ingestion/src/metadata/profiler/interface/sqlalchemy/profiler_interface.py b/ingestion/src/metadata/profiler/interface/sqlalchemy/profiler_interface.py index 2cfc3751bf12..b0263741d607 100644 --- a/ingestion/src/metadata/profiler/interface/sqlalchemy/profiler_interface.py +++ b/ingestion/src/metadata/profiler/interface/sqlalchemy/profiler_interface.py @@ -76,8 +76,6 @@ class SQAProfilerInterface(ProfilerInterface, SQAInterfaceMixin): # pylint: disable=too-many-arguments - system_metrics_computer_class = SystemMetricsComputer - def __init__( self, service_connection_config, @@ -112,9 +110,13 @@ def __init__( self._table = self._convert_table_to_orm_object(sqa_metadata) self.create_session() - self.system_metrics_computer = self.system_metrics_computer_class( - session=self.session - ) + self.system_metrics_computer = self.initialize_system_metrics_computer() + + def initialize_system_metrics_computer(self) -> SystemMetricsComputer: + """Initialize system metrics computer. Override this if you want to use a metric source with + state or other dependencies. + """ + return SystemMetricsComputer() def create_session(self): self.session_factory = self._session_factory() diff --git a/ingestion/src/metadata/profiler/metrics/system/system.py b/ingestion/src/metadata/profiler/metrics/system/system.py index e2f30d07e5db..48b8ebdebbc6 100644 --- a/ingestion/src/metadata/profiler/metrics/system/system.py +++ b/ingestion/src/metadata/profiler/metrics/system/system.py @@ -63,12 +63,9 @@ class EmptySystemMetricsSource: """Empty system metrics source that can be used as a default. Just returns an empty list of system metrics for any resource.""" - def __init__(self, *args, **kwargs): - kwargs.pop("session", None) - if len(args) > 0: - logger.warning("Received unexpected arguments: %s", args) - if len(kwargs) > 0: - logger.warning("Received unexpected keyword arguments: %s", kwargs) + def __init__(self, *_, **__): + """This is a 'collaborative' constructor that takes any number of arguments and keyword arguments and + is required for using it in dependency injection.""" super().__init__() def get_inserts(self, *args, **kwargs) -> List[SystemProfile]: From 5dc797ffbc7720c8617cdff88e43db24afed687a Mon Sep 17 00:00:00 2001 From: sushi30 Date: Tue, 22 Oct 2024 15:28:59 +0200 Subject: [PATCH 16/18] - changed test_system_metric --- ingestion/tests/unit/profiler/sqlalchemy/test_metrics.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/ingestion/tests/unit/profiler/sqlalchemy/test_metrics.py b/ingestion/tests/unit/profiler/sqlalchemy/test_metrics.py index 940a01174635..9c31d5ffca56 100644 --- a/ingestion/tests/unit/profiler/sqlalchemy/test_metrics.py +++ b/ingestion/tests/unit/profiler/sqlalchemy/test_metrics.py @@ -34,6 +34,7 @@ ) from metadata.profiler.metrics.core import add_props from metadata.profiler.metrics.registry import Metrics +from metadata.profiler.metrics.system.system import SystemMetricsComputer from metadata.profiler.orm.functions.sum import SumFn from metadata.profiler.processor.core import Profiler @@ -928,11 +929,7 @@ def test_sum_function(self): assert res == 61 def test_system_metric(self): - system = add_props(table=User, ometa_client=None, db_service=None)( - Metrics.SYSTEM.value - ) - session = self.sqa_profiler_interface.session - system().sql(session) + assert SystemMetricsComputer().get_system_metrics() == [] def test_table_custom_metric(self): table_entity = Table( From ead59e977aac69b4daa9e5cedc72d966f560e97d Mon Sep 17 00:00:00 2001 From: sushi30 Date: Wed, 23 Oct 2024 09:35:03 +0200 Subject: [PATCH 17/18] - added linage and usage to service_spec - fixed postgres native lineage test --- .../source/database/athena/service_spec.py | 8 ++++++- .../source/database/azuresql/service_spec.py | 8 ++++++- .../source/database/bigquery/service_spec.py | 7 +++++- .../database/clickhouse/service_spec.py | 10 +++++++- .../database/databricks/service_spec.py | 9 ++++++- .../source/database/dbt/service_spec.py | 4 ++-- .../source/database/mssql/service_spec.py | 8 ++++++- .../source/database/oracle/service_spec.py | 8 ++++++- .../source/database/postgres/service_spec.py | 8 ++++++- .../source/database/query/service_spec.py | 9 +++++++ .../source/database/redshift/service_spec.py | 7 +++++- .../database/salesforce/service_spec.py | 4 ++-- .../source/database/saperp/service_spec.py | 4 ++-- .../source/database/saphana/service_spec.py | 5 +++- .../source/database/sas/service_spec.py | 4 ++-- .../source/database/snowflake/service_spec.py | 7 +++++- .../source/database/trino/service_spec.py | 7 +++++- .../database/unitycatalog/service_spec.py | 8 +++++++ .../source/database/vertica/service_spec.py | 8 ++++++- .../utils/service_spec/service_spec.py | 24 ++++++++++++++----- .../integration/postgres/test_lineage.py | 8 ++++++- 21 files changed, 137 insertions(+), 28 deletions(-) create mode 100644 ingestion/src/metadata/ingestion/source/database/query/service_spec.py diff --git a/ingestion/src/metadata/ingestion/source/database/athena/service_spec.py b/ingestion/src/metadata/ingestion/source/database/athena/service_spec.py index f8f34bcfd3d8..3a02ba0f8cb8 100644 --- a/ingestion/src/metadata/ingestion/source/database/athena/service_spec.py +++ b/ingestion/src/metadata/ingestion/source/database/athena/service_spec.py @@ -1,4 +1,10 @@ +from metadata.ingestion.source.database.athena.lineage import AthenaLineageSource from metadata.ingestion.source.database.athena.metadata import AthenaSource +from metadata.ingestion.source.database.athena.usage import AthenaUsageSource from metadata.utils.service_spec.default import DefaultDatabaseSpec -ServiceSpec = DefaultDatabaseSpec(metadata_source_class=AthenaSource) +ServiceSpec = DefaultDatabaseSpec( + metadata_source_class=AthenaSource, + lineage_source_class=AthenaLineageSource, + usage_source_class=AthenaUsageSource, +) diff --git a/ingestion/src/metadata/ingestion/source/database/azuresql/service_spec.py b/ingestion/src/metadata/ingestion/source/database/azuresql/service_spec.py index ca0f6c7f9836..14072b8618a8 100644 --- a/ingestion/src/metadata/ingestion/source/database/azuresql/service_spec.py +++ b/ingestion/src/metadata/ingestion/source/database/azuresql/service_spec.py @@ -1,4 +1,10 @@ +from metadata.ingestion.source.database.azuresql.lineage import AzuresqlLineageSource from metadata.ingestion.source.database.azuresql.metadata import AzuresqlSource +from metadata.ingestion.source.database.azuresql.usage import AzuresqlUsageSource from metadata.utils.service_spec.default import DefaultDatabaseSpec -ServiceSpec = DefaultDatabaseSpec(metadata_source_class=AzuresqlSource) +ServiceSpec = DefaultDatabaseSpec( + metadata_source_class=AzuresqlSource, + lineage_source_class=AzuresqlLineageSource, + usage_source_class=AzuresqlUsageSource, +) diff --git a/ingestion/src/metadata/ingestion/source/database/bigquery/service_spec.py b/ingestion/src/metadata/ingestion/source/database/bigquery/service_spec.py index 2b8f412b11da..bf97171d0982 100644 --- a/ingestion/src/metadata/ingestion/source/database/bigquery/service_spec.py +++ b/ingestion/src/metadata/ingestion/source/database/bigquery/service_spec.py @@ -1,9 +1,14 @@ +from metadata.ingestion.source.database.bigquery.lineage import BigqueryLineageSource from metadata.ingestion.source.database.bigquery.metadata import BigquerySource from metadata.ingestion.source.database.bigquery.profiler.profiler import ( BigQueryProfiler, ) +from metadata.ingestion.source.database.bigquery.usage import BigqueryUsageSource from metadata.utils.service_spec.default import DefaultDatabaseSpec ServiceSpec = DefaultDatabaseSpec( - metadata_source_class=BigquerySource, profiler_class=BigQueryProfiler + metadata_source_class=BigquerySource, + lineage_source_class=BigqueryLineageSource, + usage_source_class=BigqueryUsageSource, + profiler_class=BigQueryProfiler, ) diff --git a/ingestion/src/metadata/ingestion/source/database/clickhouse/service_spec.py b/ingestion/src/metadata/ingestion/source/database/clickhouse/service_spec.py index 68e75d8fb319..43d14129bf17 100644 --- a/ingestion/src/metadata/ingestion/source/database/clickhouse/service_spec.py +++ b/ingestion/src/metadata/ingestion/source/database/clickhouse/service_spec.py @@ -1,4 +1,12 @@ +from metadata.ingestion.source.database.clickhouse.lineage import ( + ClickhouseLineageSource, +) from metadata.ingestion.source.database.clickhouse.metadata import ClickhouseSource +from metadata.ingestion.source.database.clickhouse.usage import ClickhouseUsageSource from metadata.utils.service_spec.default import DefaultDatabaseSpec -ServiceSpec = DefaultDatabaseSpec(metadata_source_class=ClickhouseSource) +ServiceSpec = DefaultDatabaseSpec( + metadata_source_class=ClickhouseSource, + lineage_source_class=ClickhouseLineageSource, + usage_source_class=ClickhouseUsageSource, +) diff --git a/ingestion/src/metadata/ingestion/source/database/databricks/service_spec.py b/ingestion/src/metadata/ingestion/source/database/databricks/service_spec.py index 00a9d9e2f79b..3bf1978a8a99 100644 --- a/ingestion/src/metadata/ingestion/source/database/databricks/service_spec.py +++ b/ingestion/src/metadata/ingestion/source/database/databricks/service_spec.py @@ -1,9 +1,16 @@ +from metadata.ingestion.source.database.databricks.lineage import ( + DatabricksLineageSource, +) from metadata.ingestion.source.database.databricks.metadata import DatabricksSource +from metadata.ingestion.source.database.databricks.usage import DatabricksUsageSource from metadata.profiler.interface.sqlalchemy.databricks.profiler_interface import ( DatabricksProfilerInterface, ) from metadata.utils.service_spec.default import DefaultDatabaseSpec ServiceSpec = DefaultDatabaseSpec( - metadata_source_class=DatabricksSource, profiler_class=DatabricksProfilerInterface + metadata_source_class=DatabricksSource, + lineage_source_class=DatabricksLineageSource, + usage_source_class=DatabricksUsageSource, + profiler_class=DatabricksProfilerInterface, ) diff --git a/ingestion/src/metadata/ingestion/source/database/dbt/service_spec.py b/ingestion/src/metadata/ingestion/source/database/dbt/service_spec.py index 6126e67432a4..40ae953002b1 100644 --- a/ingestion/src/metadata/ingestion/source/database/dbt/service_spec.py +++ b/ingestion/src/metadata/ingestion/source/database/dbt/service_spec.py @@ -1,4 +1,4 @@ from metadata.ingestion.source.database.dbt.metadata import DbtSource -from metadata.utils.service_spec import BaseSpec +from metadata.utils.service_spec.default import DefaultDatabaseSpec -ServiceSpec = BaseSpec(metadata_source_class=DbtSource) +ServiceSpec = DefaultDatabaseSpec(metadata_source_class=DbtSource) diff --git a/ingestion/src/metadata/ingestion/source/database/mssql/service_spec.py b/ingestion/src/metadata/ingestion/source/database/mssql/service_spec.py index c371f3ae5de4..9d7fa7bf1fa3 100644 --- a/ingestion/src/metadata/ingestion/source/database/mssql/service_spec.py +++ b/ingestion/src/metadata/ingestion/source/database/mssql/service_spec.py @@ -1,4 +1,10 @@ +from metadata.ingestion.source.database.mssql.lineage import MssqlLineageSource from metadata.ingestion.source.database.mssql.metadata import MssqlSource +from metadata.ingestion.source.database.mssql.usage import MssqlUsageSource from metadata.utils.service_spec.default import DefaultDatabaseSpec -ServiceSpec = DefaultDatabaseSpec(metadata_source_class=MssqlSource) +ServiceSpec = DefaultDatabaseSpec( + metadata_source_class=MssqlSource, + lineage_source_class=MssqlLineageSource, + usage_source_class=MssqlUsageSource, +) diff --git a/ingestion/src/metadata/ingestion/source/database/oracle/service_spec.py b/ingestion/src/metadata/ingestion/source/database/oracle/service_spec.py index ffc2bb80aaff..3c89f9162117 100644 --- a/ingestion/src/metadata/ingestion/source/database/oracle/service_spec.py +++ b/ingestion/src/metadata/ingestion/source/database/oracle/service_spec.py @@ -1,4 +1,10 @@ +from metadata.ingestion.source.database.oracle.lineage import OracleLineageSource from metadata.ingestion.source.database.oracle.metadata import OracleSource +from metadata.ingestion.source.database.oracle.usage import OracleUsageSource from metadata.utils.service_spec.default import DefaultDatabaseSpec -ServiceSpec = DefaultDatabaseSpec(metadata_source_class=OracleSource) +ServiceSpec = DefaultDatabaseSpec( + metadata_source_class=OracleSource, + lineage_source_class=OracleLineageSource, + usage_source_class=OracleUsageSource, +) diff --git a/ingestion/src/metadata/ingestion/source/database/postgres/service_spec.py b/ingestion/src/metadata/ingestion/source/database/postgres/service_spec.py index 3e2744277c02..3bea308b164a 100644 --- a/ingestion/src/metadata/ingestion/source/database/postgres/service_spec.py +++ b/ingestion/src/metadata/ingestion/source/database/postgres/service_spec.py @@ -1,4 +1,10 @@ +from metadata.ingestion.source.database.postgres.lineage import PostgresLineageSource from metadata.ingestion.source.database.postgres.metadata import PostgresSource +from metadata.ingestion.source.database.postgres.usage import PostgresUsageSource from metadata.utils.service_spec.default import DefaultDatabaseSpec -ServiceSpec = DefaultDatabaseSpec(metadata_source_class=PostgresSource) +ServiceSpec = DefaultDatabaseSpec( + metadata_source_class=PostgresSource, + lineage_source_class=PostgresLineageSource, + usage_source_class=PostgresUsageSource, +) diff --git a/ingestion/src/metadata/ingestion/source/database/query/service_spec.py b/ingestion/src/metadata/ingestion/source/database/query/service_spec.py new file mode 100644 index 000000000000..643f2d7c398b --- /dev/null +++ b/ingestion/src/metadata/ingestion/source/database/query/service_spec.py @@ -0,0 +1,9 @@ +from metadata.ingestion.source.database.query.lineage import QueryLogLineageSource +from metadata.ingestion.source.database.query.usage import QueryLogUsageSource +from metadata.utils.service_spec import BaseSpec + +ServiceSpec = BaseSpec( + metadata_source_class="not.implemented", + lineage_source_class=QueryLogLineageSource, + usage_source_class=QueryLogUsageSource, +) diff --git a/ingestion/src/metadata/ingestion/source/database/redshift/service_spec.py b/ingestion/src/metadata/ingestion/source/database/redshift/service_spec.py index 7d1fbd2cafda..6f010e9287ef 100644 --- a/ingestion/src/metadata/ingestion/source/database/redshift/service_spec.py +++ b/ingestion/src/metadata/ingestion/source/database/redshift/service_spec.py @@ -1,9 +1,14 @@ +from metadata.ingestion.source.database.redshift.lineage import RedshiftLineageSource from metadata.ingestion.source.database.redshift.metadata import RedshiftSource from metadata.ingestion.source.database.redshift.profiler.profiler import ( RedshiftProfiler, ) +from metadata.ingestion.source.database.redshift.usage import RedshiftUsageSource from metadata.utils.service_spec.default import DefaultDatabaseSpec ServiceSpec = DefaultDatabaseSpec( - metadata_source_class=RedshiftSource, profiler_class=RedshiftProfiler + metadata_source_class=RedshiftSource, + lineage_source_class=RedshiftLineageSource, + usage_source_class=RedshiftUsageSource, + profiler_class=RedshiftProfiler, ) diff --git a/ingestion/src/metadata/ingestion/source/database/salesforce/service_spec.py b/ingestion/src/metadata/ingestion/source/database/salesforce/service_spec.py index 28505f1a152f..f0fc26d0b05a 100644 --- a/ingestion/src/metadata/ingestion/source/database/salesforce/service_spec.py +++ b/ingestion/src/metadata/ingestion/source/database/salesforce/service_spec.py @@ -1,4 +1,4 @@ from metadata.ingestion.source.database.salesforce.metadata import SalesforceSource -from metadata.utils.service_spec import BaseSpec +from metadata.utils.service_spec.default import DefaultDatabaseSpec -ServiceSpec = BaseSpec(metadata_source_class=SalesforceSource) +ServiceSpec = DefaultDatabaseSpec(metadata_source_class=SalesforceSource) diff --git a/ingestion/src/metadata/ingestion/source/database/saperp/service_spec.py b/ingestion/src/metadata/ingestion/source/database/saperp/service_spec.py index c0e3a0a27457..8d63287440c8 100644 --- a/ingestion/src/metadata/ingestion/source/database/saperp/service_spec.py +++ b/ingestion/src/metadata/ingestion/source/database/saperp/service_spec.py @@ -1,4 +1,4 @@ from metadata.ingestion.source.database.saperp.metadata import SaperpSource -from metadata.utils.service_spec import BaseSpec +from metadata.utils.service_spec.default import DefaultDatabaseSpec -ServiceSpec = BaseSpec(metadata_source_class=SaperpSource) +ServiceSpec = DefaultDatabaseSpec(metadata_source_class=SaperpSource) diff --git a/ingestion/src/metadata/ingestion/source/database/saphana/service_spec.py b/ingestion/src/metadata/ingestion/source/database/saphana/service_spec.py index 73f116521433..733652ff7419 100644 --- a/ingestion/src/metadata/ingestion/source/database/saphana/service_spec.py +++ b/ingestion/src/metadata/ingestion/source/database/saphana/service_spec.py @@ -1,4 +1,7 @@ +from metadata.ingestion.source.database.saphana.lineage import SaphanaLineageSource from metadata.ingestion.source.database.saphana.metadata import SaphanaSource from metadata.utils.service_spec.default import DefaultDatabaseSpec -ServiceSpec = DefaultDatabaseSpec(metadata_source_class=SaphanaSource) +ServiceSpec = DefaultDatabaseSpec( + metadata_source_class=SaphanaSource, lineage_source_class=SaphanaLineageSource +) diff --git a/ingestion/src/metadata/ingestion/source/database/sas/service_spec.py b/ingestion/src/metadata/ingestion/source/database/sas/service_spec.py index 81b49e86e9e3..9c6794d842bc 100644 --- a/ingestion/src/metadata/ingestion/source/database/sas/service_spec.py +++ b/ingestion/src/metadata/ingestion/source/database/sas/service_spec.py @@ -1,4 +1,4 @@ from metadata.ingestion.source.database.sas.metadata import SasSource -from metadata.utils.service_spec import BaseSpec +from metadata.utils.service_spec.default import DefaultDatabaseSpec -ServiceSpec = BaseSpec(metadata_source_class=SasSource) +ServiceSpec = DefaultDatabaseSpec(metadata_source_class=SasSource) diff --git a/ingestion/src/metadata/ingestion/source/database/snowflake/service_spec.py b/ingestion/src/metadata/ingestion/source/database/snowflake/service_spec.py index ef4f3d680bff..51ffc62ed29e 100644 --- a/ingestion/src/metadata/ingestion/source/database/snowflake/service_spec.py +++ b/ingestion/src/metadata/ingestion/source/database/snowflake/service_spec.py @@ -1,9 +1,14 @@ +from metadata.ingestion.source.database.snowflake.lineage import SnowflakeLineageSource from metadata.ingestion.source.database.snowflake.metadata import SnowflakeSource from metadata.ingestion.source.database.snowflake.profiler.profiler import ( SnowflakeProfiler, ) +from metadata.ingestion.source.database.snowflake.usage import SnowflakeUsageSource from metadata.utils.service_spec.default import DefaultDatabaseSpec ServiceSpec = DefaultDatabaseSpec( - metadata_source_class=SnowflakeSource, profiler_class=SnowflakeProfiler + metadata_source_class=SnowflakeSource, + lineage_source_class=SnowflakeLineageSource, + usage_source_class=SnowflakeUsageSource, + profiler_class=SnowflakeProfiler, ) diff --git a/ingestion/src/metadata/ingestion/source/database/trino/service_spec.py b/ingestion/src/metadata/ingestion/source/database/trino/service_spec.py index 7909416957de..4242ea01e9b3 100644 --- a/ingestion/src/metadata/ingestion/source/database/trino/service_spec.py +++ b/ingestion/src/metadata/ingestion/source/database/trino/service_spec.py @@ -1,9 +1,14 @@ +from metadata.ingestion.source.database.trino.lineage import TrinoLineageSource from metadata.ingestion.source.database.trino.metadata import TrinoSource +from metadata.ingestion.source.database.trino.usage import TrinoUsageSource from metadata.profiler.interface.sqlalchemy.trino.profiler_interface import ( TrinoProfilerInterface, ) from metadata.utils.service_spec.default import DefaultDatabaseSpec ServiceSpec = DefaultDatabaseSpec( - metadata_source_class=TrinoSource, profiler_class=TrinoProfilerInterface + metadata_source_class=TrinoSource, + lineage_source_class=TrinoLineageSource, + usage_source_class=TrinoUsageSource, + profiler_class=TrinoProfilerInterface, ) diff --git a/ingestion/src/metadata/ingestion/source/database/unitycatalog/service_spec.py b/ingestion/src/metadata/ingestion/source/database/unitycatalog/service_spec.py index 4b6a5868c7ad..676941465306 100644 --- a/ingestion/src/metadata/ingestion/source/database/unitycatalog/service_spec.py +++ b/ingestion/src/metadata/ingestion/source/database/unitycatalog/service_spec.py @@ -1,4 +1,10 @@ +from metadata.ingestion.source.database.unitycatalog.lineage import ( + UnitycatalogLineageSource, +) from metadata.ingestion.source.database.unitycatalog.metadata import UnitycatalogSource +from metadata.ingestion.source.database.unitycatalog.usage import ( + UnitycatalogUsageSource, +) from metadata.profiler.interface.sqlalchemy.unity_catalog.profiler_interface import ( UnityCatalogProfilerInterface, ) @@ -6,5 +12,7 @@ ServiceSpec = DefaultDatabaseSpec( metadata_source_class=UnitycatalogSource, + lineage_source_class=UnitycatalogLineageSource, + usage_source_class=UnitycatalogUsageSource, profiler_class=UnityCatalogProfilerInterface, ) diff --git a/ingestion/src/metadata/ingestion/source/database/vertica/service_spec.py b/ingestion/src/metadata/ingestion/source/database/vertica/service_spec.py index 1b3f260ffd4a..1cda23751a20 100644 --- a/ingestion/src/metadata/ingestion/source/database/vertica/service_spec.py +++ b/ingestion/src/metadata/ingestion/source/database/vertica/service_spec.py @@ -1,4 +1,10 @@ +from metadata.ingestion.source.database.vertica.lineage import VerticaLineageSource from metadata.ingestion.source.database.vertica.metadata import VerticaSource +from metadata.ingestion.source.database.vertica.usage import VerticaUsageSource from metadata.utils.service_spec.default import DefaultDatabaseSpec -ServiceSpec = DefaultDatabaseSpec(metadata_source_class=VerticaSource) +ServiceSpec = DefaultDatabaseSpec( + metadata_source_class=VerticaSource, + lineage_source_class=VerticaLineageSource, + usage_source_class=VerticaUsageSource, +) diff --git a/ingestion/src/metadata/utils/service_spec/service_spec.py b/ingestion/src/metadata/utils/service_spec/service_spec.py index aebc3be60a0f..a401ba0e8154 100644 --- a/ingestion/src/metadata/utils/service_spec/service_spec.py +++ b/ingestion/src/metadata/utils/service_spec/service_spec.py @@ -9,7 +9,15 @@ from metadata.generated.schema.entity.services.serviceType import ServiceType from metadata.ingestion.api.steps import Source from metadata.ingestion.models.custom_pydantic import BaseModel -from metadata.utils.importer import get_class_path, get_module_dir, import_from_module +from metadata.utils.importer import ( + TYPE_SEPARATOR, + get_class_path, + get_module_dir, + import_from_module, +) +from metadata.utils.logger import utils_logger + +logger = utils_logger() class BaseSpec(BaseModel): @@ -36,6 +44,8 @@ class BaseSpec(BaseModel): profiler_class: Optional[str] = None metadata_source_class: str + lineage_source_class: Optional[str] = None + usage_source_class: Optional[str] = None @model_validator(mode="before") @classmethod @@ -77,11 +87,13 @@ def get_for_source( def import_source_class( service_type: ServiceType, source_type: str, from_: str = "ingestion" ) -> Type[Source]: + source_class_type = source_type.split(TYPE_SEPARATOR)[-1] + if source_class_type in ["usage", "lineage"]: + field = f"{source_class_type}_source_class" + else: + field = "metadata_source_class" + spec = BaseSpec.get_for_source(service_type, source_type, from_) return cast( Type[Source], - import_from_module( - BaseSpec.get_for_source( - service_type, source_type, from_ - ).metadata_source_class - ), + import_from_module(spec.model_dump()[field]), ) diff --git a/ingestion/tests/integration/postgres/test_lineage.py b/ingestion/tests/integration/postgres/test_lineage.py index b6e6609ca342..fd2c9ca68cc3 100644 --- a/ingestion/tests/integration/postgres/test_lineage.py +++ b/ingestion/tests/integration/postgres/test_lineage.py @@ -5,6 +5,9 @@ import pytest from metadata.generated.schema.entity.data.table import Table +from metadata.generated.schema.metadataIngestion.databaseServiceQueryLineagePipeline import ( + DatabaseLineageConfigType, +) from metadata.ingestion.lineage.sql_lineage import search_cache from metadata.ingestion.ometa.ometa_api import OpenMetadata from metadata.workflow.metadata import MetadataWorkflow @@ -19,7 +22,9 @@ def native_lineage_config(db_service, workflow_config, sink_config): "source": { "type": "postgres-lineage", "serviceName": db_service.fullyQualifiedName.root, - "sourceConfig": {"config": {}}, + "sourceConfig": { + "config": {"type": DatabaseLineageConfigType.DatabaseLineage.value} + }, }, "sink": sink_config, "workflowConfig": workflow_config, @@ -39,6 +44,7 @@ def native_lineage_config(db_service, workflow_config, sink_config): ), ) def test_native_lineage( + patch_passwords_for_db_services, source_config, expected_nodes, run_workflow, From 6d678cb90818dec7f5f032486c95d58645c3d1b9 Mon Sep 17 00:00:00 2001 From: sushi30 Date: Wed, 23 Oct 2024 17:22:19 +0200 Subject: [PATCH 18/18] add comments on collaborative constructors --- .../ingestion/source/database/redshift/profiler/system.py | 1 + ingestion/src/metadata/profiler/metrics/system/system.py | 6 +----- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/ingestion/src/metadata/ingestion/source/database/redshift/profiler/system.py b/ingestion/src/metadata/ingestion/source/database/redshift/profiler/system.py index 6069a6ee10f9..148178722a6f 100644 --- a/ingestion/src/metadata/ingestion/source/database/redshift/profiler/system.py +++ b/ingestion/src/metadata/ingestion/source/database/redshift/profiler/system.py @@ -25,6 +25,7 @@ class RedshiftSystemMetricsSource( SQASessionProvider, EmptySystemMetricsSource, CacheProvider ): def __init__(self, *args, **kwargs): + # collaborative constructor that initalizes the SQASessionProvider and CacheProvider super().__init__(*args, **kwargs) def get_inserts( diff --git a/ingestion/src/metadata/profiler/metrics/system/system.py b/ingestion/src/metadata/profiler/metrics/system/system.py index 48b8ebdebbc6..7d07fc0c3202 100644 --- a/ingestion/src/metadata/profiler/metrics/system/system.py +++ b/ingestion/src/metadata/profiler/metrics/system/system.py @@ -63,11 +63,6 @@ class EmptySystemMetricsSource: """Empty system metrics source that can be used as a default. Just returns an empty list of system metrics for any resource.""" - def __init__(self, *_, **__): - """This is a 'collaborative' constructor that takes any number of arguments and keyword arguments and - is required for using it in dependency injection.""" - super().__init__() - def get_inserts(self, *args, **kwargs) -> List[SystemProfile]: """Get insert queries""" return [] @@ -86,6 +81,7 @@ def get_kwargs(self, *args, **kwargs): class SystemMetricsComputer(EmptySystemMetricsSource): def __init__(self, *args, **kwargs): + # collaborative constructor that initalizes upstream classes super().__init__(*args, **kwargs) def get_system_metrics(self, *args, **kwargs) -> List[SystemProfile]: