From 26a942511d3e923683fc4f860dbf9b797907e66e Mon Sep 17 00:00:00 2001 From: Pere Miquel Brull Date: Sun, 10 Nov 2024 09:37:21 +0100 Subject: [PATCH 01/29] skeleton --- ingestion/src/metadata/pii/models.py | 1 + ingestion/src/metadata/pii/processor.py | 4 +- ingestion/src/metadata/profiler/api/models.py | 78 +----- ingestion/src/metadata/profiler/config.py | 34 +++ .../profiler/interface/profiler_interface.py | 235 +--------------- .../src/metadata/profiler/processor/core.py | 47 +--- .../profiler/processor/sample_data_handler.py | 7 +- .../processor/sampler/sqlalchemy/sampler.py | 2 +- .../profiler/source/base/profiler_source.py | 41 ++- ingestion/src/metadata/sampler/config.py | 180 ++++++++++++ ingestion/src/metadata/sampler/models.py | 101 +++++++ .../src/metadata/sampler/nosql/sampler.py | 77 ++++++ .../src/metadata/sampler/pandas/sampler.py | 170 ++++++++++++ .../metadata/{utils => sampler}/partition.py | 0 ingestion/src/metadata/sampler/processor.py | 53 ++++ .../src/metadata/sampler/sampler_factory.py | 100 +++++++ .../src/metadata/sampler/sampler_interface.py | 146 ++++++++++ .../sampler/sqlalchemy/azuresql/sampler.py | 40 +++ .../sampler/sqlalchemy/bigquery/sampler.py | 97 +++++++ .../metadata/sampler/sqlalchemy/sampler.py | 257 ++++++++++++++++++ .../sampler/sqlalchemy/snowflake/sampler.py | 83 ++++++ .../sampler/sqlalchemy/trino/sampler.py | 47 ++++ ingestion/src/metadata/utils/logger.py | 9 + .../metadata/utils/service_spec/default.py | 2 + .../utils/service_spec/service_spec.py | 16 ++ ingestion/src/metadata/workflow/pii.py | 41 +++ .../databaseServicePIIPipeline.json | 88 ++++++ .../schema/metadataIngestion/workflow.json | 3 + 28 files changed, 1595 insertions(+), 364 deletions(-) create mode 100644 ingestion/src/metadata/profiler/config.py create mode 100644 ingestion/src/metadata/sampler/config.py create mode 100644 ingestion/src/metadata/sampler/models.py create mode 100644 ingestion/src/metadata/sampler/nosql/sampler.py create mode 100644 ingestion/src/metadata/sampler/pandas/sampler.py rename ingestion/src/metadata/{utils => sampler}/partition.py (100%) create mode 100644 ingestion/src/metadata/sampler/processor.py create mode 100644 ingestion/src/metadata/sampler/sampler_factory.py create mode 100644 ingestion/src/metadata/sampler/sampler_interface.py create mode 100644 ingestion/src/metadata/sampler/sqlalchemy/azuresql/sampler.py create mode 100644 ingestion/src/metadata/sampler/sqlalchemy/bigquery/sampler.py create mode 100644 ingestion/src/metadata/sampler/sqlalchemy/sampler.py create mode 100644 ingestion/src/metadata/sampler/sqlalchemy/snowflake/sampler.py create mode 100644 ingestion/src/metadata/sampler/sqlalchemy/trino/sampler.py create mode 100644 ingestion/src/metadata/workflow/pii.py create mode 100644 openmetadata-spec/src/main/resources/json/schema/metadataIngestion/databaseServicePIIPipeline.json diff --git a/ingestion/src/metadata/pii/models.py b/ingestion/src/metadata/pii/models.py index 64dbd8a8bb00..a2b23b6e6a12 100644 --- a/ingestion/src/metadata/pii/models.py +++ b/ingestion/src/metadata/pii/models.py @@ -16,6 +16,7 @@ from pydantic import BaseModel + class TagType(Enum): SENSITIVE = "Sensitive" NONSENSITIVE = "NonSensitive" diff --git a/ingestion/src/metadata/pii/processor.py b/ingestion/src/metadata/pii/processor.py index 95ebb90cbe4c..74dd02872e2b 100644 --- a/ingestion/src/metadata/pii/processor.py +++ b/ingestion/src/metadata/pii/processor.py @@ -40,7 +40,7 @@ from metadata.pii.constants import PII from metadata.pii.scanners.column_name_scanner import ColumnNameScanner from metadata.pii.scanners.ner_scanner import NERScanner -from metadata.profiler.api.models import ProfilerResponse +from metadata.profiler.api.models import PIIResponse, ProfilerResponse from metadata.utils.logger import profiler_logger logger = profiler_logger() @@ -153,7 +153,7 @@ def process_column( def _run( self, record: ProfilerResponse, - ) -> Either[ProfilerResponse]: + ) -> Either[PIIResponse]: """ Main entrypoint for the scanner. diff --git a/ingestion/src/metadata/profiler/api/models.py b/ingestion/src/metadata/profiler/api/models.py index afaf05fb42a1..4b1a577fee59 100644 --- a/ingestion/src/metadata/profiler/api/models.py +++ b/ingestion/src/metadata/profiler/api/models.py @@ -18,87 +18,24 @@ from typing import List, Optional, Type, Union -from pydantic import ConfigDict, Field +from pydantic import ConfigDict from sqlalchemy import Column from sqlalchemy.orm import DeclarativeMeta -from typing_extensions import Annotated from metadata.config.common import ConfigModel from metadata.generated.schema.api.data.createTableProfile import ( CreateTableProfileRequest, ) from metadata.generated.schema.entity.data.table import ( - ColumnProfilerConfig, - PartitionProfilerConfig, - ProfileSampleType, - SamplingMethodType, Table, - TableData, -) -from metadata.generated.schema.entity.services.connections.connectionBasicType import ( - SampleDataStorageConfig, ) from metadata.generated.schema.tests.customMetric import CustomMetric -from metadata.generated.schema.type.basic import FullyQualifiedEntityName -from metadata.ingestion.models.custom_pydantic import BaseModel -from metadata.ingestion.models.table_metadata import ColumnTag from metadata.profiler.metrics.core import Metric, MetricTypes from metadata.profiler.processor.models import ProfilerDef +from metadata.sampler.models import TableConfig, DatabaseAndSchemaConfig from metadata.utils.sqa_like_column import SQALikeColumn -class ColumnConfig(ConfigModel): - """Column config for profiler""" - - excludeColumns: Optional[List[str]] = None - includeColumns: Optional[List[ColumnProfilerConfig]] = None - - -class BaseProfileConfig(ConfigModel): - """base profile config""" - - fullyQualifiedName: FullyQualifiedEntityName - profileSample: Optional[Union[float, int]] = None - profileSampleType: Optional[ProfileSampleType] = None - samplingMethodType: Optional[SamplingMethodType] = None - sampleDataCount: Optional[int] = 100 - - -class TableConfig(BaseProfileConfig): - """table profile config""" - - profileQuery: Optional[str] = None - partitionConfig: Optional[PartitionProfilerConfig] = None - columnConfig: Optional[ColumnConfig] = None - - @classmethod - def from_database_and_schema_config( - cls, config: "DatabaseAndSchemaConfig", table_fqn: str - ): - table_config = TableConfig( - fullyQualifiedName=table_fqn, - profileSample=config.profileSample, - profileSampleType=config.profileSampleType, - sampleDataCount=config.sampleDataCount, - samplingMethodType=config.samplingMethodType, - ) - return table_config - - -class DatabaseAndSchemaConfig(BaseProfileConfig): - """schema profile config""" - - sampleDataStorageConfig: Optional[SampleDataStorageConfig] = None - - -class ProfileSampleConfig(ConfigModel): - """Profile Sample Config""" - - profile_sample: Optional[Union[float, int]] = None - profile_sample_type: Optional[ProfileSampleType] = ProfileSampleType.PERCENTAGE - sampling_method_type: Optional[SamplingMethodType] = None - - class ProfilerProcessorConfig(ConfigModel): """ Defines how we read the processor information @@ -111,15 +48,6 @@ class ProfilerProcessorConfig(ConfigModel): databaseConfig: Optional[List[DatabaseAndSchemaConfig]] = [] -class SampleData(BaseModel): - """TableData wrapper to handle ephemeral SampleData""" - - data: Annotated[TableData, Field(None, description="Table Sample Data")] - store: Annotated[ - bool, Field(False, description="Is the sample data should be stored or not") - ] - - class ProfilerResponse(ConfigModel): """ ORM Profiler processor response. @@ -130,8 +58,6 @@ class ProfilerResponse(ConfigModel): table: Table profile: CreateTableProfileRequest - sample_data: Optional[SampleData] = None - column_tags: Optional[List[ColumnTag]] = None def __str__(self): """Return the table name being processed""" diff --git a/ingestion/src/metadata/profiler/config.py b/ingestion/src/metadata/profiler/config.py new file mode 100644 index 000000000000..9edc2af5fc96 --- /dev/null +++ b/ingestion/src/metadata/profiler/config.py @@ -0,0 +1,34 @@ +# Copyright 2021 Collate +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Profiler configuration helpers +""" +from typing import Optional + +from metadata.generated.schema.entity.data.databaseSchema import DatabaseSchema, DatabaseSchemaProfilerConfig + +from metadata.generated.schema.entity.data.database import Database, DatabaseProfilerConfig + + +def get_database_profiler_config( + database_entity: Optional[Database], +) -> Optional[DatabaseProfilerConfig]: + if database_entity and database_entity.databaseProfilerConfig: + return database_entity.databaseProfilerConfig + return None + + +def get_schema_profiler_config( + schema_entity: Optional[DatabaseSchema], +) -> Optional[DatabaseSchemaProfilerConfig]: + if schema_entity and schema_entity.databaseSchemaProfilerConfig: + return schema_entity.databaseSchemaProfilerConfig + return None diff --git a/ingestion/src/metadata/profiler/interface/profiler_interface.py b/ingestion/src/metadata/profiler/interface/profiler_interface.py index 884ceac4c69c..22680cf973f6 100644 --- a/ingestion/src/metadata/profiler/interface/profiler_interface.py +++ b/ingestion/src/metadata/profiler/interface/profiler_interface.py @@ -17,31 +17,26 @@ from abc import ABC, abstractmethod from typing import Any, Dict, List, Optional, Type, Union +from metadata.profiler.config import get_schema_profiler_config, get_database_profiler_config +from metadata.sampler.sampler_interface import SamplerInterface from sqlalchemy import Column from metadata.generated.schema.entity.data.database import ( Database, - DatabaseProfilerConfig, ) from metadata.generated.schema.entity.data.databaseSchema import ( DatabaseSchema, - DatabaseSchemaProfilerConfig, ) from metadata.generated.schema.entity.data.table import ( PartitionProfilerConfig, SystemProfile, Table, - TableData, -) -from metadata.generated.schema.entity.services.connections.connectionBasicType import ( - DataStorageConfig, ) from metadata.generated.schema.entity.services.connections.database.datalakeConnection import ( DatalakeConnection, ) from metadata.generated.schema.entity.services.databaseService import ( DatabaseConnection, - DatabaseService, ) from metadata.generated.schema.entity.services.ingestionPipelines.status import ( StackTraceError, @@ -53,8 +48,6 @@ from metadata.ingestion.api.status import Status from metadata.ingestion.ometa.ometa_api import OpenMetadata from metadata.profiler.api.models import ( - DatabaseAndSchemaConfig, - ProfilerProcessorConfig, ProfileSampleConfig, TableConfig, ) @@ -62,8 +55,6 @@ from metadata.profiler.metrics.registry import Metrics from metadata.profiler.metrics.system.system import System from metadata.profiler.processor.runner import QueryRunner -from metadata.utils.constants import SAMPLE_DATA_DEFAULT_COUNT -from metadata.utils.partition import get_partition_details from metadata.utils.ssl_manager import get_ssl_connection @@ -95,20 +86,17 @@ def __init__( service_connection_config: Union[DatabaseConnection, DatalakeConnection], ometa_client: OpenMetadata, entity: Table, - storage_config: DataStorageConfig, profile_sample_config: Optional[ProfileSampleConfig], source_config: DatabaseServiceProfilerPipeline, sample_query: Optional[str], table_partition_config: Optional[PartitionProfilerConfig], thread_count: int = 5, timeout_seconds: int = 43200, - sample_data_count: Optional[int] = SAMPLE_DATA_DEFAULT_COUNT, **kwargs, ): """Required attribute for the interface""" self._thread_count = thread_count self.table_entity = entity - self.storage_config = storage_config self.ometa_client = ometa_client self.source_config = source_config self.service_connection_config = service_connection_config @@ -126,7 +114,6 @@ def __init__( table_partition_config if not self.profile_query else None ) self.timeout_seconds = timeout_seconds - self.sample_data_count = sample_data_count self._get_metric_fn = { MetricTypes.Table.value: self._compute_table_metrics, @@ -137,11 +124,6 @@ def __init__( MetricTypes.Custom.value: self._compute_custom_metrics, } - @abstractmethod - def _get_sampler(self): - """Get the sampler""" - raise NotImplementedError - # pylint: disable=too-many-locals @classmethod def create( @@ -149,11 +131,10 @@ def create( entity: Table, database_schema: DatabaseSchema, database: Database, - database_service: DatabaseService, entity_config: Optional[TableConfig], - profiler_config: Optional[ProfilerProcessorConfig], source_config: DatabaseServiceProfilerPipeline, service_connection_config, + sampler: SamplerInterface, ometa_client: Optional[OpenMetadata], **kwargs, ) -> "ProfilerInterface": @@ -176,234 +157,41 @@ def create( """ thread_count = source_config.threadCount timeout_seconds = source_config.timeoutSeconds - database_profiler_config = cls.get_database_profiler_config( + database_profiler_config = get_database_profiler_config( database_entity=database ) - schema_profiler_config = cls.get_schema_profiler_config( + schema_profiler_config = get_schema_profiler_config( schema_entity=database_schema ) - storage_config = cls.get_storage_config_for_table( - entity=entity, - schema_profiler_config=schema_profiler_config, - database_profiler_config=database_profiler_config, - db_service=database_service, - profiler_config=profiler_config, - ) - if not cls.get_profile_query(entity, entity_config): - profile_sample_config = cls.get_profile_sample_config( + if not sampler.get_profile_query(entity, entity_config): + profile_sample_config = sampler.get_profile_sample_config( entity, schema_profiler_config, database_profiler_config, entity_config, source_config, ) - table_partition_config = cls.get_partition_details(entity, entity_config) + table_partition_config = sampler.get_partition_details(entity, entity_config) sample_query = None else: - sample_query = cls.get_profile_query(entity, entity_config) + sample_query = sampler.get_profile_query(entity, entity_config) profile_sample_config = None table_partition_config = None - sample_data_count = cls.get_sample_data_count_config( - entity, - schema_profiler_config, - database_profiler_config, - entity_config, - source_config, - ) return cls( service_connection_config=service_connection_config, ometa_client=ometa_client, entity=entity, - storage_config=storage_config, profile_sample_config=profile_sample_config, source_config=source_config, sample_query=sample_query, table_partition_config=table_partition_config, thread_count=thread_count, timeout_seconds=timeout_seconds, - sample_data_count=sample_data_count, **kwargs, ) - @staticmethod - def get_schema_profiler_config( - schema_entity: Optional[DatabaseSchema], - ) -> DatabaseSchemaProfilerConfig: - if schema_entity and schema_entity.databaseSchemaProfilerConfig: - return schema_entity.databaseSchemaProfilerConfig - return None - - @staticmethod - def get_database_profiler_config( - database_entity: Optional[Database], - ) -> DatabaseProfilerConfig: - if database_entity and database_entity.databaseProfilerConfig: - return database_entity.databaseProfilerConfig - return None - - @staticmethod - def _get_sample_storage_config( - config: Union[ - DatabaseSchemaProfilerConfig, - DatabaseProfilerConfig, - DatabaseAndSchemaConfig, - ], - ) -> Optional[DataStorageConfig]: - if ( - config - and config.sampleDataStorageConfig - and config.sampleDataStorageConfig.config - ): - return config.sampleDataStorageConfig.config - return None - - @staticmethod - def get_storage_config_for_table( - entity: Table, - schema_profiler_config: Optional[DatabaseSchemaProfilerConfig], - database_profiler_config: Optional[DatabaseProfilerConfig], - db_service: Optional[DatabaseService], - profiler_config: ProfilerProcessorConfig, - ) -> Optional[DataStorageConfig]: - """Get config for a specific entity - - Args: - entity: table entity - """ - for schema_config in profiler_config.schemaConfig: - if ( - schema_config.fullyQualifiedName.root - == entity.databaseSchema.fullyQualifiedName - and ProfilerInterface._get_sample_storage_config(schema_config) - ): - return ProfilerInterface._get_sample_storage_config(schema_config) - - for database_config in profiler_config.databaseConfig: - if ( - database_config.fullyQualifiedName.root - == entity.database.fullyQualifiedName - and ProfilerInterface._get_sample_storage_config(database_config) - ): - return ProfilerInterface._get_sample_storage_config(database_config) - - if ProfilerInterface._get_sample_storage_config(schema_profiler_config): - return ProfilerInterface._get_sample_storage_config(schema_profiler_config) - - if ProfilerInterface._get_sample_storage_config(database_profiler_config): - return ProfilerInterface._get_sample_storage_config( - database_profiler_config - ) - - try: - return db_service.connection.config.sampleDataStorageConfig.config - except AttributeError: - pass - - return None - - @staticmethod - def get_profile_sample_config( - entity: Table, - schema_profiler_config: Optional[DatabaseSchemaProfilerConfig], - database_profiler_config: Optional[DatabaseProfilerConfig], - entity_config: Optional[Union[TableConfig, DatabaseAndSchemaConfig]], - source_config: DatabaseServiceProfilerPipeline, - ) -> Optional[ProfileSampleConfig]: - """_summary_ - Args: - entity (Table): table entity object - entity_config (Optional[TableConfig]): table config object from yaml/json file - Returns: - Optional[dict]: dict - """ - for config in ( - entity_config, - entity.tableProfilerConfig, - schema_profiler_config, - database_profiler_config, - source_config, - ): - try: - if config and config.profileSample: - return ProfileSampleConfig( - profile_sample=config.profileSample, - profile_sample_type=config.profileSampleType, - sampling_method_type=config.samplingMethodType, - ) - except AttributeError: - pass - - return None - - @staticmethod - def get_sample_data_count_config( - entity: Table, - schema_profiler_config: Optional[DatabaseSchemaProfilerConfig], - database_profiler_config: Optional[DatabaseProfilerConfig], - entity_config: Optional[TableConfig], - source_config: DatabaseServiceProfilerPipeline, - ) -> Optional[int]: - """_summary_ - Args: - entity_config (Optional[TableConfig]): table config object from yaml/json file - source_config DatabaseServiceProfilerPipeline: profiler pipeline details - Returns: - Optional[int]: int - """ - - for config in ( - entity_config, - entity.tableProfilerConfig, - schema_profiler_config, - database_profiler_config, - ): - if config and config.sampleDataCount: - return config.sampleDataCount - - return source_config.sampleDataCount - - @staticmethod - def get_profile_query( - entity: Table, entity_config: Optional[TableConfig] - ) -> Optional[str]: - """get profile query for sampling - - Args: - entity (Table): table entity object - entity_config (Optional[TableConfig]): entity configuration - - Returns: - Optional[str]: - """ - if entity_config: - return entity_config.profileQuery - - if entity.tableProfilerConfig: - return entity.tableProfilerConfig.profileQuery - - return None - - @staticmethod - def get_partition_details( - entity: Table, - entity_config: Optional[TableConfig] = None, - ) -> Optional[PartitionProfilerConfig]: - """_summary_ - - Args: - entity (Table): table entity object - entity_config (Optional[TableConfig]): entity configuration - - Returns: - Optional[PartitionProfilerConfig]: - """ - if entity_config: - return entity_config.partitionConfig - - return get_partition_details(entity) - @property @abstractmethod def table(self): @@ -495,11 +283,6 @@ def get_hybrid_metrics( """run profiler metrics""" raise NotImplementedError - @abstractmethod - def fetch_sample_data(self, table, columns: List[Column]) -> TableData: - """run profiler metrics""" - raise NotImplementedError - @abstractmethod def close(self): """Clean up profiler interface""" diff --git a/ingestion/src/metadata/profiler/processor/core.py b/ingestion/src/metadata/profiler/processor/core.py index 18236a3eeef3..43bf81875a0c 100644 --- a/ingestion/src/metadata/profiler/processor/core.py +++ b/ingestion/src/metadata/profiler/processor/core.py @@ -40,7 +40,7 @@ CustomMetric as CustomMetricEntity, ) from metadata.generated.schema.type.basic import Timestamp -from metadata.profiler.api.models import ProfilerResponse, SampleData, ThreadPoolMetrics +from metadata.profiler.api.models import ProfilerResponse, ThreadPoolMetrics from metadata.profiler.interface.profiler_interface import ProfilerInterface from metadata.profiler.metrics.core import ( ComposedMetric, @@ -54,9 +54,6 @@ from metadata.profiler.orm.functions.table_metric_computer import CREATE_DATETIME from metadata.profiler.orm.registry import NOT_COMPUTE from metadata.profiler.processor.metric_filter import MetricFilter -from metadata.profiler.processor.sample_data_handler import upload_sample_data -from metadata.utils.constants import SAMPLE_DATA_DEFAULT_COUNT -from metadata.utils.execution_time_tracker import calculate_execution_time from metadata.utils.logger import profiler_logger logger = profiler_logger() @@ -488,15 +485,7 @@ def process(self) -> ProfilerResponse: ) self.compute_metrics() - # We need the sample data for Sample Data or PII Sensitive processing. - # We'll nullify the Sample Data after the PII processing so that it's not stored. - if ( - self.source_config.generateSampleData - or self.source_config.processPiiSensitive - ): - sample_data = self.generate_sample_data() - else: - sample_data = None + profile = self.get_profile() if self.source_config.computeMetrics: @@ -510,38 +499,6 @@ def process(self) -> ProfilerResponse: return table_profile - @calculate_execution_time(store=False) - def generate_sample_data(self) -> Optional[SampleData]: - """Fetch and ingest sample data - - Returns: - TableData: sample data - """ - try: - logger.debug( - "Fetching sample data for " - f"{self.profiler_interface.table_entity.fullyQualifiedName.root}..." # type: ignore - ) - table_data = self.profiler_interface.fetch_sample_data( - self.table, self.columns - ) - upload_sample_data( - data=table_data, profiler_interface=self.profiler_interface - ) - table_data.rows = table_data.rows[ - : min( - SAMPLE_DATA_DEFAULT_COUNT, self.profiler_interface.sample_data_count - ) - ] - return SampleData( - data=table_data, store=self.source_config.generateSampleData - ) - - except Exception as err: - logger.debug(traceback.format_exc()) - logger.warning(f"Error fetching sample data: {err}") - return None - def get_profile(self) -> CreateTableProfileRequest: """ After executing the profiler, get all results diff --git a/ingestion/src/metadata/profiler/processor/sample_data_handler.py b/ingestion/src/metadata/profiler/processor/sample_data_handler.py index 733c66c8e8a7..7cbbaeec177a 100644 --- a/ingestion/src/metadata/profiler/processor/sample_data_handler.py +++ b/ingestion/src/metadata/profiler/processor/sample_data_handler.py @@ -16,6 +16,7 @@ from datetime import datetime from functools import singledispatch from io import BytesIO +from typing import Optional from metadata.clients.aws_client import AWSClient from metadata.generated.schema.entity.data.table import Table, TableData @@ -24,7 +25,6 @@ ) from metadata.generated.schema.security.credentials.awsCredentials import AWSCredentials from metadata.ingestion.models.custom_pydantic import ignore_type_decoder -from metadata.profiler.interface.profiler_interface import ProfilerInterface from metadata.utils.helpers import clean_uri from metadata.utils.logger import profiler_logger @@ -72,14 +72,13 @@ def _get_object_key( return file_name -def upload_sample_data(data: TableData, profiler_interface: ProfilerInterface) -> None: +def upload_sample_data(data: TableData, entity: Table, sample_storage_config: Optional[DataStorageConfig] = None) -> None: """ Upload Sample data to storage config """ import pandas as pd # pylint: disable=import-outside-toplevel try: - sample_storage_config: DataStorageConfig = profiler_interface.storage_config if not sample_storage_config: return # Ignore any decoding error for byte data @@ -92,7 +91,7 @@ def upload_sample_data(data: TableData, profiler_interface: ProfilerInterface) - pq_buffer = BytesIO() df.to_parquet(pq_buffer) object_key = _get_object_key( - table=profiler_interface.table_entity, + table=entity, prefix=sample_storage_config.prefix, overwrite_data=sample_storage_config.overwriteData, file_path_format=sample_storage_config.filePathPattern, diff --git a/ingestion/src/metadata/profiler/processor/sampler/sqlalchemy/sampler.py b/ingestion/src/metadata/profiler/processor/sampler/sqlalchemy/sampler.py index 3398d90e4b66..8fe02e9b23e6 100644 --- a/ingestion/src/metadata/profiler/processor/sampler/sqlalchemy/sampler.py +++ b/ingestion/src/metadata/profiler/processor/sampler/sqlalchemy/sampler.py @@ -132,7 +132,7 @@ def fetch_sample_data(self, columns: Optional[List[Column]] = None) -> TableData Args: columns (Optional[List]): List of columns to fetch - Retunrs: + Returns: TableData to be added to the Table Entity """ if self._profile_sample_query: diff --git a/ingestion/src/metadata/profiler/source/base/profiler_source.py b/ingestion/src/metadata/profiler/source/base/profiler_source.py index b84aa994f2ae..8d3d7b1df64a 100644 --- a/ingestion/src/metadata/profiler/source/base/profiler_source.py +++ b/ingestion/src/metadata/profiler/source/base/profiler_source.py @@ -40,14 +40,18 @@ ) from metadata.ingestion.ometa.ometa_api import OpenMetadata from metadata.profiler.api.models import ProfilerProcessorConfig, TableConfig +from metadata.profiler.config import get_schema_profiler_config, get_database_profiler_config from metadata.profiler.interface.profiler_interface import ProfilerInterface from metadata.profiler.metrics.registry import Metrics from metadata.profiler.processor.core import Profiler from metadata.profiler.processor.default import DefaultProfiler, get_default_metrics from metadata.profiler.source.profiler_source_interface import ProfilerSourceInterface +from metadata.sampler.config import get_profile_query, get_sample_data_count_config +from metadata.sampler.models import ProfileSampleConfig +from metadata.sampler.sampler_interface import SamplerInterface from metadata.utils.importer import import_from_module from metadata.utils.logger import profiler_logger -from metadata.utils.service_spec.service_spec import BaseSpec +from metadata.utils.service_spec.service_spec import BaseSpec, import_profiler_class, import_sampler_class NON_SQA_DATABASE_CONNECTIONS = (DatalakeConnection,) @@ -189,18 +193,41 @@ def create_profiler_interface( db_service: Optional[DatabaseService], ) -> ProfilerInterface: """Create sqlalchemy profiler interface""" - profiler_class = self.import_profiler_class( + profiler_class = import_profiler_class( ServiceType.Database, source_type=self.profiler_interface_type ) + sampler_class = import_sampler_class( + ServiceType.Database, source_type=self.profiler_interface_type + ) + sampler_interface: SamplerInterface = sampler_class.create( + table=entity, + sample_config=ProfileSampleConfig( + profile_sample=self.source_config.profileSample, + profile_sample_type=self.source_config.profileSampleType, + sampling_method_type=self.source_config.samplingMethodType, + ), + profile_sample_query=get_profile_query(entity=entity, entity_config=config), + storage_config=get_sample_data_count_config( + entity=entity, + schema_profiler_config=get_schema_profiler_config( + schema_entity=schema_entity + ), + database_profiler_config=get_database_profiler_config( + database_entity=database_entity + ), + entity_config=config, + source_config=self.source_config, + ), + sample_data_count=self.source_config.sampleDataCount, + ) profiler_interface: ProfilerInterface = profiler_class.create( entity, schema_entity, database_entity, - db_service, config, - profiler_config, self.source_config, self.service_conn_config, + sampler=sampler_interface, self.ometa_client, sqa_metadata=self.sqa_metadata, ) # type: ignore @@ -208,12 +235,6 @@ def create_profiler_interface( self.interface = profiler_interface return self.interface - def import_profiler_class( - self, service_type: ServiceType, source_type: str - ) -> Type[ProfilerInterface]: - class_path = BaseSpec.get_for_source(service_type, source_type).profiler_class - return cast(Type[ProfilerInterface], import_from_module(class_path)) - def _get_context_entities( self, entity: Table ) -> Tuple[DatabaseSchema, Database, DatabaseService]: diff --git a/ingestion/src/metadata/sampler/config.py b/ingestion/src/metadata/sampler/config.py new file mode 100644 index 000000000000..767067adf576 --- /dev/null +++ b/ingestion/src/metadata/sampler/config.py @@ -0,0 +1,180 @@ +# Copyright 2021 Collate +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Sampler configuration helpers +""" +from typing import Union, Optional + +from metadata.generated.schema.metadataIngestion.databaseServiceProfilerPipeline import DatabaseServiceProfilerPipeline + +from metadata.generated.schema.entity.data.table import Table, PartitionProfilerConfig + +from metadata.generated.schema.entity.services.databaseService import DatabaseService + +from metadata.generated.schema.entity.services.connections.connectionBasicType import DataStorageConfig + +from metadata.generated.schema.entity.data.database import DatabaseProfilerConfig + +from metadata.generated.schema.entity.data.databaseSchema import DatabaseSchemaProfilerConfig +from metadata.profiler.api.models import DatabaseAndSchemaConfig, ProfilerProcessorConfig, ProfileSampleConfig +from metadata.sampler.models import TableConfig + + +def get_sample_storage_config( + config: Union[ + DatabaseSchemaProfilerConfig, + DatabaseProfilerConfig, + DatabaseAndSchemaConfig, + ], +) -> Optional[DataStorageConfig]: + """Get sample storage config""" + if ( + config + and config.sampleDataStorageConfig + and config.sampleDataStorageConfig.config + ): + return config.sampleDataStorageConfig.config + return None + + +def get_storage_config_for_table( + entity: Table, + schema_profiler_config: Optional[DatabaseSchemaProfilerConfig], + database_profiler_config: Optional[DatabaseProfilerConfig], + db_service: Optional[DatabaseService], + profiler_config: ProfilerProcessorConfig, +) -> Optional[DataStorageConfig]: + """Get storage config for a specific entity""" + for schema_config in profiler_config.schemaConfig: + if ( + schema_config.fullyQualifiedName.root + == entity.databaseSchema.fullyQualifiedName + and get_sample_storage_config(schema_config) + ): + return get_sample_storage_config(schema_config) + + for database_config in profiler_config.databaseConfig: + if ( + database_config.fullyQualifiedName.root + == entity.database.fullyQualifiedName + and get_sample_storage_config(database_config) + ): + return get_sample_storage_config(database_config) + + if get_sample_storage_config(schema_profiler_config): + return get_sample_storage_config(schema_profiler_config) + + if get_sample_storage_config(database_profiler_config): + return get_sample_storage_config( + database_profiler_config + ) + + try: + return db_service.connection.config.sampleDataStorageConfig.config + except AttributeError: + pass + + return None + + +def get_profile_sample_config( + entity: Table, + schema_profiler_config: Optional[DatabaseSchemaProfilerConfig], + database_profiler_config: Optional[DatabaseProfilerConfig], + entity_config: Optional[Union[TableConfig, DatabaseAndSchemaConfig]], + source_config: DatabaseServiceProfilerPipeline, +) -> Optional[ProfileSampleConfig]: + """Get profile sample config for a specific entity""" + for config in ( + entity_config, + entity.tableProfilerConfig, + schema_profiler_config, + database_profiler_config, + source_config, + ): + try: + if config and config.profileSample: + return ProfileSampleConfig( + profile_sample=config.profileSample, + profile_sample_type=config.profileSampleType, + sampling_method_type=config.samplingMethodType, + ) + except AttributeError: + pass + + return None + + +def get_partition_details( + entity: Table, + entity_config: Optional[TableConfig] = None, +) -> Optional[PartitionProfilerConfig]: + """_summary_ + + Args: + entity (Table): table entity object + entity_config (Optional[TableConfig]): entity configuration + + Returns: + Optional[PartitionProfilerConfig]: + """ + if entity_config: + return entity_config.partitionConfig + + return get_partition_details(entity) + + +def get_profile_query( + entity: Table, entity_config: Optional[TableConfig] +) -> Optional[str]: + """get profile query for sampling + + Args: + entity (Table): table entity object + entity_config (Optional[TableConfig]): entity configuration + + Returns: + Optional[str]: + """ + if entity_config: + return entity_config.profileQuery + + if entity.tableProfilerConfig: + return entity.tableProfilerConfig.profileQuery + + return None + + +def get_sample_data_count_config( + entity: Table, + schema_profiler_config: Optional[DatabaseSchemaProfilerConfig], + database_profiler_config: Optional[DatabaseProfilerConfig], + entity_config: Optional[TableConfig], + source_config: DatabaseServiceProfilerPipeline, +) -> Optional[int]: + """_summary_ + Args: + entity_config (Optional[TableConfig]): table config object from yaml/json file + source_config DatabaseServiceProfilerPipeline: profiler pipeline details + Returns: + Optional[int]: int + """ + + for config in ( + entity_config, + entity.tableProfilerConfig, + schema_profiler_config, + database_profiler_config, + ): + if config and config.sampleDataCount: + return config.sampleDataCount + + return source_config.sampleDataCount diff --git a/ingestion/src/metadata/sampler/models.py b/ingestion/src/metadata/sampler/models.py new file mode 100644 index 000000000000..333019d5a201 --- /dev/null +++ b/ingestion/src/metadata/sampler/models.py @@ -0,0 +1,101 @@ +# Copyright 2021 Collate +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Sampling Models +""" +from typing import Optional, List, Union + +from metadata.generated.schema.entity.services.connections.connectionBasicType import SampleDataStorageConfig + +from metadata.generated.schema.type.basic import FullyQualifiedEntityName +from pydantic import Field + +from metadata.generated.schema.entity.data.table import TableData, Table, ProfileSampleType, SamplingMethodType, \ + ColumnProfilerConfig, PartitionProfilerConfig +from typing_extensions import Annotated + +from metadata.config.common import ConfigModel +from metadata.ingestion.models.custom_pydantic import BaseModel +from metadata.ingestion.models.table_metadata import ColumnTag + + +class BaseProfileConfig(ConfigModel): + """base profile config""" + + fullyQualifiedName: FullyQualifiedEntityName + profileSample: Optional[Union[float, int]] = None + profileSampleType: Optional[ProfileSampleType] = None + samplingMethodType: Optional[SamplingMethodType] = None + sampleDataCount: Optional[int] = 100 + + +class ColumnConfig(ConfigModel): + """Column config for profiler""" + + excludeColumns: Optional[List[str]] = None + includeColumns: Optional[List[ColumnProfilerConfig]] = None + + +class TableConfig(BaseProfileConfig): + """table profile config""" + + profileQuery: Optional[str] = None + partitionConfig: Optional[PartitionProfilerConfig] = None + columnConfig: Optional[ColumnConfig] = None + + @classmethod + def from_database_and_schema_config( + cls, config: "DatabaseAndSchemaConfig", table_fqn: str + ): + table_config = TableConfig( + fullyQualifiedName=table_fqn, + profileSample=config.profileSample, + profileSampleType=config.profileSampleType, + sampleDataCount=config.sampleDataCount, + samplingMethodType=config.samplingMethodType, + ) + return table_config + + +class DatabaseAndSchemaConfig(BaseProfileConfig): + """schema profile config""" + + # TODO: do we even need this now? + sampleDataStorageConfig: Optional[SampleDataStorageConfig] = None + + +class SampleData(BaseModel): + """TableData wrapper to handle ephemeral SampleData""" + + data: Annotated[TableData, Field(None, description="Table Sample Data")] + store: Annotated[ + bool, Field(False, description="Is the sample data should be stored or not") + ] + + +class SamplerResponse(ConfigModel): + """PII & Sampler Workflow Response. For a given table, return all the tags and sample data""" + + table: Table + sample_data: Optional[SampleData] = None + column_tags: Optional[List[ColumnTag]] = None + + def __str__(self): + """Return the table name being processed""" + return f"Table [{self.table.name.root}]" + + +class SampleConfig(ConfigModel): + """Profile Sample Config""" + + profile_sample: Optional[Union[float, int]] = None + profile_sample_type: Optional[ProfileSampleType] = ProfileSampleType.PERCENTAGE + sampling_method_type: Optional[SamplingMethodType] = None diff --git a/ingestion/src/metadata/sampler/nosql/sampler.py b/ingestion/src/metadata/sampler/nosql/sampler.py new file mode 100644 index 000000000000..381570450081 --- /dev/null +++ b/ingestion/src/metadata/sampler/nosql/sampler.py @@ -0,0 +1,77 @@ +from typing import Dict, List, Optional, Tuple + +from metadata.generated.schema.entity.data.table import ProfileSampleType, TableData +from metadata.profiler.adaptors.nosql_adaptor import NoSQLAdaptor +from metadata.profiler.processor.sampler.sampler_interface import SamplerInterface +from metadata.utils.constants import SAMPLE_DATA_DEFAULT_COUNT +from metadata.utils.sqa_like_column import SQALikeColumn + + +class NoSQLSampler(SamplerInterface): + client: NoSQLAdaptor + + def _rdn_sample_from_user_query(self) -> List[Dict[str, any]]: + """ + Get random sample from user query + """ + limit = self._get_limit() + return self.client.query( + self.table, self.table.columns, self._profile_sample_query, limit + ) + + def _fetch_sample_data_from_user_query(self) -> TableData: + """ + Fetch sample data based on a user query. Assuming the enging has one (example: MongoDB) + If the engine does not support a custom query, an error will be raised. + """ + records = self._rdn_sample_from_user_query() + columns = [ + SQALikeColumn(name=column.name.root, type=column.dataType) + for column in self.table.columns + ] + rows, cols = self.transpose_records(records, columns) + return TableData( + rows=[list(map(str, row)) for row in rows], columns=[c.name for c in cols] + ) + + def random_sample(self): + pass + + def fetch_sample_data(self, columns: List[SQALikeColumn]) -> TableData: + if self._profile_sample_query: + return self._fetch_sample_data_from_user_query() + return self._fetch_sample_data(columns) + + def _fetch_sample_data(self, columns: List[SQALikeColumn]) -> TableData: + """ + returns sampled ometa dataframes + """ + limit = self._get_limit() + records = self.client.scan(self.table, self.table.columns, limit) + rows, cols = self.transpose_records(records, columns) + return TableData( + rows=[list(map(str, row)) for row in rows], + columns=[col.name for col in cols], + ) + + def _get_limit(self) -> Optional[int]: + num_rows = self.client.item_count(self.table) + if self.profile_sample_type == ProfileSampleType.PERCENTAGE: + limit = num_rows * (self.profile_sample / 100) + elif self.profile_sample_type == ProfileSampleType.ROWS: + limit = self.profile_sample + else: + limit = SAMPLE_DATA_DEFAULT_COUNT + return limit + + @staticmethod + def transpose_records( + records: List[Dict[str, any]], columns: List[SQALikeColumn] + ) -> Tuple[List[List[any]], List[SQALikeColumn]]: + rows = [] + for record in records: + row = [] + for column in columns: + row.append(record.get(column.name)) + rows.append(row) + return rows, columns diff --git a/ingestion/src/metadata/sampler/pandas/sampler.py b/ingestion/src/metadata/sampler/pandas/sampler.py new file mode 100644 index 000000000000..572796dc71ff --- /dev/null +++ b/ingestion/src/metadata/sampler/pandas/sampler.py @@ -0,0 +1,170 @@ +# Copyright 2021 Collate +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Helper module to handle data sampling +for the profiler +""" +import math +import random +from typing import List, Optional, cast + +from metadata.data_quality.validations.table.pandas.tableRowInsertedCountToBeBetween import ( + TableRowInsertedCountToBeBetweenValidator, +) +from metadata.generated.schema.entity.data.table import ( + PartitionIntervalTypes, + PartitionProfilerConfig, + ProfileSampleType, + TableData, +) +from metadata.profiler.processor.sampler.sampler_interface import SamplerInterface +from metadata.utils.sqa_like_column import SQALikeColumn + + +class DatalakeSampler(SamplerInterface): + """ + Generates a sample of the data to not + run the query in the whole table. + """ + + def _partitioned_table(self): + """Get partitioned table""" + self._partition_details = cast(PartitionProfilerConfig, self._partition_details) + partition_field = self._partition_details.partitionColumnName + if ( + self._partition_details.partitionIntervalType + == PartitionIntervalTypes.COLUMN_VALUE + ): + return [ + df[df[partition_field].isin(self._partition_details.partitionValues)] + for df in self.table + ] + if ( + self._partition_details.partitionIntervalType + == PartitionIntervalTypes.INTEGER_RANGE + ): + return [ + df[ + df[partition_field].between( + self._partition_details.partitionIntegerRangeStart, + self._partition_details.partitionIntegerRangeEnd, + ) + ] + for df in self.table + ] + return [ + df[ + df[partition_field] + >= TableRowInsertedCountToBeBetweenValidator._get_threshold_date( # pylint: disable=protected-access + self._partition_details.partitionIntervalUnit.value, + self._partition_details.partitionInterval, + ) + ] + for df in self.table + ] + + def _fetch_sample_data_from_user_query(self) -> TableData: + """Fetch sample data from user query""" + cols, rows = self.get_col_row(data_frame=self._rdn_sample_from_user_query()) + return TableData(columns=cols, rows=rows) + + def _rdn_sample_from_user_query(self): + """Generate sample from user query""" + return [df.query(self._profile_sample_query) for df in self.table] + + def _get_sampled_dataframe(self): + """ + returns sampled ometa dataframes + """ + random.shuffle(self.table) # we'll shuffle the list of dataframes + # sampling data based on profiler config (if any) + if self.profile_sample_type == ProfileSampleType.PERCENTAGE: + try: + profile_sample = self.profile_sample / 100 + except TypeError: + # if the profile sample is not a number or is None + # we'll set it to 100 + profile_sample = self.profile_sample = 100 + return [ + df.sample( + frac=profile_sample, + random_state=random.randint(0, 100), + replace=True, + ) + for df in self.table + ] + + # we'll distribute the sample size equally among the dataframes + sample_rows_per_chunk: int = math.floor(self.profile_sample / len(self.table)) + num_rows = sum(len(df) for df in self.table) + + # if we have less rows than the sample size + # we'll return the whole table + if sample_rows_per_chunk > num_rows: + return self.table + return [ + df.sample( + n=sample_rows_per_chunk, + random_state=random.randint(0, 100), + replace=True, + ) + for df in self.table + ] + + def get_col_row(self, data_frame, columns: Optional[List[SQALikeColumn]] = None): + """ + Fetches columns and rows from the data_frame + """ + if columns: + cols = [col.name for col in columns] + else: + # we'll use the first dataframe to get the columns + cols = data_frame[0].columns.tolist() + rows = [] + # Sample Data should not exceed sample limit + for chunk in data_frame: + rows.extend(self._fetch_rows(chunk[cols])[: self.sample_limit]) + if len(rows) >= self.sample_limit: + break + return cols, rows + + def random_sample(self, is_sampled: bool = False): + """Generate random sample from the table + + Returns: + List[DataFrame] + """ + if self._profile_sample_query: + return self._rdn_sample_from_user_query() + + if self._partition_details: + self.table = self._partitioned_table() + + if not self.profile_sample or is_sampled: + return self.table + return self._get_sampled_dataframe() + + def _fetch_rows(self, data_frame): + return data_frame.dropna().values.tolist() + + def fetch_sample_data( + self, columns: Optional[List[SQALikeColumn]] = None + ) -> TableData: + """Fetch sample data from the table + + Returns: + TableData: + """ + if self._profile_sample_query: + return self._fetch_sample_data_from_user_query() + + cols, rows = self.get_col_row(data_frame=self.table, columns=columns) + return TableData(columns=cols, rows=rows) diff --git a/ingestion/src/metadata/utils/partition.py b/ingestion/src/metadata/sampler/partition.py similarity index 100% rename from ingestion/src/metadata/utils/partition.py rename to ingestion/src/metadata/sampler/partition.py diff --git a/ingestion/src/metadata/sampler/processor.py b/ingestion/src/metadata/sampler/processor.py new file mode 100644 index 000000000000..774f7e6ceecf --- /dev/null +++ b/ingestion/src/metadata/sampler/processor.py @@ -0,0 +1,53 @@ +# Copyright 2021 Collate +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Data Sampler for the PII Workflow +""" +from typing import Optional + +from metadata.ingestion.api.steps import Processor +from metadata.ingestion.ometa.ometa_api import OpenMetadata +from metadata.profiler.processor.core import Profiler +from metadata.profiler.source.metadata import ProfilerSourceAndEntity +from metadata.sampler.models import SamplerResponse + + +class SamplerProcessor(Processor): + """Use the profiler interface to fetch the sample data""" + + def _run(self, record: ProfilerSourceAndEntity) -> SamplerResponse: + """Fetch the sample data and pass it down the pipeline""" + profiler_runner: Profiler = record.profiler_source.get_profiler_runner( + record.entity, self.profiler_config + ) + + # We need the sample data for Sample Data or PII Sensitive processing. + # We'll nullify the Sample Data after the PII processing so that it's not stored. + if ( + self.source_config.generateSampleData + or self.source_config.processPiiSensitive + ): + sample_data = self.generate_sample_data() + else: + sample_data = None + + @classmethod + def create( + cls, + config_dict: dict, + metadata: OpenMetadata, + pipeline_name: Optional[str] = None, + ) -> "Step": + pass + + def close(self) -> None: + """Nothing to close""" + pass diff --git a/ingestion/src/metadata/sampler/sampler_factory.py b/ingestion/src/metadata/sampler/sampler_factory.py new file mode 100644 index 000000000000..2a5e52e7211e --- /dev/null +++ b/ingestion/src/metadata/sampler/sampler_factory.py @@ -0,0 +1,100 @@ +# Copyright 2021 Collate +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Factory class for creating sampler objects +""" + +from typing import Union + +from metadata.generated.schema.entity.services.connections.database.azureSQLConnection import ( + AzureSQLConnection, +) +from metadata.generated.schema.entity.services.connections.database.bigQueryConnection import ( + BigQueryConnection, +) +from metadata.generated.schema.entity.services.connections.database.datalakeConnection import ( + DatalakeConnection, +) +from metadata.generated.schema.entity.services.connections.database.dynamoDBConnection import ( + DynamoDBConnection, +) +from metadata.generated.schema.entity.services.connections.database.mongoDBConnection import ( + MongoDBConnection, +) +from metadata.generated.schema.entity.services.connections.database.snowflakeConnection import ( + SnowflakeConnection, +) +from metadata.generated.schema.entity.services.connections.database.trinoConnection import ( + TrinoConnection, +) +from metadata.generated.schema.entity.services.databaseService import DatabaseConnection +from metadata.profiler.processor.sampler.nosql.sampler import NoSQLSampler +from metadata.profiler.processor.sampler.pandas.sampler import DatalakeSampler +from metadata.profiler.processor.sampler.sqlalchemy.azuresql.sampler import ( + AzureSQLSampler, +) +from metadata.profiler.processor.sampler.sqlalchemy.bigquery.sampler import ( + BigQuerySampler, +) +from metadata.profiler.processor.sampler.sqlalchemy.sampler import SQASampler +from metadata.profiler.processor.sampler.sqlalchemy.snowflake.sampler import ( + SnowflakeSampler, +) +from metadata.profiler.processor.sampler.sqlalchemy.trino.sampler import TrinoSampler + + +class SamplerFactory: + """Creational factory for sampler objects""" + + def __init__(self): + self._sampler_type = {} + + def register(self, source_type: str, sampler_class): + """Register a new source type""" + self._sampler_type[source_type] = sampler_class + + def create( + self, source_type: str, *args, **kwargs + ) -> Union[SQASampler, DatalakeSampler]: + """Create source object based on source type""" + sampler_class = self._sampler_type.get(source_type) + if not sampler_class: + sampler_class = self._sampler_type[DatabaseConnection.__name__] + return sampler_class(*args, **kwargs) + return sampler_class(*args, **kwargs) + + +sampler_factory_ = SamplerFactory() +sampler_factory_.register( + source_type=DatabaseConnection.__name__, sampler_class=SQASampler +) +sampler_factory_.register( + source_type=BigQueryConnection.__name__, sampler_class=BigQuerySampler +) +sampler_factory_.register( + source_type=DatalakeConnection.__name__, sampler_class=DatalakeSampler +) +sampler_factory_.register( + source_type=TrinoConnection.__name__, sampler_class=TrinoSampler +) +sampler_factory_.register( + source_type=MongoDBConnection.__name__, sampler_class=NoSQLSampler +) +sampler_factory_.register( + source_type=SnowflakeConnection.__name__, sampler_class=SnowflakeSampler +) +sampler_factory_.register( + source_type=DynamoDBConnection.__name__, sampler_class=NoSQLSampler +) +sampler_factory_.register( + source_type=AzureSQLConnection.__name__, sampler_class=AzureSQLSampler +) diff --git a/ingestion/src/metadata/sampler/sampler_interface.py b/ingestion/src/metadata/sampler/sampler_interface.py new file mode 100644 index 000000000000..46c37df2e88d --- /dev/null +++ b/ingestion/src/metadata/sampler/sampler_interface.py @@ -0,0 +1,146 @@ +# Copyright 2021 Collate +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Interface for sampler +""" +import traceback +from abc import ABC, abstractmethod +from typing import Dict, List, Optional, Union + +from metadata.generated.schema.entity.services.connections.connectionBasicType import DataStorageConfig + + +from metadata.generated.schema.metadataIngestion.databaseServiceProfilerPipeline import DatabaseServiceProfilerPipeline + +from metadata.generated.schema.entity.data.database import DatabaseProfilerConfig + +from metadata.generated.schema.entity.data.databaseSchema import DatabaseSchemaProfilerConfig +from metadata.profiler.processor.sample_data_handler import upload_sample_data +from metadata.sampler.models import SampleConfig + +from metadata.sampler.partition import get_partition_details +from metadata.utils.execution_time_tracker import calculate_execution_time +from metadata.utils.logger import sampler_logger +from sqlalchemy import Column + +from metadata.generated.schema.entity.data.table import Table, TableData, PartitionProfilerConfig +from metadata.profiler.api.models import TableConfig +from metadata.utils.constants import SAMPLE_DATA_DEFAULT_COUNT +from metadata.utils.sqa_like_column import SQALikeColumn + +logger = sampler_logger() + + +class SamplerInterface(ABC): + """Sampler interface""" + + def __init__( + self, + client, + table: Table, + sample_config: Optional[SampleConfig] = None, + partition_details: Optional[Dict] = None, + profile_sample_query: Optional[str] = None, + storage_config: DataStorageConfig = None, + sample_data_count: Optional[int] = SAMPLE_DATA_DEFAULT_COUNT, + ): + self.profile_sample = None + self.profile_sample_type = None + if sample_config: + self.profile_sample = sample_config.profile_sample + self.profile_sample_type = sample_config.profile_sample_type + self.client = client + self.table = table + self._profile_sample_query = profile_sample_query + self.sample_limit = sample_data_count + self._sample_rows = None + self._partition_details = partition_details + self.storage_config = storage_config + + @classmethod + def create( + cls, + client, + table: Table, + sample_config: Optional[SampleConfig] = None, + partition_details: Optional[Dict] = None, + profile_sample_query: Optional[str] = None, + storage_config: DataStorageConfig = None, + sample_data_count: Optional[int] = SAMPLE_DATA_DEFAULT_COUNT, + ) -> "SamplerInterface": + """Create sampler""" + + return cls( + client=client, + table=table, + sample_config=sample_config, + partition_details=partition_details, + profile_sample_query=profile_sample_query, + storage_config=storage_config, + sample_data_count=sample_data_count, + ) + + @abstractmethod + def _rdn_sample_from_user_query(self): + """Get random sample from user query""" + raise NotImplementedError + + @abstractmethod + def _fetch_sample_data_from_user_query(self) -> TableData: + """Fetch sample data from user query""" + raise NotImplementedError + + @abstractmethod + def random_sample(self): + """Get random sample""" + raise NotImplementedError + + @abstractmethod + def fetch_sample_data( + self, columns: Optional[Union[List[Column], List[SQALikeColumn]]] + ) -> TableData: + """Fetch sample data + + Args: + columns (Optional[List]): List of columns to fetch + """ + raise NotImplementedError + + @calculate_execution_time(store=False) + def generate_sample_data(self) -> Optional[TableData]: + """Fetch and ingest sample data + + Returns: + TableData: sample data + """ + try: + logger.debug( + "Fetching sample data for " + f"{self.profiler_interface.table_entity.fullyQualifiedName.root}..." # type: ignore + ) + # TODO: GET COLUMNS? + table_data = self.fetch_sample_data( + self.table, self.columns + ) + upload_sample_data( + data=table_data, entity=self.table, sample_storage_config=self.storage_config, + ) + table_data.rows = table_data.rows[ + : min( + SAMPLE_DATA_DEFAULT_COUNT, self.sample_limit + ) + ] + return table_data + + except Exception as err: + logger.debug(traceback.format_exc()) + logger.warning(f"Error fetching sample data: {err}") + return None diff --git a/ingestion/src/metadata/sampler/sqlalchemy/azuresql/sampler.py b/ingestion/src/metadata/sampler/sqlalchemy/azuresql/sampler.py new file mode 100644 index 000000000000..fb48f80e6623 --- /dev/null +++ b/ingestion/src/metadata/sampler/sqlalchemy/azuresql/sampler.py @@ -0,0 +1,40 @@ +# Copyright 2021 Collate +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Helper module to handle data sampling +for the profiler +""" +from typing import List, Optional + +from sqlalchemy import Column + +from metadata.generated.schema.entity.data.table import TableData +from metadata.profiler.processor.sampler.sqlalchemy.sampler import SQASampler + + +class AzureSQLSampler(SQASampler): + """ + Generates a sample of the data to not + run the query in the whole table. + """ + + # These types are not supported by pyodbc - it throws + # an error when trying to fetch data from these columns + # pyodbc.ProgrammingError: ('ODBC SQL type -151 is not yet supported. column-index=x type=-151', 'HY106') + NOT_COMPUTE_PYODBC = {"SQASGeography", "UndeterminedType"} + + def fetch_sample_data(self, columns: Optional[List[Column]] = None) -> TableData: + sqa_columns = [] + if columns: + for col in columns: + if col.type.__class__.__name__ not in self.NOT_COMPUTE_PYODBC: + sqa_columns.append(col) + return super().fetch_sample_data(sqa_columns or columns) diff --git a/ingestion/src/metadata/sampler/sqlalchemy/bigquery/sampler.py b/ingestion/src/metadata/sampler/sqlalchemy/bigquery/sampler.py new file mode 100644 index 000000000000..d3174ecd48aa --- /dev/null +++ b/ingestion/src/metadata/sampler/sqlalchemy/bigquery/sampler.py @@ -0,0 +1,97 @@ +# Copyright 2021 Collate +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Helper module to handle data sampling +for the profiler +""" +from typing import Dict, Optional + +from sqlalchemy import Column +from sqlalchemy.orm import Query + +from metadata.generated.schema.entity.data.table import ProfileSampleType, TableType +from metadata.profiler.api.models import ProfileSampleConfig +from metadata.profiler.processor.handle_partition import partition_filter_handler +from metadata.profiler.processor.sampler.sqlalchemy.sampler import SQASampler +from metadata.utils.constants import SAMPLE_DATA_DEFAULT_COUNT + + +class BigQuerySampler(SQASampler): + """ + Generates a sample of the data to not + run the query in the whole table. + """ + + # pylint: disable=too-many-arguments + def __init__( + self, + client, + table, + profile_sample_config: Optional[ProfileSampleConfig] = None, + partition_details: Optional[Dict] = None, + profile_sample_query: Optional[str] = None, + sample_data_count: Optional[int] = SAMPLE_DATA_DEFAULT_COUNT, + table_type: TableType = None, + ): + super().__init__( + client, + table, + profile_sample_config, + partition_details, + profile_sample_query, + sample_data_count, + ) + self.table_type: TableType = table_type + + def _base_sample_query(self, column: Optional[Column], label=None): + """Base query for sampling + + Args: + column (Optional[Column]): if computing a column metric only sample for the column + label (_type_, optional): + + Returns: + """ + # pylint: disable=import-outside-toplevel + from sqlalchemy_bigquery import STRUCT + + if column is not None: + column_parts = column.name.split(".") + if len(column_parts) > 1: + # for struct columns (e.g. `foo.bar`) we need to create a new column corresponding to + # the struct (e.g. `foo`) and then use that in the sample query as the column that + # will be query is `foo.bar`. + # e.g. WITH sample AS (SELECT `foo` FROM table) SELECT `foo.bar` + # FROM sample TABLESAMPLE SYSTEM (n PERCENT) + column = Column(column_parts[0], STRUCT) + # pylint: disable=protected-access + column._set_parent(self.table.__table__) + # pylint: enable=protected-access + + return super()._base_sample_query(column, label=label) + + @partition_filter_handler(build_sample=True) + def get_sample_query(self, *, column=None) -> Query: + """get query for sample data""" + # TABLESAMPLE SYSTEM is not supported for views + if ( + self.profile_sample_type == ProfileSampleType.PERCENTAGE + and self.table_type != TableType.View + ): + return ( + self._base_sample_query(column) + .suffix_with( + f"TABLESAMPLE SYSTEM ({self.profile_sample or 100} PERCENT)", + ) + .cte(f"{self.table.__tablename__}_sample") + ) + + return super().get_sample_query(column=column) diff --git a/ingestion/src/metadata/sampler/sqlalchemy/sampler.py b/ingestion/src/metadata/sampler/sqlalchemy/sampler.py new file mode 100644 index 000000000000..3398d90e4b66 --- /dev/null +++ b/ingestion/src/metadata/sampler/sqlalchemy/sampler.py @@ -0,0 +1,257 @@ +# Copyright 2021 Collate +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Helper module to handle data sampling +for the profiler +""" +import traceback +from typing import List, Optional, Union, cast + +from sqlalchemy import Column, inspect, text +from sqlalchemy.orm import DeclarativeMeta, Query, aliased +from sqlalchemy.orm.util import AliasedClass +from sqlalchemy.sql.sqltypes import Enum + +from metadata.generated.schema.entity.data.table import ( + PartitionIntervalTypes, + PartitionProfilerConfig, + ProfileSampleType, + TableData, +) +from metadata.profiler.orm.functions.modulo import ModuloFn +from metadata.profiler.orm.functions.random_num import RandomNumFn +from metadata.profiler.processor.handle_partition import partition_filter_handler +from metadata.profiler.processor.sampler.sampler_interface import SamplerInterface +from metadata.utils.helpers import is_safe_sql_query +from metadata.utils.logger import profiler_interface_registry_logger +from metadata.utils.sqa_utils import ( + build_query_filter, + dispatch_to_date_or_datetime, + get_integer_range_filter, + get_partition_col_type, + get_value_filter, +) + +logger = profiler_interface_registry_logger() + +RANDOM_LABEL = "random" + + +def _object_value_for_elem(self, elem): + """ + we have mapped DataType.ENUM: sqlalchemy.Enum + if map by default return None, + we will always get None because there is no enum map to lookup, + so what we are doing here is basically trusting the database, + that it will be storing the correct map key and showing directly that on the UI, + and in this approach we will be only able to display + what database has stored (i.e the key) and not the actual value of the same! + """ + return self._object_lookup.get(elem, elem) # pylint: disable=protected-access + + +Enum._object_value_for_elem = _object_value_for_elem # pylint: disable=protected-access + + +class SQASampler(SamplerInterface): + """ + Generates a sample of the data to not + run the query in the whole table. + """ + + def _base_sample_query(self, column: Optional[Column], label=None): + """Base query for sampling + + Args: + column (Optional[Column]): if computing a column metric only sample for the column + label (_type_, optional): + + Returns: + """ + # only sample the column if we are computing a column metric to limit the amount of data scaned + entity = self.table if column is None else column + if label is not None: + return self.client.query(entity, label) + return self.client.query(entity) + + @partition_filter_handler(build_sample=True) + def get_sample_query(self, *, column=None) -> Query: + """get query for sample data""" + if self.profile_sample_type == ProfileSampleType.PERCENTAGE: + rnd = self._base_sample_query( + column, + (ModuloFn(RandomNumFn(), 100)).label(RANDOM_LABEL), + ).cte(f"{self.table.__tablename__}_rnd") + session_query = self.client.query(rnd) + return session_query.where(rnd.c.random <= self.profile_sample).cte( + f"{self.table.__tablename__}_sample" + ) + + table_query = self.client.query(self.table) + session_query = self._base_sample_query( + column, + (ModuloFn(RandomNumFn(), table_query.count())).label(RANDOM_LABEL), + ) + return ( + session_query.order_by(RANDOM_LABEL) + .limit(self.profile_sample) + .cte(f"{self.table.__tablename__}_rnd") + ) + + def random_sample(self, ccolumn=None) -> Union[DeclarativeMeta, AliasedClass]: + """ + Either return a sampled CTE of table, or + the full table if no sampling is required. + """ + if self._profile_sample_query: + return self._rdn_sample_from_user_query() + + if not self.profile_sample or int(self.profile_sample) == 100: + if self._partition_details: + return self._partitioned_table() + + return self.table + + # Add new RandomNumFn column + sampled = self.get_sample_query(column=ccolumn) + + # Assign as an alias + return aliased(self.table, sampled) + + def fetch_sample_data(self, columns: Optional[List[Column]] = None) -> TableData: + """ + Use the sampler to retrieve sample data rows as per limit given by user + + Args: + columns (Optional[List]): List of columns to fetch + Retunrs: + TableData to be added to the Table Entity + """ + if self._profile_sample_query: + return self._fetch_sample_data_from_user_query() + + # Add new RandomNumFn column + rnd = self.get_sample_query() + if not columns: + sqa_columns = [col for col in inspect(rnd).c if col.name != RANDOM_LABEL] + else: + # we can't directly use columns as it is bound to self.table and not the rnd table. + # If we use it, it will result in a cross join between self.table and rnd table + names = [col.name for col in columns] + sqa_columns = [ + col + for col in inspect(rnd).c + if col.name != RANDOM_LABEL and col.name in names + ] + + try: + sqa_sample = ( + self.client.query(*sqa_columns) + .select_from(rnd) + .limit(self.sample_limit) + .all() + ) + except Exception: + logger.debug( + "Cannot fetch sample data with random sampling. Falling back to 100 rows." + ) + logger.debug(traceback.format_exc()) + sqa_columns = list(inspect(self.table).c) + sqa_sample = ( + self.client.query(*sqa_columns).select_from(self.table).limit(100).all() + ) + + return TableData( + columns=[column.name for column in sqa_columns], + rows=[list(row) for row in sqa_sample], + ) + + def _fetch_sample_data_from_user_query(self) -> TableData: + """Returns a table data object using results from query execution""" + if not is_safe_sql_query(self._profile_sample_query): + raise RuntimeError( + f"SQL expression is not safe\n\n{self._profile_sample_query}" + ) + + rnd = self.client.execute(f"{self._profile_sample_query}") + try: + columns = [col.name for col in rnd.cursor.description] + except AttributeError: + columns = list(rnd.keys()) + return TableData( + columns=columns, + rows=[list(row) for row in rnd.fetchmany(100)], + ) + + def _rdn_sample_from_user_query(self) -> Query: + """Returns sql alchemy object to use when running profiling""" + if not is_safe_sql_query(self._profile_sample_query): + raise RuntimeError( + f"SQL expression is not safe\n\n{self._profile_sample_query}" + ) + + return self.client.query(self.table).from_statement( + text(f"{self._profile_sample_query}") + ) + + def _partitioned_table(self) -> Query: + """Return the Query object for partitioned tables""" + return aliased(self.get_partitioned_query().subquery()) + + def get_partitioned_query(self) -> Query: + """Return the partitioned query""" + self._partition_details = cast( + PartitionProfilerConfig, self._partition_details + ) # satisfying type checker + partition_field = self._partition_details.partitionColumnName + + type_ = get_partition_col_type( + partition_field, + self.table.__table__.c, + ) + + if ( + self._partition_details.partitionIntervalType + == PartitionIntervalTypes.COLUMN_VALUE + ): + return self.client.query(self.table).filter( + get_value_filter( + Column(partition_field), + self._partition_details.partitionValues, + ) + ) + if ( + self._partition_details.partitionIntervalType + == PartitionIntervalTypes.INTEGER_RANGE + ): + return self.client.query(self.table).filter( + get_integer_range_filter( + Column(partition_field), + self._partition_details.partitionIntegerRangeStart, + self._partition_details.partitionIntegerRangeEnd, + ) + ) + return self.client.query(self.table).filter( + build_query_filter( + [ + ( + Column(partition_field), + "ge", + dispatch_to_date_or_datetime( + self._partition_details.partitionInterval, + text(self._partition_details.partitionIntervalUnit.value), + type_, + ), + ) + ], + False, + ) + ) diff --git a/ingestion/src/metadata/sampler/sqlalchemy/snowflake/sampler.py b/ingestion/src/metadata/sampler/sqlalchemy/snowflake/sampler.py new file mode 100644 index 000000000000..5757782b17d4 --- /dev/null +++ b/ingestion/src/metadata/sampler/sqlalchemy/snowflake/sampler.py @@ -0,0 +1,83 @@ +# Copyright 2021 Collate +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Helper module to handle data sampling +for the profiler +""" + +from typing import Dict, Optional, cast + +from sqlalchemy import Table +from sqlalchemy.sql.selectable import CTE + +from metadata.generated.schema.entity.data.table import ( + ProfileSampleType, + SamplingMethodType, +) +from metadata.profiler.api.models import ProfileSampleConfig +from metadata.profiler.processor.handle_partition import partition_filter_handler +from metadata.profiler.processor.sampler.sqlalchemy.sampler import SQASampler +from metadata.utils.constants import SAMPLE_DATA_DEFAULT_COUNT + + +class SnowflakeSampler(SQASampler): + """ + Generates a sample of the data to not + run the query in the whole table. + """ + + def __init__( + self, + client, + table, + profile_sample_config: Optional[ProfileSampleConfig] = None, + partition_details: Optional[Dict] = None, + profile_sample_query: Optional[str] = None, + sample_data_count: Optional[int] = SAMPLE_DATA_DEFAULT_COUNT, + ): + super().__init__( + client, + table, + profile_sample_config, + partition_details, + profile_sample_query, + sample_data_count, + ) + self.sampling_method_type = SamplingMethodType.BERNOULLI + if profile_sample_config and profile_sample_config.sampling_method_type: + self.sampling_method_type = profile_sample_config.sampling_method_type + + @partition_filter_handler(build_sample=True) + def get_sample_query(self, *, column=None) -> CTE: + """get query for sample data""" + # TABLESAMPLE SYSTEM is not supported for views + self.table = cast(Table, self.table) + + if self.profile_sample_type == ProfileSampleType.PERCENTAGE: + rnd = ( + self._base_sample_query( + column, + ) + .suffix_with( + f"SAMPLE {self.sampling_method_type.value} ({self.profile_sample or 100})", + ) + .cte(f"{self.table.__tablename__}_rnd") + ) + session_query = self.client.query(rnd) + return session_query.cte(f"{self.table.__tablename__}_sample") + + return ( + self._base_sample_query(column) + .suffix_with( + f"TABLESAMPLE ({self.profile_sample or 100} ROWS)", + ) + .cte(f"{self.table.__tablename__}_sample") + ) diff --git a/ingestion/src/metadata/sampler/sqlalchemy/trino/sampler.py b/ingestion/src/metadata/sampler/sqlalchemy/trino/sampler.py new file mode 100644 index 000000000000..6cfc69472386 --- /dev/null +++ b/ingestion/src/metadata/sampler/sqlalchemy/trino/sampler.py @@ -0,0 +1,47 @@ +# Copyright 2021 Collate +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Helper module to handle data sampling +for the profiler +""" +from sqlalchemy import inspect, or_, text + +from metadata.profiler.orm.registry import FLOAT_SET +from metadata.profiler.processor.handle_partition import RANDOM_LABEL +from metadata.profiler.processor.sampler.sqlalchemy.sampler import SQASampler + + +class TrinoSampler(SQASampler): + """ + Generates a sample of the data to not + run the query in the whole table. + """ + + def __init__(self, *args, **kwargs): + # pylint: disable=import-outside-toplevel + from trino.sqlalchemy.dialect import TrinoDialect + + TrinoDialect._json_deserializer = None + + super().__init__(*args, **kwargs) + + def _base_sample_query(self, column, label=None): + sqa_columns = [col for col in inspect(self.table).c if col.name != RANDOM_LABEL] + entity = self.table if column is None else column + return self.client.query(entity, label).where( + or_( + *[ + text(f"is_nan({cols.name}) = False") + for cols in sqa_columns + if type(cols.type) in FLOAT_SET + ] + ) + ) diff --git a/ingestion/src/metadata/utils/logger.py b/ingestion/src/metadata/utils/logger.py index c33eb3b96d19..b0058f8f4b4c 100644 --- a/ingestion/src/metadata/utils/logger.py +++ b/ingestion/src/metadata/utils/logger.py @@ -49,6 +49,7 @@ class Loggers(Enum): OMETA = "OMetaAPI" CLI = "Metadata" PROFILER = "Profiler" + SAMPLER = "Sampler" PII = "PII" INGESTION = "Ingestion" UTILS = "Utils" @@ -100,6 +101,14 @@ def profiler_logger(): return logging.getLogger(Loggers.PROFILER.value) +def sampler_logger(): + """ + Method to get the SAMPLER logger + """ + + return logging.getLogger(Loggers.SAMPLER.value) + + def pii_logger(): """ Method to get the PROFILER logger diff --git a/ingestion/src/metadata/utils/service_spec/default.py b/ingestion/src/metadata/utils/service_spec/default.py index 92558a1409a4..10067961cd34 100644 --- a/ingestion/src/metadata/utils/service_spec/default.py +++ b/ingestion/src/metadata/utils/service_spec/default.py @@ -7,9 +7,11 @@ from metadata.profiler.interface.sqlalchemy.profiler_interface import ( SQAProfilerInterface, ) +from metadata.sampler.sqlalchemy.sampler import SQASampler from metadata.utils.importer import get_class_path from metadata.utils.service_spec.service_spec import BaseSpec class DefaultDatabaseSpec(BaseSpec): profiler_class: Optional[str] = get_class_path(SQAProfilerInterface) + sampler_class: Optional[str] = get_class_path(SQASampler) diff --git a/ingestion/src/metadata/utils/service_spec/service_spec.py b/ingestion/src/metadata/utils/service_spec/service_spec.py index a401ba0e8154..808626a11023 100644 --- a/ingestion/src/metadata/utils/service_spec/service_spec.py +++ b/ingestion/src/metadata/utils/service_spec/service_spec.py @@ -9,6 +9,8 @@ from metadata.generated.schema.entity.services.serviceType import ServiceType from metadata.ingestion.api.steps import Source from metadata.ingestion.models.custom_pydantic import BaseModel +from metadata.profiler.interface.profiler_interface import ProfilerInterface +from metadata.sampler.sampler_interface import SamplerInterface from metadata.utils.importer import ( TYPE_SEPARATOR, get_class_path, @@ -97,3 +99,17 @@ def import_source_class( Type[Source], import_from_module(spec.model_dump()[field]), ) + + +def import_profiler_class( + service_type: ServiceType, source_type: str +) -> Type[ProfilerInterface]: + class_path = BaseSpec.get_for_source(service_type, source_type).profiler_class + return cast(Type[ProfilerInterface], import_from_module(class_path)) + + +def import_sampler_class( + service_type: ServiceType, source_type: str +) -> Type[SamplerInterface]: + class_path = BaseSpec.get_for_source(service_type, source_type).sampler_class + return cast(Type[SamplerInterface], import_from_module(class_path)) diff --git a/ingestion/src/metadata/workflow/pii.py b/ingestion/src/metadata/workflow/pii.py new file mode 100644 index 000000000000..3144714348b4 --- /dev/null +++ b/ingestion/src/metadata/workflow/pii.py @@ -0,0 +1,41 @@ +# Copyright 2021 Collate +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Workflow definition for the profiler +""" + +from metadata.ingestion.api.steps import Processor +from metadata.pii.processor import PIIProcessor +from metadata.utils.logger import profiler_logger +from metadata.workflow.profiler import ProfilerWorkflow + +logger = profiler_logger() + + +class PIIWorkflow(ProfilerWorkflow): + """PII workflow implementation. Based on the Profiler logic with different steps""" + + def set_steps(self): + source_class = self._get_source_class() + self.source = source_class.create(self.config.model_dump(), self.metadata) + + sink = self._get_sink() + pii_processor = self._get_pii_processor() + + # OM Source -> sampler -> PII -> Sink + self.steps = (pii_processor, sink) + + def _get_pii_processor(self) -> Processor: + return PIIProcessor.create(self.config.model_dump(), self.metadata) + + def _get_sampler_processor(self) -> Processor: + # TODO + return ... diff --git a/openmetadata-spec/src/main/resources/json/schema/metadataIngestion/databaseServicePIIPipeline.json b/openmetadata-spec/src/main/resources/json/schema/metadataIngestion/databaseServicePIIPipeline.json new file mode 100644 index 000000000000..bca61cbcef6e --- /dev/null +++ b/openmetadata-spec/src/main/resources/json/schema/metadataIngestion/databaseServicePIIPipeline.json @@ -0,0 +1,88 @@ +{ + "$id": "https://open-metadata.org/schema/metadataIngestion/databaseServicePIIPipeline.json", + "$schema": "http://json-schema.org/draft-07/schema#", + "title": "DatabaseServicePIIPipeline", + "description": "DatabaseService PII & PII Pipeline Configuration.", + "type": "object", + "definitions": { + "PIIConfigType": { + "description": "Profiler Source Config Pipeline type", + "type": "string", + "enum": ["PII"], + "default": "PII" + } + }, + "properties": { + "type": { + "description": "Pipeline type", + "$ref": "#/definitions/PIIConfigType", + "default": "PII" + }, + "schemaFilterPattern": { + "description": "Regex to only fetch tables or databases that matches the pattern.", + "$ref": "../type/filterPattern.json#/definitions/filterPattern", + "title": "Schema Filter Pattern" + }, + "tableFilterPattern": { + "description": "Regex exclude tables or databases that matches the pattern.", + "$ref": "../type/filterPattern.json#/definitions/filterPattern", + "title": "Table Filter Pattern" + }, + "databaseFilterPattern": { + "description": "Regex to only fetch databases that matches the pattern.", + "$ref": "../type/filterPattern.json#/definitions/filterPattern", + "title": "Database Filter Pattern" + }, + "includeViews": { + "description": "Optional configuration to turn off fetching metadata for views.", + "type": "boolean", + "default": true, + "title": "Include Views" + }, + "useFqnForFiltering": { + "description": "Regex will be applied on fully qualified name (e.g service_name.db_name.schema_name.table_name) instead of raw name (e.g. table_name)", + "type": "boolean", + "default": false, + "title": "Use FQN For Filtering" + }, + "generateSampleData": { + "description": "Option to turn on/off generating sample data. If enabled, profiler will ingest sample data for each table.", + "type": "boolean", + "default": true, + "title": "Generate Sample Data" + }, + "processPiiSensitive": { + "description": "Optional configuration to automatically tag columns that might contain sensitive information", + "type": "boolean", + "default": false, + "title": "Auto Tag PII" + }, + "confidence": { + "description": "Set the Confidence value for which you want the column to be tagged as PII. Confidence value ranges from 0 to 100. A higher number will yield less false positives but more false negatives. A lower number will yield more false positives but less false negatives.", + "type": "number", + "default": 80, + "title": "PII Inference Confidence Level" + }, + "profileSampleType": { + "$ref": "../entity/data/table.json#/definitions/profileSampleType", + "title": "Profile Sample Type" + }, + "profileSample": { + "description": "Percentage of data or no. of rows used to compute the profiler metrics and run data quality tests", + "type": "number", + "default": null, + "title": "Profile Sample" + }, + "samplingMethodType": { + "$ref": "../entity/data/table.json#/definitions/samplingMethodType", + "title": "Sampling Method Type" + }, + "sampleDataCount": { + "description": "Number of sample rows to ingest when 'Generate Sample Data' is enabled", + "type": "integer", + "default": 50, + "title": "Sample Data Rows Count" + } + }, + "additionalProperties": false +} diff --git a/openmetadata-spec/src/main/resources/json/schema/metadataIngestion/workflow.json b/openmetadata-spec/src/main/resources/json/schema/metadataIngestion/workflow.json index 1703f4b5b805..e8e08a8e0475 100644 --- a/openmetadata-spec/src/main/resources/json/schema/metadataIngestion/workflow.json +++ b/openmetadata-spec/src/main/resources/json/schema/metadataIngestion/workflow.json @@ -30,6 +30,9 @@ { "$ref": "databaseServiceProfilerPipeline.json" }, + { + "$ref": "databaseServicePIIPipeline.json" + }, { "$ref": "pipelineServiceMetadataPipeline.json" }, From 402b64e7faceb7ef8b7bcab34a8c31f597b13389 Mon Sep 17 00:00:00 2001 From: Pere Miquel Brull Date: Mon, 11 Nov 2024 18:19:32 +0100 Subject: [PATCH 02/29] separate sampler --- .../sqlalchemy/sqa_test_suite_interface.py | 2 +- .../interface/test_suite_interface.py | 8 +- .../runtime_param_setter/param_setter.py | 5 +- .../metadata/ingestion/sink/metadata_rest.py | 41 +-- .../metadata/mixins/sqalchemy/sqa_mixin.py | 16 +- ingestion/src/metadata/pii/models.py | 1 - ingestion/src/metadata/pii/processor.py | 5 +- ingestion/src/metadata/profiler/api/models.py | 6 +- ingestion/src/metadata/profiler/config.py | 13 +- .../interface/nosql/profiler_interface.py | 23 -- .../interface/pandas/profiler_interface.py | 37 +-- .../profiler/interface/profiler_interface.py | 61 +---- .../sqlalchemy/bigquery/profiler_interface.py | 19 -- .../databricks/profiler_interface.py | 19 +- .../sqlalchemy/profiler_interface.py | 112 ++------ .../metadata/profiler/orm/converter/base.py | 30 +- .../src/metadata/profiler/processor/core.py | 12 +- .../profiler/processor/sample_data_handler.py | 6 +- .../processor/sampler/nosql/sampler.py | 77 ------ .../processor/sampler/pandas/sampler.py | 170 ------------ .../processor/sampler/sampler_factory.py | 100 ------- .../processor/sampler/sampler_interface.py | 74 ----- .../sampler/sqlalchemy/azuresql/sampler.py | 40 --- .../sampler/sqlalchemy/bigquery/sampler.py | 97 ------- .../processor/sampler/sqlalchemy/sampler.py | 257 ------------------ .../sampler/sqlalchemy/snowflake/sampler.py | 83 ------ .../sampler/sqlalchemy/trino/sampler.py | 47 ---- .../profiler/source/base/profiler_source.py | 61 +++-- ingestion/src/metadata/sampler/config.py | 106 ++++---- ingestion/src/metadata/sampler/models.py | 22 +- .../src/metadata/sampler/sampler_factory.py | 100 ------- .../src/metadata/sampler/sampler_interface.py | 141 +++++++--- .../sampler/sqlalchemy/azuresql/sampler.py | 2 +- .../sampler/sqlalchemy/bigquery/sampler.py | 6 +- .../metadata/sampler/sqlalchemy/sampler.py | 65 +++-- .../sampler/sqlalchemy/snowflake/sampler.py | 6 +- .../sampler/sqlalchemy/trino/sampler.py | 2 +- .../utils/service_spec/service_spec.py | 1 + ingestion/src/metadata/workflow/profiler.py | 13 +- 39 files changed, 359 insertions(+), 1527 deletions(-) delete mode 100644 ingestion/src/metadata/profiler/processor/sampler/nosql/sampler.py delete mode 100644 ingestion/src/metadata/profiler/processor/sampler/pandas/sampler.py delete mode 100644 ingestion/src/metadata/profiler/processor/sampler/sampler_factory.py delete mode 100644 ingestion/src/metadata/profiler/processor/sampler/sampler_interface.py delete mode 100644 ingestion/src/metadata/profiler/processor/sampler/sqlalchemy/azuresql/sampler.py delete mode 100644 ingestion/src/metadata/profiler/processor/sampler/sqlalchemy/bigquery/sampler.py delete mode 100644 ingestion/src/metadata/profiler/processor/sampler/sqlalchemy/sampler.py delete mode 100644 ingestion/src/metadata/profiler/processor/sampler/sqlalchemy/snowflake/sampler.py delete mode 100644 ingestion/src/metadata/profiler/processor/sampler/sqlalchemy/trino/sampler.py delete mode 100644 ingestion/src/metadata/sampler/sampler_factory.py diff --git a/ingestion/src/metadata/data_quality/interface/sqlalchemy/sqa_test_suite_interface.py b/ingestion/src/metadata/data_quality/interface/sqlalchemy/sqa_test_suite_interface.py index cd2d79b48e84..6a04ad05daf5 100644 --- a/ingestion/src/metadata/data_quality/interface/sqlalchemy/sqa_test_suite_interface.py +++ b/ingestion/src/metadata/data_quality/interface/sqlalchemy/sqa_test_suite_interface.py @@ -29,7 +29,7 @@ from metadata.ingestion.ometa.ometa_api import OpenMetadata from metadata.mixins.sqalchemy.sqa_mixin import SQAInterfaceMixin from metadata.profiler.processor.runner import QueryRunner -from metadata.profiler.processor.sampler.sqlalchemy.sampler import SQASampler +from metadata.sampler.sqlalchemy.sampler import SQASampler from metadata.utils.constants import TEN_MIN from metadata.utils.logger import test_suite_logger from metadata.utils.ssl_manager import get_ssl_connection diff --git a/ingestion/src/metadata/data_quality/interface/test_suite_interface.py b/ingestion/src/metadata/data_quality/interface/test_suite_interface.py index 9e98cf66a1ae..57ea7358f511 100644 --- a/ingestion/src/metadata/data_quality/interface/test_suite_interface.py +++ b/ingestion/src/metadata/data_quality/interface/test_suite_interface.py @@ -31,9 +31,9 @@ from metadata.generated.schema.tests.testCase import TestCase from metadata.generated.schema.tests.testDefinition import TestDefinition from metadata.ingestion.ometa.ometa_api import OpenMetadata -from metadata.profiler.api.models import ProfileSampleConfig +from metadata.sampler.config import get_partition_details +from metadata.sampler.models import SampleConfig from metadata.utils.logger import test_suite_logger -from metadata.utils.partition import get_partition_details logger = test_suite_logger() @@ -139,10 +139,10 @@ def _get_sample_query(self) -> Optional[str]: return None - def _get_profile_sample(self) -> Optional[ProfileSampleConfig]: + def _get_profile_sample(self) -> Optional[SampleConfig]: try: if self.table_entity.tableProfilerConfig.profileSample: - return ProfileSampleConfig( + return SampleConfig( profile_sample=self.table_entity.tableProfilerConfig.profileSample, profile_sample_type=self.table_entity.tableProfilerConfig.profileSampleType, ) diff --git a/ingestion/src/metadata/data_quality/validations/runtime_param_setter/param_setter.py b/ingestion/src/metadata/data_quality/validations/runtime_param_setter/param_setter.py index b1a44e0eddba..60223587e219 100644 --- a/ingestion/src/metadata/data_quality/validations/runtime_param_setter/param_setter.py +++ b/ingestion/src/metadata/data_quality/validations/runtime_param_setter/param_setter.py @@ -15,14 +15,13 @@ from metadata.generated.schema.entity.data.table import Table from metadata.ingestion.ometa.ometa_api import OpenMetadata -from metadata.profiler.processor.sampler.sqlalchemy.sampler import SQASampler class RuntimeParameterSetter(ABC): """ Set runtime parameters for a test case. This can be useful for resolving parameters based on the state of OpenMetadata entities. - You can see an example implemnentation in `TableDiffParamsSetter`. + You can see an example implementation in `TableDiffParamsSetter`. """ def __init__( @@ -30,7 +29,7 @@ def __init__( ometa_client: OpenMetadata, service_connection_config, table_entity: Table, - sampler: SQASampler, + sampler, ): self.ometa_client = ometa_client self.service_connection_config = service_connection_config diff --git a/ingestion/src/metadata/ingestion/sink/metadata_rest.py b/ingestion/src/metadata/ingestion/sink/metadata_rest.py index 01d6fdd932ef..ad8d2eefe4e8 100644 --- a/ingestion/src/metadata/ingestion/sink/metadata_rest.py +++ b/ingestion/src/metadata/ingestion/sink/metadata_rest.py @@ -84,6 +84,7 @@ from metadata.ingestion.source.dashboard.dashboard_service import DashboardUsage from metadata.ingestion.source.database.database_service import DataModelLink from metadata.profiler.api.models import ProfilerResponse +from metadata.sampler.models import SamplerResponse from metadata.utils.execution_time_tracker import calculate_execution_time from metadata.utils.logger import get_log_name, ingestion_logger @@ -549,22 +550,8 @@ def write_life_cycle_data(self, record: OMetaLifeCycleData) -> Either[Entity]: ) @_run_dispatch.register - def write_profiler_response(self, record: ProfilerResponse) -> Either[Table]: - """Cleanup "`" character in columns and ingest""" - column_profile = record.profile.columnProfile - for column in column_profile: - column.name = column.name.replace("`", "") - - record.profile.columnProfile = column_profile - - table = self.metadata.ingest_profile_data( - table=record.table, - profile_request=record.profile, - ) - logger.debug( - f"Successfully ingested profile metrics for {record.table.fullyQualifiedName.root}" - ) - + def write_sampler_response(self, record: SamplerResponse) -> Either[Table]: + """Ingest the sample data - if needed - and the PII tags""" if record.sample_data and record.sample_data.store: table_data = self.metadata.ingest_table_sample_data( table=record.table, sample_data=record.sample_data.data @@ -572,7 +559,7 @@ def write_profiler_response(self, record: ProfilerResponse) -> Either[Table]: if not table_data: self.status.failed( StackTraceError( - name=table.fullyQualifiedName.root, + name=record.table.fullyQualifiedName.root, error="Error trying to ingest sample data for table", ) ) @@ -587,7 +574,7 @@ def write_profiler_response(self, record: ProfilerResponse) -> Either[Table]: ) if not patched: self.status.warning( - key=table.fullyQualifiedName.root, + key=record.table.fullyQualifiedName.root, reason="Error patching tags for table", ) else: @@ -595,6 +582,24 @@ def write_profiler_response(self, record: ProfilerResponse) -> Either[Table]: f"Successfully patched tag {record.column_tags} for {record.table.fullyQualifiedName.root}" ) + return Either(right=record.table) + + @_run_dispatch.register + def write_profiler_response(self, record: ProfilerResponse) -> Either[Table]: + """Cleanup "`" character in columns and ingest""" + column_profile = record.profile.columnProfile + for column in column_profile: + column.name = column.name.replace("`", "") + + record.profile.columnProfile = column_profile + + table = self.metadata.ingest_profile_data( + table=record.table, + profile_request=record.profile, + ) + logger.debug( + f"Successfully ingested profile metrics for {record.table.fullyQualifiedName.root}" + ) return Either(right=table) @_run_dispatch.register diff --git a/ingestion/src/metadata/mixins/sqalchemy/sqa_mixin.py b/ingestion/src/metadata/mixins/sqalchemy/sqa_mixin.py index 78768a1bf529..dcc583fe4ba2 100644 --- a/ingestion/src/metadata/mixins/sqalchemy/sqa_mixin.py +++ b/ingestion/src/metadata/mixins/sqalchemy/sqa_mixin.py @@ -33,11 +33,10 @@ from metadata.ingestion.source.database.snowflake.queries import ( SNOWFLAKE_SESSION_TAG_QUERY, ) -from metadata.profiler.orm.converter.base import ometa_to_sqa_orm class SQAInterfaceMixin: - """SQLAlchemy inteface mixin grouping shared methods between sequential and threaded executor""" + """SQLAlchemy interface mixin grouping shared methods between sequential and threaded executor""" def _get_engine(self): """Get engine for database @@ -51,19 +50,6 @@ def _get_engine(self): return engine - def _convert_table_to_orm_object( - self, - sqa_metadata_obj: Optional[MetaData] = None, - ) -> DeclarativeMeta: - """Given a table entity return a SQA ORM object - - Args: - sqa_metadata_obj: sqa metadata registry - Returns: - DeclarativeMeta - """ - return ometa_to_sqa_orm(self.table_entity, self.ometa_client, sqa_metadata_obj) - def get_columns(self) -> Column: """get columns from an orm object""" return inspect(self.table).c diff --git a/ingestion/src/metadata/pii/models.py b/ingestion/src/metadata/pii/models.py index a2b23b6e6a12..64dbd8a8bb00 100644 --- a/ingestion/src/metadata/pii/models.py +++ b/ingestion/src/metadata/pii/models.py @@ -16,7 +16,6 @@ from pydantic import BaseModel - class TagType(Enum): SENSITIVE = "Sensitive" NONSENSITIVE = "NonSensitive" diff --git a/ingestion/src/metadata/pii/processor.py b/ingestion/src/metadata/pii/processor.py index 74dd02872e2b..85ca756043d5 100644 --- a/ingestion/src/metadata/pii/processor.py +++ b/ingestion/src/metadata/pii/processor.py @@ -40,7 +40,8 @@ from metadata.pii.constants import PII from metadata.pii.scanners.column_name_scanner import ColumnNameScanner from metadata.pii.scanners.ner_scanner import NERScanner -from metadata.profiler.api.models import PIIResponse, ProfilerResponse +from metadata.profiler.api.models import ProfilerResponse +from metadata.sampler.models import SamplerResponse from metadata.utils.logger import profiler_logger logger = profiler_logger() @@ -153,7 +154,7 @@ def process_column( def _run( self, record: ProfilerResponse, - ) -> Either[PIIResponse]: + ) -> Either[SamplerResponse]: """ Main entrypoint for the scanner. diff --git a/ingestion/src/metadata/profiler/api/models.py b/ingestion/src/metadata/profiler/api/models.py index 4b1a577fee59..d236e0caa315 100644 --- a/ingestion/src/metadata/profiler/api/models.py +++ b/ingestion/src/metadata/profiler/api/models.py @@ -26,13 +26,11 @@ from metadata.generated.schema.api.data.createTableProfile import ( CreateTableProfileRequest, ) -from metadata.generated.schema.entity.data.table import ( - Table, -) +from metadata.generated.schema.entity.data.table import Table from metadata.generated.schema.tests.customMetric import CustomMetric from metadata.profiler.metrics.core import Metric, MetricTypes from metadata.profiler.processor.models import ProfilerDef -from metadata.sampler.models import TableConfig, DatabaseAndSchemaConfig +from metadata.sampler.models import DatabaseAndSchemaConfig, TableConfig from metadata.utils.sqa_like_column import SQALikeColumn diff --git a/ingestion/src/metadata/profiler/config.py b/ingestion/src/metadata/profiler/config.py index 9edc2af5fc96..76a1ad790643 100644 --- a/ingestion/src/metadata/profiler/config.py +++ b/ingestion/src/metadata/profiler/config.py @@ -13,13 +13,18 @@ """ from typing import Optional -from metadata.generated.schema.entity.data.databaseSchema import DatabaseSchema, DatabaseSchemaProfilerConfig - -from metadata.generated.schema.entity.data.database import Database, DatabaseProfilerConfig +from metadata.generated.schema.entity.data.database import ( + Database, + DatabaseProfilerConfig, +) +from metadata.generated.schema.entity.data.databaseSchema import ( + DatabaseSchema, + DatabaseSchemaProfilerConfig, +) def get_database_profiler_config( - database_entity: Optional[Database], + database_entity: Optional[Database], ) -> Optional[DatabaseProfilerConfig]: if database_entity and database_entity.databaseProfilerConfig: return database_entity.databaseProfilerConfig diff --git a/ingestion/src/metadata/profiler/interface/nosql/profiler_interface.py b/ingestion/src/metadata/profiler/interface/nosql/profiler_interface.py index 4a206bf05f92..7b80ce48ef64 100644 --- a/ingestion/src/metadata/profiler/interface/nosql/profiler_interface.py +++ b/ingestion/src/metadata/profiler/interface/nosql/profiler_interface.py @@ -30,7 +30,6 @@ from metadata.profiler.metrics.core import Metric, MetricTypes from metadata.profiler.metrics.registry import Metrics from metadata.profiler.processor.metric_filter import MetricFilter -from metadata.profiler.processor.sampler.nosql.sampler import NoSQLSampler from metadata.utils.logger import profiler_interface_registry_logger from metadata.utils.sqa_like_column import SQALikeColumn @@ -45,10 +44,6 @@ class NoSQLProfilerInterface(ProfilerInterface): # pylint: disable=too-many-arguments - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.sampler = self._get_sampler() - def _compute_table_metrics( self, metrics: List[Type[Metric]], @@ -157,24 +152,6 @@ def compute_metrics( def fetch_sample_data(self, table, columns: List[SQALikeColumn]) -> TableData: return self.sampler.fetch_sample_data(columns) - def _get_sampler(self) -> NoSQLSampler: - """Get NoSQL sampler from config""" - from metadata.profiler.processor.sampler.sampler_factory import ( # pylint: disable=import-outside-toplevel - sampler_factory_, - ) - - return sampler_factory_.create( - self.service_connection_config.__class__.__name__, - table=self.table, - client=factory.create( - self.service_connection_config.__class__.__name__, - client=self.connection, - ), - profile_sample_config=self.profile_sample_config, - partition_details=self.partition_details, - profile_sample_query=self.profile_query, - ) - def get_composed_metrics( self, column: Column, metric: Metrics, column_results: Dict ): diff --git a/ingestion/src/metadata/profiler/interface/pandas/profiler_interface.py b/ingestion/src/metadata/profiler/interface/pandas/profiler_interface.py index a54d8b706af6..8031e1cb1ac7 100644 --- a/ingestion/src/metadata/profiler/interface/pandas/profiler_interface.py +++ b/ingestion/src/metadata/profiler/interface/pandas/profiler_interface.py @@ -22,14 +22,7 @@ from sqlalchemy import Column -from metadata.generated.schema.entity.data.table import ( - CustomMetricProfile, - DataType, - TableData, -) -from metadata.generated.schema.entity.services.connections.database.datalakeConnection import ( - DatalakeConnection, -) +from metadata.generated.schema.entity.data.table import CustomMetricProfile, DataType from metadata.generated.schema.tests.customMetric import CustomMetric from metadata.mixins.pandas.pandas_mixin import PandasInterfaceMixin from metadata.profiler.api.models import ThreadPoolMetrics @@ -92,7 +85,6 @@ def __init__( table=self.table_entity, profile_sample_config=profile_sample_config, ) - self.sampler = self._get_sampler() self.complex_dataframe_sample = deepcopy( self.sampler.random_sample(is_sampled=True) ) @@ -129,21 +121,6 @@ def complex_df(self): logger.warning(f"NaN/NoneType found in the Dataframe: {err}") break - def _get_sampler(self): - """Get dataframe sampler from config""" - from metadata.profiler.processor.sampler.sampler_factory import ( # pylint: disable=import-outside-toplevel - sampler_factory_, - ) - - return sampler_factory_.create( - DatalakeConnection.__name__, - client=self.client._client, # pylint: disable=W0212 - table=self.dfs, - profile_sample_config=self.profile_sample_config, - partition_details=self.partition_details, - profile_sample_query=self.profile_query, - ) - def _compute_table_metrics( self, metrics: List[Metrics], @@ -326,18 +303,6 @@ def compute_metrics( column = None return row, column, metric_func.metric_type.value - def fetch_sample_data(self, table, columns: SQALikeColumn) -> TableData: - """Fetch sample data from database - - Args: - table: ORM declarative table - - Returns: - TableData: sample table data - """ - sampler = self._get_sampler() - return sampler.fetch_sample_data(columns) - def get_composed_metrics( self, column: Column, metric: Metrics, column_results: Dict ): diff --git a/ingestion/src/metadata/profiler/interface/profiler_interface.py b/ingestion/src/metadata/profiler/interface/profiler_interface.py index 22680cf973f6..efb71f174fc2 100644 --- a/ingestion/src/metadata/profiler/interface/profiler_interface.py +++ b/ingestion/src/metadata/profiler/interface/profiler_interface.py @@ -17,27 +17,13 @@ from abc import ABC, abstractmethod from typing import Any, Dict, List, Optional, Type, Union -from metadata.profiler.config import get_schema_profiler_config, get_database_profiler_config -from metadata.sampler.sampler_interface import SamplerInterface from sqlalchemy import Column -from metadata.generated.schema.entity.data.database import ( - Database, -) -from metadata.generated.schema.entity.data.databaseSchema import ( - DatabaseSchema, -) -from metadata.generated.schema.entity.data.table import ( - PartitionProfilerConfig, - SystemProfile, - Table, -) +from metadata.generated.schema.entity.data.table import SystemProfile, Table from metadata.generated.schema.entity.services.connections.database.datalakeConnection import ( DatalakeConnection, ) -from metadata.generated.schema.entity.services.databaseService import ( - DatabaseConnection, -) +from metadata.generated.schema.entity.services.databaseService import DatabaseConnection from metadata.generated.schema.entity.services.ingestionPipelines.status import ( StackTraceError, ) @@ -47,14 +33,11 @@ from metadata.generated.schema.tests.customMetric import CustomMetric from metadata.ingestion.api.status import Status from metadata.ingestion.ometa.ometa_api import OpenMetadata -from metadata.profiler.api.models import ( - ProfileSampleConfig, - TableConfig, -) from metadata.profiler.metrics.core import MetricTypes from metadata.profiler.metrics.registry import Metrics from metadata.profiler.metrics.system.system import System from metadata.profiler.processor.runner import QueryRunner +from metadata.sampler.sampler_interface import SamplerInterface from metadata.utils.ssl_manager import get_ssl_connection @@ -86,10 +69,8 @@ def __init__( service_connection_config: Union[DatabaseConnection, DatalakeConnection], ometa_client: OpenMetadata, entity: Table, - profile_sample_config: Optional[ProfileSampleConfig], source_config: DatabaseServiceProfilerPipeline, - sample_query: Optional[str], - table_partition_config: Optional[PartitionProfilerConfig], + sampler: SamplerInterface, thread_count: int = 5, timeout_seconds: int = 43200, **kwargs, @@ -108,11 +89,7 @@ def __init__( self.status.entity = None else: self.status.entity = fqn.root if fqn else None - self.profile_sample_config = profile_sample_config - self.profile_query = sample_query - self.partition_details = ( - table_partition_config if not self.profile_query else None - ) + self.sampler = sampler self.timeout_seconds = timeout_seconds self._get_metric_fn = { @@ -129,9 +106,6 @@ def __init__( def create( cls, entity: Table, - database_schema: DatabaseSchema, - database: Database, - entity_config: Optional[TableConfig], source_config: DatabaseServiceProfilerPipeline, service_connection_config, sampler: SamplerInterface, @@ -157,36 +131,13 @@ def create( """ thread_count = source_config.threadCount timeout_seconds = source_config.timeoutSeconds - database_profiler_config = get_database_profiler_config( - database_entity=database - ) - schema_profiler_config = get_schema_profiler_config( - schema_entity=database_schema - ) - - if not sampler.get_profile_query(entity, entity_config): - profile_sample_config = sampler.get_profile_sample_config( - entity, - schema_profiler_config, - database_profiler_config, - entity_config, - source_config, - ) - table_partition_config = sampler.get_partition_details(entity, entity_config) - sample_query = None - else: - sample_query = sampler.get_profile_query(entity, entity_config) - profile_sample_config = None - table_partition_config = None return cls( service_connection_config=service_connection_config, ometa_client=ometa_client, entity=entity, - profile_sample_config=profile_sample_config, source_config=source_config, - sample_query=sample_query, - table_partition_config=table_partition_config, + sampler=sampler, thread_count=thread_count, timeout_seconds=timeout_seconds, **kwargs, diff --git a/ingestion/src/metadata/profiler/interface/sqlalchemy/bigquery/profiler_interface.py b/ingestion/src/metadata/profiler/interface/sqlalchemy/bigquery/profiler_interface.py index f2e5a0e3e76c..aa0de308b1a4 100644 --- a/ingestion/src/metadata/profiler/interface/sqlalchemy/bigquery/profiler_interface.py +++ b/ingestion/src/metadata/profiler/interface/sqlalchemy/bigquery/profiler_interface.py @@ -43,25 +43,6 @@ def _get_struct_columns(self, columns: dict, parent: str): columns_list.extend(col) return columns_list - def _get_sampler(self, **kwargs): - """get sampler object""" - from metadata.profiler.processor.sampler.sampler_factory import ( # pylint: disable=import-outside-toplevel - sampler_factory_, - ) - - session = kwargs.get("session") - table = kwargs["table"] - - return sampler_factory_.create( - self.service_connection_config.__class__.__name__, - client=session or self.session, - table=table, - profile_sample_config=self.profile_sample_config, - partition_details=self.partition_details, - profile_sample_query=self.profile_query, - table_type=self.table_entity.tableType, - ) - def get_columns(self) -> Column: """Get columns from table""" # pylint: disable=import-outside-toplevel diff --git a/ingestion/src/metadata/profiler/interface/sqlalchemy/databricks/profiler_interface.py b/ingestion/src/metadata/profiler/interface/sqlalchemy/databricks/profiler_interface.py index 60e3f7c7cc34..266b4cd17072 100644 --- a/ingestion/src/metadata/profiler/interface/sqlalchemy/databricks/profiler_interface.py +++ b/ingestion/src/metadata/profiler/interface/sqlalchemy/databricks/profiler_interface.py @@ -16,10 +16,10 @@ from typing import List from pyhive.sqlalchemy_hive import HiveCompiler -from sqlalchemy import Column, inspect +from sqlalchemy import Column from metadata.generated.schema.entity.data.table import Column as OMColumn -from metadata.generated.schema.entity.data.table import ColumnName, DataType, TableData +from metadata.generated.schema.entity.data.table import ColumnName, DataType from metadata.generated.schema.entity.services.databaseService import ( DatabaseServiceType, ) @@ -97,18 +97,3 @@ def get_columns(self) -> Column: ) columns.append(col) return columns - - def fetch_sample_data(self, table, columns) -> TableData: - """Fetch sample data from database - - Args: - table: ORM declarative table - - Returns: - TableData: sample table data - """ - sampler = self._get_sampler( - table=table, - ) - - return sampler.fetch_sample_data(list(inspect(self.table).c)) diff --git a/ingestion/src/metadata/profiler/interface/sqlalchemy/profiler_interface.py b/ingestion/src/metadata/profiler/interface/sqlalchemy/profiler_interface.py index b0263741d607..2a14fe6e7d8f 100644 --- a/ingestion/src/metadata/profiler/interface/sqlalchemy/profiler_interface.py +++ b/ingestion/src/metadata/profiler/interface/sqlalchemy/profiler_interface.py @@ -21,19 +21,24 @@ import traceback from collections import defaultdict from datetime import datetime -from typing import Any, Dict, List, Optional, Type +from typing import Any, Dict, List, Optional, Type, Union +from metadata.generated.schema.metadataIngestion.databaseServiceProfilerPipeline import DatabaseServiceProfilerPipeline + +from metadata.generated.schema.entity.services.connections.database.datalakeConnection import DatalakeConnection + +from metadata.generated.schema.entity.services.databaseService import DatabaseConnection from sqlalchemy import Column, inspect, text from sqlalchemy.exc import DBAPIError, ProgrammingError, ResourceClosedError -from sqlalchemy.orm import scoped_session +from sqlalchemy.orm import scoped_session, DeclarativeMeta from metadata.generated.schema.entity.data.table import ( CustomMetricProfile, - SystemProfile, - TableData, + SystemProfile, Table, ) from metadata.generated.schema.tests.customMetric import CustomMetric from metadata.ingestion.connections.session import create_and_bind_thread_safe_session +from metadata.ingestion.ometa.ometa_api import OpenMetadata from metadata.mixins.sqalchemy.sqa_mixin import SQAInterfaceMixin from metadata.profiler.api.models import ThreadPoolMetrics from metadata.profiler.interface.profiler_interface import ProfilerInterface @@ -47,6 +52,7 @@ from metadata.profiler.orm.registry import Dialects from metadata.profiler.processor.metric_filter import MetricFilter from metadata.profiler.processor.runner import QueryRunner +from metadata.sampler.sampler_interface import SamplerInterface from metadata.utils.constants import SAMPLE_DATA_DEFAULT_COUNT from metadata.utils.custom_thread_pool import CustomThreadPoolExecutor from metadata.utils.helpers import is_safe_sql_query @@ -78,37 +84,31 @@ class SQAProfilerInterface(ProfilerInterface, SQAInterfaceMixin): def __init__( self, - service_connection_config, - ometa_client, - entity, - storage_config, - profile_sample_config, - source_config, - sample_query, - table_partition_config, + service_connection_config: Union[DatabaseConnection, DatalakeConnection], + ometa_client: OpenMetadata, + entity: Table, + source_config: DatabaseServiceProfilerPipeline, + sampler: SamplerInterface, thread_count: int = 5, timeout_seconds: int = 43200, - sqa_metadata=None, - sample_data_count: Optional[int] = SAMPLE_DATA_DEFAULT_COUNT, + orm_table: Optional[DeclarativeMeta] = None, **kwargs, ): """Instantiate SQA Interface object""" + self.session_factory = None + self.session = None super().__init__( - service_connection_config, - ometa_client, - entity, - storage_config, - profile_sample_config, - source_config, - sample_query, - table_partition_config, - thread_count, - timeout_seconds, - sample_data_count, + service_connection_config=service_connection_config, + ometa_client=ometa_client, + entity=entity, + source_config=source_config, + sampler=sampler, + thread_count=thread_count, + timeout_seconds=timeout_seconds, ) - self._table = self._convert_table_to_orm_object(sqa_metadata) + self._table = orm_table self.create_session() self.system_metrics_computer = self.initialize_system_metrics_computer() @@ -126,25 +126,6 @@ def create_session(self): def table(self): return self._table - def _get_sampler(self, **kwargs): - """get sampler object""" - from metadata.profiler.processor.sampler.sampler_factory import ( # pylint: disable=import-outside-toplevel - sampler_factory_, - ) - - session = kwargs.get("session") - table = kwargs["table"] - - return sampler_factory_.create( - self.service_connection_config.__class__.__name__, - client=session or self.session, - table=table, - profile_sample_config=self.profile_sample_config, - partition_details=self.partition_details, - profile_sample_query=self.profile_query, - sample_data_count=self.sample_data_count, - ) - def _session_factory(self) -> scoped_session: """Create thread safe session that will be automatically garbage collected once the application thread ends @@ -393,19 +374,6 @@ def _compute_system_metrics( logger.debug(f"Computing system metrics for {runner.table.__tablename__}") return self.system_metrics_computer.get_system_metrics(runner.table) - def _create_thread_safe_sampler( - self, - session, - table, - ): - """Create thread safe runner""" - if not hasattr(thread_local, "sampler"): - thread_local.sampler = self._get_sampler( - table=table, - session=session, - ) - return thread_local.sampler - def _create_thread_safe_runner( self, session, @@ -418,8 +386,8 @@ def _create_thread_safe_runner( session=session, table=table, sample=sample, - partition_details=self.partition_details, - profile_sample_query=self.profile_query, + partition_details=self.sampler.partition_details, + profile_sample_query=self.sampler.profile_sample_query, ) return thread_local.runner thread_local.runner._sample = sample # pylint: disable=protected-access @@ -437,11 +405,7 @@ def compute_metrics_in_thread( with Session() as session: self.set_session_tag(session) self.set_catalog(session) - sampler = self._create_thread_safe_sampler( - session, - metric_func.table, - ) - sample = sampler.random_sample(metric_func.column) + sample = self.sampler.random_sample(metric_func.column) runner = self._create_thread_safe_runner( session, metric_func.table, @@ -546,21 +510,6 @@ def get_all_metrics( return profile_results - def fetch_sample_data(self, table, columns) -> TableData: - """Fetch sample data from database - - Args: - table: ORM declarative table - - Returns: - TableData: sample table data - """ - sampler = self._get_sampler( - table=table, - ) - - return sampler.fetch_sample_data(columns) - def get_composed_metrics( self, column: Column, metric: Metrics, column_results: Dict ): @@ -593,8 +542,7 @@ def get_hybrid_metrics( Returns: dictionnary of results """ - sampler = self._get_sampler(table=kwargs.get("table")) - sample = sampler.random_sample(column) + sample = self.sampler.random_sample(column) try: return metric(column).fn(sample, column_results, self.session) except Exception as exc: diff --git a/ingestion/src/metadata/profiler/orm/converter/base.py b/ingestion/src/metadata/profiler/orm/converter/base.py index 865cf7c19921..59ed39665b2a 100644 --- a/ingestion/src/metadata/profiler/orm/converter/base.py +++ b/ingestion/src/metadata/profiler/orm/converter/base.py @@ -110,9 +110,22 @@ def ometa_to_sqa_orm( `type` and passing SQLAlchemy `Base` class as the bases tuple for inheritance. """ + _metadata = sqa_metadata_obj or Base.metadata table.serviceType = cast( databaseService.DatabaseServiceType, table.serviceType ) # satisfy mypy + + orm_database_name = get_orm_database(table, metadata) + # SQLite does not support schemas + orm_schema_name = get_orm_schema(table, metadata) if table.serviceType != databaseService.DatabaseServiceType.SQLite else None + orm_name = f"{orm_database_name}_{orm_schema_name}_{table.name.root}".replace( + ".", "_" + ) + orm_key = f"{orm_schema_name}.{table.name.root}" if orm_schema_name else table.name.root + visited = False + if orm_key in _metadata.tables: + visited = True + cols = { ( col.name.root + "_" @@ -122,12 +135,6 @@ def ometa_to_sqa_orm( for idx, col in enumerate(table.columns) } - orm_database_name = get_orm_database(table, metadata) - orm_schema_name = get_orm_schema(table, metadata) - orm_name = f"{orm_database_name}_{orm_schema_name}_{table.name.root}".replace( - ".", "_" - ) - # Type takes positional arguments in the form of (name, bases, dict) orm = type( orm_name, # Output class name @@ -135,17 +142,14 @@ def ometa_to_sqa_orm( { "__tablename__": str(table.name.root), "__table_args__": { - # SQLite does not support schemas - "schema": orm_schema_name - if table.serviceType != databaseService.DatabaseServiceType.SQLite - else None, + "schema": orm_schema_name, "extend_existing": True, # Recreates the table ORM object if it already exists. Useful for testing - "quote": check_snowflake_case_sensitive( + **({"quote": check_snowflake_case_sensitive( table.serviceType, table.name.root - ), + )} if not visited else {}), }, **cols, - "metadata": sqa_metadata_obj or Base.metadata, + "metadata": _metadata, }, ) diff --git a/ingestion/src/metadata/profiler/processor/core.py b/ingestion/src/metadata/profiler/processor/core.py index 43bf81875a0c..55e2a739dc46 100644 --- a/ingestion/src/metadata/profiler/processor/core.py +++ b/ingestion/src/metadata/profiler/processor/core.py @@ -104,7 +104,6 @@ def __init__( self.exclude_columns = exclude_columns self._metrics = metrics self._profile_ts = Timestamp(int(datetime.now().timestamp() * 1000)) - self.profile_sample_config = self.profiler_interface.profile_sample_config self.metric_filter = MetricFilter( metrics=self.metrics, @@ -485,8 +484,6 @@ def process(self) -> ProfilerResponse: ) self.compute_metrics() - - profile = self.get_profile() if self.source_config.computeMetrics: self._check_profile_and_handle(profile) @@ -494,7 +491,6 @@ def process(self) -> ProfilerResponse: table_profile = ProfilerResponse( table=self.profiler_interface.table_entity, profile=profile, - sample_data=sample_data, ) return table_profile @@ -553,13 +549,13 @@ def get_profile(self) -> CreateTableProfileRequest: createDateTime=raw_create_date, sizeInByte=self._table_results.get("sizeInBytes"), profileSample=( - self.profile_sample_config.profile_sample - if self.profile_sample_config + self.profiler_interface.sampler.sample_config.profile_sample + if self.profiler_interface.sampler.sample_config else None ), profileSampleType=( - self.profile_sample_config.profile_sample_type - if self.profile_sample_config + self.profiler_interface.sampler.sample_config.profile_sample_type + if self.profiler_interface.sampler.sample_config else None ), customMetrics=self._table_results.get("customMetrics"), diff --git a/ingestion/src/metadata/profiler/processor/sample_data_handler.py b/ingestion/src/metadata/profiler/processor/sample_data_handler.py index 7cbbaeec177a..9314816c9eda 100644 --- a/ingestion/src/metadata/profiler/processor/sample_data_handler.py +++ b/ingestion/src/metadata/profiler/processor/sample_data_handler.py @@ -72,7 +72,11 @@ def _get_object_key( return file_name -def upload_sample_data(data: TableData, entity: Table, sample_storage_config: Optional[DataStorageConfig] = None) -> None: +def upload_sample_data( + data: TableData, + entity: Table, + sample_storage_config: Optional[DataStorageConfig] = None, +) -> None: """ Upload Sample data to storage config """ diff --git a/ingestion/src/metadata/profiler/processor/sampler/nosql/sampler.py b/ingestion/src/metadata/profiler/processor/sampler/nosql/sampler.py deleted file mode 100644 index 381570450081..000000000000 --- a/ingestion/src/metadata/profiler/processor/sampler/nosql/sampler.py +++ /dev/null @@ -1,77 +0,0 @@ -from typing import Dict, List, Optional, Tuple - -from metadata.generated.schema.entity.data.table import ProfileSampleType, TableData -from metadata.profiler.adaptors.nosql_adaptor import NoSQLAdaptor -from metadata.profiler.processor.sampler.sampler_interface import SamplerInterface -from metadata.utils.constants import SAMPLE_DATA_DEFAULT_COUNT -from metadata.utils.sqa_like_column import SQALikeColumn - - -class NoSQLSampler(SamplerInterface): - client: NoSQLAdaptor - - def _rdn_sample_from_user_query(self) -> List[Dict[str, any]]: - """ - Get random sample from user query - """ - limit = self._get_limit() - return self.client.query( - self.table, self.table.columns, self._profile_sample_query, limit - ) - - def _fetch_sample_data_from_user_query(self) -> TableData: - """ - Fetch sample data based on a user query. Assuming the enging has one (example: MongoDB) - If the engine does not support a custom query, an error will be raised. - """ - records = self._rdn_sample_from_user_query() - columns = [ - SQALikeColumn(name=column.name.root, type=column.dataType) - for column in self.table.columns - ] - rows, cols = self.transpose_records(records, columns) - return TableData( - rows=[list(map(str, row)) for row in rows], columns=[c.name for c in cols] - ) - - def random_sample(self): - pass - - def fetch_sample_data(self, columns: List[SQALikeColumn]) -> TableData: - if self._profile_sample_query: - return self._fetch_sample_data_from_user_query() - return self._fetch_sample_data(columns) - - def _fetch_sample_data(self, columns: List[SQALikeColumn]) -> TableData: - """ - returns sampled ometa dataframes - """ - limit = self._get_limit() - records = self.client.scan(self.table, self.table.columns, limit) - rows, cols = self.transpose_records(records, columns) - return TableData( - rows=[list(map(str, row)) for row in rows], - columns=[col.name for col in cols], - ) - - def _get_limit(self) -> Optional[int]: - num_rows = self.client.item_count(self.table) - if self.profile_sample_type == ProfileSampleType.PERCENTAGE: - limit = num_rows * (self.profile_sample / 100) - elif self.profile_sample_type == ProfileSampleType.ROWS: - limit = self.profile_sample - else: - limit = SAMPLE_DATA_DEFAULT_COUNT - return limit - - @staticmethod - def transpose_records( - records: List[Dict[str, any]], columns: List[SQALikeColumn] - ) -> Tuple[List[List[any]], List[SQALikeColumn]]: - rows = [] - for record in records: - row = [] - for column in columns: - row.append(record.get(column.name)) - rows.append(row) - return rows, columns diff --git a/ingestion/src/metadata/profiler/processor/sampler/pandas/sampler.py b/ingestion/src/metadata/profiler/processor/sampler/pandas/sampler.py deleted file mode 100644 index 572796dc71ff..000000000000 --- a/ingestion/src/metadata/profiler/processor/sampler/pandas/sampler.py +++ /dev/null @@ -1,170 +0,0 @@ -# Copyright 2021 Collate -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -Helper module to handle data sampling -for the profiler -""" -import math -import random -from typing import List, Optional, cast - -from metadata.data_quality.validations.table.pandas.tableRowInsertedCountToBeBetween import ( - TableRowInsertedCountToBeBetweenValidator, -) -from metadata.generated.schema.entity.data.table import ( - PartitionIntervalTypes, - PartitionProfilerConfig, - ProfileSampleType, - TableData, -) -from metadata.profiler.processor.sampler.sampler_interface import SamplerInterface -from metadata.utils.sqa_like_column import SQALikeColumn - - -class DatalakeSampler(SamplerInterface): - """ - Generates a sample of the data to not - run the query in the whole table. - """ - - def _partitioned_table(self): - """Get partitioned table""" - self._partition_details = cast(PartitionProfilerConfig, self._partition_details) - partition_field = self._partition_details.partitionColumnName - if ( - self._partition_details.partitionIntervalType - == PartitionIntervalTypes.COLUMN_VALUE - ): - return [ - df[df[partition_field].isin(self._partition_details.partitionValues)] - for df in self.table - ] - if ( - self._partition_details.partitionIntervalType - == PartitionIntervalTypes.INTEGER_RANGE - ): - return [ - df[ - df[partition_field].between( - self._partition_details.partitionIntegerRangeStart, - self._partition_details.partitionIntegerRangeEnd, - ) - ] - for df in self.table - ] - return [ - df[ - df[partition_field] - >= TableRowInsertedCountToBeBetweenValidator._get_threshold_date( # pylint: disable=protected-access - self._partition_details.partitionIntervalUnit.value, - self._partition_details.partitionInterval, - ) - ] - for df in self.table - ] - - def _fetch_sample_data_from_user_query(self) -> TableData: - """Fetch sample data from user query""" - cols, rows = self.get_col_row(data_frame=self._rdn_sample_from_user_query()) - return TableData(columns=cols, rows=rows) - - def _rdn_sample_from_user_query(self): - """Generate sample from user query""" - return [df.query(self._profile_sample_query) for df in self.table] - - def _get_sampled_dataframe(self): - """ - returns sampled ometa dataframes - """ - random.shuffle(self.table) # we'll shuffle the list of dataframes - # sampling data based on profiler config (if any) - if self.profile_sample_type == ProfileSampleType.PERCENTAGE: - try: - profile_sample = self.profile_sample / 100 - except TypeError: - # if the profile sample is not a number or is None - # we'll set it to 100 - profile_sample = self.profile_sample = 100 - return [ - df.sample( - frac=profile_sample, - random_state=random.randint(0, 100), - replace=True, - ) - for df in self.table - ] - - # we'll distribute the sample size equally among the dataframes - sample_rows_per_chunk: int = math.floor(self.profile_sample / len(self.table)) - num_rows = sum(len(df) for df in self.table) - - # if we have less rows than the sample size - # we'll return the whole table - if sample_rows_per_chunk > num_rows: - return self.table - return [ - df.sample( - n=sample_rows_per_chunk, - random_state=random.randint(0, 100), - replace=True, - ) - for df in self.table - ] - - def get_col_row(self, data_frame, columns: Optional[List[SQALikeColumn]] = None): - """ - Fetches columns and rows from the data_frame - """ - if columns: - cols = [col.name for col in columns] - else: - # we'll use the first dataframe to get the columns - cols = data_frame[0].columns.tolist() - rows = [] - # Sample Data should not exceed sample limit - for chunk in data_frame: - rows.extend(self._fetch_rows(chunk[cols])[: self.sample_limit]) - if len(rows) >= self.sample_limit: - break - return cols, rows - - def random_sample(self, is_sampled: bool = False): - """Generate random sample from the table - - Returns: - List[DataFrame] - """ - if self._profile_sample_query: - return self._rdn_sample_from_user_query() - - if self._partition_details: - self.table = self._partitioned_table() - - if not self.profile_sample or is_sampled: - return self.table - return self._get_sampled_dataframe() - - def _fetch_rows(self, data_frame): - return data_frame.dropna().values.tolist() - - def fetch_sample_data( - self, columns: Optional[List[SQALikeColumn]] = None - ) -> TableData: - """Fetch sample data from the table - - Returns: - TableData: - """ - if self._profile_sample_query: - return self._fetch_sample_data_from_user_query() - - cols, rows = self.get_col_row(data_frame=self.table, columns=columns) - return TableData(columns=cols, rows=rows) diff --git a/ingestion/src/metadata/profiler/processor/sampler/sampler_factory.py b/ingestion/src/metadata/profiler/processor/sampler/sampler_factory.py deleted file mode 100644 index 2a5e52e7211e..000000000000 --- a/ingestion/src/metadata/profiler/processor/sampler/sampler_factory.py +++ /dev/null @@ -1,100 +0,0 @@ -# Copyright 2021 Collate -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -Factory class for creating sampler objects -""" - -from typing import Union - -from metadata.generated.schema.entity.services.connections.database.azureSQLConnection import ( - AzureSQLConnection, -) -from metadata.generated.schema.entity.services.connections.database.bigQueryConnection import ( - BigQueryConnection, -) -from metadata.generated.schema.entity.services.connections.database.datalakeConnection import ( - DatalakeConnection, -) -from metadata.generated.schema.entity.services.connections.database.dynamoDBConnection import ( - DynamoDBConnection, -) -from metadata.generated.schema.entity.services.connections.database.mongoDBConnection import ( - MongoDBConnection, -) -from metadata.generated.schema.entity.services.connections.database.snowflakeConnection import ( - SnowflakeConnection, -) -from metadata.generated.schema.entity.services.connections.database.trinoConnection import ( - TrinoConnection, -) -from metadata.generated.schema.entity.services.databaseService import DatabaseConnection -from metadata.profiler.processor.sampler.nosql.sampler import NoSQLSampler -from metadata.profiler.processor.sampler.pandas.sampler import DatalakeSampler -from metadata.profiler.processor.sampler.sqlalchemy.azuresql.sampler import ( - AzureSQLSampler, -) -from metadata.profiler.processor.sampler.sqlalchemy.bigquery.sampler import ( - BigQuerySampler, -) -from metadata.profiler.processor.sampler.sqlalchemy.sampler import SQASampler -from metadata.profiler.processor.sampler.sqlalchemy.snowflake.sampler import ( - SnowflakeSampler, -) -from metadata.profiler.processor.sampler.sqlalchemy.trino.sampler import TrinoSampler - - -class SamplerFactory: - """Creational factory for sampler objects""" - - def __init__(self): - self._sampler_type = {} - - def register(self, source_type: str, sampler_class): - """Register a new source type""" - self._sampler_type[source_type] = sampler_class - - def create( - self, source_type: str, *args, **kwargs - ) -> Union[SQASampler, DatalakeSampler]: - """Create source object based on source type""" - sampler_class = self._sampler_type.get(source_type) - if not sampler_class: - sampler_class = self._sampler_type[DatabaseConnection.__name__] - return sampler_class(*args, **kwargs) - return sampler_class(*args, **kwargs) - - -sampler_factory_ = SamplerFactory() -sampler_factory_.register( - source_type=DatabaseConnection.__name__, sampler_class=SQASampler -) -sampler_factory_.register( - source_type=BigQueryConnection.__name__, sampler_class=BigQuerySampler -) -sampler_factory_.register( - source_type=DatalakeConnection.__name__, sampler_class=DatalakeSampler -) -sampler_factory_.register( - source_type=TrinoConnection.__name__, sampler_class=TrinoSampler -) -sampler_factory_.register( - source_type=MongoDBConnection.__name__, sampler_class=NoSQLSampler -) -sampler_factory_.register( - source_type=SnowflakeConnection.__name__, sampler_class=SnowflakeSampler -) -sampler_factory_.register( - source_type=DynamoDBConnection.__name__, sampler_class=NoSQLSampler -) -sampler_factory_.register( - source_type=AzureSQLConnection.__name__, sampler_class=AzureSQLSampler -) diff --git a/ingestion/src/metadata/profiler/processor/sampler/sampler_interface.py b/ingestion/src/metadata/profiler/processor/sampler/sampler_interface.py deleted file mode 100644 index daba85fcebcc..000000000000 --- a/ingestion/src/metadata/profiler/processor/sampler/sampler_interface.py +++ /dev/null @@ -1,74 +0,0 @@ -# Copyright 2021 Collate -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -Interface for sampler -""" - -from abc import ABC, abstractmethod -from typing import Dict, List, Optional, Union - -from sqlalchemy import Column - -from metadata.generated.schema.entity.data.table import Table, TableData -from metadata.profiler.api.models import ProfileSampleConfig -from metadata.utils.constants import SAMPLE_DATA_DEFAULT_COUNT -from metadata.utils.sqa_like_column import SQALikeColumn - - -class SamplerInterface(ABC): - """Sampler interface""" - - def __init__( - self, - client, - table: Table, - profile_sample_config: Optional[ProfileSampleConfig] = None, - partition_details: Optional[Dict] = None, - profile_sample_query: Optional[str] = None, - sample_data_count: Optional[int] = SAMPLE_DATA_DEFAULT_COUNT, - ): - self.profile_sample = None - self.profile_sample_type = None - if profile_sample_config: - self.profile_sample = profile_sample_config.profile_sample - self.profile_sample_type = profile_sample_config.profile_sample_type - self.client = client - self.table = table - self._profile_sample_query = profile_sample_query - self.sample_limit = sample_data_count - self._sample_rows = None - self._partition_details = partition_details - - @abstractmethod - def _rdn_sample_from_user_query(self): - """Get random sample from user query""" - raise NotImplementedError - - @abstractmethod - def _fetch_sample_data_from_user_query(self) -> TableData: - """Fetch sample data from user query""" - raise NotImplementedError - - @abstractmethod - def random_sample(self): - """Get random sample""" - raise NotImplementedError - - @abstractmethod - def fetch_sample_data( - self, columns: Optional[Union[List[Column], List[SQALikeColumn]]] - ) -> TableData: - """Fetch sample data - - Args: - columns (Optional[List]): List of columns to fetch - """ - raise NotImplementedError diff --git a/ingestion/src/metadata/profiler/processor/sampler/sqlalchemy/azuresql/sampler.py b/ingestion/src/metadata/profiler/processor/sampler/sqlalchemy/azuresql/sampler.py deleted file mode 100644 index fb48f80e6623..000000000000 --- a/ingestion/src/metadata/profiler/processor/sampler/sqlalchemy/azuresql/sampler.py +++ /dev/null @@ -1,40 +0,0 @@ -# Copyright 2021 Collate -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -Helper module to handle data sampling -for the profiler -""" -from typing import List, Optional - -from sqlalchemy import Column - -from metadata.generated.schema.entity.data.table import TableData -from metadata.profiler.processor.sampler.sqlalchemy.sampler import SQASampler - - -class AzureSQLSampler(SQASampler): - """ - Generates a sample of the data to not - run the query in the whole table. - """ - - # These types are not supported by pyodbc - it throws - # an error when trying to fetch data from these columns - # pyodbc.ProgrammingError: ('ODBC SQL type -151 is not yet supported. column-index=x type=-151', 'HY106') - NOT_COMPUTE_PYODBC = {"SQASGeography", "UndeterminedType"} - - def fetch_sample_data(self, columns: Optional[List[Column]] = None) -> TableData: - sqa_columns = [] - if columns: - for col in columns: - if col.type.__class__.__name__ not in self.NOT_COMPUTE_PYODBC: - sqa_columns.append(col) - return super().fetch_sample_data(sqa_columns or columns) diff --git a/ingestion/src/metadata/profiler/processor/sampler/sqlalchemy/bigquery/sampler.py b/ingestion/src/metadata/profiler/processor/sampler/sqlalchemy/bigquery/sampler.py deleted file mode 100644 index d3174ecd48aa..000000000000 --- a/ingestion/src/metadata/profiler/processor/sampler/sqlalchemy/bigquery/sampler.py +++ /dev/null @@ -1,97 +0,0 @@ -# Copyright 2021 Collate -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -Helper module to handle data sampling -for the profiler -""" -from typing import Dict, Optional - -from sqlalchemy import Column -from sqlalchemy.orm import Query - -from metadata.generated.schema.entity.data.table import ProfileSampleType, TableType -from metadata.profiler.api.models import ProfileSampleConfig -from metadata.profiler.processor.handle_partition import partition_filter_handler -from metadata.profiler.processor.sampler.sqlalchemy.sampler import SQASampler -from metadata.utils.constants import SAMPLE_DATA_DEFAULT_COUNT - - -class BigQuerySampler(SQASampler): - """ - Generates a sample of the data to not - run the query in the whole table. - """ - - # pylint: disable=too-many-arguments - def __init__( - self, - client, - table, - profile_sample_config: Optional[ProfileSampleConfig] = None, - partition_details: Optional[Dict] = None, - profile_sample_query: Optional[str] = None, - sample_data_count: Optional[int] = SAMPLE_DATA_DEFAULT_COUNT, - table_type: TableType = None, - ): - super().__init__( - client, - table, - profile_sample_config, - partition_details, - profile_sample_query, - sample_data_count, - ) - self.table_type: TableType = table_type - - def _base_sample_query(self, column: Optional[Column], label=None): - """Base query for sampling - - Args: - column (Optional[Column]): if computing a column metric only sample for the column - label (_type_, optional): - - Returns: - """ - # pylint: disable=import-outside-toplevel - from sqlalchemy_bigquery import STRUCT - - if column is not None: - column_parts = column.name.split(".") - if len(column_parts) > 1: - # for struct columns (e.g. `foo.bar`) we need to create a new column corresponding to - # the struct (e.g. `foo`) and then use that in the sample query as the column that - # will be query is `foo.bar`. - # e.g. WITH sample AS (SELECT `foo` FROM table) SELECT `foo.bar` - # FROM sample TABLESAMPLE SYSTEM (n PERCENT) - column = Column(column_parts[0], STRUCT) - # pylint: disable=protected-access - column._set_parent(self.table.__table__) - # pylint: enable=protected-access - - return super()._base_sample_query(column, label=label) - - @partition_filter_handler(build_sample=True) - def get_sample_query(self, *, column=None) -> Query: - """get query for sample data""" - # TABLESAMPLE SYSTEM is not supported for views - if ( - self.profile_sample_type == ProfileSampleType.PERCENTAGE - and self.table_type != TableType.View - ): - return ( - self._base_sample_query(column) - .suffix_with( - f"TABLESAMPLE SYSTEM ({self.profile_sample or 100} PERCENT)", - ) - .cte(f"{self.table.__tablename__}_sample") - ) - - return super().get_sample_query(column=column) diff --git a/ingestion/src/metadata/profiler/processor/sampler/sqlalchemy/sampler.py b/ingestion/src/metadata/profiler/processor/sampler/sqlalchemy/sampler.py deleted file mode 100644 index 8fe02e9b23e6..000000000000 --- a/ingestion/src/metadata/profiler/processor/sampler/sqlalchemy/sampler.py +++ /dev/null @@ -1,257 +0,0 @@ -# Copyright 2021 Collate -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -Helper module to handle data sampling -for the profiler -""" -import traceback -from typing import List, Optional, Union, cast - -from sqlalchemy import Column, inspect, text -from sqlalchemy.orm import DeclarativeMeta, Query, aliased -from sqlalchemy.orm.util import AliasedClass -from sqlalchemy.sql.sqltypes import Enum - -from metadata.generated.schema.entity.data.table import ( - PartitionIntervalTypes, - PartitionProfilerConfig, - ProfileSampleType, - TableData, -) -from metadata.profiler.orm.functions.modulo import ModuloFn -from metadata.profiler.orm.functions.random_num import RandomNumFn -from metadata.profiler.processor.handle_partition import partition_filter_handler -from metadata.profiler.processor.sampler.sampler_interface import SamplerInterface -from metadata.utils.helpers import is_safe_sql_query -from metadata.utils.logger import profiler_interface_registry_logger -from metadata.utils.sqa_utils import ( - build_query_filter, - dispatch_to_date_or_datetime, - get_integer_range_filter, - get_partition_col_type, - get_value_filter, -) - -logger = profiler_interface_registry_logger() - -RANDOM_LABEL = "random" - - -def _object_value_for_elem(self, elem): - """ - we have mapped DataType.ENUM: sqlalchemy.Enum - if map by default return None, - we will always get None because there is no enum map to lookup, - so what we are doing here is basically trusting the database, - that it will be storing the correct map key and showing directly that on the UI, - and in this approach we will be only able to display - what database has stored (i.e the key) and not the actual value of the same! - """ - return self._object_lookup.get(elem, elem) # pylint: disable=protected-access - - -Enum._object_value_for_elem = _object_value_for_elem # pylint: disable=protected-access - - -class SQASampler(SamplerInterface): - """ - Generates a sample of the data to not - run the query in the whole table. - """ - - def _base_sample_query(self, column: Optional[Column], label=None): - """Base query for sampling - - Args: - column (Optional[Column]): if computing a column metric only sample for the column - label (_type_, optional): - - Returns: - """ - # only sample the column if we are computing a column metric to limit the amount of data scaned - entity = self.table if column is None else column - if label is not None: - return self.client.query(entity, label) - return self.client.query(entity) - - @partition_filter_handler(build_sample=True) - def get_sample_query(self, *, column=None) -> Query: - """get query for sample data""" - if self.profile_sample_type == ProfileSampleType.PERCENTAGE: - rnd = self._base_sample_query( - column, - (ModuloFn(RandomNumFn(), 100)).label(RANDOM_LABEL), - ).cte(f"{self.table.__tablename__}_rnd") - session_query = self.client.query(rnd) - return session_query.where(rnd.c.random <= self.profile_sample).cte( - f"{self.table.__tablename__}_sample" - ) - - table_query = self.client.query(self.table) - session_query = self._base_sample_query( - column, - (ModuloFn(RandomNumFn(), table_query.count())).label(RANDOM_LABEL), - ) - return ( - session_query.order_by(RANDOM_LABEL) - .limit(self.profile_sample) - .cte(f"{self.table.__tablename__}_rnd") - ) - - def random_sample(self, ccolumn=None) -> Union[DeclarativeMeta, AliasedClass]: - """ - Either return a sampled CTE of table, or - the full table if no sampling is required. - """ - if self._profile_sample_query: - return self._rdn_sample_from_user_query() - - if not self.profile_sample or int(self.profile_sample) == 100: - if self._partition_details: - return self._partitioned_table() - - return self.table - - # Add new RandomNumFn column - sampled = self.get_sample_query(column=ccolumn) - - # Assign as an alias - return aliased(self.table, sampled) - - def fetch_sample_data(self, columns: Optional[List[Column]] = None) -> TableData: - """ - Use the sampler to retrieve sample data rows as per limit given by user - - Args: - columns (Optional[List]): List of columns to fetch - Returns: - TableData to be added to the Table Entity - """ - if self._profile_sample_query: - return self._fetch_sample_data_from_user_query() - - # Add new RandomNumFn column - rnd = self.get_sample_query() - if not columns: - sqa_columns = [col for col in inspect(rnd).c if col.name != RANDOM_LABEL] - else: - # we can't directly use columns as it is bound to self.table and not the rnd table. - # If we use it, it will result in a cross join between self.table and rnd table - names = [col.name for col in columns] - sqa_columns = [ - col - for col in inspect(rnd).c - if col.name != RANDOM_LABEL and col.name in names - ] - - try: - sqa_sample = ( - self.client.query(*sqa_columns) - .select_from(rnd) - .limit(self.sample_limit) - .all() - ) - except Exception: - logger.debug( - "Cannot fetch sample data with random sampling. Falling back to 100 rows." - ) - logger.debug(traceback.format_exc()) - sqa_columns = list(inspect(self.table).c) - sqa_sample = ( - self.client.query(*sqa_columns).select_from(self.table).limit(100).all() - ) - - return TableData( - columns=[column.name for column in sqa_columns], - rows=[list(row) for row in sqa_sample], - ) - - def _fetch_sample_data_from_user_query(self) -> TableData: - """Returns a table data object using results from query execution""" - if not is_safe_sql_query(self._profile_sample_query): - raise RuntimeError( - f"SQL expression is not safe\n\n{self._profile_sample_query}" - ) - - rnd = self.client.execute(f"{self._profile_sample_query}") - try: - columns = [col.name for col in rnd.cursor.description] - except AttributeError: - columns = list(rnd.keys()) - return TableData( - columns=columns, - rows=[list(row) for row in rnd.fetchmany(100)], - ) - - def _rdn_sample_from_user_query(self) -> Query: - """Returns sql alchemy object to use when running profiling""" - if not is_safe_sql_query(self._profile_sample_query): - raise RuntimeError( - f"SQL expression is not safe\n\n{self._profile_sample_query}" - ) - - return self.client.query(self.table).from_statement( - text(f"{self._profile_sample_query}") - ) - - def _partitioned_table(self) -> Query: - """Return the Query object for partitioned tables""" - return aliased(self.get_partitioned_query().subquery()) - - def get_partitioned_query(self) -> Query: - """Return the partitioned query""" - self._partition_details = cast( - PartitionProfilerConfig, self._partition_details - ) # satisfying type checker - partition_field = self._partition_details.partitionColumnName - - type_ = get_partition_col_type( - partition_field, - self.table.__table__.c, - ) - - if ( - self._partition_details.partitionIntervalType - == PartitionIntervalTypes.COLUMN_VALUE - ): - return self.client.query(self.table).filter( - get_value_filter( - Column(partition_field), - self._partition_details.partitionValues, - ) - ) - if ( - self._partition_details.partitionIntervalType - == PartitionIntervalTypes.INTEGER_RANGE - ): - return self.client.query(self.table).filter( - get_integer_range_filter( - Column(partition_field), - self._partition_details.partitionIntegerRangeStart, - self._partition_details.partitionIntegerRangeEnd, - ) - ) - return self.client.query(self.table).filter( - build_query_filter( - [ - ( - Column(partition_field), - "ge", - dispatch_to_date_or_datetime( - self._partition_details.partitionInterval, - text(self._partition_details.partitionIntervalUnit.value), - type_, - ), - ) - ], - False, - ) - ) diff --git a/ingestion/src/metadata/profiler/processor/sampler/sqlalchemy/snowflake/sampler.py b/ingestion/src/metadata/profiler/processor/sampler/sqlalchemy/snowflake/sampler.py deleted file mode 100644 index 5757782b17d4..000000000000 --- a/ingestion/src/metadata/profiler/processor/sampler/sqlalchemy/snowflake/sampler.py +++ /dev/null @@ -1,83 +0,0 @@ -# Copyright 2021 Collate -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -Helper module to handle data sampling -for the profiler -""" - -from typing import Dict, Optional, cast - -from sqlalchemy import Table -from sqlalchemy.sql.selectable import CTE - -from metadata.generated.schema.entity.data.table import ( - ProfileSampleType, - SamplingMethodType, -) -from metadata.profiler.api.models import ProfileSampleConfig -from metadata.profiler.processor.handle_partition import partition_filter_handler -from metadata.profiler.processor.sampler.sqlalchemy.sampler import SQASampler -from metadata.utils.constants import SAMPLE_DATA_DEFAULT_COUNT - - -class SnowflakeSampler(SQASampler): - """ - Generates a sample of the data to not - run the query in the whole table. - """ - - def __init__( - self, - client, - table, - profile_sample_config: Optional[ProfileSampleConfig] = None, - partition_details: Optional[Dict] = None, - profile_sample_query: Optional[str] = None, - sample_data_count: Optional[int] = SAMPLE_DATA_DEFAULT_COUNT, - ): - super().__init__( - client, - table, - profile_sample_config, - partition_details, - profile_sample_query, - sample_data_count, - ) - self.sampling_method_type = SamplingMethodType.BERNOULLI - if profile_sample_config and profile_sample_config.sampling_method_type: - self.sampling_method_type = profile_sample_config.sampling_method_type - - @partition_filter_handler(build_sample=True) - def get_sample_query(self, *, column=None) -> CTE: - """get query for sample data""" - # TABLESAMPLE SYSTEM is not supported for views - self.table = cast(Table, self.table) - - if self.profile_sample_type == ProfileSampleType.PERCENTAGE: - rnd = ( - self._base_sample_query( - column, - ) - .suffix_with( - f"SAMPLE {self.sampling_method_type.value} ({self.profile_sample or 100})", - ) - .cte(f"{self.table.__tablename__}_rnd") - ) - session_query = self.client.query(rnd) - return session_query.cte(f"{self.table.__tablename__}_sample") - - return ( - self._base_sample_query(column) - .suffix_with( - f"TABLESAMPLE ({self.profile_sample or 100} ROWS)", - ) - .cte(f"{self.table.__tablename__}_sample") - ) diff --git a/ingestion/src/metadata/profiler/processor/sampler/sqlalchemy/trino/sampler.py b/ingestion/src/metadata/profiler/processor/sampler/sqlalchemy/trino/sampler.py deleted file mode 100644 index 6cfc69472386..000000000000 --- a/ingestion/src/metadata/profiler/processor/sampler/sqlalchemy/trino/sampler.py +++ /dev/null @@ -1,47 +0,0 @@ -# Copyright 2021 Collate -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -Helper module to handle data sampling -for the profiler -""" -from sqlalchemy import inspect, or_, text - -from metadata.profiler.orm.registry import FLOAT_SET -from metadata.profiler.processor.handle_partition import RANDOM_LABEL -from metadata.profiler.processor.sampler.sqlalchemy.sampler import SQASampler - - -class TrinoSampler(SQASampler): - """ - Generates a sample of the data to not - run the query in the whole table. - """ - - def __init__(self, *args, **kwargs): - # pylint: disable=import-outside-toplevel - from trino.sqlalchemy.dialect import TrinoDialect - - TrinoDialect._json_deserializer = None - - super().__init__(*args, **kwargs) - - def _base_sample_query(self, column, label=None): - sqa_columns = [col for col in inspect(self.table).c if col.name != RANDOM_LABEL] - entity = self.table if column is None else column - return self.client.query(entity, label).where( - or_( - *[ - text(f"is_nan({cols.name}) = False") - for cols in sqa_columns - if type(cols.type) in FLOAT_SET - ] - ) - ) diff --git a/ingestion/src/metadata/profiler/source/base/profiler_source.py b/ingestion/src/metadata/profiler/source/base/profiler_source.py index 8d3d7b1df64a..3878e45e8fa4 100644 --- a/ingestion/src/metadata/profiler/source/base/profiler_source.py +++ b/ingestion/src/metadata/profiler/source/base/profiler_source.py @@ -14,9 +14,10 @@ its interface """ from copy import deepcopy -from typing import List, Optional, Tuple, Type, cast +from typing import List, Optional, Tuple, cast from sqlalchemy import MetaData +from sqlalchemy.orm import DeclarativeMeta from metadata.generated.schema.configuration.profilerConfiguration import ( ProfilerConfiguration, @@ -40,18 +41,19 @@ ) from metadata.ingestion.ometa.ometa_api import OpenMetadata from metadata.profiler.api.models import ProfilerProcessorConfig, TableConfig -from metadata.profiler.config import get_schema_profiler_config, get_database_profiler_config from metadata.profiler.interface.profiler_interface import ProfilerInterface from metadata.profiler.metrics.registry import Metrics +from metadata.profiler.orm.converter.base import ometa_to_sqa_orm from metadata.profiler.processor.core import Profiler from metadata.profiler.processor.default import DefaultProfiler, get_default_metrics from metadata.profiler.source.profiler_source_interface import ProfilerSourceInterface -from metadata.sampler.config import get_profile_query, get_sample_data_count_config -from metadata.sampler.models import ProfileSampleConfig +from metadata.sampler.models import SampleConfig from metadata.sampler.sampler_interface import SamplerInterface -from metadata.utils.importer import import_from_module from metadata.utils.logger import profiler_logger -from metadata.utils.service_spec.service_spec import BaseSpec, import_profiler_class, import_sampler_class +from metadata.utils.service_spec.service_spec import ( + import_profiler_class, + import_sampler_class, +) NON_SQA_DATABASE_CONNECTIONS = (DatalakeConnection,) @@ -101,6 +103,12 @@ def _set_sqa_metadata(self): return MetaData() return None + def _build_table_orm(self, entity: Table) -> Optional[DeclarativeMeta]: + """Build the ORM table if needed for the sampler and profiler interfaces""" + if not isinstance(self.service_conn_config, NON_SQA_DATABASE_CONNECTIONS): + return ometa_to_sqa_orm(entity, self.ometa_client, self.sqa_metadata) + return None + @staticmethod def get_config_for_table(entity: Table, profiler_config) -> Optional[TableConfig]: """Get config for a specific entity @@ -199,37 +207,32 @@ def create_profiler_interface( sampler_class = import_sampler_class( ServiceType.Database, source_type=self.profiler_interface_type ) + # This is shared between the sampler and profiler interfaces + _orm = self._build_table_orm(entity) sampler_interface: SamplerInterface = sampler_class.create( - table=entity, - sample_config=ProfileSampleConfig( + service_connection_config=self.service_conn_config, + ometa_client=self.ometa_client, + entity=entity, + schema_entity=schema_entity, + database_entity=database_entity, + db_service=db_service, + table_config=config, + profiler_config=profiler_config, + sample_config=SampleConfig( profile_sample=self.source_config.profileSample, profile_sample_type=self.source_config.profileSampleType, sampling_method_type=self.source_config.samplingMethodType, ), - profile_sample_query=get_profile_query(entity=entity, entity_config=config), - storage_config=get_sample_data_count_config( - entity=entity, - schema_profiler_config=get_schema_profiler_config( - schema_entity=schema_entity - ), - database_profiler_config=get_database_profiler_config( - database_entity=database_entity - ), - entity_config=config, - source_config=self.source_config, - ), - sample_data_count=self.source_config.sampleDataCount, + default_sample_data_count=self.source_config.sampleDataCount, + orm_table=_orm, ) profiler_interface: ProfilerInterface = profiler_class.create( - entity, - schema_entity, - database_entity, - config, - self.source_config, - self.service_conn_config, + entity=entity, + source_config=self.source_config, + service_connection_config=self.service_conn_config, sampler=sampler_interface, - self.ometa_client, - sqa_metadata=self.sqa_metadata, + ometa_client=self.ometa_client, + orm_table=_orm, ) # type: ignore self.interface = profiler_interface diff --git a/ingestion/src/metadata/sampler/config.py b/ingestion/src/metadata/sampler/config.py index 767067adf576..82d89f50e460 100644 --- a/ingestion/src/metadata/sampler/config.py +++ b/ingestion/src/metadata/sampler/config.py @@ -11,61 +11,65 @@ """ Sampler configuration helpers """ -from typing import Union, Optional - -from metadata.generated.schema.metadataIngestion.databaseServiceProfilerPipeline import DatabaseServiceProfilerPipeline - -from metadata.generated.schema.entity.data.table import Table, PartitionProfilerConfig - -from metadata.generated.schema.entity.services.databaseService import DatabaseService - -from metadata.generated.schema.entity.services.connections.connectionBasicType import DataStorageConfig +from typing import Optional, Union from metadata.generated.schema.entity.data.database import DatabaseProfilerConfig - -from metadata.generated.schema.entity.data.databaseSchema import DatabaseSchemaProfilerConfig -from metadata.profiler.api.models import DatabaseAndSchemaConfig, ProfilerProcessorConfig, ProfileSampleConfig -from metadata.sampler.models import TableConfig +from metadata.generated.schema.entity.data.databaseSchema import ( + DatabaseSchemaProfilerConfig, +) +from metadata.generated.schema.entity.data.table import PartitionProfilerConfig, Table +from metadata.generated.schema.entity.services.connections.connectionBasicType import ( + DataStorageConfig, +) +from metadata.generated.schema.entity.services.databaseService import DatabaseService +from metadata.generated.schema.metadataIngestion.databaseServiceProfilerPipeline import ( + DatabaseServiceProfilerPipeline, +) +from metadata.profiler.api.models import ( + DatabaseAndSchemaConfig, + ProfilerProcessorConfig, +) +from metadata.sampler.models import SampleConfig, TableConfig def get_sample_storage_config( - config: Union[ - DatabaseSchemaProfilerConfig, - DatabaseProfilerConfig, - DatabaseAndSchemaConfig, - ], + config: Union[ + DatabaseSchemaProfilerConfig, + DatabaseProfilerConfig, + DatabaseAndSchemaConfig, + ], ) -> Optional[DataStorageConfig]: """Get sample storage config""" if ( - config - and config.sampleDataStorageConfig - and config.sampleDataStorageConfig.config + config + and config.sampleDataStorageConfig + and config.sampleDataStorageConfig.config ): return config.sampleDataStorageConfig.config return None def get_storage_config_for_table( - entity: Table, - schema_profiler_config: Optional[DatabaseSchemaProfilerConfig], - database_profiler_config: Optional[DatabaseProfilerConfig], - db_service: Optional[DatabaseService], - profiler_config: ProfilerProcessorConfig, + entity: Table, + schema_profiler_config: Optional[DatabaseSchemaProfilerConfig], + database_profiler_config: Optional[DatabaseProfilerConfig], + db_service: Optional[DatabaseService], + profiler_config: ProfilerProcessorConfig, ) -> Optional[DataStorageConfig]: """Get storage config for a specific entity""" for schema_config in profiler_config.schemaConfig: if ( - schema_config.fullyQualifiedName.root - == entity.databaseSchema.fullyQualifiedName - and get_sample_storage_config(schema_config) + schema_config.fullyQualifiedName.root + == entity.databaseSchema.fullyQualifiedName + and get_sample_storage_config(schema_config) ): return get_sample_storage_config(schema_config) for database_config in profiler_config.databaseConfig: if ( - database_config.fullyQualifiedName.root - == entity.database.fullyQualifiedName - and get_sample_storage_config(database_config) + database_config.fullyQualifiedName.root + == entity.database.fullyQualifiedName + and get_sample_storage_config(database_config) ): return get_sample_storage_config(database_config) @@ -73,9 +77,7 @@ def get_storage_config_for_table( return get_sample_storage_config(schema_profiler_config) if get_sample_storage_config(database_profiler_config): - return get_sample_storage_config( - database_profiler_config - ) + return get_sample_storage_config(database_profiler_config) try: return db_service.connection.config.sampleDataStorageConfig.config @@ -86,23 +88,23 @@ def get_storage_config_for_table( def get_profile_sample_config( - entity: Table, - schema_profiler_config: Optional[DatabaseSchemaProfilerConfig], - database_profiler_config: Optional[DatabaseProfilerConfig], - entity_config: Optional[Union[TableConfig, DatabaseAndSchemaConfig]], - source_config: DatabaseServiceProfilerPipeline, -) -> Optional[ProfileSampleConfig]: + entity: Table, + schema_profiler_config: Optional[DatabaseSchemaProfilerConfig], + database_profiler_config: Optional[DatabaseProfilerConfig], + entity_config: Optional[Union[TableConfig, DatabaseAndSchemaConfig]], + source_config: DatabaseServiceProfilerPipeline, +) -> Optional[SampleConfig]: """Get profile sample config for a specific entity""" for config in ( - entity_config, - entity.tableProfilerConfig, - schema_profiler_config, - database_profiler_config, - source_config, + entity_config, + entity.tableProfilerConfig, + schema_profiler_config, + database_profiler_config, + source_config, ): try: if config and config.profileSample: - return ProfileSampleConfig( + return SampleConfig( profile_sample=config.profileSample, profile_sample_type=config.profileSampleType, sampling_method_type=config.samplingMethodType, @@ -114,8 +116,8 @@ def get_profile_sample_config( def get_partition_details( - entity: Table, - entity_config: Optional[TableConfig] = None, + entity: Table, + entity_config: Optional[TableConfig] = None, ) -> Optional[PartitionProfilerConfig]: """_summary_ @@ -133,7 +135,7 @@ def get_partition_details( def get_profile_query( - entity: Table, entity_config: Optional[TableConfig] + entity: Table, entity_config: Optional[TableConfig] ) -> Optional[str]: """get profile query for sampling @@ -158,7 +160,7 @@ def get_sample_data_count_config( schema_profiler_config: Optional[DatabaseSchemaProfilerConfig], database_profiler_config: Optional[DatabaseProfilerConfig], entity_config: Optional[TableConfig], - source_config: DatabaseServiceProfilerPipeline, + default_sample_data_count: int, ) -> Optional[int]: """_summary_ Args: @@ -177,4 +179,4 @@ def get_sample_data_count_config( if config and config.sampleDataCount: return config.sampleDataCount - return source_config.sampleDataCount + return default_sample_data_count diff --git a/ingestion/src/metadata/sampler/models.py b/ingestion/src/metadata/sampler/models.py index 333019d5a201..d50885b0c63c 100644 --- a/ingestion/src/metadata/sampler/models.py +++ b/ingestion/src/metadata/sampler/models.py @@ -11,18 +11,24 @@ """ Sampling Models """ -from typing import Optional, List, Union +from typing import List, Optional, Union -from metadata.generated.schema.entity.services.connections.connectionBasicType import SampleDataStorageConfig - -from metadata.generated.schema.type.basic import FullyQualifiedEntityName from pydantic import Field - -from metadata.generated.schema.entity.data.table import TableData, Table, ProfileSampleType, SamplingMethodType, \ - ColumnProfilerConfig, PartitionProfilerConfig from typing_extensions import Annotated from metadata.config.common import ConfigModel +from metadata.generated.schema.entity.data.table import ( + ColumnProfilerConfig, + PartitionProfilerConfig, + ProfileSampleType, + SamplingMethodType, + Table, + TableData, +) +from metadata.generated.schema.entity.services.connections.connectionBasicType import ( + SampleDataStorageConfig, +) +from metadata.generated.schema.type.basic import FullyQualifiedEntityName from metadata.ingestion.models.custom_pydantic import BaseModel from metadata.ingestion.models.table_metadata import ColumnTag @@ -67,8 +73,6 @@ def from_database_and_schema_config( class DatabaseAndSchemaConfig(BaseProfileConfig): """schema profile config""" - - # TODO: do we even need this now? sampleDataStorageConfig: Optional[SampleDataStorageConfig] = None diff --git a/ingestion/src/metadata/sampler/sampler_factory.py b/ingestion/src/metadata/sampler/sampler_factory.py deleted file mode 100644 index 2a5e52e7211e..000000000000 --- a/ingestion/src/metadata/sampler/sampler_factory.py +++ /dev/null @@ -1,100 +0,0 @@ -# Copyright 2021 Collate -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" -Factory class for creating sampler objects -""" - -from typing import Union - -from metadata.generated.schema.entity.services.connections.database.azureSQLConnection import ( - AzureSQLConnection, -) -from metadata.generated.schema.entity.services.connections.database.bigQueryConnection import ( - BigQueryConnection, -) -from metadata.generated.schema.entity.services.connections.database.datalakeConnection import ( - DatalakeConnection, -) -from metadata.generated.schema.entity.services.connections.database.dynamoDBConnection import ( - DynamoDBConnection, -) -from metadata.generated.schema.entity.services.connections.database.mongoDBConnection import ( - MongoDBConnection, -) -from metadata.generated.schema.entity.services.connections.database.snowflakeConnection import ( - SnowflakeConnection, -) -from metadata.generated.schema.entity.services.connections.database.trinoConnection import ( - TrinoConnection, -) -from metadata.generated.schema.entity.services.databaseService import DatabaseConnection -from metadata.profiler.processor.sampler.nosql.sampler import NoSQLSampler -from metadata.profiler.processor.sampler.pandas.sampler import DatalakeSampler -from metadata.profiler.processor.sampler.sqlalchemy.azuresql.sampler import ( - AzureSQLSampler, -) -from metadata.profiler.processor.sampler.sqlalchemy.bigquery.sampler import ( - BigQuerySampler, -) -from metadata.profiler.processor.sampler.sqlalchemy.sampler import SQASampler -from metadata.profiler.processor.sampler.sqlalchemy.snowflake.sampler import ( - SnowflakeSampler, -) -from metadata.profiler.processor.sampler.sqlalchemy.trino.sampler import TrinoSampler - - -class SamplerFactory: - """Creational factory for sampler objects""" - - def __init__(self): - self._sampler_type = {} - - def register(self, source_type: str, sampler_class): - """Register a new source type""" - self._sampler_type[source_type] = sampler_class - - def create( - self, source_type: str, *args, **kwargs - ) -> Union[SQASampler, DatalakeSampler]: - """Create source object based on source type""" - sampler_class = self._sampler_type.get(source_type) - if not sampler_class: - sampler_class = self._sampler_type[DatabaseConnection.__name__] - return sampler_class(*args, **kwargs) - return sampler_class(*args, **kwargs) - - -sampler_factory_ = SamplerFactory() -sampler_factory_.register( - source_type=DatabaseConnection.__name__, sampler_class=SQASampler -) -sampler_factory_.register( - source_type=BigQueryConnection.__name__, sampler_class=BigQuerySampler -) -sampler_factory_.register( - source_type=DatalakeConnection.__name__, sampler_class=DatalakeSampler -) -sampler_factory_.register( - source_type=TrinoConnection.__name__, sampler_class=TrinoSampler -) -sampler_factory_.register( - source_type=MongoDBConnection.__name__, sampler_class=NoSQLSampler -) -sampler_factory_.register( - source_type=SnowflakeConnection.__name__, sampler_class=SnowflakeSampler -) -sampler_factory_.register( - source_type=DynamoDBConnection.__name__, sampler_class=NoSQLSampler -) -sampler_factory_.register( - source_type=AzureSQLConnection.__name__, sampler_class=AzureSQLSampler -) diff --git a/ingestion/src/metadata/sampler/sampler_interface.py b/ingestion/src/metadata/sampler/sampler_interface.py index 46c37df2e88d..50318982d1fa 100644 --- a/ingestion/src/metadata/sampler/sampler_interface.py +++ b/ingestion/src/metadata/sampler/sampler_interface.py @@ -15,26 +15,40 @@ from abc import ABC, abstractmethod from typing import Dict, List, Optional, Union -from metadata.generated.schema.entity.services.connections.connectionBasicType import DataStorageConfig - - -from metadata.generated.schema.metadataIngestion.databaseServiceProfilerPipeline import DatabaseServiceProfilerPipeline - -from metadata.generated.schema.entity.data.database import DatabaseProfilerConfig +from metadata.ingestion.ometa.ometa_api import OpenMetadata +from sqlalchemy import Column -from metadata.generated.schema.entity.data.databaseSchema import DatabaseSchemaProfilerConfig +from metadata.generated.schema.entity.data.database import Database +from metadata.generated.schema.entity.data.databaseSchema import DatabaseSchema +from metadata.generated.schema.entity.data.table import Table, TableData +from metadata.generated.schema.entity.services.connections.connectionBasicType import ( + DataStorageConfig, +) +from metadata.generated.schema.entity.services.connections.database.datalakeConnection import ( + DatalakeConnection, +) +from metadata.generated.schema.entity.services.databaseService import ( + DatabaseConnection, + DatabaseService, +) +from metadata.profiler.api.models import ProfilerProcessorConfig, TableConfig +from metadata.profiler.config import ( + get_database_profiler_config, + get_schema_profiler_config, +) from metadata.profiler.processor.sample_data_handler import upload_sample_data +from metadata.sampler.config import ( + get_profile_query, + get_sample_data_count_config, + get_storage_config_for_table, +) from metadata.sampler.models import SampleConfig - from metadata.sampler.partition import get_partition_details +from metadata.utils.constants import SAMPLE_DATA_DEFAULT_COUNT from metadata.utils.execution_time_tracker import calculate_execution_time from metadata.utils.logger import sampler_logger -from sqlalchemy import Column - -from metadata.generated.schema.entity.data.table import Table, TableData, PartitionProfilerConfig -from metadata.profiler.api.models import TableConfig -from metadata.utils.constants import SAMPLE_DATA_DEFAULT_COUNT from metadata.utils.sqa_like_column import SQALikeColumn +from metadata.utils.ssl_manager import get_ssl_connection logger = sampler_logger() @@ -44,50 +58,97 @@ class SamplerInterface(ABC): def __init__( self, - client, - table: Table, + service_connection_config: Union[DatabaseConnection, DatalakeConnection], + ometa_client: OpenMetadata, + entity: Table, sample_config: Optional[SampleConfig] = None, partition_details: Optional[Dict] = None, profile_sample_query: Optional[str] = None, storage_config: DataStorageConfig = None, sample_data_count: Optional[int] = SAMPLE_DATA_DEFAULT_COUNT, + **kwargs, ): - self.profile_sample = None - self.profile_sample_type = None - if sample_config: - self.profile_sample = sample_config.profile_sample - self.profile_sample_type = sample_config.profile_sample_type - self.client = client - self.table = table - self._profile_sample_query = profile_sample_query - self.sample_limit = sample_data_count + self.ometa_client = ometa_client self._sample_rows = None - self._partition_details = partition_details + self.sample_config = sample_config + + self.entity = entity + self.profile_sample_query = profile_sample_query + self.sample_limit = sample_data_count + self.partition_details = partition_details self.storage_config = storage_config + self.service_connection_config = service_connection_config + self.connection = get_ssl_connection(self.service_connection_config) + self.client = self.get_client() + @classmethod def create( cls, - client, - table: Table, + service_connection_config: Union[DatabaseConnection, DatalakeConnection], + ometa_client: OpenMetadata, + entity: Table, + schema_entity: DatabaseSchema, + database_entity: Database, + db_service: DatabaseService, + table_config: TableConfig, + profiler_config: ProfilerProcessorConfig, sample_config: Optional[SampleConfig] = None, - partition_details: Optional[Dict] = None, - profile_sample_query: Optional[str] = None, - storage_config: DataStorageConfig = None, - sample_data_count: Optional[int] = SAMPLE_DATA_DEFAULT_COUNT, + default_sample_data_count: Optional[int] = SAMPLE_DATA_DEFAULT_COUNT, + **kwargs, ) -> "SamplerInterface": """Create sampler""" + schema_profiler_config = get_schema_profiler_config(schema_entity=schema_entity) + database_profiler_config = get_database_profiler_config( + database_entity=database_entity + ) + + storage_config = get_storage_config_for_table( + entity=entity, + schema_profiler_config=schema_profiler_config, + database_profiler_config=database_profiler_config, + db_service=db_service, + profiler_config=profiler_config, + ) + + sample_data_count = get_sample_data_count_config( + entity=entity, + schema_profiler_config=schema_profiler_config, + database_profiler_config=database_profiler_config, + entity_config=table_config, + default_sample_data_count=default_sample_data_count, + ) + + profile_sample_query = get_profile_query( + entity=entity, entity_config=table_config + ) + + partition_details = get_partition_details(entity=entity) + return cls( - client=client, - table=table, + service_connection_config=service_connection_config, + ometa_client=ometa_client, + entity=entity, sample_config=sample_config, partition_details=partition_details, profile_sample_query=profile_sample_query, storage_config=storage_config, sample_data_count=sample_data_count, + **kwargs, ) + @property + @abstractmethod + def table(self): + """Table object to run the sampling""" + raise NotImplementedError + + @abstractmethod + def get_client(self): + """Get client""" + raise NotImplementedError + @abstractmethod def _rdn_sample_from_user_query(self): """Get random sample from user query""" @@ -127,17 +188,15 @@ def generate_sample_data(self) -> Optional[TableData]: f"{self.profiler_interface.table_entity.fullyQualifiedName.root}..." # type: ignore ) # TODO: GET COLUMNS? - table_data = self.fetch_sample_data( - self.table, self.columns - ) + table_data = self.fetch_sample_data(self.columns) upload_sample_data( - data=table_data, entity=self.table, sample_storage_config=self.storage_config, + data=table_data, + entity=self.entity, + sample_storage_config=self.storage_config, ) table_data.rows = table_data.rows[ - : min( - SAMPLE_DATA_DEFAULT_COUNT, self.sample_limit - ) - ] + : min(SAMPLE_DATA_DEFAULT_COUNT, self.sample_limit) + ] return table_data except Exception as err: diff --git a/ingestion/src/metadata/sampler/sqlalchemy/azuresql/sampler.py b/ingestion/src/metadata/sampler/sqlalchemy/azuresql/sampler.py index fb48f80e6623..d59d1604bd5f 100644 --- a/ingestion/src/metadata/sampler/sqlalchemy/azuresql/sampler.py +++ b/ingestion/src/metadata/sampler/sqlalchemy/azuresql/sampler.py @@ -17,7 +17,7 @@ from sqlalchemy import Column from metadata.generated.schema.entity.data.table import TableData -from metadata.profiler.processor.sampler.sqlalchemy.sampler import SQASampler +from metadata.sampler.sqlalchemy.sampler import SQASampler class AzureSQLSampler(SQASampler): diff --git a/ingestion/src/metadata/sampler/sqlalchemy/bigquery/sampler.py b/ingestion/src/metadata/sampler/sqlalchemy/bigquery/sampler.py index d3174ecd48aa..1e67f07ec398 100644 --- a/ingestion/src/metadata/sampler/sqlalchemy/bigquery/sampler.py +++ b/ingestion/src/metadata/sampler/sqlalchemy/bigquery/sampler.py @@ -18,9 +18,9 @@ from sqlalchemy.orm import Query from metadata.generated.schema.entity.data.table import ProfileSampleType, TableType -from metadata.profiler.api.models import ProfileSampleConfig from metadata.profiler.processor.handle_partition import partition_filter_handler -from metadata.profiler.processor.sampler.sqlalchemy.sampler import SQASampler +from metadata.sampler.models import SampleConfig +from metadata.sampler.sqlalchemy.sampler import SQASampler from metadata.utils.constants import SAMPLE_DATA_DEFAULT_COUNT @@ -35,7 +35,7 @@ def __init__( self, client, table, - profile_sample_config: Optional[ProfileSampleConfig] = None, + profile_sample_config: Optional[SampleConfig] = None, partition_details: Optional[Dict] = None, profile_sample_query: Optional[str] = None, sample_data_count: Optional[int] = SAMPLE_DATA_DEFAULT_COUNT, diff --git a/ingestion/src/metadata/sampler/sqlalchemy/sampler.py b/ingestion/src/metadata/sampler/sqlalchemy/sampler.py index 3398d90e4b66..0839ff230fe5 100644 --- a/ingestion/src/metadata/sampler/sqlalchemy/sampler.py +++ b/ingestion/src/metadata/sampler/sqlalchemy/sampler.py @@ -26,10 +26,12 @@ ProfileSampleType, TableData, ) +from metadata.ingestion.connections.session import create_and_bind_thread_safe_session +from metadata.profiler.orm.converter.base import ometa_to_sqa_orm from metadata.profiler.orm.functions.modulo import ModuloFn from metadata.profiler.orm.functions.random_num import RandomNumFn from metadata.profiler.processor.handle_partition import partition_filter_handler -from metadata.profiler.processor.sampler.sampler_interface import SamplerInterface +from metadata.sampler.sampler_interface import SamplerInterface from metadata.utils.helpers import is_safe_sql_query from metadata.utils.logger import profiler_interface_registry_logger from metadata.utils.sqa_utils import ( @@ -67,6 +69,19 @@ class SQASampler(SamplerInterface): run the query in the whole table. """ + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self._table = kwargs["orm_table"] + + @property + def table(self): + return self._table + + def get_client(self): + """Build the SQA Client""" + session_factory = create_and_bind_thread_safe_session(self.connection) + return session_factory() + def _base_sample_query(self, column: Optional[Column], label=None): """Base query for sampling @@ -85,13 +100,13 @@ def _base_sample_query(self, column: Optional[Column], label=None): @partition_filter_handler(build_sample=True) def get_sample_query(self, *, column=None) -> Query: """get query for sample data""" - if self.profile_sample_type == ProfileSampleType.PERCENTAGE: + if self.sample_config.profile_sample_type == ProfileSampleType.PERCENTAGE: rnd = self._base_sample_query( column, (ModuloFn(RandomNumFn(), 100)).label(RANDOM_LABEL), ).cte(f"{self.table.__tablename__}_rnd") session_query = self.client.query(rnd) - return session_query.where(rnd.c.random <= self.profile_sample).cte( + return session_query.where(rnd.c.random <= self.sample_config.profile_sample).cte( f"{self.table.__tablename__}_sample" ) @@ -102,7 +117,7 @@ def get_sample_query(self, *, column=None) -> Query: ) return ( session_query.order_by(RANDOM_LABEL) - .limit(self.profile_sample) + .limit(self.sample_config.profile_sample) .cte(f"{self.table.__tablename__}_rnd") ) @@ -111,11 +126,11 @@ def random_sample(self, ccolumn=None) -> Union[DeclarativeMeta, AliasedClass]: Either return a sampled CTE of table, or the full table if no sampling is required. """ - if self._profile_sample_query: + if self.profile_sample_query: return self._rdn_sample_from_user_query() - if not self.profile_sample or int(self.profile_sample) == 100: - if self._partition_details: + if not self.sample_config.profile_sample or int(self.sample_config.profile_sample) == 100: + if self.partition_details: return self._partitioned_table() return self.table @@ -132,10 +147,10 @@ def fetch_sample_data(self, columns: Optional[List[Column]] = None) -> TableData Args: columns (Optional[List]): List of columns to fetch - Retunrs: + Returns: TableData to be added to the Table Entity """ - if self._profile_sample_query: + if self.profile_sample_query: return self._fetch_sample_data_from_user_query() # Add new RandomNumFn column @@ -176,12 +191,12 @@ def fetch_sample_data(self, columns: Optional[List[Column]] = None) -> TableData def _fetch_sample_data_from_user_query(self) -> TableData: """Returns a table data object using results from query execution""" - if not is_safe_sql_query(self._profile_sample_query): + if not is_safe_sql_query(self.profile_sample_query): raise RuntimeError( - f"SQL expression is not safe\n\n{self._profile_sample_query}" + f"SQL expression is not safe\n\n{self.profile_sample_query}" ) - rnd = self.client.execute(f"{self._profile_sample_query}") + rnd = self.client.execute(f"{self.profile_sample_query}") try: columns = [col.name for col in rnd.cursor.description] except AttributeError: @@ -193,13 +208,13 @@ def _fetch_sample_data_from_user_query(self) -> TableData: def _rdn_sample_from_user_query(self) -> Query: """Returns sql alchemy object to use when running profiling""" - if not is_safe_sql_query(self._profile_sample_query): + if not is_safe_sql_query(self.profile_sample_query): raise RuntimeError( - f"SQL expression is not safe\n\n{self._profile_sample_query}" + f"SQL expression is not safe\n\n{self.profile_sample_query}" ) return self.client.query(self.table).from_statement( - text(f"{self._profile_sample_query}") + text(f"{self.profile_sample_query}") ) def _partitioned_table(self) -> Query: @@ -208,10 +223,10 @@ def _partitioned_table(self) -> Query: def get_partitioned_query(self) -> Query: """Return the partitioned query""" - self._partition_details = cast( - PartitionProfilerConfig, self._partition_details + self.partition_details = cast( + PartitionProfilerConfig, self.partition_details ) # satisfying type checker - partition_field = self._partition_details.partitionColumnName + partition_field = self.partition_details.partitionColumnName type_ = get_partition_col_type( partition_field, @@ -219,24 +234,24 @@ def get_partitioned_query(self) -> Query: ) if ( - self._partition_details.partitionIntervalType + self.partition_details.partitionIntervalType == PartitionIntervalTypes.COLUMN_VALUE ): return self.client.query(self.table).filter( get_value_filter( Column(partition_field), - self._partition_details.partitionValues, + self.partition_details.partitionValues, ) ) if ( - self._partition_details.partitionIntervalType + self.partition_details.partitionIntervalType == PartitionIntervalTypes.INTEGER_RANGE ): return self.client.query(self.table).filter( get_integer_range_filter( Column(partition_field), - self._partition_details.partitionIntegerRangeStart, - self._partition_details.partitionIntegerRangeEnd, + self.partition_details.partitionIntegerRangeStart, + self.partition_details.partitionIntegerRangeEnd, ) ) return self.client.query(self.table).filter( @@ -246,8 +261,8 @@ def get_partitioned_query(self) -> Query: Column(partition_field), "ge", dispatch_to_date_or_datetime( - self._partition_details.partitionInterval, - text(self._partition_details.partitionIntervalUnit.value), + self.partition_details.partitionInterval, + text(self.partition_details.partitionIntervalUnit.value), type_, ), ) diff --git a/ingestion/src/metadata/sampler/sqlalchemy/snowflake/sampler.py b/ingestion/src/metadata/sampler/sqlalchemy/snowflake/sampler.py index 5757782b17d4..368c760ed91e 100644 --- a/ingestion/src/metadata/sampler/sqlalchemy/snowflake/sampler.py +++ b/ingestion/src/metadata/sampler/sqlalchemy/snowflake/sampler.py @@ -22,9 +22,9 @@ ProfileSampleType, SamplingMethodType, ) -from metadata.profiler.api.models import ProfileSampleConfig from metadata.profiler.processor.handle_partition import partition_filter_handler -from metadata.profiler.processor.sampler.sqlalchemy.sampler import SQASampler +from metadata.sampler.models import SampleConfig +from metadata.sampler.sqlalchemy.sampler import SQASampler from metadata.utils.constants import SAMPLE_DATA_DEFAULT_COUNT @@ -38,7 +38,7 @@ def __init__( self, client, table, - profile_sample_config: Optional[ProfileSampleConfig] = None, + profile_sample_config: Optional[SampleConfig] = None, partition_details: Optional[Dict] = None, profile_sample_query: Optional[str] = None, sample_data_count: Optional[int] = SAMPLE_DATA_DEFAULT_COUNT, diff --git a/ingestion/src/metadata/sampler/sqlalchemy/trino/sampler.py b/ingestion/src/metadata/sampler/sqlalchemy/trino/sampler.py index 6cfc69472386..37012b3f3027 100644 --- a/ingestion/src/metadata/sampler/sqlalchemy/trino/sampler.py +++ b/ingestion/src/metadata/sampler/sqlalchemy/trino/sampler.py @@ -16,7 +16,7 @@ from metadata.profiler.orm.registry import FLOAT_SET from metadata.profiler.processor.handle_partition import RANDOM_LABEL -from metadata.profiler.processor.sampler.sqlalchemy.sampler import SQASampler +from metadata.sampler.sqlalchemy.sampler import SQASampler class TrinoSampler(SQASampler): diff --git a/ingestion/src/metadata/utils/service_spec/service_spec.py b/ingestion/src/metadata/utils/service_spec/service_spec.py index 808626a11023..1ed750a9b3a6 100644 --- a/ingestion/src/metadata/utils/service_spec/service_spec.py +++ b/ingestion/src/metadata/utils/service_spec/service_spec.py @@ -48,6 +48,7 @@ class BaseSpec(BaseModel): metadata_source_class: str lineage_source_class: Optional[str] = None usage_source_class: Optional[str] = None + sampler_class: Optional[str] = None @model_validator(mode="before") @classmethod diff --git a/ingestion/src/metadata/workflow/profiler.py b/ingestion/src/metadata/workflow/profiler.py index 5f8d692d2f20..6ed66d3ae99d 100644 --- a/ingestion/src/metadata/workflow/profiler.py +++ b/ingestion/src/metadata/workflow/profiler.py @@ -68,15 +68,7 @@ def set_steps(self): profiler_processor = self._get_profiler_processor() sink = self._get_sink() - # Only instantiate the PII Processor on demand - source_config: DatabaseServiceProfilerPipeline = cast( - DatabaseServiceProfilerPipeline, self.config.source.sourceConfig.config - ) - if source_config.processPiiSensitive: - pii_processor = self._get_pii_processor() - self.steps = (profiler_processor, pii_processor, sink) - else: - self.steps = (profiler_processor, sink) + self.steps = (profiler_processor, sink) def test_connection(self) -> None: service_config = self.config.source.serviceConnection.root.config @@ -97,6 +89,3 @@ def _get_sink(self) -> Sink: def _get_profiler_processor(self) -> Processor: return ProfilerProcessor.create(self.config.model_dump(), self.metadata) - - def _get_pii_processor(self) -> Processor: - return PIIProcessor.create(self.config.model_dump(), self.metadata) From 9ad37c40db595515426989a1d9e925fe0e9871de Mon Sep 17 00:00:00 2001 From: Pere Miquel Brull Date: Tue, 12 Nov 2024 12:55:24 +0100 Subject: [PATCH 03/29] separate sampler --- .../pandas/pandas_test_suite_interface.py | 29 ++-- .../sqlalchemy/sqa_test_suite_interface.py | 41 ++--- .../interface/test_suite_interface.py | 72 ++++---- .../interface/test_suite_interface_factory.py | 158 ------------------ .../runner/base_test_suite_source.py | 67 ++++++-- .../source/database/azuresql/service_spec.py | 2 + .../source/database/bigquery/service_spec.py | 2 + .../database/databricks/service_spec.py | 4 + .../source/database/datalake/service_spec.py | 9 +- .../source/database/dynamodb/service_spec.py | 5 +- .../source/database/mongodb/service_spec.py | 5 +- .../source/database/snowflake/service_spec.py | 6 + .../source/database/trino/service_spec.py | 2 + .../database/unitycatalog/service_spec.py | 4 + .../metadata/mixins/sqalchemy/sqa_mixin.py | 5 +- .../interface/pandas/profiler_interface.py | 60 +++---- .../sqlalchemy/profiler_interface.py | 20 ++- .../metadata/profiler/orm/converter/base.py | 14 +- .../profiler/source/base/profiler_source.py | 50 +----- ingestion/src/metadata/sampler/config.py | 29 +++- ingestion/src/metadata/sampler/models.py | 1 + .../src/metadata/sampler/nosql/sampler.py | 26 ++- .../src/metadata/sampler/pandas/sampler.py | 55 +++--- .../src/metadata/sampler/sampler_interface.py | 66 +++----- .../sampler/sqlalchemy/bigquery/sampler.py | 4 +- .../metadata/sampler/sqlalchemy/sampler.py | 32 ++-- .../sampler/sqlalchemy/snowflake/sampler.py | 8 +- .../src/metadata/utils/profiler_utils.py | 41 +++++ .../metadata/utils/service_spec/default.py | 4 + .../utils/service_spec/service_spec.py | 9 + ingestion/src/metadata/workflow/profiler.py | 5 - .../tests/unit/profiler/pandas/test_sample.py | 2 +- .../tests/unit/test_suite/test_factories.py | 120 ------------- 33 files changed, 372 insertions(+), 585 deletions(-) delete mode 100644 ingestion/src/metadata/data_quality/interface/test_suite_interface_factory.py delete mode 100644 ingestion/tests/unit/test_suite/test_factories.py diff --git a/ingestion/src/metadata/data_quality/interface/pandas/pandas_test_suite_interface.py b/ingestion/src/metadata/data_quality/interface/pandas/pandas_test_suite_interface.py index 2546242e6bd5..ed7db1218083 100644 --- a/ingestion/src/metadata/data_quality/interface/pandas/pandas_test_suite_interface.py +++ b/ingestion/src/metadata/data_quality/interface/pandas/pandas_test_suite_interface.py @@ -13,20 +13,17 @@ Interfaces with database for all database engine supporting sqlalchemy abstraction layer """ - from metadata.data_quality.builders.i_validator_builder import IValidatorBuilder from metadata.data_quality.builders.pandas_validator_builder import ( PandasValidatorBuilder, ) from metadata.data_quality.interface.test_suite_interface import TestSuiteInterface from metadata.generated.schema.entity.data.table import Table -from metadata.generated.schema.entity.services.connections.database.datalakeConnection import ( - DatalakeConnection, -) +from metadata.generated.schema.entity.services.databaseService import DatabaseConnection from metadata.generated.schema.tests.testCase import TestCase from metadata.ingestion.ometa.ometa_api import OpenMetadata -from metadata.ingestion.source.connections import get_connection from metadata.mixins.pandas.pandas_mixin import PandasInterfaceMixin +from metadata.sampler.sampler_interface import SamplerInterface from metadata.utils.logger import test_suite_logger logger = test_suite_logger() @@ -41,15 +38,18 @@ class PandasTestSuiteInterface(TestSuiteInterface, PandasInterfaceMixin): def __init__( self, - service_connection_config: DatalakeConnection, + service_connection_config: DatabaseConnection, ometa_client: OpenMetadata, - table_entity: Table = None, + sampler: SamplerInterface, + table_entity: Table, **kwargs, # pylint: disable=unused-argument ): - self.table_entity = table_entity - - self.ometa_client = ometa_client - self.service_connection_config = service_connection_config + super().__init__( + service_connection_config, + ometa_client, + sampler, + table_entity, + ) ( self.table_sample_query, @@ -58,12 +58,7 @@ def __init__( ) = self._get_table_config() # add partition logic to test suite - self.dfs = self.return_ometa_dataframes_sampled( - service_connection_config=self.service_connection_config, - client=get_connection(self.service_connection_config).client._client, - table=self.table_entity, - profile_sample_config=self.table_sample_config, - ) + self.dfs = self.sampler.table if self.dfs and self.table_partition_config: self.dfs = self.get_partitioned_df(self.dfs) diff --git a/ingestion/src/metadata/data_quality/interface/sqlalchemy/sqa_test_suite_interface.py b/ingestion/src/metadata/data_quality/interface/sqlalchemy/sqa_test_suite_interface.py index 6a04ad05daf5..2d37b4fac006 100644 --- a/ingestion/src/metadata/data_quality/interface/sqlalchemy/sqa_test_suite_interface.py +++ b/ingestion/src/metadata/data_quality/interface/sqlalchemy/sqa_test_suite_interface.py @@ -29,7 +29,7 @@ from metadata.ingestion.ometa.ometa_api import OpenMetadata from metadata.mixins.sqalchemy.sqa_mixin import SQAInterfaceMixin from metadata.profiler.processor.runner import QueryRunner -from metadata.sampler.sqlalchemy.sampler import SQASampler +from metadata.sampler.sampler_interface import SamplerInterface from metadata.utils.constants import TEN_MIN from metadata.utils.logger import test_suite_logger from metadata.utils.ssl_manager import get_ssl_connection @@ -49,14 +49,18 @@ def __init__( self, service_connection_config: DatabaseConnection, ometa_client: OpenMetadata, + sampler: SamplerInterface, table_entity: Table = None, - sqa_metadata=None, + orm_table=None, ): - self.ometa_client = ometa_client - self.table_entity = table_entity - self.service_connection_config = service_connection_config + super().__init__( + service_connection_config, + ometa_client, + sampler, + table_entity, + ) self.create_session() - self._table = self._convert_table_to_orm_object(sqa_metadata) + self._table = orm_table ( self.table_sample_query, @@ -64,7 +68,6 @@ def __init__( self.table_partition_config, ) = self._get_table_config() - self._sampler = self._create_sampler() self._runner = self._create_runner() def create_session(self): @@ -95,15 +98,6 @@ def runner(self) -> QueryRunner: """ return self._runner - @property - def sampler(self) -> SQASampler: - """getter method for the Runner object - - Returns: - Sampler: sampler object - """ - return self._sampler - @property def table(self): """getter method for the table object @@ -113,21 +107,6 @@ def table(self): """ return self._table - def _create_sampler(self) -> SQASampler: - """Create sampler instance""" - from metadata.profiler.processor.sampler.sampler_factory import ( # pylint: disable=import-outside-toplevel - sampler_factory_, - ) - - return sampler_factory_.create( - self.service_connection_config.__class__.__name__, - client=self.session, - table=self.table, - profile_sample_config=self.table_sample_config, - partition_details=self.table_partition_config, - profile_sample_query=self.table_sample_query, - ) - def _create_runner(self) -> None: """Create a QueryRunner Instance""" diff --git a/ingestion/src/metadata/data_quality/interface/test_suite_interface.py b/ingestion/src/metadata/data_quality/interface/test_suite_interface.py index 57ea7358f511..82e58ffa2842 100644 --- a/ingestion/src/metadata/data_quality/interface/test_suite_interface.py +++ b/ingestion/src/metadata/data_quality/interface/test_suite_interface.py @@ -31,8 +31,7 @@ from metadata.generated.schema.tests.testCase import TestCase from metadata.generated.schema.tests.testDefinition import TestDefinition from metadata.ingestion.ometa.ometa_api import OpenMetadata -from metadata.sampler.config import get_partition_details -from metadata.sampler.models import SampleConfig +from metadata.sampler.sampler_interface import SamplerInterface from metadata.utils.logger import test_suite_logger logger = test_suite_logger() @@ -43,27 +42,39 @@ class TestSuiteInterface(ABC): runtime_params_setter_fact = RuntimeParameterSetterFactory - @abstractmethod def __init__( self, service_connection_config: DatabaseConnection, ometa_client: OpenMetadata, + sampler: SamplerInterface, table_entity: Table, + *args, + **kwargs, ): """Required attribute for the interface""" self.ometa_client = ometa_client self.service_connection_config = service_connection_config self.table_entity = table_entity + self.sampler = sampler - @property - def sampler(self): - """Get the sampler object - - Note: Overriden in the implementation class. This should be removed from the interface. It has been - implemented as the RuntimeParameterSetter takes the sampler as an argument, though we may want to - remove that dependency. - """ - return None + @classmethod + def create( + cls, + service_connection_config: DatabaseConnection, + ometa_client: OpenMetadata, + sampler: SamplerInterface, + table_entity: Table, + *args, + **kwargs, + ): + return cls( + service_connection_config, + ometa_client, + sampler, + table_entity, + *args, + **kwargs, + ) @abstractmethod def _get_validator_builder( @@ -128,37 +139,10 @@ def run_test_case(self, test_case: TestCase) -> Optional[TestCaseResult]: ) raise RuntimeError(err) - def _get_sample_query(self) -> Optional[str]: - """Get the sampling query for the data quality tests - - Args: - entity (Table): _description_ - """ - if self.table_entity.tableProfilerConfig: - return self.table_entity.tableProfilerConfig.profileQuery - - return None - - def _get_profile_sample(self) -> Optional[SampleConfig]: - try: - if self.table_entity.tableProfilerConfig.profileSample: - return SampleConfig( - profile_sample=self.table_entity.tableProfilerConfig.profileSample, - profile_sample_type=self.table_entity.tableProfilerConfig.profileSampleType, - ) - except AttributeError: - # if tableProfilerConfig is None it will indicate that the table has not profiler config - # hence we can return None - return None - return None - def _get_table_config(self): """Get the sampling configuration for the data quality tests""" - sample_query = self._get_sample_query() - sample_config = None - partition_config = None - if not sample_query: - sample_config = self._get_profile_sample() - partition_config = get_partition_details(self.table_entity) - - return sample_query, sample_config, partition_config + return ( + self.sampler.sample_query, + self.sampler.sample_config, + self.sampler.partition_details, + ) diff --git a/ingestion/src/metadata/data_quality/interface/test_suite_interface_factory.py b/ingestion/src/metadata/data_quality/interface/test_suite_interface_factory.py deleted file mode 100644 index 15b76cbc4790..000000000000 --- a/ingestion/src/metadata/data_quality/interface/test_suite_interface_factory.py +++ /dev/null @@ -1,158 +0,0 @@ -# Copyright 2021 Collate -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# pylint: disable=import-outside-toplevel -""" -Interface factory -""" -import traceback -from logging import Logger -from typing import Callable, Dict, Type - -from metadata.data_quality.interface.test_suite_interface import TestSuiteInterface -from metadata.generated.schema.entity.data.table import Table -from metadata.generated.schema.entity.services.connections.database.databricksConnection import ( - DatabricksConnection, -) -from metadata.generated.schema.entity.services.connections.database.datalakeConnection import ( - DatalakeConnection, -) -from metadata.generated.schema.entity.services.connections.database.snowflakeConnection import ( - SnowflakeConnection, -) -from metadata.generated.schema.entity.services.connections.database.unityCatalogConnection import ( - UnityCatalogConnection, -) -from metadata.generated.schema.entity.services.databaseService import DatabaseConnection -from metadata.ingestion.ometa.ometa_api import OpenMetadata -from metadata.utils.logger import test_suite_logger - -logger: Logger = test_suite_logger() - - -class TestSuiteInterfaceFactory: - """Factory class for the data quality interface""" - - def __init__(self): - """Initialize the interface factory""" - self._interface_type: Dict[str, Callable[[], Type[TestSuiteInterface]]] = { - "base": self.sqa, - } - - def register(self, interface_type: str, fn: Callable[[], Type[TestSuiteInterface]]): - """Register the interface - - Args: - interface_type (str): type of the interface - interface (callable): a class that implements the TestSuiteInterface - """ - self._interface_type[interface_type] = fn - - def register_many(self, interface_dict): - """ - Registers multiple profiler interfaces at once. - - Args: - interface_dict: A dictionary mapping connection class names (strings) to their - corresponding profiler interface classes. - """ - for interface_type, interface_fn in interface_dict.items(): - self.register(interface_type, interface_fn) - - def create( - self, - service_connection_config: DatabaseConnection, - ometa_client: OpenMetadata, - table_entity: Table, - *args, - **kwargs, - ) -> TestSuiteInterface: - """Create the interface - - Args: - service_connection_config (DatabaseService): a database service object - - Raises: - AttributeError: if no connection is found in the database service object - - Returns: - TestSuiteInterface: - """ - try: - connection_type = service_connection_config.__class__.__name__ - except AttributeError as err: - logger.debug(traceback.format_exc()) - raise AttributeError(f"Could not instantiate interface class: {err}") - interface_fn = self._interface_type.get(connection_type) - - if not interface_fn: - interface_fn = self._interface_type["base"] - - interface_class = interface_fn() - return interface_class( - service_connection_config, ometa_client, table_entity, *args, **kwargs - ) - - @staticmethod - def sqa() -> Type[TestSuiteInterface]: - """Lazy load the SQATestSuiteInterface""" - from metadata.data_quality.interface.sqlalchemy.sqa_test_suite_interface import ( - SQATestSuiteInterface, - ) - - return SQATestSuiteInterface - - @staticmethod - def pandas() -> Type[TestSuiteInterface]: - """Lazy load the PandasTestSuiteInterface""" - from metadata.data_quality.interface.pandas.pandas_test_suite_interface import ( - PandasTestSuiteInterface, - ) - - return PandasTestSuiteInterface - - @staticmethod - def snowflake() -> Type[TestSuiteInterface]: - """Lazy load the SnowflakeTestSuiteInterface""" - from metadata.data_quality.interface.sqlalchemy.snowflake.test_suite_interface import ( - SnowflakeTestSuiteInterface, - ) - - return SnowflakeTestSuiteInterface - - @staticmethod - def unity_catalog() -> Type[TestSuiteInterface]: - """Lazy load the UnityCatalogTestSuiteInterface""" - from metadata.data_quality.interface.sqlalchemy.unity_catalog.test_suite_interface import ( - UnityCatalogTestSuiteInterface, - ) - - return UnityCatalogTestSuiteInterface - - @staticmethod - def databricks() -> Type[TestSuiteInterface]: - """Lazy load the DatabricksTestSuiteInterface""" - from metadata.data_quality.interface.sqlalchemy.databricks.test_suite_interface import ( - DatabricksTestSuiteInterface, - ) - - return DatabricksTestSuiteInterface - - -test_suite_interface = { - DatabaseConnection.__name__: TestSuiteInterfaceFactory.sqa, - DatalakeConnection.__name__: TestSuiteInterfaceFactory.pandas, - SnowflakeConnection.__name__: TestSuiteInterfaceFactory.snowflake, - UnityCatalogConnection.__name__: TestSuiteInterfaceFactory.unity_catalog, - DatabricksConnection.__name__: TestSuiteInterfaceFactory.databricks, -} - -test_suite_interface_factory = TestSuiteInterfaceFactory() -test_suite_interface_factory.register_many(test_suite_interface) diff --git a/ingestion/src/metadata/data_quality/runner/base_test_suite_source.py b/ingestion/src/metadata/data_quality/runner/base_test_suite_source.py index 5bdaefa250b4..697f85195c2d 100644 --- a/ingestion/src/metadata/data_quality/runner/base_test_suite_source.py +++ b/ingestion/src/metadata/data_quality/runner/base_test_suite_source.py @@ -16,22 +16,32 @@ from typing import Optional, cast from sqlalchemy import MetaData +from sqlalchemy.orm import DeclarativeMeta from metadata.data_quality.interface.test_suite_interface import TestSuiteInterface -from metadata.data_quality.interface.test_suite_interface_factory import ( - test_suite_interface_factory, -) from metadata.data_quality.runner.core import DataTestsRunner from metadata.generated.schema.entity.data.table import Table from metadata.generated.schema.entity.services.connections.database.datalakeConnection import ( DatalakeConnection, ) from metadata.generated.schema.entity.services.databaseService import DatabaseConnection +from metadata.generated.schema.entity.services.serviceType import ServiceType +from metadata.generated.schema.metadataIngestion.testSuitePipeline import ( + TestSuitePipeline, +) from metadata.generated.schema.metadataIngestion.workflow import ( OpenMetadataWorkflowConfig, ) from metadata.generated.schema.type.entityReference import EntityReference from metadata.ingestion.ometa.ometa_api import OpenMetadata +from metadata.profiler.orm.converter.base import ometa_to_sqa_orm +from metadata.sampler.models import SampleConfig +from metadata.sampler.sampler_interface import SamplerInterface +from metadata.utils.profiler_utils import get_context_entities +from metadata.utils.service_spec.service_spec import ( + import_sampler_class, + import_test_suite_class, +) NON_SQA_DATABASE_CONNECTIONS = (DatalakeConnection,) @@ -46,8 +56,12 @@ def __init__( entity: Table, ): self._interface = None + self._interface_type: str = config.source.type.lower() self.entity = entity self.service_conn_config = self._copy_service_config(config, self.entity.database) # type: ignore + self.source_config = TestSuitePipeline.model_validate( + config.source.sourceConfig.config + ) self.ometa_client = ometa_client self.sqa_metadata = self._set_sqa_metadata() @@ -93,22 +107,51 @@ def _set_sqa_metadata(self): return MetaData() return None + def _build_table_orm(self, entity: Table) -> Optional[DeclarativeMeta]: + """Build the ORM table if needed for the sampler and profiler interfaces""" + if not isinstance(self.service_conn_config, NON_SQA_DATABASE_CONNECTIONS): + return ometa_to_sqa_orm(entity, self.ometa_client, self.sqa_metadata) + return None + def create_data_quality_interface(self) -> TestSuiteInterface: """Create data quality interface Returns: TestSuiteInterface: a data quality interface """ - data_quality_interface: TestSuiteInterface = ( - test_suite_interface_factory.create( - self.service_conn_config, - self.ometa_client, - self.entity, - sqa_metadata=self.sqa_metadata, - ) + schema_entity, database_entity, _ = get_context_entities( + entity=self.entity, metadata=self.ometa_client + ) + test_suite_class = import_test_suite_class( + ServiceType.Database, source_type=self._interface_type + ) + sampler_class = import_sampler_class( + ServiceType.Database, source_type=self._interface_type + ) + # This is shared between the sampler and DQ interfaces + _orm = self._build_table_orm(self.entity) + sampler_interface: SamplerInterface = sampler_class.create( + service_connection_config=self.service_conn_config, + ometa_client=self.ometa_client, + entity=self.entity, + schema_entity=schema_entity, + database_entity=database_entity, + sample_config=SampleConfig( + profile_sample=self.source_config.profileSample, + profile_sample_type=self.source_config.profileSampleType, + sampling_method_type=self.source_config.samplingMethodType, + ), + orm_table=_orm, + ) + + self.interface: TestSuiteInterface = test_suite_class.create( + self.service_conn_config, + self.ometa_client, + sampler_interface, + self.entity, + orm_table=_orm, ) - self.interface = data_quality_interface - return data_quality_interface + return self.interface def get_data_quality_runner(self) -> DataTestsRunner: """Get a data quality runner diff --git a/ingestion/src/metadata/ingestion/source/database/azuresql/service_spec.py b/ingestion/src/metadata/ingestion/source/database/azuresql/service_spec.py index 14072b8618a8..ad1b0d41ac00 100644 --- a/ingestion/src/metadata/ingestion/source/database/azuresql/service_spec.py +++ b/ingestion/src/metadata/ingestion/source/database/azuresql/service_spec.py @@ -1,10 +1,12 @@ from metadata.ingestion.source.database.azuresql.lineage import AzuresqlLineageSource from metadata.ingestion.source.database.azuresql.metadata import AzuresqlSource from metadata.ingestion.source.database.azuresql.usage import AzuresqlUsageSource +from metadata.sampler.sqlalchemy.azuresql.sampler import AzureSQLSampler from metadata.utils.service_spec.default import DefaultDatabaseSpec ServiceSpec = DefaultDatabaseSpec( metadata_source_class=AzuresqlSource, lineage_source_class=AzuresqlLineageSource, usage_source_class=AzuresqlUsageSource, + sampler_class=AzureSQLSampler, ) diff --git a/ingestion/src/metadata/ingestion/source/database/bigquery/service_spec.py b/ingestion/src/metadata/ingestion/source/database/bigquery/service_spec.py index bf97171d0982..8140e30143ec 100644 --- a/ingestion/src/metadata/ingestion/source/database/bigquery/service_spec.py +++ b/ingestion/src/metadata/ingestion/source/database/bigquery/service_spec.py @@ -4,6 +4,7 @@ BigQueryProfiler, ) from metadata.ingestion.source.database.bigquery.usage import BigqueryUsageSource +from metadata.sampler.sqlalchemy.bigquery.sampler import BigQuerySampler from metadata.utils.service_spec.default import DefaultDatabaseSpec ServiceSpec = DefaultDatabaseSpec( @@ -11,4 +12,5 @@ lineage_source_class=BigqueryLineageSource, usage_source_class=BigqueryUsageSource, profiler_class=BigQueryProfiler, + sampler_class=BigQuerySampler, ) diff --git a/ingestion/src/metadata/ingestion/source/database/databricks/service_spec.py b/ingestion/src/metadata/ingestion/source/database/databricks/service_spec.py index 3bf1978a8a99..36f6c42cb4bc 100644 --- a/ingestion/src/metadata/ingestion/source/database/databricks/service_spec.py +++ b/ingestion/src/metadata/ingestion/source/database/databricks/service_spec.py @@ -1,3 +1,6 @@ +from metadata.data_quality.interface.sqlalchemy.databricks.test_suite_interface import ( + DatabricksTestSuiteInterface, +) from metadata.ingestion.source.database.databricks.lineage import ( DatabricksLineageSource, ) @@ -13,4 +16,5 @@ lineage_source_class=DatabricksLineageSource, usage_source_class=DatabricksUsageSource, profiler_class=DatabricksProfilerInterface, + test_suite_class=DatabricksTestSuiteInterface, ) diff --git a/ingestion/src/metadata/ingestion/source/database/datalake/service_spec.py b/ingestion/src/metadata/ingestion/source/database/datalake/service_spec.py index bbd36b6f312c..98a417a7e41d 100644 --- a/ingestion/src/metadata/ingestion/source/database/datalake/service_spec.py +++ b/ingestion/src/metadata/ingestion/source/database/datalake/service_spec.py @@ -1,9 +1,16 @@ +from metadata.data_quality.interface.pandas.pandas_test_suite_interface import ( + PandasTestSuiteInterface, +) from metadata.ingestion.source.database.datalake.metadata import DatalakeSource from metadata.profiler.interface.pandas.profiler_interface import ( PandasProfilerInterface, ) +from metadata.sampler.pandas.sampler import DatalakeSampler from metadata.utils.service_spec.default import DefaultDatabaseSpec ServiceSpec = DefaultDatabaseSpec( - metadata_source_class=DatalakeSource, profiler_class=PandasProfilerInterface + metadata_source_class=DatalakeSource, + profiler_class=PandasProfilerInterface, + test_suite_class=PandasTestSuiteInterface, + sampler_class=DatalakeSampler, ) diff --git a/ingestion/src/metadata/ingestion/source/database/dynamodb/service_spec.py b/ingestion/src/metadata/ingestion/source/database/dynamodb/service_spec.py index 5c5555707dae..f5372cbf04ff 100644 --- a/ingestion/src/metadata/ingestion/source/database/dynamodb/service_spec.py +++ b/ingestion/src/metadata/ingestion/source/database/dynamodb/service_spec.py @@ -1,7 +1,10 @@ from metadata.ingestion.source.database.dynamodb.metadata import DynamodbSource from metadata.profiler.interface.nosql.profiler_interface import NoSQLProfilerInterface +from metadata.sampler.nosql.sampler import NoSQLSampler from metadata.utils.service_spec.default import DefaultDatabaseSpec ServiceSpec = DefaultDatabaseSpec( - metadata_source_class=DynamodbSource, profiler_class=NoSQLProfilerInterface + metadata_source_class=DynamodbSource, + profiler_class=NoSQLProfilerInterface, + sampler_class=NoSQLSampler, ) diff --git a/ingestion/src/metadata/ingestion/source/database/mongodb/service_spec.py b/ingestion/src/metadata/ingestion/source/database/mongodb/service_spec.py index b3feafb4a665..e63169751d51 100644 --- a/ingestion/src/metadata/ingestion/source/database/mongodb/service_spec.py +++ b/ingestion/src/metadata/ingestion/source/database/mongodb/service_spec.py @@ -1,7 +1,10 @@ from metadata.ingestion.source.database.mongodb.metadata import MongodbSource from metadata.profiler.interface.nosql.profiler_interface import NoSQLProfilerInterface +from metadata.sampler.nosql.sampler import NoSQLSampler from metadata.utils.service_spec.default import DefaultDatabaseSpec ServiceSpec = DefaultDatabaseSpec( - metadata_source_class=MongodbSource, profiler_class=NoSQLProfilerInterface + metadata_source_class=MongodbSource, + profiler_class=NoSQLProfilerInterface, + sampler_class=NoSQLSampler, ) diff --git a/ingestion/src/metadata/ingestion/source/database/snowflake/service_spec.py b/ingestion/src/metadata/ingestion/source/database/snowflake/service_spec.py index 51ffc62ed29e..09aa78a249dc 100644 --- a/ingestion/src/metadata/ingestion/source/database/snowflake/service_spec.py +++ b/ingestion/src/metadata/ingestion/source/database/snowflake/service_spec.py @@ -1,9 +1,13 @@ +from metadata.data_quality.interface.sqlalchemy.snowflake.test_suite_interface import ( + SnowflakeTestSuiteInterface, +) from metadata.ingestion.source.database.snowflake.lineage import SnowflakeLineageSource from metadata.ingestion.source.database.snowflake.metadata import SnowflakeSource from metadata.ingestion.source.database.snowflake.profiler.profiler import ( SnowflakeProfiler, ) from metadata.ingestion.source.database.snowflake.usage import SnowflakeUsageSource +from metadata.sampler.sqlalchemy.snowflake.sampler import SnowflakeSampler from metadata.utils.service_spec.default import DefaultDatabaseSpec ServiceSpec = DefaultDatabaseSpec( @@ -11,4 +15,6 @@ lineage_source_class=SnowflakeLineageSource, usage_source_class=SnowflakeUsageSource, profiler_class=SnowflakeProfiler, + test_suite_class=SnowflakeTestSuiteInterface, + sampler_class=SnowflakeSampler, ) diff --git a/ingestion/src/metadata/ingestion/source/database/trino/service_spec.py b/ingestion/src/metadata/ingestion/source/database/trino/service_spec.py index 4242ea01e9b3..f4c8b5061549 100644 --- a/ingestion/src/metadata/ingestion/source/database/trino/service_spec.py +++ b/ingestion/src/metadata/ingestion/source/database/trino/service_spec.py @@ -4,6 +4,7 @@ from metadata.profiler.interface.sqlalchemy.trino.profiler_interface import ( TrinoProfilerInterface, ) +from metadata.sampler.sqlalchemy.trino.sampler import TrinoSampler from metadata.utils.service_spec.default import DefaultDatabaseSpec ServiceSpec = DefaultDatabaseSpec( @@ -11,4 +12,5 @@ lineage_source_class=TrinoLineageSource, usage_source_class=TrinoUsageSource, profiler_class=TrinoProfilerInterface, + sampler_class=TrinoSampler, ) diff --git a/ingestion/src/metadata/ingestion/source/database/unitycatalog/service_spec.py b/ingestion/src/metadata/ingestion/source/database/unitycatalog/service_spec.py index 676941465306..ca9f17b254e5 100644 --- a/ingestion/src/metadata/ingestion/source/database/unitycatalog/service_spec.py +++ b/ingestion/src/metadata/ingestion/source/database/unitycatalog/service_spec.py @@ -1,3 +1,6 @@ +from metadata.data_quality.interface.sqlalchemy.unity_catalog.test_suite_interface import ( + UnityCatalogTestSuiteInterface, +) from metadata.ingestion.source.database.unitycatalog.lineage import ( UnitycatalogLineageSource, ) @@ -15,4 +18,5 @@ lineage_source_class=UnitycatalogLineageSource, usage_source_class=UnitycatalogUsageSource, profiler_class=UnityCatalogProfilerInterface, + test_suite_class=UnityCatalogTestSuiteInterface, ) diff --git a/ingestion/src/metadata/mixins/sqalchemy/sqa_mixin.py b/ingestion/src/metadata/mixins/sqalchemy/sqa_mixin.py index dcc583fe4ba2..0246987a15e9 100644 --- a/ingestion/src/metadata/mixins/sqalchemy/sqa_mixin.py +++ b/ingestion/src/metadata/mixins/sqalchemy/sqa_mixin.py @@ -15,10 +15,9 @@ """ -from typing import List, Optional +from typing import List -from sqlalchemy import Column, MetaData, inspect -from sqlalchemy.orm import DeclarativeMeta +from sqlalchemy import Column, inspect from metadata.generated.schema.entity.services.connections.database.databricksConnection import ( DatabricksConnection, diff --git a/ingestion/src/metadata/profiler/interface/pandas/profiler_interface.py b/ingestion/src/metadata/profiler/interface/pandas/profiler_interface.py index 8031e1cb1ac7..e4279c4ea14c 100644 --- a/ingestion/src/metadata/profiler/interface/pandas/profiler_interface.py +++ b/ingestion/src/metadata/profiler/interface/pandas/profiler_interface.py @@ -18,19 +18,32 @@ from collections import defaultdict from copy import deepcopy from datetime import datetime -from typing import Dict, List, Optional +from typing import Dict, List, Optional, Union from sqlalchemy import Column -from metadata.generated.schema.entity.data.table import CustomMetricProfile, DataType +from metadata.generated.schema.entity.data.table import ( + CustomMetricProfile, + DataType, + Table, +) +from metadata.generated.schema.entity.services.connections.database.datalakeConnection import ( + DatalakeConnection, +) +from metadata.generated.schema.entity.services.databaseService import DatabaseConnection +from metadata.generated.schema.metadataIngestion.databaseServiceProfilerPipeline import ( + DatabaseServiceProfilerPipeline, +) from metadata.generated.schema.tests.customMetric import CustomMetric +from metadata.ingestion.ometa.ometa_api import OpenMetadata from metadata.mixins.pandas.pandas_mixin import PandasInterfaceMixin from metadata.profiler.api.models import ThreadPoolMetrics from metadata.profiler.interface.profiler_interface import ProfilerInterface from metadata.profiler.metrics.core import MetricTypes from metadata.profiler.metrics.registry import Metrics from metadata.profiler.processor.metric_filter import MetricFilter -from metadata.utils.constants import COMPLEX_COLUMN_SEPARATOR, SAMPLE_DATA_DEFAULT_COUNT +from metadata.sampler.pandas.sampler import DatalakeSampler +from metadata.utils.constants import COMPLEX_COLUMN_SEPARATOR from metadata.utils.datalake.datalake_utils import GenericDataFrameColumnParser from metadata.utils.logger import profiler_interface_registry_logger from metadata.utils.sqa_like_column import SQALikeColumn @@ -48,43 +61,30 @@ class PandasProfilerInterface(ProfilerInterface, PandasInterfaceMixin): def __init__( self, - service_connection_config, - ometa_client, - entity, - storage_config, - profile_sample_config, - source_config, - sample_query, - table_partition_config, + service_connection_config: Union[DatabaseConnection, DatalakeConnection], + ometa_client: OpenMetadata, + entity: Table, + source_config: DatabaseServiceProfilerPipeline, + sampler: DatalakeSampler, thread_count: int = 5, timeout_seconds: int = 43200, - sample_data_count: int = SAMPLE_DATA_DEFAULT_COUNT, **kwargs, ): """Instantiate Pandas Interface object""" super().__init__( - service_connection_config, - ometa_client, - entity, - storage_config, - profile_sample_config, - source_config, - sample_query, - table_partition_config, - thread_count, - timeout_seconds, - sample_data_count, + service_connection_config=service_connection_config, + ometa_client=ometa_client, + entity=entity, + source_config=source_config, + sampler=sampler, + thread_count=thread_count, + timeout_seconds=timeout_seconds, **kwargs, ) - self.client = self.connection.client - self.dfs = self.return_ometa_dataframes_sampled( - service_connection_config=self.service_connection_config, - client=self.client._client, - table=self.table_entity, - profile_sample_config=profile_sample_config, - ) + self.client = self.sampler.client + self.dfs = self.sampler.table self.complex_dataframe_sample = deepcopy( self.sampler.random_sample(is_sampled=True) ) diff --git a/ingestion/src/metadata/profiler/interface/sqlalchemy/profiler_interface.py b/ingestion/src/metadata/profiler/interface/sqlalchemy/profiler_interface.py index 2a14fe6e7d8f..ed66cbf7aae5 100644 --- a/ingestion/src/metadata/profiler/interface/sqlalchemy/profiler_interface.py +++ b/ingestion/src/metadata/profiler/interface/sqlalchemy/profiler_interface.py @@ -23,18 +23,21 @@ from datetime import datetime from typing import Any, Dict, List, Optional, Type, Union -from metadata.generated.schema.metadataIngestion.databaseServiceProfilerPipeline import DatabaseServiceProfilerPipeline - -from metadata.generated.schema.entity.services.connections.database.datalakeConnection import DatalakeConnection - -from metadata.generated.schema.entity.services.databaseService import DatabaseConnection from sqlalchemy import Column, inspect, text from sqlalchemy.exc import DBAPIError, ProgrammingError, ResourceClosedError -from sqlalchemy.orm import scoped_session, DeclarativeMeta +from sqlalchemy.orm import DeclarativeMeta, scoped_session from metadata.generated.schema.entity.data.table import ( CustomMetricProfile, - SystemProfile, Table, + SystemProfile, + Table, +) +from metadata.generated.schema.entity.services.connections.database.datalakeConnection import ( + DatalakeConnection, +) +from metadata.generated.schema.entity.services.databaseService import DatabaseConnection +from metadata.generated.schema.metadataIngestion.databaseServiceProfilerPipeline import ( + DatabaseServiceProfilerPipeline, ) from metadata.generated.schema.tests.customMetric import CustomMetric from metadata.ingestion.connections.session import create_and_bind_thread_safe_session @@ -53,7 +56,6 @@ from metadata.profiler.processor.metric_filter import MetricFilter from metadata.profiler.processor.runner import QueryRunner from metadata.sampler.sampler_interface import SamplerInterface -from metadata.utils.constants import SAMPLE_DATA_DEFAULT_COUNT from metadata.utils.custom_thread_pool import CustomThreadPoolExecutor from metadata.utils.helpers import is_safe_sql_query from metadata.utils.logger import profiler_interface_registry_logger @@ -387,7 +389,7 @@ def _create_thread_safe_runner( table=table, sample=sample, partition_details=self.sampler.partition_details, - profile_sample_query=self.sampler.profile_sample_query, + profile_sample_query=self.sampler.sample_query, ) return thread_local.runner thread_local.runner._sample = sample # pylint: disable=protected-access diff --git a/ingestion/src/metadata/profiler/orm/converter/base.py b/ingestion/src/metadata/profiler/orm/converter/base.py index 59ed39665b2a..daa0cd8c9ceb 100644 --- a/ingestion/src/metadata/profiler/orm/converter/base.py +++ b/ingestion/src/metadata/profiler/orm/converter/base.py @@ -117,14 +117,14 @@ def ometa_to_sqa_orm( orm_database_name = get_orm_database(table, metadata) # SQLite does not support schemas - orm_schema_name = get_orm_schema(table, metadata) if table.serviceType != databaseService.DatabaseServiceType.SQLite else None + orm_schema_name = ( + get_orm_schema(table, metadata) + if table.serviceType != databaseService.DatabaseServiceType.SQLite + else None + ) orm_name = f"{orm_database_name}_{orm_schema_name}_{table.name.root}".replace( ".", "_" ) - orm_key = f"{orm_schema_name}.{table.name.root}" if orm_schema_name else table.name.root - visited = False - if orm_key in _metadata.tables: - visited = True cols = { ( @@ -144,9 +144,9 @@ def ometa_to_sqa_orm( "__table_args__": { "schema": orm_schema_name, "extend_existing": True, # Recreates the table ORM object if it already exists. Useful for testing - **({"quote": check_snowflake_case_sensitive( + "quote": check_snowflake_case_sensitive( table.serviceType, table.name.root - )} if not visited else {}), + ), }, **cols, "metadata": _metadata, diff --git a/ingestion/src/metadata/profiler/source/base/profiler_source.py b/ingestion/src/metadata/profiler/source/base/profiler_source.py index 3878e45e8fa4..aa81fa473a1d 100644 --- a/ingestion/src/metadata/profiler/source/base/profiler_source.py +++ b/ingestion/src/metadata/profiler/source/base/profiler_source.py @@ -14,7 +14,7 @@ its interface """ from copy import deepcopy -from typing import List, Optional, Tuple, cast +from typing import List, Optional, cast from sqlalchemy import MetaData from sqlalchemy.orm import DeclarativeMeta @@ -50,6 +50,7 @@ from metadata.sampler.models import SampleConfig from metadata.sampler.sampler_interface import SamplerInterface from metadata.utils.logger import profiler_logger +from metadata.utils.profiler_utils import get_context_entities from metadata.utils.service_spec.service_spec import ( import_profiler_class, import_sampler_class, @@ -80,7 +81,7 @@ def __init__( config.processor.model_dump().get("config") ) self.ometa_client = ometa_client - self.profiler_interface_type: str = config.source.type.lower() + self._interface_type: str = config.source.type.lower() self.sqa_metadata = self._set_sqa_metadata() self._interface = None self.global_profiler_configuration = global_profiler_configuration @@ -202,10 +203,10 @@ def create_profiler_interface( ) -> ProfilerInterface: """Create sqlalchemy profiler interface""" profiler_class = import_profiler_class( - ServiceType.Database, source_type=self.profiler_interface_type + ServiceType.Database, source_type=self._interface_type ) sampler_class = import_sampler_class( - ServiceType.Database, source_type=self.profiler_interface_type + ServiceType.Database, source_type=self._interface_type ) # This is shared between the sampler and profiler interfaces _orm = self._build_table_orm(entity) @@ -215,9 +216,7 @@ def create_profiler_interface( entity=entity, schema_entity=schema_entity, database_entity=database_entity, - db_service=db_service, table_config=config, - profiler_config=profiler_config, sample_config=SampleConfig( profile_sample=self.source_config.profileSample, profile_sample_type=self.source_config.profileSampleType, @@ -238,41 +237,6 @@ def create_profiler_interface( self.interface = profiler_interface return self.interface - def _get_context_entities( - self, entity: Table - ) -> Tuple[DatabaseSchema, Database, DatabaseService]: - schema_entity = None - database_entity = None - db_service = None - - if entity.databaseSchema: - schema_entity_list = self.ometa_client.es_search_from_fqn( - entity_type=DatabaseSchema, - fqn_search_string=entity.databaseSchema.fullyQualifiedName, - fields="databaseSchemaProfilerConfig", - ) - if schema_entity_list: - schema_entity = schema_entity_list[0] - - if entity.database: - database_entity_list = self.ometa_client.es_search_from_fqn( - entity_type=Database, - fqn_search_string=entity.database.fullyQualifiedName, - fields="databaseProfilerConfig", - ) - if database_entity_list: - database_entity = database_entity_list[0] - - if entity.service: - db_service_list = self.ometa_client.es_search_from_fqn( - entity_type=DatabaseService, - fqn_search_string=entity.service.fullyQualifiedName, - ) - if db_service_list: - db_service = db_service_list[0] - - return schema_entity, database_entity, db_service - def get_profiler_runner( self, entity: Table, profiler_config: ProfilerProcessorConfig ) -> Profiler: @@ -280,8 +244,8 @@ def get_profiler_runner( Returns the runner for the profiler """ table_config = self.get_config_for_table(entity, profiler_config) - schema_entity, database_entity, db_service = self._get_context_entities( - entity=entity + schema_entity, database_entity, db_service = get_context_entities( + entity=entity, metadata=self.ometa_client ) profiler_interface = self.create_profiler_interface( entity, diff --git a/ingestion/src/metadata/sampler/config.py b/ingestion/src/metadata/sampler/config.py index 82d89f50e460..898b12d1dfff 100644 --- a/ingestion/src/metadata/sampler/config.py +++ b/ingestion/src/metadata/sampler/config.py @@ -13,8 +13,12 @@ """ from typing import Optional, Union -from metadata.generated.schema.entity.data.database import DatabaseProfilerConfig +from metadata.generated.schema.entity.data.database import ( + Database, + DatabaseProfilerConfig, +) from metadata.generated.schema.entity.data.databaseSchema import ( + DatabaseSchema, DatabaseSchemaProfilerConfig, ) from metadata.generated.schema.entity.data.table import PartitionProfilerConfig, Table @@ -29,6 +33,10 @@ DatabaseAndSchemaConfig, ProfilerProcessorConfig, ) +from metadata.profiler.config import ( + get_database_profiler_config, + get_schema_profiler_config, +) from metadata.sampler.models import SampleConfig, TableConfig @@ -51,12 +59,17 @@ def get_sample_storage_config( def get_storage_config_for_table( entity: Table, - schema_profiler_config: Optional[DatabaseSchemaProfilerConfig], - database_profiler_config: Optional[DatabaseProfilerConfig], + schema_entity: DatabaseSchema, + database_entity: Database, db_service: Optional[DatabaseService], profiler_config: ProfilerProcessorConfig, ) -> Optional[DataStorageConfig]: """Get storage config for a specific entity""" + schema_profiler_config = get_schema_profiler_config(schema_entity=schema_entity) + database_profiler_config = get_database_profiler_config( + database_entity=database_entity + ) + for schema_config in profiler_config.schemaConfig: if ( schema_config.fullyQualifiedName.root @@ -134,7 +147,7 @@ def get_partition_details( return get_partition_details(entity) -def get_profile_query( +def get_sample_query( entity: Table, entity_config: Optional[TableConfig] ) -> Optional[str]: """get profile query for sampling @@ -157,8 +170,8 @@ def get_profile_query( def get_sample_data_count_config( entity: Table, - schema_profiler_config: Optional[DatabaseSchemaProfilerConfig], - database_profiler_config: Optional[DatabaseProfilerConfig], + schema_entity: DatabaseSchema, + database_entity: Database, entity_config: Optional[TableConfig], default_sample_data_count: int, ) -> Optional[int]: @@ -169,6 +182,10 @@ def get_sample_data_count_config( Returns: Optional[int]: int """ + schema_profiler_config = get_schema_profiler_config(schema_entity=schema_entity) + database_profiler_config = get_database_profiler_config( + database_entity=database_entity + ) for config in ( entity_config, diff --git a/ingestion/src/metadata/sampler/models.py b/ingestion/src/metadata/sampler/models.py index d50885b0c63c..26aa4d9ac95f 100644 --- a/ingestion/src/metadata/sampler/models.py +++ b/ingestion/src/metadata/sampler/models.py @@ -73,6 +73,7 @@ def from_database_and_schema_config( class DatabaseAndSchemaConfig(BaseProfileConfig): """schema profile config""" + sampleDataStorageConfig: Optional[SampleDataStorageConfig] = None diff --git a/ingestion/src/metadata/sampler/nosql/sampler.py b/ingestion/src/metadata/sampler/nosql/sampler.py index 381570450081..199dc1143a55 100644 --- a/ingestion/src/metadata/sampler/nosql/sampler.py +++ b/ingestion/src/metadata/sampler/nosql/sampler.py @@ -1,22 +1,34 @@ from typing import Dict, List, Optional, Tuple from metadata.generated.schema.entity.data.table import ProfileSampleType, TableData +from metadata.profiler.adaptors.factory import factory from metadata.profiler.adaptors.nosql_adaptor import NoSQLAdaptor -from metadata.profiler.processor.sampler.sampler_interface import SamplerInterface +from metadata.sampler.sampler_interface import SamplerInterface from metadata.utils.constants import SAMPLE_DATA_DEFAULT_COUNT from metadata.utils.sqa_like_column import SQALikeColumn class NoSQLSampler(SamplerInterface): + client: NoSQLAdaptor + @property + def table(self): + return self.table + + def get_client(self): + return factory.create( + self.service_connection_config.__class__.__name__, + client=self.connection, + ) + def _rdn_sample_from_user_query(self) -> List[Dict[str, any]]: """ Get random sample from user query """ limit = self._get_limit() return self.client.query( - self.table, self.table.columns, self._profile_sample_query, limit + self.table, self.table.columns, self.sample_query, limit ) def _fetch_sample_data_from_user_query(self) -> TableData: @@ -38,7 +50,7 @@ def random_sample(self): pass def fetch_sample_data(self, columns: List[SQALikeColumn]) -> TableData: - if self._profile_sample_query: + if self.sample_query: return self._fetch_sample_data_from_user_query() return self._fetch_sample_data(columns) @@ -56,10 +68,10 @@ def _fetch_sample_data(self, columns: List[SQALikeColumn]) -> TableData: def _get_limit(self) -> Optional[int]: num_rows = self.client.item_count(self.table) - if self.profile_sample_type == ProfileSampleType.PERCENTAGE: - limit = num_rows * (self.profile_sample / 100) - elif self.profile_sample_type == ProfileSampleType.ROWS: - limit = self.profile_sample + if self.sample_config.profile_sample_type == ProfileSampleType.PERCENTAGE: + limit = num_rows * (self.sample_config.profile_sample / 100) + elif self.sample_config.profile_sample_type == ProfileSampleType.ROWS: + limit = self.sample_config.profile_sample else: limit = SAMPLE_DATA_DEFAULT_COUNT return limit diff --git a/ingestion/src/metadata/sampler/pandas/sampler.py b/ingestion/src/metadata/sampler/pandas/sampler.py index 572796dc71ff..15cc5d760df7 100644 --- a/ingestion/src/metadata/sampler/pandas/sampler.py +++ b/ingestion/src/metadata/sampler/pandas/sampler.py @@ -25,37 +25,50 @@ ProfileSampleType, TableData, ) -from metadata.profiler.processor.sampler.sampler_interface import SamplerInterface +from metadata.mixins.pandas.pandas_mixin import PandasInterfaceMixin +from metadata.sampler.sampler_interface import SamplerInterface from metadata.utils.sqa_like_column import SQALikeColumn -class DatalakeSampler(SamplerInterface): +class DatalakeSampler(SamplerInterface, PandasInterfaceMixin): """ Generates a sample of the data to not run the query in the whole table. """ + @property + def table(self): + return self.return_ometa_dataframes_sampled( + service_connection_config=self.service_connection_config, + client=self.client._client, + table=self.entity, + profile_sample_config=self.sample_config.profile_sample, + ) + + def get_client(self): + return self.connection.client + def _partitioned_table(self): """Get partitioned table""" - self._partition_details = cast(PartitionProfilerConfig, self._partition_details) - partition_field = self._partition_details.partitionColumnName + self.partition_details = cast(PartitionProfilerConfig, self.partition_details) + partition_field = self.partition_details.partitionColumnName if ( - self._partition_details.partitionIntervalType + self.partition_details.partitionIntervalType == PartitionIntervalTypes.COLUMN_VALUE ): return [ - df[df[partition_field].isin(self._partition_details.partitionValues)] + df[df[partition_field].isin(self.partition_details.partitionValues)] for df in self.table ] if ( - self._partition_details.partitionIntervalType + self.partition_details.partitionIntervalType == PartitionIntervalTypes.INTEGER_RANGE ): return [ df[ df[partition_field].between( - self._partition_details.partitionIntegerRangeStart, - self._partition_details.partitionIntegerRangeEnd, + self.partition_details.partitionIntegerRangeStart, + self.partition_details.partitionIntegerRangeEnd, ) ] for df in self.table @@ -64,8 +77,8 @@ def _partitioned_table(self): df[ df[partition_field] >= TableRowInsertedCountToBeBetweenValidator._get_threshold_date( # pylint: disable=protected-access - self._partition_details.partitionIntervalUnit.value, - self._partition_details.partitionInterval, + self.partition_details.partitionIntervalUnit.value, + self.partition_details.partitionInterval, ) ] for df in self.table @@ -78,7 +91,7 @@ def _fetch_sample_data_from_user_query(self) -> TableData: def _rdn_sample_from_user_query(self): """Generate sample from user query""" - return [df.query(self._profile_sample_query) for df in self.table] + return [df.query(self.sample_query) for df in self.table] def _get_sampled_dataframe(self): """ @@ -86,13 +99,13 @@ def _get_sampled_dataframe(self): """ random.shuffle(self.table) # we'll shuffle the list of dataframes # sampling data based on profiler config (if any) - if self.profile_sample_type == ProfileSampleType.PERCENTAGE: + if self.sample_config.profile_sample_type == ProfileSampleType.PERCENTAGE: try: - profile_sample = self.profile_sample / 100 + profile_sample = self.sample_config.profile_sample / 100 except TypeError: # if the profile sample is not a number or is None # we'll set it to 100 - profile_sample = self.profile_sample = 100 + profile_sample = self.sample_config.profile_sample = 100 return [ df.sample( frac=profile_sample, @@ -103,7 +116,9 @@ def _get_sampled_dataframe(self): ] # we'll distribute the sample size equally among the dataframes - sample_rows_per_chunk: int = math.floor(self.profile_sample / len(self.table)) + sample_rows_per_chunk: int = math.floor( + self.sample_config.profile_sample / len(self.table) + ) num_rows = sum(len(df) for df in self.table) # if we have less rows than the sample size @@ -142,13 +157,13 @@ def random_sample(self, is_sampled: bool = False): Returns: List[DataFrame] """ - if self._profile_sample_query: + if self.sample_query: return self._rdn_sample_from_user_query() - if self._partition_details: + if self.partition_details: self.table = self._partitioned_table() - if not self.profile_sample or is_sampled: + if not self.sample_config.profile_sample or is_sampled: return self.table return self._get_sampled_dataframe() @@ -163,7 +178,7 @@ def fetch_sample_data( Returns: TableData: """ - if self._profile_sample_query: + if self.sample_query: return self._fetch_sample_data_from_user_query() cols, rows = self.get_col_row(data_frame=self.table, columns=columns) diff --git a/ingestion/src/metadata/sampler/sampler_interface.py b/ingestion/src/metadata/sampler/sampler_interface.py index 50318982d1fa..48ef04c4282c 100644 --- a/ingestion/src/metadata/sampler/sampler_interface.py +++ b/ingestion/src/metadata/sampler/sampler_interface.py @@ -15,7 +15,6 @@ from abc import ABC, abstractmethod from typing import Dict, List, Optional, Union -from metadata.ingestion.ometa.ometa_api import OpenMetadata from sqlalchemy import Column from metadata.generated.schema.entity.data.database import Database @@ -27,21 +26,11 @@ from metadata.generated.schema.entity.services.connections.database.datalakeConnection import ( DatalakeConnection, ) -from metadata.generated.schema.entity.services.databaseService import ( - DatabaseConnection, - DatabaseService, -) -from metadata.profiler.api.models import ProfilerProcessorConfig, TableConfig -from metadata.profiler.config import ( - get_database_profiler_config, - get_schema_profiler_config, -) +from metadata.generated.schema.entity.services.databaseService import DatabaseConnection +from metadata.ingestion.ometa.ometa_api import OpenMetadata +from metadata.profiler.api.models import TableConfig from metadata.profiler.processor.sample_data_handler import upload_sample_data -from metadata.sampler.config import ( - get_profile_query, - get_sample_data_count_config, - get_storage_config_for_table, -) +from metadata.sampler.config import get_sample_data_count_config, get_sample_query from metadata.sampler.models import SampleConfig from metadata.sampler.partition import get_partition_details from metadata.utils.constants import SAMPLE_DATA_DEFAULT_COUNT @@ -63,7 +52,7 @@ def __init__( entity: Table, sample_config: Optional[SampleConfig] = None, partition_details: Optional[Dict] = None, - profile_sample_query: Optional[str] = None, + sample_query: Optional[str] = None, storage_config: DataStorageConfig = None, sample_data_count: Optional[int] = SAMPLE_DATA_DEFAULT_COUNT, **kwargs, @@ -73,7 +62,7 @@ def __init__( self.sample_config = sample_config self.entity = entity - self.profile_sample_query = profile_sample_query + self.sample_query = sample_query self.sample_limit = sample_data_count self.partition_details = partition_details self.storage_config = storage_config @@ -90,39 +79,23 @@ def create( entity: Table, schema_entity: DatabaseSchema, database_entity: Database, - db_service: DatabaseService, - table_config: TableConfig, - profiler_config: ProfilerProcessorConfig, + table_config: Optional[TableConfig] = None, + storage_config: Optional[DataStorageConfig] = None, sample_config: Optional[SampleConfig] = None, default_sample_data_count: Optional[int] = SAMPLE_DATA_DEFAULT_COUNT, **kwargs, ) -> "SamplerInterface": """Create sampler""" - schema_profiler_config = get_schema_profiler_config(schema_entity=schema_entity) - database_profiler_config = get_database_profiler_config( - database_entity=database_entity - ) - - storage_config = get_storage_config_for_table( - entity=entity, - schema_profiler_config=schema_profiler_config, - database_profiler_config=database_profiler_config, - db_service=db_service, - profiler_config=profiler_config, - ) - sample_data_count = get_sample_data_count_config( entity=entity, - schema_profiler_config=schema_profiler_config, - database_profiler_config=database_profiler_config, + schema_entity=schema_entity, + database_entity=database_entity, entity_config=table_config, default_sample_data_count=default_sample_data_count, ) - profile_sample_query = get_profile_query( - entity=entity, entity_config=table_config - ) + sample_query = get_sample_query(entity=entity, entity_config=table_config) partition_details = get_partition_details(entity=entity) @@ -132,7 +105,7 @@ def create( entity=entity, sample_config=sample_config, partition_details=partition_details, - profile_sample_query=profile_sample_query, + sample_query=sample_query, storage_config=storage_config, sample_data_count=sample_data_count, **kwargs, @@ -160,7 +133,7 @@ def _fetch_sample_data_from_user_query(self) -> TableData: raise NotImplementedError @abstractmethod - def random_sample(self): + def random_sample(self, **kwargs): """Get random sample""" raise NotImplementedError @@ -187,13 +160,14 @@ def generate_sample_data(self) -> Optional[TableData]: "Fetching sample data for " f"{self.profiler_interface.table_entity.fullyQualifiedName.root}..." # type: ignore ) - # TODO: GET COLUMNS? table_data = self.fetch_sample_data(self.columns) - upload_sample_data( - data=table_data, - entity=self.entity, - sample_storage_config=self.storage_config, - ) + # Only store the data if configured to do so + if self.storage_config: + upload_sample_data( + data=table_data, + entity=self.entity, + sample_storage_config=self.storage_config, + ) table_data.rows = table_data.rows[ : min(SAMPLE_DATA_DEFAULT_COUNT, self.sample_limit) ] diff --git a/ingestion/src/metadata/sampler/sqlalchemy/bigquery/sampler.py b/ingestion/src/metadata/sampler/sqlalchemy/bigquery/sampler.py index 1e67f07ec398..5d70731cf52a 100644 --- a/ingestion/src/metadata/sampler/sqlalchemy/bigquery/sampler.py +++ b/ingestion/src/metadata/sampler/sqlalchemy/bigquery/sampler.py @@ -83,13 +83,13 @@ def get_sample_query(self, *, column=None) -> Query: """get query for sample data""" # TABLESAMPLE SYSTEM is not supported for views if ( - self.profile_sample_type == ProfileSampleType.PERCENTAGE + self.sample_config.profile_sample_type == ProfileSampleType.PERCENTAGE and self.table_type != TableType.View ): return ( self._base_sample_query(column) .suffix_with( - f"TABLESAMPLE SYSTEM ({self.profile_sample or 100} PERCENT)", + f"TABLESAMPLE SYSTEM ({self.sample_config.profile_sample or 100} PERCENT)", ) .cte(f"{self.table.__tablename__}_sample") ) diff --git a/ingestion/src/metadata/sampler/sqlalchemy/sampler.py b/ingestion/src/metadata/sampler/sqlalchemy/sampler.py index 0839ff230fe5..4f3511e26d25 100644 --- a/ingestion/src/metadata/sampler/sqlalchemy/sampler.py +++ b/ingestion/src/metadata/sampler/sqlalchemy/sampler.py @@ -27,7 +27,6 @@ TableData, ) from metadata.ingestion.connections.session import create_and_bind_thread_safe_session -from metadata.profiler.orm.converter.base import ometa_to_sqa_orm from metadata.profiler.orm.functions.modulo import ModuloFn from metadata.profiler.orm.functions.random_num import RandomNumFn from metadata.profiler.processor.handle_partition import partition_filter_handler @@ -106,9 +105,9 @@ def get_sample_query(self, *, column=None) -> Query: (ModuloFn(RandomNumFn(), 100)).label(RANDOM_LABEL), ).cte(f"{self.table.__tablename__}_rnd") session_query = self.client.query(rnd) - return session_query.where(rnd.c.random <= self.sample_config.profile_sample).cte( - f"{self.table.__tablename__}_sample" - ) + return session_query.where( + rnd.c.random <= self.sample_config.profile_sample + ).cte(f"{self.table.__tablename__}_sample") table_query = self.client.query(self.table) session_query = self._base_sample_query( @@ -126,10 +125,13 @@ def random_sample(self, ccolumn=None) -> Union[DeclarativeMeta, AliasedClass]: Either return a sampled CTE of table, or the full table if no sampling is required. """ - if self.profile_sample_query: + if self.sample_query: return self._rdn_sample_from_user_query() - if not self.sample_config.profile_sample or int(self.sample_config.profile_sample) == 100: + if ( + not self.sample_config.profile_sample + or int(self.sample_config.profile_sample) == 100 + ): if self.partition_details: return self._partitioned_table() @@ -150,7 +152,7 @@ def fetch_sample_data(self, columns: Optional[List[Column]] = None) -> TableData Returns: TableData to be added to the Table Entity """ - if self.profile_sample_query: + if self.sample_query: return self._fetch_sample_data_from_user_query() # Add new RandomNumFn column @@ -191,12 +193,10 @@ def fetch_sample_data(self, columns: Optional[List[Column]] = None) -> TableData def _fetch_sample_data_from_user_query(self) -> TableData: """Returns a table data object using results from query execution""" - if not is_safe_sql_query(self.profile_sample_query): - raise RuntimeError( - f"SQL expression is not safe\n\n{self.profile_sample_query}" - ) + if not is_safe_sql_query(self.sample_query): + raise RuntimeError(f"SQL expression is not safe\n\n{self.sample_query}") - rnd = self.client.execute(f"{self.profile_sample_query}") + rnd = self.client.execute(f"{self.sample_query}") try: columns = [col.name for col in rnd.cursor.description] except AttributeError: @@ -208,13 +208,11 @@ def _fetch_sample_data_from_user_query(self) -> TableData: def _rdn_sample_from_user_query(self) -> Query: """Returns sql alchemy object to use when running profiling""" - if not is_safe_sql_query(self.profile_sample_query): - raise RuntimeError( - f"SQL expression is not safe\n\n{self.profile_sample_query}" - ) + if not is_safe_sql_query(self.sample_query): + raise RuntimeError(f"SQL expression is not safe\n\n{self.sample_query}") return self.client.query(self.table).from_statement( - text(f"{self.profile_sample_query}") + text(f"{self.sample_query}") ) def _partitioned_table(self) -> Query: diff --git a/ingestion/src/metadata/sampler/sqlalchemy/snowflake/sampler.py b/ingestion/src/metadata/sampler/sqlalchemy/snowflake/sampler.py index 368c760ed91e..8c958a63aa1e 100644 --- a/ingestion/src/metadata/sampler/sqlalchemy/snowflake/sampler.py +++ b/ingestion/src/metadata/sampler/sqlalchemy/snowflake/sampler.py @@ -59,15 +59,15 @@ def __init__( def get_sample_query(self, *, column=None) -> CTE: """get query for sample data""" # TABLESAMPLE SYSTEM is not supported for views - self.table = cast(Table, self.table) + self._table = cast(Table, self.table) - if self.profile_sample_type == ProfileSampleType.PERCENTAGE: + if self.sample_config.profile_sample_type == ProfileSampleType.PERCENTAGE: rnd = ( self._base_sample_query( column, ) .suffix_with( - f"SAMPLE {self.sampling_method_type.value} ({self.profile_sample or 100})", + f"SAMPLE {self.sampling_method_type.value} ({self.sample_config.profile_sample or 100})", ) .cte(f"{self.table.__tablename__}_rnd") ) @@ -77,7 +77,7 @@ def get_sample_query(self, *, column=None) -> CTE: return ( self._base_sample_query(column) .suffix_with( - f"TABLESAMPLE ({self.profile_sample or 100} ROWS)", + f"TABLESAMPLE ({self.sample_config.profile_sample or 100} ROWS)", ) .cte(f"{self.table.__tablename__}_sample") ) diff --git a/ingestion/src/metadata/utils/profiler_utils.py b/ingestion/src/metadata/utils/profiler_utils.py index c6841e56872f..c9af1d7dd555 100644 --- a/ingestion/src/metadata/utils/profiler_utils.py +++ b/ingestion/src/metadata/utils/profiler_utils.py @@ -20,6 +20,11 @@ import sqlparse from pydantic import BaseModel +from metadata.generated.schema.entity.data.database import Database +from metadata.generated.schema.entity.data.databaseSchema import DatabaseSchema +from metadata.generated.schema.entity.data.table import Table +from metadata.generated.schema.entity.services.databaseService import DatabaseService +from metadata.ingestion.ometa.ometa_api import OpenMetadata from metadata.utils.logger import profiler_logger logger = profiler_logger() @@ -100,3 +105,39 @@ def set_cache(cache: defaultdict, key: str, value): cache[key_] = value break cache = cache[key_] + + +def get_context_entities( + entity: Table, metadata: OpenMetadata +) -> Tuple[DatabaseSchema, Database, DatabaseService]: + schema_entity = None + database_entity = None + db_service = None + + if entity.databaseSchema: + schema_entity_list = metadata.es_search_from_fqn( + entity_type=DatabaseSchema, + fqn_search_string=entity.databaseSchema.fullyQualifiedName, + fields="databaseSchemaProfilerConfig", + ) + if schema_entity_list: + schema_entity = schema_entity_list[0] + + if entity.database: + database_entity_list = metadata.es_search_from_fqn( + entity_type=Database, + fqn_search_string=entity.database.fullyQualifiedName, + fields="databaseProfilerConfig", + ) + if database_entity_list: + database_entity = database_entity_list[0] + + if entity.service: + db_service_list = metadata.es_search_from_fqn( + entity_type=DatabaseService, + fqn_search_string=entity.service.fullyQualifiedName, + ) + if db_service_list: + db_service = db_service_list[0] + + return schema_entity, database_entity, db_service diff --git a/ingestion/src/metadata/utils/service_spec/default.py b/ingestion/src/metadata/utils/service_spec/default.py index 10067961cd34..dbba406faacf 100644 --- a/ingestion/src/metadata/utils/service_spec/default.py +++ b/ingestion/src/metadata/utils/service_spec/default.py @@ -4,6 +4,9 @@ from typing import Optional +from metadata.data_quality.interface.sqlalchemy.sqa_test_suite_interface import ( + SQATestSuiteInterface, +) from metadata.profiler.interface.sqlalchemy.profiler_interface import ( SQAProfilerInterface, ) @@ -15,3 +18,4 @@ class DefaultDatabaseSpec(BaseSpec): profiler_class: Optional[str] = get_class_path(SQAProfilerInterface) sampler_class: Optional[str] = get_class_path(SQASampler) + test_suite_class: Optional[str] = get_class_path(SQATestSuiteInterface) diff --git a/ingestion/src/metadata/utils/service_spec/service_spec.py b/ingestion/src/metadata/utils/service_spec/service_spec.py index 1ed750a9b3a6..d91c2b77704e 100644 --- a/ingestion/src/metadata/utils/service_spec/service_spec.py +++ b/ingestion/src/metadata/utils/service_spec/service_spec.py @@ -6,6 +6,7 @@ from pydantic import model_validator +from metadata.data_quality.interface.test_suite_interface import TestSuiteInterface from metadata.generated.schema.entity.services.serviceType import ServiceType from metadata.ingestion.api.steps import Source from metadata.ingestion.models.custom_pydantic import BaseModel @@ -45,6 +46,7 @@ class BaseSpec(BaseModel): """ profiler_class: Optional[str] = None + test_suite_class: Optional[str] = None metadata_source_class: str lineage_source_class: Optional[str] = None usage_source_class: Optional[str] = None @@ -109,6 +111,13 @@ def import_profiler_class( return cast(Type[ProfilerInterface], import_from_module(class_path)) +def import_test_suite_class( + service_type: ServiceType, source_type: str +) -> Type[TestSuiteInterface]: + class_path = BaseSpec.get_for_source(service_type, source_type).test_suite_class + return cast(Type[TestSuiteInterface], import_from_module(class_path)) + + def import_sampler_class( service_type: ServiceType, source_type: str ) -> Type[SamplerInterface]: diff --git a/ingestion/src/metadata/workflow/profiler.py b/ingestion/src/metadata/workflow/profiler.py index 6ed66d3ae99d..50b3cde10a06 100644 --- a/ingestion/src/metadata/workflow/profiler.py +++ b/ingestion/src/metadata/workflow/profiler.py @@ -11,11 +11,7 @@ """ Workflow definition for the profiler """ -from typing import cast -from metadata.generated.schema.metadataIngestion.databaseServiceProfilerPipeline import ( - DatabaseServiceProfilerPipeline, -) from metadata.generated.schema.metadataIngestion.workflow import ( OpenMetadataWorkflowConfig, ) @@ -24,7 +20,6 @@ raise_test_connection_exception, ) from metadata.ingestion.source.connections import get_test_connection_fn -from metadata.pii.processor import PIIProcessor from metadata.profiler.processor.processor import ProfilerProcessor from metadata.profiler.source.metadata import OpenMetadataSource from metadata.profiler.source.metadata_ext import OpenMetadataSourceExt diff --git a/ingestion/tests/unit/profiler/pandas/test_sample.py b/ingestion/tests/unit/profiler/pandas/test_sample.py index c95799502e25..c29712a3e170 100644 --- a/ingestion/tests/unit/profiler/pandas/test_sample.py +++ b/ingestion/tests/unit/profiler/pandas/test_sample.py @@ -197,7 +197,7 @@ def test_sample_property(self, mock_get_connection, mocked_dfs): table_partition_config=None, ) - random_sample = datalake_profiler_interface._get_sampler().random_sample() + random_sample = datalake_profiler_interface.sampler.random_sample() res = sum(len(r) for r in random_sample) assert res < 5 diff --git a/ingestion/tests/unit/test_suite/test_factories.py b/ingestion/tests/unit/test_suite/test_factories.py deleted file mode 100644 index cc4b1c0691a4..000000000000 --- a/ingestion/tests/unit/test_suite/test_factories.py +++ /dev/null @@ -1,120 +0,0 @@ -# Copyright 2021 Collate -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Validate our interface factory creates the expected interface instance""" - -from unittest.mock import patch - -from pytest import mark - -from metadata.data_quality.interface.pandas.pandas_test_suite_interface import ( - PandasTestSuiteInterface, -) -from metadata.data_quality.interface.sqlalchemy.databricks.test_suite_interface import ( - DatabricksTestSuiteInterface, -) -from metadata.data_quality.interface.sqlalchemy.snowflake.test_suite_interface import ( - SnowflakeTestSuiteInterface, -) -from metadata.data_quality.interface.sqlalchemy.sqa_test_suite_interface import ( - SQATestSuiteInterface, -) -from metadata.data_quality.interface.sqlalchemy.unity_catalog.test_suite_interface import ( - UnityCatalogTestSuiteInterface, -) -from metadata.data_quality.interface.test_suite_interface_factory import ( - TestSuiteInterfaceFactory, - test_suite_interface_factory, -) -from metadata.generated.schema.entity.services.connections.database.databricksConnection import ( - DatabricksConnection, -) -from metadata.generated.schema.entity.services.connections.database.datalake.s3Config import ( - S3Config, -) -from metadata.generated.schema.entity.services.connections.database.datalakeConnection import ( - DatalakeConnection, -) -from metadata.generated.schema.entity.services.connections.database.mysqlConnection import ( - MysqlConnection, -) -from metadata.generated.schema.entity.services.connections.database.snowflakeConnection import ( - SnowflakeConnection, -) -from metadata.generated.schema.entity.services.connections.database.unityCatalogConnection import ( - UnityCatalogConnection, -) -from metadata.generated.schema.security.credentials.awsCredentials import AWSCredentials - -MYSQL_CONNECTION_CONFIG = MysqlConnection( - username="root", - hostPort="localhost:3306", -) # type: ignore -DATALAKE_CONNECTION_CONFIG = DatalakeConnection( - configSource=S3Config( - securityConfig=AWSCredentials( - awsRegion="us-east-1", - ) # type: ignore - ) -) # type: ignore - - -@patch( - "metadata.data_quality.interface.sqlalchemy.sqa_test_suite_interface.SQATestSuiteInterface.__init__", - return_value=None, -) -@patch( - "metadata.data_quality.interface.pandas.pandas_test_suite_interface.PandasTestSuiteInterface.__init__", - return_value=None, -) -@mark.parametrize( - "service_connection_config,expected_interface", - [ - (MYSQL_CONNECTION_CONFIG, SQATestSuiteInterface), - (DATALAKE_CONNECTION_CONFIG, PandasTestSuiteInterface), - ], -) -def test_interface_factory( - sqa_init, pandas_init, service_connection_config, expected_interface -): - """Test our interface factory creates the expected interface instance type""" - interface = test_suite_interface_factory.create( - service_connection_config=service_connection_config, - ometa_client=None, # type: ignore - table_entity=None, # type: ignore - ) - assert interface.__class__ == expected_interface - - -def test_register_many(): - # Initialize factory - - factory = TestSuiteInterfaceFactory() - - test_suite_interfaces = { - "base": SQATestSuiteInterface, - DatalakeConnection.__name__: PandasTestSuiteInterface, - SnowflakeConnection.__name__: SnowflakeTestSuiteInterface, - UnityCatalogConnection.__name__: UnityCatalogTestSuiteInterface, - DatabricksConnection.__name__: DatabricksTestSuiteInterface, - } - - # Register profiles - factory.register_many(test_suite_interfaces) - - # Assert all expected interfaces are registered - expected_interfaces = set(test_suite_interfaces.keys()) - actual_interfaces = set(factory._interface_type.keys()) - assert expected_interfaces == actual_interfaces - - # Assert profiler classes match registered interfaces - for interface_type, interface_class in test_suite_interfaces.items(): - assert factory._interface_type[interface_type] == interface_class From 16c2a581af28085ac4a64fd75686e5f6b1ec56d6 Mon Sep 17 00:00:00 2001 From: Pere Miquel Brull Date: Tue, 12 Nov 2024 14:53:27 +0100 Subject: [PATCH 04/29] separate sampler --- .../sampler/sqlalchemy/bigquery/sampler.py | 42 +++++++++++++------ .../sampler/sqlalchemy/snowflake/sampler.py | 40 ++++++++++++------ 2 files changed, 57 insertions(+), 25 deletions(-) diff --git a/ingestion/src/metadata/sampler/sqlalchemy/bigquery/sampler.py b/ingestion/src/metadata/sampler/sqlalchemy/bigquery/sampler.py index 5d70731cf52a..dc3fd0aea223 100644 --- a/ingestion/src/metadata/sampler/sqlalchemy/bigquery/sampler.py +++ b/ingestion/src/metadata/sampler/sqlalchemy/bigquery/sampler.py @@ -12,12 +12,24 @@ Helper module to handle data sampling for the profiler """ -from typing import Dict, Optional +from typing import Dict, Optional, Union from sqlalchemy import Column from sqlalchemy.orm import Query -from metadata.generated.schema.entity.data.table import ProfileSampleType, TableType +from metadata.generated.schema.entity.data.table import ( + ProfileSampleType, + Table, + TableType, +) +from metadata.generated.schema.entity.services.connections.connectionBasicType import ( + DataStorageConfig, +) +from metadata.generated.schema.entity.services.connections.database.datalakeConnection import ( + DatalakeConnection, +) +from metadata.generated.schema.entity.services.databaseService import DatabaseConnection +from metadata.ingestion.ometa.ometa_api import OpenMetadata from metadata.profiler.processor.handle_partition import partition_filter_handler from metadata.sampler.models import SampleConfig from metadata.sampler.sqlalchemy.sampler import SQASampler @@ -33,21 +45,27 @@ class BigQuerySampler(SQASampler): # pylint: disable=too-many-arguments def __init__( self, - client, - table, - profile_sample_config: Optional[SampleConfig] = None, + service_connection_config: Union[DatabaseConnection, DatalakeConnection], + ometa_client: OpenMetadata, + entity: Table, + sample_config: Optional[SampleConfig] = None, partition_details: Optional[Dict] = None, - profile_sample_query: Optional[str] = None, + sample_query: Optional[str] = None, + storage_config: DataStorageConfig = None, sample_data_count: Optional[int] = SAMPLE_DATA_DEFAULT_COUNT, table_type: TableType = None, + **kwargs, ): super().__init__( - client, - table, - profile_sample_config, - partition_details, - profile_sample_query, - sample_data_count, + service_connection_config=service_connection_config, + ometa_client=ometa_client, + entity=entity, + sample_config=sample_config, + partition_details=partition_details, + sample_query=sample_query, + storage_config=storage_config, + sample_data_count=sample_data_count, + **kwargs, ) self.table_type: TableType = table_type diff --git a/ingestion/src/metadata/sampler/sqlalchemy/snowflake/sampler.py b/ingestion/src/metadata/sampler/sqlalchemy/snowflake/sampler.py index 8c958a63aa1e..70dbbc9b416e 100644 --- a/ingestion/src/metadata/sampler/sqlalchemy/snowflake/sampler.py +++ b/ingestion/src/metadata/sampler/sqlalchemy/snowflake/sampler.py @@ -13,7 +13,7 @@ for the profiler """ -from typing import Dict, Optional, cast +from typing import Dict, Optional, Union, cast from sqlalchemy import Table from sqlalchemy.sql.selectable import CTE @@ -22,6 +22,14 @@ ProfileSampleType, SamplingMethodType, ) +from metadata.generated.schema.entity.services.connections.connectionBasicType import ( + DataStorageConfig, +) +from metadata.generated.schema.entity.services.connections.database.datalakeConnection import ( + DatalakeConnection, +) +from metadata.generated.schema.entity.services.databaseService import DatabaseConnection +from metadata.ingestion.ometa.ometa_api import OpenMetadata from metadata.profiler.processor.handle_partition import partition_filter_handler from metadata.sampler.models import SampleConfig from metadata.sampler.sqlalchemy.sampler import SQASampler @@ -36,24 +44,30 @@ class SnowflakeSampler(SQASampler): def __init__( self, - client, - table, - profile_sample_config: Optional[SampleConfig] = None, + service_connection_config: Union[DatabaseConnection, DatalakeConnection], + ometa_client: OpenMetadata, + entity: Table, + sample_config: Optional[SampleConfig] = None, partition_details: Optional[Dict] = None, - profile_sample_query: Optional[str] = None, + sample_query: Optional[str] = None, + storage_config: DataStorageConfig = None, sample_data_count: Optional[int] = SAMPLE_DATA_DEFAULT_COUNT, + **kwargs, ): super().__init__( - client, - table, - profile_sample_config, - partition_details, - profile_sample_query, - sample_data_count, + service_connection_config=service_connection_config, + ometa_client=ometa_client, + entity=entity, + sample_config=sample_config, + partition_details=partition_details, + sample_query=sample_query, + storage_config=storage_config, + sample_data_count=sample_data_count, + **kwargs, ) self.sampling_method_type = SamplingMethodType.BERNOULLI - if profile_sample_config and profile_sample_config.sampling_method_type: - self.sampling_method_type = profile_sample_config.sampling_method_type + if sample_config and sample_config.sampling_method_type: + self.sampling_method_type = sample_config.sampling_method_type @partition_filter_handler(build_sample=True) def get_sample_query(self, *, column=None) -> CTE: From f02043d5ba1bd5c7d5736f735b4755110ca5dc13 Mon Sep 17 00:00:00 2001 From: Pere Miquel Brull Date: Tue, 12 Nov 2024 17:04:01 +0100 Subject: [PATCH 05/29] merge --- ingestion/src/metadata/mixins/sqalchemy/sqa_mixin.py | 1 - .../src/metadata/profiler/interface/profiler_interface.py | 6 ------ 2 files changed, 7 deletions(-) diff --git a/ingestion/src/metadata/mixins/sqalchemy/sqa_mixin.py b/ingestion/src/metadata/mixins/sqalchemy/sqa_mixin.py index ff1a88a2bbf2..da65b1416128 100644 --- a/ingestion/src/metadata/mixins/sqalchemy/sqa_mixin.py +++ b/ingestion/src/metadata/mixins/sqalchemy/sqa_mixin.py @@ -32,7 +32,6 @@ from metadata.ingestion.source.database.snowflake.queries import ( SNOWFLAKE_SESSION_TAG_QUERY, ) -from metadata.profiler.orm.converter.base import ometa_to_sqa_orm from metadata.utils.collaborative_super import Root diff --git a/ingestion/src/metadata/profiler/interface/profiler_interface.py b/ingestion/src/metadata/profiler/interface/profiler_interface.py index b12e20612c1b..fc58719a462d 100644 --- a/ingestion/src/metadata/profiler/interface/profiler_interface.py +++ b/ingestion/src/metadata/profiler/interface/profiler_interface.py @@ -39,7 +39,6 @@ from metadata.profiler.processor.runner import QueryRunner from metadata.sampler.sampler_interface import SamplerInterface from metadata.utils.collaborative_super import Root -from metadata.utils.constants import SAMPLE_DATA_DEFAULT_COUNT from metadata.utils.ssl_manager import get_ssl_connection @@ -113,11 +112,6 @@ def __init__( # pylint: disable=too-many-arguments **kwargs, ) - @abstractmethod - def _get_sampler(self): - """Get the sampler""" - raise NotImplementedError - # pylint: disable=too-many-locals @classmethod def create( From 85ab4f53f243c590592acfbaddbecc86e33b5f4c Mon Sep 17 00:00:00 2001 From: Pere Miquel Brull Date: Tue, 12 Nov 2024 18:58:52 +0100 Subject: [PATCH 06/29] workflow --- ingestion/src/metadata/sampler/processor.py | 157 ++++++++++++++++-- .../metadata/workflow/{pii.py => sampler.py} | 27 ++- ...on => databaseServiceSamplerPipeline.json} | 28 ++-- 3 files changed, 172 insertions(+), 40 deletions(-) rename ingestion/src/metadata/workflow/{pii.py => sampler.py} (56%) rename openmetadata-spec/src/main/resources/json/schema/metadataIngestion/{databaseServicePIIPipeline.json => databaseServiceSamplerPipeline.json} (81%) diff --git a/ingestion/src/metadata/sampler/processor.py b/ingestion/src/metadata/sampler/processor.py index 774f7e6ceecf..d25f6d6de075 100644 --- a/ingestion/src/metadata/sampler/processor.py +++ b/ingestion/src/metadata/sampler/processor.py @@ -11,33 +11,114 @@ """ Data Sampler for the PII Workflow """ -from typing import Optional +import traceback +from copy import deepcopy +from typing import Optional, cast +from sqlalchemy import MetaData +from sqlalchemy.orm import DeclarativeMeta + +from metadata.generated.schema.entity.data.table import Table +from metadata.generated.schema.entity.services.databaseService import ( + DatabaseConnection, + DatabaseService, +) +from metadata.generated.schema.entity.services.ingestionPipelines.status import ( + StackTraceError, +) +from metadata.generated.schema.entity.services.serviceType import ServiceType +from metadata.generated.schema.metadataIngestion.databaseServiceSamplerPipeline import ( + DatabaseServiceSamplerPipeline, +) +from metadata.generated.schema.metadataIngestion.workflow import ( + OpenMetadataWorkflowConfig, +) +from metadata.ingestion.api.models import Either +from metadata.ingestion.api.parser import parse_workflow_config_gracefully +from metadata.ingestion.api.step import Step from metadata.ingestion.api.steps import Processor from metadata.ingestion.ometa.ometa_api import OpenMetadata -from metadata.profiler.processor.core import Profiler +from metadata.profiler.orm.converter.base import ometa_to_sqa_orm +from metadata.profiler.source.database.base.profiler_source import ( + NON_SQA_DATABASE_CONNECTIONS, +) from metadata.profiler.source.metadata import ProfilerSourceAndEntity -from metadata.sampler.models import SamplerResponse +from metadata.sampler.models import SampleConfig, SampleData, SamplerResponse +from metadata.sampler.sampler_interface import SamplerInterface +from metadata.utils.profiler_utils import get_context_entities +from metadata.utils.service_spec.service_spec import import_sampler_class class SamplerProcessor(Processor): """Use the profiler interface to fetch the sample data""" - def _run(self, record: ProfilerSourceAndEntity) -> SamplerResponse: + def __init__(self, config: OpenMetadataWorkflowConfig, metadata: OpenMetadata): + super().__init__() + + self.config = config + self.metadata = metadata + self.sqa_metadata = self._set_sqa_metadata() + + self.source_config: DatabaseServiceSamplerPipeline = cast( + DatabaseServiceSamplerPipeline, self.config.source.sourceConfig.config + ) # Used to satisfy type checked + + self._interface_type: str = config.source.type.lower() + + @property + def name(self) -> str: + return "Sampler" + + def _run(self, record: ProfilerSourceAndEntity) -> Either[SamplerResponse]: """Fetch the sample data and pass it down the pipeline""" - profiler_runner: Profiler = record.profiler_source.get_profiler_runner( - record.entity, self.profiler_config - ) - # We need the sample data for Sample Data or PII Sensitive processing. - # We'll nullify the Sample Data after the PII processing so that it's not stored. - if ( - self.source_config.generateSampleData - or self.source_config.processPiiSensitive - ): - sample_data = self.generate_sample_data() - else: - sample_data = None + try: + entity = cast(Table, record.entity) + schema_entity, database_entity, db_service = get_context_entities( + entity=entity, metadata=self.metadata + ) + self.service_conn_config = self._copy_service_config( + self.config, db_service + ) + + sampler_class = import_sampler_class( + ServiceType.Database, source_type=self._interface_type + ) + _orm = self._build_table_orm(entity) + sampler_interface: SamplerInterface = sampler_class.create( + service_connection_config=self.service_conn_config, + ometa_client=self.metadata, + entity=entity, + schema_entity=schema_entity, + database_entity=database_entity, + sample_config=SampleConfig( + profile_sample=self.source_config.profileSample, + profile_sample_type=self.source_config.profileSampleType, + sampling_method_type=self.source_config.samplingMethodType, + ), + default_sample_data_count=self.source_config.sampleDataCount, + orm_table=_orm, + ) + sample_data = SampleData( + data=sampler_interface.generate_sample_data(), + store=self.source_config.storeSampleData, + ) + + return Either( + right=SamplerResponse( + table=entity, + sampleData=sample_data, + ) + ) + + except Exception as exc: + self.status.failed( + StackTraceError( + name=record.entity.fullyQualifiedName.root, + error=f"Unexpected exception processing entity {record.entity.fullyQualifiedName.root}: {exc}", + stackTrace=traceback.format_exc(), + ) + ) @classmethod def create( @@ -46,8 +127,48 @@ def create( metadata: OpenMetadata, pipeline_name: Optional[str] = None, ) -> "Step": - pass + config = parse_workflow_config_gracefully(config_dict) + return cls(config=config, metadata=metadata) + + def _set_sqa_metadata(self): + """Set sqlalchemy metadata""" + if not isinstance(self.service_conn_config, NON_SQA_DATABASE_CONNECTIONS): + return MetaData() + return None + + def _build_table_orm(self, entity: Table) -> Optional[DeclarativeMeta]: + """Build the ORM table if needed for the sampler and profiler interfaces""" + if not isinstance(self.service_conn_config, NON_SQA_DATABASE_CONNECTIONS): + return ometa_to_sqa_orm(entity, self.metadata, self.sqa_metadata) + return None + + def _copy_service_config( + self, config: OpenMetadataWorkflowConfig, database: DatabaseService + ) -> DatabaseConnection: + """Make a copy of the service config and update the database name + + Args: + database (_type_): a database entity + + Returns: + DatabaseService.__config__ + """ + config_copy = deepcopy( + config.source.serviceConnection.root.config # type: ignore + ) + if hasattr( + config_copy, # type: ignore + "supportsDatabase", + ): + if hasattr(config_copy, "database"): + config_copy.database = database.name.root # type: ignore + if hasattr(config_copy, "catalog"): + config_copy.catalog = database.name.root # type: ignore + + # we know we'll only be working with DatabaseConnection, we cast the type to satisfy type checker + config_copy = cast(DatabaseConnection, config_copy) + + return config_copy def close(self) -> None: """Nothing to close""" - pass diff --git a/ingestion/src/metadata/workflow/pii.py b/ingestion/src/metadata/workflow/sampler.py similarity index 56% rename from ingestion/src/metadata/workflow/pii.py rename to ingestion/src/metadata/workflow/sampler.py index 3144714348b4..0cc3d7a6835e 100644 --- a/ingestion/src/metadata/workflow/pii.py +++ b/ingestion/src/metadata/workflow/sampler.py @@ -11,31 +11,42 @@ """ Workflow definition for the profiler """ +from typing import cast +from metadata.generated.schema.metadataIngestion.databaseServiceSamplerPipeline import ( + DatabaseServiceSamplerPipeline, +) from metadata.ingestion.api.steps import Processor from metadata.pii.processor import PIIProcessor +from metadata.sampler.processor import SamplerProcessor from metadata.utils.logger import profiler_logger from metadata.workflow.profiler import ProfilerWorkflow logger = profiler_logger() -class PIIWorkflow(ProfilerWorkflow): - """PII workflow implementation. Based on the Profiler logic with different steps""" +class SamplerWorkflow(ProfilerWorkflow): + """Sampler workflow implementation. Based on the Profiler logic with different steps""" def set_steps(self): source_class = self._get_source_class() self.source = source_class.create(self.config.model_dump(), self.metadata) sink = self._get_sink() - pii_processor = self._get_pii_processor() - - # OM Source -> sampler -> PII -> Sink - self.steps = (pii_processor, sink) + sampler_processor = self._get_sampler_processor() + + # Only instantiate the PII Processor on demand + source_config: DatabaseServiceSamplerPipeline = cast( + DatabaseServiceSamplerPipeline, self.config.source.sourceConfig.config + ) + if source_config.enableAutoClassification: + pii_processor = self._get_pii_processor() + self.steps = (sampler_processor, pii_processor, sink) + else: + self.steps = (sampler_processor, sink) def _get_pii_processor(self) -> Processor: return PIIProcessor.create(self.config.model_dump(), self.metadata) def _get_sampler_processor(self) -> Processor: - # TODO - return ... + return SamplerProcessor.create(self.config.model_dump(), self.metadata) diff --git a/openmetadata-spec/src/main/resources/json/schema/metadataIngestion/databaseServicePIIPipeline.json b/openmetadata-spec/src/main/resources/json/schema/metadataIngestion/databaseServiceSamplerPipeline.json similarity index 81% rename from openmetadata-spec/src/main/resources/json/schema/metadataIngestion/databaseServicePIIPipeline.json rename to openmetadata-spec/src/main/resources/json/schema/metadataIngestion/databaseServiceSamplerPipeline.json index bca61cbcef6e..fb313489695f 100644 --- a/openmetadata-spec/src/main/resources/json/schema/metadataIngestion/databaseServicePIIPipeline.json +++ b/openmetadata-spec/src/main/resources/json/schema/metadataIngestion/databaseServiceSamplerPipeline.json @@ -1,22 +1,22 @@ { - "$id": "https://open-metadata.org/schema/metadataIngestion/databaseServicePIIPipeline.json", + "$id": "https://open-metadata.org/schema/metadataIngestion/databaseServiceSamplerPipeline.json", "$schema": "http://json-schema.org/draft-07/schema#", - "title": "DatabaseServicePIIPipeline", - "description": "DatabaseService PII & PII Pipeline Configuration.", + "title": "DatabaseServiceSamplerPipeline", + "description": "DatabaseService Sampler & Auto Classification Pipeline Configuration.", "type": "object", "definitions": { - "PIIConfigType": { + "samplerConfigType": { "description": "Profiler Source Config Pipeline type", "type": "string", - "enum": ["PII"], - "default": "PII" + "enum": ["Sampler"], + "default": "Sampler" } }, "properties": { "type": { "description": "Pipeline type", - "$ref": "#/definitions/PIIConfigType", - "default": "PII" + "$ref": "#/definitions/samplerConfigType", + "default": "Sampler" }, "schemaFilterPattern": { "description": "Regex to only fetch tables or databases that matches the pattern.", @@ -45,23 +45,23 @@ "default": false, "title": "Use FQN For Filtering" }, - "generateSampleData": { - "description": "Option to turn on/off generating sample data. If enabled, profiler will ingest sample data for each table.", + "storeSampleData": { + "description": "Option to turn on/off storing sample data. If enabled, we will ingest sample data for each table.", "type": "boolean", "default": true, - "title": "Generate Sample Data" + "title": "Store Sample Data" }, - "processPiiSensitive": { + "enableAutoClassification": { "description": "Optional configuration to automatically tag columns that might contain sensitive information", "type": "boolean", "default": false, - "title": "Auto Tag PII" + "title": "Enable Auto Classification" }, "confidence": { "description": "Set the Confidence value for which you want the column to be tagged as PII. Confidence value ranges from 0 to 100. A higher number will yield less false positives but more false negatives. A lower number will yield more false positives but less false negatives.", "type": "number", "default": 80, - "title": "PII Inference Confidence Level" + "title": "Auto Classification Inference Confidence Level" }, "profileSampleType": { "$ref": "../entity/data/table.json#/definitions/profileSampleType", From bd98a7b31489fdb73d498a0fcc79cb365558e106 Mon Sep 17 00:00:00 2001 From: Pere Miquel Brull Date: Tue, 12 Nov 2024 19:01:02 +0100 Subject: [PATCH 07/29] schemas --- .../main/resources/json/schema/metadataIngestion/workflow.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openmetadata-spec/src/main/resources/json/schema/metadataIngestion/workflow.json b/openmetadata-spec/src/main/resources/json/schema/metadataIngestion/workflow.json index e8e08a8e0475..f47d99e71126 100644 --- a/openmetadata-spec/src/main/resources/json/schema/metadataIngestion/workflow.json +++ b/openmetadata-spec/src/main/resources/json/schema/metadataIngestion/workflow.json @@ -31,7 +31,7 @@ "$ref": "databaseServiceProfilerPipeline.json" }, { - "$ref": "databaseServicePIIPipeline.json" + "$ref": "databaseServiceSamplerPipeline.json" }, { "$ref": "pipelineServiceMetadataPipeline.json" From f5519b49482f7ac7c149e7b0e81289647e5abbd4 Mon Sep 17 00:00:00 2001 From: Pere Miquel Brull Date: Thu, 14 Nov 2024 10:44:16 +0100 Subject: [PATCH 08/29] prep --- ingestion/pyproject.toml | 4 +- ingestion/src/metadata/cli/sample.py | 52 +++++++++++++++++++ ingestion/src/metadata/cmd.py | 3 ++ .../profiler/source/fetcher/config.py | 42 +++++++++++++++ .../source/fetcher/fetcher_strategy.py | 6 +-- ingestion/src/metadata/sampler/config.py | 27 +++++----- .../databaseServiceSamplerPipeline.json | 5 ++ 7 files changed, 119 insertions(+), 20 deletions(-) create mode 100644 ingestion/src/metadata/cli/sample.py create mode 100644 ingestion/src/metadata/profiler/source/fetcher/config.py diff --git a/ingestion/pyproject.toml b/ingestion/pyproject.toml index a0b6ab739ece..359348498cde 100644 --- a/ingestion/pyproject.toml +++ b/ingestion/pyproject.toml @@ -114,11 +114,10 @@ ignore-paths = [ "ingestion/src/metadata/utils/datalake/datalake_utils.py", "ingestion/src/metadata/great_expectations/action.py", "ingestion/src/metadata/profiler/interface/nosql/profiler_interface.py", - "ingestion/src/metadata/profiler/processor/sampler/nosql/sampler.py", ".*/src/metadata/ingestion/source/.*/service_spec.py", "ingestion/src/metadata/profiler/metrics", "ingestion/src/metadata/profiler/source/databricks", - + # metadata ingestion sources "ingestion/src/metadata/ingestion/source/api/rest/connection.py", "ingestion/src/metadata/ingestion/source/api/rest/metadata.py", @@ -262,6 +261,7 @@ ignore = [ "src/metadata/parsers/*", "src/metadata/pii/*", "src/metadata/profiler/*", + "src/metadata/sampler/*", "src/metadata/readers/*", "src/metadata/timer/*", "src/metadata/utils/*", diff --git a/ingestion/src/metadata/cli/sample.py b/ingestion/src/metadata/cli/sample.py new file mode 100644 index 000000000000..7236ff157c8f --- /dev/null +++ b/ingestion/src/metadata/cli/sample.py @@ -0,0 +1,52 @@ +# Copyright 2021 Collate +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Sampler utility for the metadata CLI +""" +import sys +import traceback +from pathlib import Path + +from metadata.config.common import load_config_file +from metadata.generated.schema.entity.services.ingestionPipelines.ingestionPipeline import ( + PipelineType, +) +from metadata.utils.logger import cli_logger, redacted_config +from metadata.workflow.sampler import SamplerWorkflow +from metadata.workflow.workflow_init_error_handler import WorkflowInitErrorHandler + +logger = cli_logger() + + +def run_sample(config_path: Path) -> None: + """ + Run the sampler workflow from a config path + to a JSON or YAML file + :param config_path: Path to load JSON config + """ + + config_dict = None + try: + config_dict = load_config_file(config_path) + logger.debug("Using workflow config:\n%s", redacted_config(config_dict)) + workflow = SamplerWorkflow.create(config_dict) + except Exception as exc: + logger.debug(traceback.format_exc()) + WorkflowInitErrorHandler.print_init_error( + exc, config_dict, PipelineType.metadata + ) + sys.exit(1) + + workflow.execute() + workflow.stop() + workflow.print_status() + workflow.raise_from_status() diff --git a/ingestion/src/metadata/cmd.py b/ingestion/src/metadata/cmd.py index 42745ccc0e93..12fcbd717791 100644 --- a/ingestion/src/metadata/cmd.py +++ b/ingestion/src/metadata/cmd.py @@ -26,6 +26,7 @@ from metadata.cli.ingest import run_ingest from metadata.cli.lineage import run_lineage from metadata.cli.profile import run_profiler +from metadata.cli.sample import run_sample from metadata.cli.usage import run_usage from metadata.utils.logger import cli_logger, set_loggers_level @@ -40,6 +41,7 @@ class MetadataCommands(Enum): WEBHOOK = "webhook" LINEAGE = "lineage" APP = "app" + SAMPLE = "sample" RUN_PATH_METHODS = { @@ -49,6 +51,7 @@ class MetadataCommands(Enum): MetadataCommands.PROFILE.value: run_profiler, MetadataCommands.TEST.value: run_test, MetadataCommands.APP.value: run_app, + MetadataCommands.SAMPLE.value: run_sample, } diff --git a/ingestion/src/metadata/profiler/source/fetcher/config.py b/ingestion/src/metadata/profiler/source/fetcher/config.py new file mode 100644 index 000000000000..b45a4933c2ae --- /dev/null +++ b/ingestion/src/metadata/profiler/source/fetcher/config.py @@ -0,0 +1,42 @@ +# Copyright 2021 Collate +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Entity Fetcher Configuration Models +""" + +from typing import Optional, Protocol, runtime_checkable + +from metadata.generated.schema.type.filterPattern import FilterPattern + + +@runtime_checkable +class EntityFilterConfigInterface(Protocol): + """Interface for the OM workflow source configs that allow filtering""" + + @property + def classificationFilterPattern(self) -> Optional[FilterPattern]: + ... + + @property + def databaseFilterPattern(self) -> Optional[FilterPattern]: + ... + + @property + def schemaFilterPattern(self) -> Optional[FilterPattern]: + ... + + @property + def tableFilterPattern(self) -> Optional[FilterPattern]: + ... + + @property + def useFqnForFiltering(self) -> Optional[bool]: + ... diff --git a/ingestion/src/metadata/profiler/source/fetcher/fetcher_strategy.py b/ingestion/src/metadata/profiler/source/fetcher/fetcher_strategy.py index 5f6fc2d15bda..0fd55f84ccdc 100644 --- a/ingestion/src/metadata/profiler/source/fetcher/fetcher_strategy.py +++ b/ingestion/src/metadata/profiler/source/fetcher/fetcher_strategy.py @@ -20,9 +20,6 @@ from metadata.generated.schema.entity.services.ingestionPipelines.status import ( StackTraceError, ) -from metadata.generated.schema.metadataIngestion.databaseServiceProfilerPipeline import ( - DatabaseServiceProfilerPipeline, -) from metadata.generated.schema.metadataIngestion.workflow import ( OpenMetadataWorkflowConfig, ) @@ -31,6 +28,7 @@ from metadata.ingestion.api.status import Status from metadata.ingestion.models.entity_interface import EntityInterfaceWithTags from metadata.ingestion.ometa.ometa_api import OpenMetadata +from metadata.profiler.source.fetcher.config import EntityFilterConfigInterface from metadata.profiler.source.fetcher.profiler_source_factory import ( profiler_source_factory, ) @@ -116,7 +114,7 @@ def __init__( ) -> None: super().__init__(config, metadata, global_profiler_config, status) self.source_config = cast( - DatabaseServiceProfilerPipeline, self.source_config + EntityFilterConfigInterface, self.source_config ) # Satisfy typchecker def _filter_databases(self, databases: Iterable[Database]) -> Iterable[Database]: diff --git a/ingestion/src/metadata/sampler/config.py b/ingestion/src/metadata/sampler/config.py index 898b12d1dfff..5a421d48c18f 100644 --- a/ingestion/src/metadata/sampler/config.py +++ b/ingestion/src/metadata/sampler/config.py @@ -11,7 +11,7 @@ """ Sampler configuration helpers """ -from typing import Optional, Union +from typing import Any, Dict, Optional, Union from metadata.generated.schema.entity.data.database import ( Database, @@ -29,15 +29,12 @@ from metadata.generated.schema.metadataIngestion.databaseServiceProfilerPipeline import ( DatabaseServiceProfilerPipeline, ) -from metadata.profiler.api.models import ( - DatabaseAndSchemaConfig, - ProfilerProcessorConfig, -) +from metadata.profiler.api.models import ProfilerProcessorConfig from metadata.profiler.config import ( get_database_profiler_config, get_schema_profiler_config, ) -from metadata.sampler.models import SampleConfig, TableConfig +from metadata.sampler.models import DatabaseAndSchemaConfig, SampleConfig, TableConfig def get_sample_storage_config( @@ -46,7 +43,7 @@ def get_sample_storage_config( DatabaseProfilerConfig, DatabaseAndSchemaConfig, ], -) -> Optional[DataStorageConfig]: +) -> Optional[Union[DataStorageConfig, Dict[str, Any]]]: """Get sample storage config""" if ( config @@ -63,33 +60,35 @@ def get_storage_config_for_table( database_entity: Database, db_service: Optional[DatabaseService], profiler_config: ProfilerProcessorConfig, -) -> Optional[DataStorageConfig]: +) -> Optional[Union[DataStorageConfig, Dict[str, Any]]]: """Get storage config for a specific entity""" schema_profiler_config = get_schema_profiler_config(schema_entity=schema_entity) database_profiler_config = get_database_profiler_config( database_entity=database_entity ) - for schema_config in profiler_config.schemaConfig: + for schema_config in profiler_config.schemaConfig or []: if ( - schema_config.fullyQualifiedName.root + entity.databaseSchema + and schema_config.fullyQualifiedName.root == entity.databaseSchema.fullyQualifiedName and get_sample_storage_config(schema_config) ): return get_sample_storage_config(schema_config) - for database_config in profiler_config.databaseConfig: + for database_config in profiler_config.databaseConfig or []: if ( - database_config.fullyQualifiedName.root + entity.database + and database_config.fullyQualifiedName.root == entity.database.fullyQualifiedName and get_sample_storage_config(database_config) ): return get_sample_storage_config(database_config) - if get_sample_storage_config(schema_profiler_config): + if schema_profiler_config and get_sample_storage_config(schema_profiler_config): return get_sample_storage_config(schema_profiler_config) - if get_sample_storage_config(database_profiler_config): + if database_profiler_config and get_sample_storage_config(database_profiler_config): return get_sample_storage_config(database_profiler_config) try: diff --git a/openmetadata-spec/src/main/resources/json/schema/metadataIngestion/databaseServiceSamplerPipeline.json b/openmetadata-spec/src/main/resources/json/schema/metadataIngestion/databaseServiceSamplerPipeline.json index fb313489695f..11c5001bdfa0 100644 --- a/openmetadata-spec/src/main/resources/json/schema/metadataIngestion/databaseServiceSamplerPipeline.json +++ b/openmetadata-spec/src/main/resources/json/schema/metadataIngestion/databaseServiceSamplerPipeline.json @@ -18,6 +18,11 @@ "$ref": "#/definitions/samplerConfigType", "default": "Sampler" }, + "classificationFilterPattern": { + "description": "Regex to only compute metrics for table that matches the given tag, tiers, gloassary pattern.", + "$ref": "../type/filterPattern.json#/definitions/filterPattern", + "title": "Classification Filter Pattern" + }, "schemaFilterPattern": { "description": "Regex to only fetch tables or databases that matches the pattern.", "$ref": "../type/filterPattern.json#/definitions/filterPattern", From bd619f10f053e3cdd6ba9430f7e25d34a33aec1f Mon Sep 17 00:00:00 2001 From: Pere Miquel Brull Date: Thu, 14 Nov 2024 18:06:41 +0100 Subject: [PATCH 09/29] fix --- ingestion/src/metadata/cmd.py | 6 ++ ingestion/src/metadata/pii/processor.py | 15 ++-- .../profiler/processor/handle_partition.py | 4 +- .../src/metadata/profiler/processor/runner.py | 14 ++-- .../source/database/base/profiler_source.py | 80 ++++-------------- ingestion/src/metadata/sampler/config.py | 64 +++++++++++++- .../src/metadata/sampler/nosql/sampler.py | 5 ++ .../src/metadata/sampler/pandas/sampler.py | 50 +++++++++-- ingestion/src/metadata/sampler/processor.py | 56 +++++++------ .../src/metadata/sampler/sampler_interface.py | 83 ++++++++++++++++--- .../metadata/sampler/sqlalchemy/sampler.py | 4 + 11 files changed, 253 insertions(+), 128 deletions(-) diff --git a/ingestion/src/metadata/cmd.py b/ingestion/src/metadata/cmd.py index 12fcbd717791..b01eb2bae4ff 100644 --- a/ingestion/src/metadata/cmd.py +++ b/ingestion/src/metadata/cmd.py @@ -127,6 +127,12 @@ def get_parser(args: Optional[List[str]] = None): help="Workflow for running external applications", ) ) + create_common_config_parser_args( + sub_parser.add_parser( + MetadataCommands.SAMPLE.value, + help="Workflow for running sampling and auto classification", + ) + ) webhook_args( sub_parser.add_parser( MetadataCommands.WEBHOOK.value, diff --git a/ingestion/src/metadata/pii/processor.py b/ingestion/src/metadata/pii/processor.py index 85ca756043d5..cf2e90e36a30 100644 --- a/ingestion/src/metadata/pii/processor.py +++ b/ingestion/src/metadata/pii/processor.py @@ -19,8 +19,8 @@ from metadata.generated.schema.entity.services.ingestionPipelines.status import ( StackTraceError, ) -from metadata.generated.schema.metadataIngestion.databaseServiceProfilerPipeline import ( - DatabaseServiceProfilerPipeline, +from metadata.generated.schema.metadataIngestion.databaseServiceSamplerPipeline import ( + DatabaseServiceSamplerPipeline, ) from metadata.generated.schema.metadataIngestion.workflow import ( OpenMetadataWorkflowConfig, @@ -40,7 +40,6 @@ from metadata.pii.constants import PII from metadata.pii.scanners.column_name_scanner import ColumnNameScanner from metadata.pii.scanners.ner_scanner import NERScanner -from metadata.profiler.api.models import ProfilerResponse from metadata.sampler.models import SamplerResponse from metadata.utils.logger import profiler_logger @@ -62,8 +61,8 @@ def __init__( self.metadata = metadata # Init and type the source config - self.source_config: DatabaseServiceProfilerPipeline = cast( - DatabaseServiceProfilerPipeline, self.config.source.sourceConfig.config + self.source_config: DatabaseServiceSamplerPipeline = cast( + DatabaseServiceSamplerPipeline, self.config.source.sourceConfig.config ) # Used to satisfy type checked self._ner_scanner = None @@ -72,7 +71,7 @@ def __init__( @property def name(self) -> str: - return "PII Processor" + return "Auto Classification Processor" @property def ner_scanner(self) -> NERScanner: @@ -153,7 +152,7 @@ def process_column( def _run( self, - record: ProfilerResponse, + record: SamplerResponse, ) -> Either[SamplerResponse]: """ Main entrypoint for the scanner. @@ -163,7 +162,7 @@ def _run( """ # We don't always need to process - if not self.source_config.processPiiSensitive: + if not self.source_config.enableAutoClassification: return Either(right=record) column_tags = [] diff --git a/ingestion/src/metadata/profiler/processor/handle_partition.py b/ingestion/src/metadata/profiler/processor/handle_partition.py index e5a79d1c8ab6..744d0263e81c 100644 --- a/ingestion/src/metadata/profiler/processor/handle_partition.py +++ b/ingestion/src/metadata/profiler/processor/handle_partition.py @@ -105,9 +105,9 @@ def __init__( def __call__(self, func): def handle_and_execute(_self, *args, **kwargs): """Handle partitioned queries""" - if _self._partition_details: + if _self.partition_details: partition_filter = build_partition_predicate( - _self._partition_details, + _self.partition_details, _self.table.__table__.c, ) if self.build_sample: diff --git a/ingestion/src/metadata/profiler/processor/runner.py b/ingestion/src/metadata/profiler/processor/runner.py index e621476353bb..df3467b044bb 100644 --- a/ingestion/src/metadata/profiler/processor/runner.py +++ b/ingestion/src/metadata/profiler/processor/runner.py @@ -55,8 +55,8 @@ def __init__( self._session = session self.table = table self._sample = sample - self._partition_details = partition_details - self._profile_sample_query = profile_sample_query + self.partition_details = partition_details + self.profile_sample_query = profile_sample_query def _build_query(self, *entities, **kwargs) -> Query: return self._session.query(*entities, **kwargs) @@ -77,7 +77,7 @@ def _select_from_user_query(self, *entities, **kwargs): filter_ = get_query_filter_for_runner(kwargs) user_query = self._session.query(self.table).from_statement( - text(f"{self._profile_sample_query}") + text(f"{self.profile_sample_query}") ) query = self._build_query(*entities, **kwargs).select_from(user_query) @@ -92,7 +92,7 @@ def select_first_from_table(self, *entities, **kwargs): """Select first row from the table""" filter_ = get_query_filter_for_runner(kwargs) - if self._profile_sample_query: + if self.profile_sample_query: return self._select_from_user_query(*entities, **kwargs).first() query = self._build_query(*entities, **kwargs).select_from(self.table) @@ -106,7 +106,7 @@ def select_all_from_table(self, *entities, **kwargs): """Select all rows from the table""" filter_ = get_query_filter_for_runner(kwargs) - if self._profile_sample_query: + if self.profile_sample_query: return self._select_from_user_query(*entities, **kwargs).all() query = self._build_query(*entities, **kwargs).select_from(self.table) @@ -126,9 +126,9 @@ def select_all_from_sample(self, *entities, **kwargs): def yield_from_sample(self, *entities, **kwargs): query = self._select_from_sample(*entities, **kwargs) - if self._partition_details: + if self.partition_details: partition_filter = build_partition_predicate( - self._partition_details, + self.partition_details, self.table.__table__.c, ) query.filter(partition_filter) diff --git a/ingestion/src/metadata/profiler/source/database/base/profiler_source.py b/ingestion/src/metadata/profiler/source/database/base/profiler_source.py index aa81fa473a1d..728dfc44239c 100644 --- a/ingestion/src/metadata/profiler/source/database/base/profiler_source.py +++ b/ingestion/src/metadata/profiler/source/database/base/profiler_source.py @@ -14,7 +14,7 @@ its interface """ from copy import deepcopy -from typing import List, Optional, cast +from typing import Optional, cast from sqlalchemy import MetaData from sqlalchemy.orm import DeclarativeMeta @@ -24,7 +24,7 @@ ) from metadata.generated.schema.entity.data.database import Database from metadata.generated.schema.entity.data.databaseSchema import DatabaseSchema -from metadata.generated.schema.entity.data.table import ColumnProfilerConfig, Table +from metadata.generated.schema.entity.data.table import Table from metadata.generated.schema.entity.services.connections.database.datalakeConnection import ( DatalakeConnection, ) @@ -47,6 +47,11 @@ from metadata.profiler.processor.core import Profiler from metadata.profiler.processor.default import DefaultProfiler, get_default_metrics from metadata.profiler.source.profiler_source_interface import ProfilerSourceInterface +from metadata.sampler.config import ( + get_config_for_table, + get_exclude_columns, + get_include_columns, +) from metadata.sampler.models import SampleConfig from metadata.sampler.sampler_interface import SamplerInterface from metadata.utils.logger import profiler_logger @@ -73,10 +78,8 @@ def __init__( ometa_client: OpenMetadata, global_profiler_configuration: ProfilerConfiguration, ): + self.config = config self.service_conn_config = self._copy_service_config(config, database) - self.source_config = DatabaseServiceProfilerPipeline.model_validate( - config.source.sourceConfig.config - ) self.profiler_config = ProfilerProcessorConfig.model_validate( config.processor.model_dump().get("config") ) @@ -110,60 +113,6 @@ def _build_table_orm(self, entity: Table) -> Optional[DeclarativeMeta]: return ometa_to_sqa_orm(entity, self.ometa_client, self.sqa_metadata) return None - @staticmethod - def get_config_for_table(entity: Table, profiler_config) -> Optional[TableConfig]: - """Get config for a specific entity - - Args: - entity: table entity - """ - for table_config in profiler_config.tableConfig or []: - if table_config.fullyQualifiedName.root == entity.fullyQualifiedName.root: - return table_config - - for schema_config in profiler_config.schemaConfig or []: - if ( - schema_config.fullyQualifiedName.root - == entity.databaseSchema.fullyQualifiedName - ): - return TableConfig.from_database_and_schema_config( - schema_config, entity.fullyQualifiedName.root - ) - for database_config in profiler_config.databaseConfig or []: - if ( - database_config.fullyQualifiedName.root - == entity.database.fullyQualifiedName - ): - return TableConfig.from_database_and_schema_config( - database_config, entity.fullyQualifiedName.root - ) - - return None - - def _get_include_columns( - self, entity, entity_config: Optional[TableConfig] - ) -> Optional[List[ColumnProfilerConfig]]: - """get included columns""" - if entity_config and entity_config.columnConfig: - return entity_config.columnConfig.includeColumns - - if entity.tableProfilerConfig: - return entity.tableProfilerConfig.includeColumns - - return None - - def _get_exclude_columns( - self, entity, entity_config: Optional[TableConfig] - ) -> Optional[List[str]]: - """get included columns""" - if entity_config and entity_config.columnConfig: - return entity_config.columnConfig.excludeColumns - - if entity.tableProfilerConfig: - return entity.tableProfilerConfig.excludeColumns - - return None - def _copy_service_config( self, config: OpenMetadataWorkflowConfig, database: DatabaseService ) -> DatabaseConnection: @@ -202,6 +151,9 @@ def create_profiler_interface( db_service: Optional[DatabaseService], ) -> ProfilerInterface: """Create sqlalchemy profiler interface""" + self.source_config = DatabaseServiceProfilerPipeline.model_validate( + self.config.source.sourceConfig.config + ) profiler_class = import_profiler_class( ServiceType.Database, source_type=self._interface_type ) @@ -243,7 +195,7 @@ def get_profiler_runner( """ Returns the runner for the profiler """ - table_config = self.get_config_for_table(entity, profiler_config) + table_config = get_config_for_table(entity, profiler_config) schema_entity, database_entity, db_service = get_context_entities( entity=entity, metadata=self.ometa_client ) @@ -259,8 +211,8 @@ def get_profiler_runner( if not profiler_config.profiler: return DefaultProfiler( profiler_interface=profiler_interface, - include_columns=self._get_include_columns(entity, table_config), - exclude_columns=self._get_exclude_columns(entity, table_config), + include_columns=get_include_columns(entity, table_config), + exclude_columns=get_exclude_columns(entity, table_config), global_profiler_configuration=self.global_profiler_configuration, db_service=db_service, ) @@ -278,7 +230,7 @@ def get_profiler_runner( return Profiler( *metrics, # type: ignore profiler_interface=profiler_interface, - include_columns=self._get_include_columns(entity, table_config), - exclude_columns=self._get_exclude_columns(entity, table_config), + include_columns=get_include_columns(entity, table_config), + exclude_columns=get_exclude_columns(entity, table_config), global_profiler_configuration=self.global_profiler_configuration, ) diff --git a/ingestion/src/metadata/sampler/config.py b/ingestion/src/metadata/sampler/config.py index 5a421d48c18f..a42d83160173 100644 --- a/ingestion/src/metadata/sampler/config.py +++ b/ingestion/src/metadata/sampler/config.py @@ -11,7 +11,7 @@ """ Sampler configuration helpers """ -from typing import Any, Dict, Optional, Union +from typing import Any, Dict, List, Optional, Union from metadata.generated.schema.entity.data.database import ( Database, @@ -21,7 +21,11 @@ DatabaseSchema, DatabaseSchemaProfilerConfig, ) -from metadata.generated.schema.entity.data.table import PartitionProfilerConfig, Table +from metadata.generated.schema.entity.data.table import ( + ColumnProfilerConfig, + PartitionProfilerConfig, + Table, +) from metadata.generated.schema.entity.services.connections.connectionBasicType import ( DataStorageConfig, ) @@ -196,3 +200,59 @@ def get_sample_data_count_config( return config.sampleDataCount return default_sample_data_count + + +def get_config_for_table(entity: Table, profiler_config) -> Optional[TableConfig]: + """Get config for a specific entity + + Args: + entity: table entity + """ + for table_config in profiler_config.tableConfig or []: + if table_config.fullyQualifiedName.root == entity.fullyQualifiedName.root: + return table_config + + for schema_config in profiler_config.schemaConfig or []: + if ( + schema_config.fullyQualifiedName.root + == entity.databaseSchema.fullyQualifiedName + ): + return TableConfig.from_database_and_schema_config( + schema_config, entity.fullyQualifiedName.root + ) + for database_config in profiler_config.databaseConfig or []: + if ( + database_config.fullyQualifiedName.root + == entity.database.fullyQualifiedName + ): + return TableConfig.from_database_and_schema_config( + database_config, entity.fullyQualifiedName.root + ) + + return None + + +def get_include_columns( + entity, entity_config: Optional[TableConfig] +) -> Optional[List[ColumnProfilerConfig]]: + """get included columns""" + if entity_config and entity_config.columnConfig: + return entity_config.columnConfig.includeColumns + + if entity.tableProfilerConfig: + return entity.tableProfilerConfig.includeColumns + + return None + + +def get_exclude_columns( + entity, entity_config: Optional[TableConfig] +) -> Optional[List[str]]: + """get included columns""" + if entity_config and entity_config.columnConfig: + return entity_config.columnConfig.excludeColumns + + if entity.tableProfilerConfig: + return entity.tableProfilerConfig.excludeColumns + + return None diff --git a/ingestion/src/metadata/sampler/nosql/sampler.py b/ingestion/src/metadata/sampler/nosql/sampler.py index 199dc1143a55..6eeb75deca31 100644 --- a/ingestion/src/metadata/sampler/nosql/sampler.py +++ b/ingestion/src/metadata/sampler/nosql/sampler.py @@ -87,3 +87,8 @@ def transpose_records( row.append(record.get(column.name)) rows.append(row) return rows, columns + + def get_columns(self) -> List[Optional[SQALikeColumn]]: + return [ + SQALikeColumn(name=c.name.root, type=c.dataType) for c in self.table.columns + ] diff --git a/ingestion/src/metadata/sampler/pandas/sampler.py b/ingestion/src/metadata/sampler/pandas/sampler.py index 15cc5d760df7..3124ce9889ee 100644 --- a/ingestion/src/metadata/sampler/pandas/sampler.py +++ b/ingestion/src/metadata/sampler/pandas/sampler.py @@ -14,6 +14,7 @@ """ import math import random +from copy import deepcopy from typing import List, Optional, cast from metadata.data_quality.validations.table.pandas.tableRowInsertedCountToBeBetween import ( @@ -27,6 +28,8 @@ ) from metadata.mixins.pandas.pandas_mixin import PandasInterfaceMixin from metadata.sampler.sampler_interface import SamplerInterface +from metadata.utils.constants import COMPLEX_COLUMN_SEPARATOR +from metadata.utils.datalake.datalake_utils import GenericDataFrameColumnParser from metadata.utils.sqa_like_column import SQALikeColumn @@ -36,14 +39,22 @@ class DatalakeSampler(SamplerInterface, PandasInterfaceMixin): run the query in the whole table. """ + def __init__(self, *args, **kwargs): + """Init the pandas sampler""" + super().__init__(*args, **kwargs) + self._table = None + self.complex_dataframe_sample = deepcopy(self.random_sample(is_sampled=True)) + @property def table(self): - return self.return_ometa_dataframes_sampled( - service_connection_config=self.service_connection_config, - client=self.client._client, - table=self.entity, - profile_sample_config=self.sample_config.profile_sample, - ) + if not self._table: + self._table = self.return_ometa_dataframes_sampled( + service_connection_config=self.service_connection_config, + client=self.client._client, + table=self.entity, + profile_sample_config=self.sample_config.profile_sample, + ) + return self._table def get_client(self): return self.connection.client @@ -183,3 +194,30 @@ def fetch_sample_data( cols, rows = self.get_col_row(data_frame=self.table, columns=columns) return TableData(columns=cols, rows=rows) + + def get_columns(self) -> List[Optional[SQALikeColumn]]: + """Get SQALikeColumns for datalake to be passed for metric computation""" + sqalike_columns = [] + if self.complex_dataframe_sample: + for column_name in self.complex_dataframe_sample[0].columns: + complex_col_name = None + if COMPLEX_COLUMN_SEPARATOR in column_name: + complex_col_name = ".".join( + column_name.split(COMPLEX_COLUMN_SEPARATOR)[1:] + ) + if complex_col_name: + for df in self.complex_dataframe_sample: + df.rename( + columns={column_name: complex_col_name}, inplace=True + ) + column_name = complex_col_name or column_name + sqalike_columns.append( + SQALikeColumn( + column_name, + GenericDataFrameColumnParser.fetch_col_types( + self.complex_dataframe_sample[0], column_name + ), + ) + ) + return sqalike_columns + return [] diff --git a/ingestion/src/metadata/sampler/processor.py b/ingestion/src/metadata/sampler/processor.py index d25f6d6de075..0b630d8a3bf9 100644 --- a/ingestion/src/metadata/sampler/processor.py +++ b/ingestion/src/metadata/sampler/processor.py @@ -13,6 +13,7 @@ """ import traceback from copy import deepcopy +from functools import lru_cache from typing import Optional, cast from sqlalchemy import MetaData @@ -39,15 +40,22 @@ from metadata.ingestion.api.steps import Processor from metadata.ingestion.ometa.ometa_api import OpenMetadata from metadata.profiler.orm.converter.base import ometa_to_sqa_orm -from metadata.profiler.source.database.base.profiler_source import ( - NON_SQA_DATABASE_CONNECTIONS, -) from metadata.profiler.source.metadata import ProfilerSourceAndEntity from metadata.sampler.models import SampleConfig, SampleData, SamplerResponse from metadata.sampler.sampler_interface import SamplerInterface from metadata.utils.profiler_utils import get_context_entities from metadata.utils.service_spec.service_spec import import_sampler_class +NON_SQA_DATABASE_CONNECTIONS = ("Datalake",) + + +@lru_cache +def _get_sqa_metadata(conn_type: str) -> Optional[MetaData]: + """Set sqlalchemy metadata""" + if conn_type not in NON_SQA_DATABASE_CONNECTIONS: + return MetaData() + return None + class SamplerProcessor(Processor): """Use the profiler interface to fetch the sample data""" @@ -57,13 +65,15 @@ def __init__(self, config: OpenMetadataWorkflowConfig, metadata: OpenMetadata): self.config = config self.metadata = metadata - self.sqa_metadata = self._set_sqa_metadata() self.source_config: DatabaseServiceSamplerPipeline = cast( DatabaseServiceSamplerPipeline, self.config.source.sourceConfig.config ) # Used to satisfy type checked self._interface_type: str = config.source.type.lower() + self.sampler_class = import_sampler_class( + ServiceType.Database, source_type=self._interface_type + ) @property def name(self) -> str: @@ -77,16 +87,12 @@ def _run(self, record: ProfilerSourceAndEntity) -> Either[SamplerResponse]: schema_entity, database_entity, db_service = get_context_entities( entity=entity, metadata=self.metadata ) - self.service_conn_config = self._copy_service_config( - self.config, db_service - ) + service_conn_config = self._copy_service_config(self.config, db_service) + sqa_metadata = _get_sqa_metadata(str(service_conn_config.type.value)) - sampler_class = import_sampler_class( - ServiceType.Database, source_type=self._interface_type - ) - _orm = self._build_table_orm(entity) - sampler_interface: SamplerInterface = sampler_class.create( - service_connection_config=self.service_conn_config, + _orm = self._build_table_orm(entity, sqa_metadata) + sampler_interface: SamplerInterface = self.sampler_class.create( + service_connection_config=service_conn_config, ometa_client=self.metadata, entity=entity, schema_entity=schema_entity, @@ -107,13 +113,13 @@ def _run(self, record: ProfilerSourceAndEntity) -> Either[SamplerResponse]: return Either( right=SamplerResponse( table=entity, - sampleData=sample_data, + sample_data=sample_data, ) ) except Exception as exc: - self.status.failed( - StackTraceError( + return Either( + left=StackTraceError( name=record.entity.fullyQualifiedName.root, error=f"Unexpected exception processing entity {record.entity.fullyQualifiedName.root}: {exc}", stackTrace=traceback.format_exc(), @@ -130,17 +136,15 @@ def create( config = parse_workflow_config_gracefully(config_dict) return cls(config=config, metadata=metadata) - def _set_sqa_metadata(self): - """Set sqlalchemy metadata""" - if not isinstance(self.service_conn_config, NON_SQA_DATABASE_CONNECTIONS): - return MetaData() - return None - - def _build_table_orm(self, entity: Table) -> Optional[DeclarativeMeta]: + def _build_table_orm( + self, entity: Table, sqa_metadata: Optional[MetaData] + ) -> Optional[DeclarativeMeta]: """Build the ORM table if needed for the sampler and profiler interfaces""" - if not isinstance(self.service_conn_config, NON_SQA_DATABASE_CONNECTIONS): - return ometa_to_sqa_orm(entity, self.metadata, self.sqa_metadata) - return None + return ( + ometa_to_sqa_orm(entity, self.metadata, sqa_metadata) + if sqa_metadata + else None + ) def _copy_service_config( self, config: OpenMetadataWorkflowConfig, database: DatabaseService diff --git a/ingestion/src/metadata/sampler/sampler_interface.py b/ingestion/src/metadata/sampler/sampler_interface.py index 48ef04c4282c..1501335b31f2 100644 --- a/ingestion/src/metadata/sampler/sampler_interface.py +++ b/ingestion/src/metadata/sampler/sampler_interface.py @@ -13,13 +13,15 @@ """ import traceback from abc import ABC, abstractmethod -from typing import Dict, List, Optional, Union - -from sqlalchemy import Column +from typing import Dict, List, Optional, Set, Union from metadata.generated.schema.entity.data.database import Database from metadata.generated.schema.entity.data.databaseSchema import DatabaseSchema -from metadata.generated.schema.entity.data.table import Table, TableData +from metadata.generated.schema.entity.data.table import ( + ColumnProfilerConfig, + Table, + TableData, +) from metadata.generated.schema.entity.services.connections.connectionBasicType import ( DataStorageConfig, ) @@ -30,7 +32,12 @@ from metadata.ingestion.ometa.ometa_api import OpenMetadata from metadata.profiler.api.models import TableConfig from metadata.profiler.processor.sample_data_handler import upload_sample_data -from metadata.sampler.config import get_sample_data_count_config, get_sample_query +from metadata.sampler.config import ( + get_exclude_columns, + get_include_columns, + get_sample_data_count_config, + get_sample_query, +) from metadata.sampler.models import SampleConfig from metadata.sampler.partition import get_partition_details from metadata.utils.constants import SAMPLE_DATA_DEFAULT_COUNT @@ -50,6 +57,8 @@ def __init__( service_connection_config: Union[DatabaseConnection, DatalakeConnection], ometa_client: OpenMetadata, entity: Table, + include_columns: Optional[List[ColumnProfilerConfig]] = None, + exclude_columns: Optional[List[str]] = None, sample_config: Optional[SampleConfig] = None, partition_details: Optional[Dict] = None, sample_query: Optional[str] = None, @@ -58,10 +67,16 @@ def __init__( **kwargs, ): self.ometa_client = ometa_client - self._sample_rows = None + self._sample = None + self._columns: Optional[List[SQALikeColumn]] = None self.sample_config = sample_config + if not self.sample_config.profile_sample: + self.sample_config.profile_sample = 100 + self.entity = entity + self.include_columns = include_columns + self.exclude_columns = exclude_columns self.sample_query = sample_query self.sample_limit = sample_data_count self.partition_details = partition_details @@ -94,15 +109,17 @@ def create( entity_config=table_config, default_sample_data_count=default_sample_data_count, ) - sample_query = get_sample_query(entity=entity, entity_config=table_config) - partition_details = get_partition_details(entity=entity) + include_columns = get_include_columns(entity, table_config) + exclude_columns = get_exclude_columns(entity, table_config) return cls( service_connection_config=service_connection_config, ometa_client=ometa_client, entity=entity, + include_columns=include_columns, + exclude_columns=exclude_columns, sample_config=sample_config, partition_details=partition_details, sample_query=sample_query, @@ -111,6 +128,44 @@ def create( **kwargs, ) + @property + def columns(self) -> List[SQALikeColumn]: + """ + Return the list of columns to profile + by skipping the columns to ignore. + """ + + if self._columns: + return self._columns + + if self._get_included_columns(): + self._columns = [ + column + for column in self.get_columns() + if column.name in self._get_included_columns() + ] + + if not self._get_included_columns(): + self._columns = [ + column + for column in self._columns or self.get_columns() + if column.name not in self._get_excluded_columns() + ] + + return self._columns + + def _get_excluded_columns(self) -> Optional[Set[str]]: + """Get excluded columns for table being profiled""" + if self.exclude_columns: + return set(self.exclude_columns) + return set() + + def _get_included_columns(self) -> Optional[Set[str]]: + """Get include columns for table being profiled""" + if self.include_columns: + return {include_col.columnName for include_col in self.include_columns} + return set() + @property @abstractmethod def table(self): @@ -138,9 +193,7 @@ def random_sample(self, **kwargs): raise NotImplementedError @abstractmethod - def fetch_sample_data( - self, columns: Optional[Union[List[Column], List[SQALikeColumn]]] - ) -> TableData: + def fetch_sample_data(self, columns: Optional[List[SQALikeColumn]]) -> TableData: """Fetch sample data Args: @@ -148,6 +201,11 @@ def fetch_sample_data( """ raise NotImplementedError + @abstractmethod + def get_columns(self) -> List[SQALikeColumn]: + """get columns""" + raise NotImplementedError + @calculate_execution_time(store=False) def generate_sample_data(self) -> Optional[TableData]: """Fetch and ingest sample data @@ -157,8 +215,7 @@ def generate_sample_data(self) -> Optional[TableData]: """ try: logger.debug( - "Fetching sample data for " - f"{self.profiler_interface.table_entity.fullyQualifiedName.root}..." # type: ignore + f"Fetching sample data for {self.entity.fullyQualifiedName.root}..." ) table_data = self.fetch_sample_data(self.columns) # Only store the data if configured to do so diff --git a/ingestion/src/metadata/sampler/sqlalchemy/sampler.py b/ingestion/src/metadata/sampler/sqlalchemy/sampler.py index 4f3511e26d25..89c811ee462f 100644 --- a/ingestion/src/metadata/sampler/sqlalchemy/sampler.py +++ b/ingestion/src/metadata/sampler/sqlalchemy/sampler.py @@ -268,3 +268,7 @@ def get_partitioned_query(self) -> Query: False, ) ) + + def get_columns(self): + """get columns from entity""" + return list(inspect(self.table).c) From da82e91b8f8339d8543ffc356af2ed769856aa12 Mon Sep 17 00:00:00 2001 From: Pere Miquel Brull Date: Thu, 14 Nov 2024 18:07:20 +0100 Subject: [PATCH 10/29] fix --- ingestion/pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/ingestion/pyproject.toml b/ingestion/pyproject.toml index 359348498cde..9192fa2ff434 100644 --- a/ingestion/pyproject.toml +++ b/ingestion/pyproject.toml @@ -273,6 +273,7 @@ ignore = [ "src/metadata/workflow/metadata.py", "src/metadata/workflow/profiler.py", "src/metadata/workflow/usage.py", + "src/metadata/workflow/sampler.py", "src/metadata/workflow/workflow_status_mixin.py", ] From c8956cb156b01824d88bc6cb6e1e3f8db58fe2f2 Mon Sep 17 00:00:00 2001 From: Pere Miquel Brull Date: Thu, 14 Nov 2024 18:38:30 +0100 Subject: [PATCH 11/29] copy db --- .../source/database/base/profiler_source.py | 4 ++-- .../source/database/bigquery/profiler_source.py | 6 +++--- .../source/database/databricks/profiler_source.py | 5 ++--- ingestion/src/metadata/sampler/processor.py | 14 +++++++------- .../src/metadata/sampler/sampler_interface.py | 2 +- 5 files changed, 15 insertions(+), 16 deletions(-) diff --git a/ingestion/src/metadata/profiler/source/database/base/profiler_source.py b/ingestion/src/metadata/profiler/source/database/base/profiler_source.py index 728dfc44239c..9385ac0c0de3 100644 --- a/ingestion/src/metadata/profiler/source/database/base/profiler_source.py +++ b/ingestion/src/metadata/profiler/source/database/base/profiler_source.py @@ -74,7 +74,7 @@ class ProfilerSource(ProfilerSourceInterface): def __init__( self, config: OpenMetadataWorkflowConfig, - database: DatabaseService, + database: Database, ometa_client: OpenMetadata, global_profiler_configuration: ProfilerConfiguration, ): @@ -114,7 +114,7 @@ def _build_table_orm(self, entity: Table) -> Optional[DeclarativeMeta]: return None def _copy_service_config( - self, config: OpenMetadataWorkflowConfig, database: DatabaseService + self, config: OpenMetadataWorkflowConfig, database: Database ) -> DatabaseConnection: """Make a copy of the service config and update the database name diff --git a/ingestion/src/metadata/profiler/source/database/bigquery/profiler_source.py b/ingestion/src/metadata/profiler/source/database/bigquery/profiler_source.py index 3959264ab51f..92b6ef280b4c 100644 --- a/ingestion/src/metadata/profiler/source/database/bigquery/profiler_source.py +++ b/ingestion/src/metadata/profiler/source/database/bigquery/profiler_source.py @@ -15,10 +15,10 @@ from copy import deepcopy +from metadata.generated.schema.entity.data.database import Database from metadata.generated.schema.entity.services.connections.database.bigQueryConnection import ( BigQueryConnection, ) -from metadata.generated.schema.entity.services.databaseService import DatabaseService from metadata.generated.schema.metadataIngestion.workflow import ( OpenMetadataWorkflowConfig, ) @@ -34,14 +34,14 @@ class BigQueryProfilerSource(ProfilerSource): """override the base profiler source to handle BigQuery specific connection configs""" def _copy_service_config( - self, config: OpenMetadataWorkflowConfig, database: DatabaseService + self, config: OpenMetadataWorkflowConfig, database: Database ) -> BigQueryConnection: """Make a copy of the database connection config. If MultiProjectId is used, replace it with SingleProjectId with the database name being profiled. We iterate over all non filtered database in workflow.py `def execute`. Args: - database (DatabaseService): a database entity + database (Database): a database entity Returns: DatabaseConnection diff --git a/ingestion/src/metadata/profiler/source/database/databricks/profiler_source.py b/ingestion/src/metadata/profiler/source/database/databricks/profiler_source.py index f894272ada44..fdd982caca4e 100644 --- a/ingestion/src/metadata/profiler/source/database/databricks/profiler_source.py +++ b/ingestion/src/metadata/profiler/source/database/databricks/profiler_source.py @@ -1,9 +1,8 @@ """Extend the ProfilerSource class to add support for Databricks is_disconnect SQA method""" - from metadata.generated.schema.configuration.profilerConfiguration import ( ProfilerConfiguration, ) -from metadata.generated.schema.entity.services.databaseService import DatabaseService +from metadata.generated.schema.entity.data.database import Database from metadata.generated.schema.metadataIngestion.workflow import ( OpenMetadataWorkflowConfig, ) @@ -25,7 +24,7 @@ class DataBricksProfilerSource(ProfilerSource): def __init__( self, config: OpenMetadataWorkflowConfig, - database: DatabaseService, + database: Database, ometa_client: OpenMetadata, global_profiler_config: ProfilerConfiguration, ): diff --git a/ingestion/src/metadata/sampler/processor.py b/ingestion/src/metadata/sampler/processor.py index 0b630d8a3bf9..b856c41bb983 100644 --- a/ingestion/src/metadata/sampler/processor.py +++ b/ingestion/src/metadata/sampler/processor.py @@ -19,11 +19,9 @@ from sqlalchemy import MetaData from sqlalchemy.orm import DeclarativeMeta +from metadata.generated.schema.entity.data.database import Database from metadata.generated.schema.entity.data.table import Table -from metadata.generated.schema.entity.services.databaseService import ( - DatabaseConnection, - DatabaseService, -) +from metadata.generated.schema.entity.services.databaseService import DatabaseConnection from metadata.generated.schema.entity.services.ingestionPipelines.status import ( StackTraceError, ) @@ -84,10 +82,12 @@ def _run(self, record: ProfilerSourceAndEntity) -> Either[SamplerResponse]: try: entity = cast(Table, record.entity) - schema_entity, database_entity, db_service = get_context_entities( + schema_entity, database_entity, _ = get_context_entities( entity=entity, metadata=self.metadata ) - service_conn_config = self._copy_service_config(self.config, db_service) + service_conn_config = self._copy_service_config( + self.config, database_entity + ) sqa_metadata = _get_sqa_metadata(str(service_conn_config.type.value)) _orm = self._build_table_orm(entity, sqa_metadata) @@ -147,7 +147,7 @@ def _build_table_orm( ) def _copy_service_config( - self, config: OpenMetadataWorkflowConfig, database: DatabaseService + self, config: OpenMetadataWorkflowConfig, database: Database ) -> DatabaseConnection: """Make a copy of the service config and update the database name diff --git a/ingestion/src/metadata/sampler/sampler_interface.py b/ingestion/src/metadata/sampler/sampler_interface.py index 1501335b31f2..ba95590d3436 100644 --- a/ingestion/src/metadata/sampler/sampler_interface.py +++ b/ingestion/src/metadata/sampler/sampler_interface.py @@ -233,4 +233,4 @@ def generate_sample_data(self) -> Optional[TableData]: except Exception as err: logger.debug(traceback.format_exc()) logger.warning(f"Error fetching sample data: {err}") - return None + raise err From 3a56ec877048b0b15de95679bc8eb8bbad39ae2e Mon Sep 17 00:00:00 2001 From: Pere Miquel Brull Date: Fri, 15 Nov 2024 10:05:10 +0100 Subject: [PATCH 12/29] fix tests and remove unused method --- .../runner/base_test_suite_source.py | 2 +- .../table_diff_params_setter.py | 4 +- .../sqlalchemy/profiler_interface.py | 1 - .../src/metadata/profiler/processor/core.py | 5 - .../source/database/base/profiler_source.py | 2 +- ingestion/src/metadata/sampler/config.py | 49 ++--- ingestion/src/metadata/sampler/partition.py | 8 +- ingestion/src/metadata/sampler/processor.py | 2 +- .../src/metadata/sampler/sampler_interface.py | 12 +- .../test_table_diff_param_setter.py | 67 ++++--- .../tests/unit/profiler/pandas/test_sample.py | 183 ++++++++++++------ .../snowflake/test_sampling_method.py | 6 +- .../unit/profiler/sqlalchemy/test_profiler.py | 9 +- .../unit/profiler/sqlalchemy/test_runner.py | 4 +- .../unit/profiler/sqlalchemy/test_sample.py | 10 +- .../unit/profiler/test_profiler_interface.py | 92 ++++----- ingestion/tests/unit/test_partition.py | 2 +- 17 files changed, 251 insertions(+), 207 deletions(-) diff --git a/ingestion/src/metadata/data_quality/runner/base_test_suite_source.py b/ingestion/src/metadata/data_quality/runner/base_test_suite_source.py index 697f85195c2d..4afd8219beea 100644 --- a/ingestion/src/metadata/data_quality/runner/base_test_suite_source.py +++ b/ingestion/src/metadata/data_quality/runner/base_test_suite_source.py @@ -136,7 +136,7 @@ def create_data_quality_interface(self) -> TestSuiteInterface: entity=self.entity, schema_entity=schema_entity, database_entity=database_entity, - sample_config=SampleConfig( + default_sample_config=SampleConfig( profile_sample=self.source_config.profileSample, profile_sample_type=self.source_config.profileSampleType, sampling_method_type=self.source_config.samplingMethodType, diff --git a/ingestion/src/metadata/data_quality/validations/runtime_param_setter/table_diff_params_setter.py b/ingestion/src/metadata/data_quality/validations/runtime_param_setter/table_diff_params_setter.py index 09d713d9ee8d..772a0d0e46db 100644 --- a/ingestion/src/metadata/data_quality/validations/runtime_param_setter/table_diff_params_setter.py +++ b/ingestion/src/metadata/data_quality/validations/runtime_param_setter/table_diff_params_setter.py @@ -122,8 +122,8 @@ def build_where_clause(self, test_case) -> Optional[str]: partition_where_clause = ( None if not ( - self.sampler._partition_details - and self.sampler._partition_details.enablePartitioning + self.sampler.partition_details + and self.sampler.partition_details.enablePartitioning ) else self.sampler.get_partitioned_query().whereclause.compile( compile_kwargs={"literal_binds": True} diff --git a/ingestion/src/metadata/profiler/interface/sqlalchemy/profiler_interface.py b/ingestion/src/metadata/profiler/interface/sqlalchemy/profiler_interface.py index 373ca139061d..512b5f37e434 100644 --- a/ingestion/src/metadata/profiler/interface/sqlalchemy/profiler_interface.py +++ b/ingestion/src/metadata/profiler/interface/sqlalchemy/profiler_interface.py @@ -83,7 +83,6 @@ class SQAProfilerInterface(ProfilerInterface, SQAInterfaceMixin): """ # pylint: disable=too-many-arguments - def __init__( self, service_connection_config: Union[DatabaseConnection, DatalakeConnection], diff --git a/ingestion/src/metadata/profiler/processor/core.py b/ingestion/src/metadata/profiler/processor/core.py index fded65b23d13..460830e726e1 100644 --- a/ingestion/src/metadata/profiler/processor/core.py +++ b/ingestion/src/metadata/profiler/processor/core.py @@ -245,11 +245,6 @@ def get_custom_metrics( return column.customMetrics or None return None - @property - def sample(self): - """Return the sample used for the profiler""" - return self.profiler_interface.sample - def validate_composed_metric(self) -> None: """ Make sure that all composed metrics have diff --git a/ingestion/src/metadata/profiler/source/database/base/profiler_source.py b/ingestion/src/metadata/profiler/source/database/base/profiler_source.py index 9385ac0c0de3..cce8716418be 100644 --- a/ingestion/src/metadata/profiler/source/database/base/profiler_source.py +++ b/ingestion/src/metadata/profiler/source/database/base/profiler_source.py @@ -169,7 +169,7 @@ def create_profiler_interface( schema_entity=schema_entity, database_entity=database_entity, table_config=config, - sample_config=SampleConfig( + default_sample_config=SampleConfig( profile_sample=self.source_config.profileSample, profile_sample_type=self.source_config.profileSampleType, sampling_method_type=self.source_config.samplingMethodType, diff --git a/ingestion/src/metadata/sampler/config.py b/ingestion/src/metadata/sampler/config.py index a42d83160173..e5c5239f6ae6 100644 --- a/ingestion/src/metadata/sampler/config.py +++ b/ingestion/src/metadata/sampler/config.py @@ -21,18 +21,11 @@ DatabaseSchema, DatabaseSchemaProfilerConfig, ) -from metadata.generated.schema.entity.data.table import ( - ColumnProfilerConfig, - PartitionProfilerConfig, - Table, -) +from metadata.generated.schema.entity.data.table import ColumnProfilerConfig, Table from metadata.generated.schema.entity.services.connections.connectionBasicType import ( DataStorageConfig, ) from metadata.generated.schema.entity.services.databaseService import DatabaseService -from metadata.generated.schema.metadataIngestion.databaseServiceProfilerPipeline import ( - DatabaseServiceProfilerPipeline, -) from metadata.profiler.api.models import ProfilerProcessorConfig from metadata.profiler.config import ( get_database_profiler_config, @@ -105,18 +98,23 @@ def get_storage_config_for_table( def get_profile_sample_config( entity: Table, - schema_profiler_config: Optional[DatabaseSchemaProfilerConfig], - database_profiler_config: Optional[DatabaseProfilerConfig], + schema_entity: Optional[DatabaseSchema], + database_entity: Optional[Database], entity_config: Optional[Union[TableConfig, DatabaseAndSchemaConfig]], - source_config: DatabaseServiceProfilerPipeline, -) -> Optional[SampleConfig]: + default_sample_config: Optional[SampleConfig], +) -> SampleConfig: """Get profile sample config for a specific entity""" + schema_profiler_config = get_schema_profiler_config(schema_entity=schema_entity) + database_profiler_config = get_database_profiler_config( + database_entity=database_entity + ) + for config in ( entity_config, entity.tableProfilerConfig, schema_profiler_config, database_profiler_config, - source_config, + default_sample_config, ): try: if config and config.profileSample: @@ -128,26 +126,7 @@ def get_profile_sample_config( except AttributeError: pass - return None - - -def get_partition_details( - entity: Table, - entity_config: Optional[TableConfig] = None, -) -> Optional[PartitionProfilerConfig]: - """_summary_ - - Args: - entity (Table): table entity object - entity_config (Optional[TableConfig]): entity configuration - - Returns: - Optional[PartitionProfilerConfig]: - """ - if entity_config: - return entity_config.partitionConfig - - return get_partition_details(entity) + return SampleConfig() def get_sample_query( @@ -173,8 +152,8 @@ def get_sample_query( def get_sample_data_count_config( entity: Table, - schema_entity: DatabaseSchema, - database_entity: Database, + schema_entity: Optional[DatabaseSchema], + database_entity: Optional[Database], entity_config: Optional[TableConfig], default_sample_data_count: int, ) -> Optional[int]: diff --git a/ingestion/src/metadata/sampler/partition.py b/ingestion/src/metadata/sampler/partition.py index 020e8a0f9205..703dd7bb6ff3 100644 --- a/ingestion/src/metadata/sampler/partition.py +++ b/ingestion/src/metadata/sampler/partition.py @@ -25,6 +25,7 @@ from metadata.generated.schema.entity.services.databaseService import ( DatabaseServiceType, ) +from metadata.sampler.models import TableConfig def validate_athena_injected_partitioning( @@ -66,7 +67,9 @@ def validate_athena_injected_partitioning( ) -def get_partition_details(entity: Table) -> Optional[PartitionProfilerConfig]: +def get_partition_details( + entity: Table, entity_config: Optional[TableConfig] = None +) -> Optional[PartitionProfilerConfig]: """Build PartitionProfilerConfig object from entity Args: @@ -74,6 +77,9 @@ def get_partition_details(entity: Table) -> Optional[PartitionProfilerConfig]: Returns: PartitionProfilerConfig """ + if entity_config: + return entity_config.partitionConfig + # Gather service type information service_type = getattr(entity, "serviceType", None) diff --git a/ingestion/src/metadata/sampler/processor.py b/ingestion/src/metadata/sampler/processor.py index b856c41bb983..ecd126674102 100644 --- a/ingestion/src/metadata/sampler/processor.py +++ b/ingestion/src/metadata/sampler/processor.py @@ -97,7 +97,7 @@ def _run(self, record: ProfilerSourceAndEntity) -> Either[SamplerResponse]: entity=entity, schema_entity=schema_entity, database_entity=database_entity, - sample_config=SampleConfig( + default_sample_config=SampleConfig( profile_sample=self.source_config.profileSample, profile_sample_type=self.source_config.profileSampleType, sampling_method_type=self.source_config.samplingMethodType, diff --git a/ingestion/src/metadata/sampler/sampler_interface.py b/ingestion/src/metadata/sampler/sampler_interface.py index ba95590d3436..10dd8fe714d2 100644 --- a/ingestion/src/metadata/sampler/sampler_interface.py +++ b/ingestion/src/metadata/sampler/sampler_interface.py @@ -35,6 +35,7 @@ from metadata.sampler.config import ( get_exclude_columns, get_include_columns, + get_profile_sample_config, get_sample_data_count_config, get_sample_query, ) @@ -59,7 +60,7 @@ def __init__( entity: Table, include_columns: Optional[List[ColumnProfilerConfig]] = None, exclude_columns: Optional[List[str]] = None, - sample_config: Optional[SampleConfig] = None, + sample_config: SampleConfig = SampleConfig(), partition_details: Optional[Dict] = None, sample_query: Optional[str] = None, storage_config: DataStorageConfig = None, @@ -96,7 +97,7 @@ def create( database_entity: Database, table_config: Optional[TableConfig] = None, storage_config: Optional[DataStorageConfig] = None, - sample_config: Optional[SampleConfig] = None, + default_sample_config: Optional[SampleConfig] = None, default_sample_data_count: Optional[int] = SAMPLE_DATA_DEFAULT_COUNT, **kwargs, ) -> "SamplerInterface": @@ -109,6 +110,13 @@ def create( entity_config=table_config, default_sample_data_count=default_sample_data_count, ) + sample_config = get_profile_sample_config( + entity=entity, + schema_entity=schema_entity, + database_entity=database_entity, + entity_config=table_config, + default_sample_config=default_sample_config, + ) sample_query = get_sample_query(entity=entity, entity_config=table_config) partition_details = get_partition_details(entity=entity) include_columns = get_include_columns(entity, table_config) diff --git a/ingestion/tests/unit/metadata/data_quality/test_table_diff_param_setter.py b/ingestion/tests/unit/metadata/data_quality/test_table_diff_param_setter.py index de8cda77b2c9..0025d4b92f69 100644 --- a/ingestion/tests/unit/metadata/data_quality/test_table_diff_param_setter.py +++ b/ingestion/tests/unit/metadata/data_quality/test_table_diff_param_setter.py @@ -1,4 +1,4 @@ -from unittest.mock import Mock +from unittest.mock import Mock, patch from uuid import uuid4 import pytest @@ -38,7 +38,7 @@ from metadata.generated.schema.type.basic import EntityLink from metadata.generated.schema.type.entityReference import EntityReference from metadata.ingestion.connections.session import create_and_bind_session -from metadata.profiler.processor.sampler.sqlalchemy.sampler import SQASampler +from metadata.sampler.sqlalchemy.sampler import SQASampler MOCK_TABLE = Table( id=uuid4(), @@ -51,6 +51,14 @@ ), ], ) +SERVICE_CONNECTION_CONFIG = MysqlConnection( + username="test", + authType=BasicAuth( + password="test", + ), + hostPort="localhost:5432", + databaseSchema="mysql_db", +) @pytest.mark.parametrize( @@ -78,16 +86,7 @@ DatabaseService( id="85811038-099a-11ed-861d-0242ac120002", name="mysql", - connection=DatabaseConnection( - config=MysqlConnection( - username="test", - authType=BasicAuth( - password="test", - ), - hostPort="localhost:5432", - databaseSchema="mysql_db", - ) - ), + connection=DatabaseConnection(config=SERVICE_CONNECTION_CONFIG), serviceType=DatabaseServiceType.Mysql, ), "mysql://test:test@localhost:5432/mysql_db", @@ -149,21 +148,29 @@ class MyTable(Base): my_column = SAColumn(String(30)) metadata_obj.create_all(engine) - mock_sampler = SQASampler(session, MyTable, Mock()) - mock_sampler._partition_details = input - setter = TableDiffParamsSetter(None, None, MOCK_TABLE, mock_sampler) - test_case = TestCase( - name="test", - testDefinition=EntityReference(id=uuid4(), type="testDefinition"), - testSuite=EntityReference(id=uuid4(), type="testSuite"), - entityLink=EntityLink( - root="<#E::table::POSTGRES_SERVICE.dvdrental.public.customer>" - ), - parameterValues=[ - TestCaseParameterValue( - name="run", - value="y", - ) - ], - ) - assert setter.build_where_clause(test_case) == expected + + with patch.object(SQASampler, "get_client") as mock_get_client: + mock_get_client.return_value = session + mock_sampler = SQASampler( + service_connection_config=SERVICE_CONNECTION_CONFIG, + ometa_client=Mock(), + entity=Mock(), + orm_table=MyTable, + ) + mock_sampler.partition_details = input + setter = TableDiffParamsSetter(None, None, MOCK_TABLE, mock_sampler) + test_case = TestCase( + name="test", + testDefinition=EntityReference(id=uuid4(), type="testDefinition"), + testSuite=EntityReference(id=uuid4(), type="testSuite"), + entityLink=EntityLink( + root="<#E::table::POSTGRES_SERVICE.dvdrental.public.customer>" + ), + parameterValues=[ + TestCaseParameterValue( + name="run", + value="y", + ) + ], + ) + assert setter.build_where_clause(test_case) == expected diff --git a/ingestion/tests/unit/profiler/pandas/test_sample.py b/ingestion/tests/unit/profiler/pandas/test_sample.py index c29712a3e170..84be75703317 100644 --- a/ingestion/tests/unit/profiler/pandas/test_sample.py +++ b/ingestion/tests/unit/profiler/pandas/test_sample.py @@ -14,6 +14,7 @@ """ import os from unittest import TestCase, mock +from unittest.mock import Mock, patch from uuid import uuid4 import pytest @@ -26,13 +27,13 @@ DatalakeConnection, ) from metadata.generated.schema.type.entityReference import EntityReference -from metadata.profiler.api.models import ProfileSampleConfig from metadata.profiler.interface.pandas.profiler_interface import ( PandasProfilerInterface, ) from metadata.profiler.metrics.registry import Metrics from metadata.profiler.processor.core import Profiler -from metadata.profiler.processor.sampler.pandas.sampler import DatalakeSampler +from metadata.sampler.models import SampleConfig +from metadata.sampler.pandas.sampler import DatalakeSampler Base = declarative_base() @@ -140,66 +141,102 @@ class DatalakeSampleTest(TestCase): return_value=FakeConnection(), ) @mock.patch( - "metadata.mixins.pandas.pandas_mixin.fetch_dataframe", - return_value=[df1, pd.concat([df2, pd.DataFrame(index=df1.index)])], + "metadata.sampler.sampler_interface.get_ssl_connection", + return_value=FakeConnection(), ) - def setUpClass(cls, mock_get_connection, mocked_dfs) -> None: + def setUpClass(cls, mock_get_connection, mock_sample_get_connection) -> None: """ Prepare Ingredients """ - cls.datalake_profiler_interface = PandasProfilerInterface( - entity=cls.table_entity, - service_connection_config=DatalakeConnection(configSource={}), - storage_config=None, - ometa_client=None, - thread_count=None, - profile_sample_config=ProfileSampleConfig(profile_sample=50.0), - source_config=None, - sample_query=None, - table_partition_config=None, - ) + with ( + patch.object( + DatalakeSampler, "table", new_callable=lambda: [cls.df1, cls.df2] + ), + patch.object(DatalakeSampler, "get_client") as mock_client, + ): + mock_client.return_value = Mock() + sampler = DatalakeSampler( + service_connection_config=DatalakeConnection(configSource={}), + ometa_client=None, + entity=cls.table_entity, + default_sample_config=SampleConfig(profile_sample=50.0), + ) + cls.datalake_profiler_interface = PandasProfilerInterface( + service_connection_config=DatalakeConnection(configSource={}), + ometa_client=None, + entity=cls.table_entity, + source_config=None, + sampler=sampler, + thread_count=None, + ) - def test_random_sampler(self): + @mock.patch( + "metadata.sampler.sampler_interface.get_ssl_connection", + return_value=FakeConnection(), + ) + def test_random_sampler(self, _): """ The random sampler should be able to generate a random subset of data """ - sampler = DatalakeSampler( - client=FakeConnection().client, - table=[self.df1, self.df2], - profile_sample_config=ProfileSampleConfig(profile_sample=50.0), - ) - random_sample = sampler.random_sample() - res = sum(len(r) for r in random_sample) - assert res < 5 + with ( + patch.object( + DatalakeSampler, "table", new_callable=lambda: [self.df1, self.df2] + ), + patch.object(DatalakeSampler, "get_client") as mock_client, + ): + mock_client.return_value = Mock() + sampler = DatalakeSampler( + service_connection_config=DatalakeConnection(configSource={}), + ometa_client=None, + entity=self.table_entity, + default_sample_config=SampleConfig(profile_sample=50.0), + ) + random_sample = sampler.random_sample() + res = sum(len(r) for r in random_sample) + assert res < 5 @mock.patch( "metadata.profiler.interface.profiler_interface.get_ssl_connection", return_value=FakeConnection(), ) + @mock.patch( + "metadata.sampler.sampler_interface.get_ssl_connection", + return_value=FakeConnection(), + ) @mock.patch( "metadata.mixins.pandas.pandas_mixin.fetch_dataframe", return_value=[df1, pd.concat([df2, pd.DataFrame(index=df1.index)])], ) - def test_sample_property(self, mock_get_connection, mocked_dfs): + def test_sample_property(self, *_): """ Sample property should be properly generated """ - datalake_profiler_interface = PandasProfilerInterface( - entity=self.table_entity, - service_connection_config=DatalakeConnection(configSource={}), - storage_config=None, - ometa_client=None, - thread_count=None, - profile_sample_config=ProfileSampleConfig(profile_sample=50.0), - source_config=None, - sample_query=None, - table_partition_config=None, - ) + with ( + patch.object( + DatalakeSampler, "table", new_callable=lambda: [self.df1, self.df2] + ), + patch.object(DatalakeSampler, "get_client") as mock_client, + ): + mock_client.return_value = Mock() + sampler = DatalakeSampler( + service_connection_config=DatalakeConnection(configSource={}), + ometa_client=None, + entity=self.table_entity, + default_sample_config=SampleConfig(profile_sample=50.0), + ) + datalake_profiler_interface = PandasProfilerInterface( + service_connection_config=DatalakeConnection(configSource={}), + ometa_client=None, + entity=self.table_entity, + source_config=None, + sampler=sampler, + thread_count=None, + ) - random_sample = datalake_profiler_interface.sampler.random_sample() - res = sum(len(r) for r in random_sample) - assert res < 5 + random_sample = datalake_profiler_interface.sampler.random_sample() + res = sum(len(r) for r in random_sample) + assert res < 5 def test_table_row_count(self): """ @@ -212,7 +249,8 @@ def test_table_row_count(self): profiler_interface=self.datalake_profiler_interface, ) res = profiler.compute_metrics()._table_results - assert res.get(Metrics.ROW_COUNT.name) == 3 + # We expect the full count of the table + assert res.get(Metrics.ROW_COUNT.name) == 4 @pytest.mark.skip(reason="Flaky test due to small sample size") def test_random_sample_histogram(self): @@ -242,31 +280,56 @@ def test_random_sample_histogram(self): # The sum of all frequencies should be sampled assert sum(res.get(User.age.name)[Metrics.HISTOGRAM.name]["frequencies"]) < 30 - def test_sample_data(self): + @mock.patch( + "metadata.sampler.sampler_interface.get_ssl_connection", + return_value=FakeConnection(), + ) + def test_sample_data(self, *_): """ We should be able to pick up sample data from the sampler """ - sampler = DatalakeSampler( - client=FakeConnection().client, - table=[self.df1, self.df2], - ) - sample_data = sampler.fetch_sample_data() + with ( + patch.object( + DatalakeSampler, "table", new_callable=lambda: [self.df1, self.df2] + ), + patch.object(DatalakeSampler, "get_client") as mock_client, + ): + mock_client.return_value = Mock() + sampler = DatalakeSampler( + service_connection_config=DatalakeConnection(configSource={}), + ometa_client=None, + entity=self.table_entity, + default_sample_config=SampleConfig(profile_sample=50.0), + ) + sample_data = sampler.fetch_sample_data() - assert len(sample_data.columns) == 10 - # we drop na values when fecthing sample data - assert len(sample_data.rows) == 4 + assert len(sample_data.columns) == 10 + # we drop na values when fecthing sample data + assert len(sample_data.rows) == 4 - def test_sample_from_user_query(self): + @mock.patch( + "metadata.sampler.sampler_interface.get_ssl_connection", + return_value=FakeConnection(), + ) + def test_sample_from_user_query(self, *_): """ Test sample data are returned based on user query """ - stmt = "`age` > 30" - sampler = DatalakeSampler( - client=FakeConnection().client, - table=[self.df1, self.df2], - profile_sample_query=stmt, - ) - sample_data = sampler.fetch_sample_data() + with ( + patch.object( + DatalakeSampler, "table", new_callable=lambda: [self.df1, self.df2] + ), + patch.object(DatalakeSampler, "get_client") as mock_client, + ): + mock_client.return_value = Mock() + sampler = DatalakeSampler( + service_connection_config=DatalakeConnection(configSource={}), + ometa_client=None, + entity=self.table_entity, + default_sample_config=SampleConfig(profile_sample=50.0), + sample_query="`age` > 30", + ) + sample_data = sampler.fetch_sample_data() - assert len(sample_data.columns) == 10 - assert len(sample_data.rows) == 3 + assert len(sample_data.columns) == 10 + assert len(sample_data.rows) == 3 diff --git a/ingestion/tests/unit/profiler/sqlalchemy/snowflake/test_sampling_method.py b/ingestion/tests/unit/profiler/sqlalchemy/snowflake/test_sampling_method.py index 0b93e3a9e15c..b62c49ffd151 100644 --- a/ingestion/tests/unit/profiler/sqlalchemy/snowflake/test_sampling_method.py +++ b/ingestion/tests/unit/profiler/sqlalchemy/snowflake/test_sampling_method.py @@ -17,7 +17,7 @@ from metadata.generated.schema.entity.services.connections.database.snowflakeConnection import ( SnowflakeConnection, ) -from metadata.profiler.api.models import ProfileSampleConfig +from metadata.profiler.api.models import SampleConfig from metadata.profiler.interface.sqlalchemy.profiler_interface import ( SQAProfilerInterface, ) @@ -64,7 +64,7 @@ def test_omit_sampling_method_type(self): sampler = SnowflakeSampler( client=self.session, table=User, - profile_sample_config=ProfileSampleConfig( + profile_sample_config=SampleConfig( profile_sample_type=ProfileSampleType.PERCENTAGE, profile_sample=50.0 ), ) @@ -82,7 +82,7 @@ def test_specify_sampling_method_type(self): sampler = SnowflakeSampler( client=self.session, table=User, - profile_sample_config=ProfileSampleConfig( + profile_sample_config=SampleConfig( profile_sample_type=ProfileSampleType.PERCENTAGE, profile_sample=50.0, sampling_method_type=sampling_method_type, diff --git a/ingestion/tests/unit/profiler/sqlalchemy/test_profiler.py b/ingestion/tests/unit/profiler/sqlalchemy/test_profiler.py index 81aaf56c3544..a9c30d4ecc8b 100644 --- a/ingestion/tests/unit/profiler/sqlalchemy/test_profiler.py +++ b/ingestion/tests/unit/profiler/sqlalchemy/test_profiler.py @@ -107,12 +107,9 @@ class ProfilerTest(TestCase): ), ], ) - with patch.object( - SQAProfilerInterface, "_convert_table_to_orm_object", return_value=User - ): - sqa_profiler_interface = SQAProfilerInterface( - sqlite_conn, None, table_entity, None, None, None, None, None, 5, 43200 - ) + sqa_profiler_interface = SQAProfilerInterface( + sqlite_conn, None, table_entity, None, None, None, None, None, 5, 43200 + ) @classmethod def setUpClass(cls) -> None: diff --git a/ingestion/tests/unit/profiler/sqlalchemy/test_runner.py b/ingestion/tests/unit/profiler/sqlalchemy/test_runner.py index 6dd4c1854e94..2a1d6fd2ba5b 100644 --- a/ingestion/tests/unit/profiler/sqlalchemy/test_runner.py +++ b/ingestion/tests/unit/profiler/sqlalchemy/test_runner.py @@ -21,7 +21,7 @@ from sqlalchemy.orm import declarative_base from metadata.ingestion.connections.session import create_and_bind_session -from metadata.profiler.api.models import ProfileSampleConfig +from metadata.profiler.api.models import SampleConfig from metadata.profiler.processor.runner import QueryRunner from metadata.profiler.processor.sampler.sqlalchemy.sampler import SQASampler from metadata.utils.timeout import cls_timeout @@ -63,7 +63,7 @@ class RunnerTest(TestCase): sampler = SQASampler( client=session, table=User, - profile_sample_config=ProfileSampleConfig(profile_sample=50.0), + profile_sample_config=SampleConfig(profile_sample=50.0), ) sample = sampler.random_sample() diff --git a/ingestion/tests/unit/profiler/sqlalchemy/test_sample.py b/ingestion/tests/unit/profiler/sqlalchemy/test_sample.py index 51d7aa06dc4f..e883779ff873 100644 --- a/ingestion/tests/unit/profiler/sqlalchemy/test_sample.py +++ b/ingestion/tests/unit/profiler/sqlalchemy/test_sample.py @@ -26,7 +26,7 @@ SQLiteConnection, SQLiteScheme, ) -from metadata.profiler.api.models import ProfileSampleConfig +from metadata.profiler.api.models import SampleConfig from metadata.profiler.interface.sqlalchemy.profiler_interface import ( SQAProfilerInterface, ) @@ -144,7 +144,7 @@ def test_random_sampler(self): sampler = SQASampler( client=self.session, table=User, - profile_sample_config=ProfileSampleConfig(profile_sample=50.0), + profile_sample_config=SampleConfig(profile_sample=50.0), ) random_sample = sampler.random_sample() res = self.session.query(func.count()).select_from(random_sample).first() @@ -164,7 +164,7 @@ def test_sample_property(self): None, self.table_entity, None, - ProfileSampleConfig(profile_sample=50.0), + SampleConfig(profile_sample=50.0), None, None, None, @@ -209,7 +209,7 @@ def test_random_sample_count(self): None, self.table_entity, None, - ProfileSampleConfig(profile_sample=50), + SampleConfig(profile_sample=50), None, None, None, @@ -246,7 +246,7 @@ def test_random_sample_histogram(self): None, self.table_entity, None, - ProfileSampleConfig(profile_sample=50), + SampleConfig(profile_sample=50), None, None, None, diff --git a/ingestion/tests/unit/profiler/test_profiler_interface.py b/ingestion/tests/unit/profiler/test_profiler_interface.py index 4392914f021a..48ab47c9d6fa 100644 --- a/ingestion/tests/unit/profiler/test_profiler_interface.py +++ b/ingestion/tests/unit/profiler/test_profiler_interface.py @@ -35,12 +35,16 @@ ) from metadata.generated.schema.security.credentials.awsCredentials import AWSCredentials from metadata.generated.schema.type.entityReference import EntityReference -from metadata.profiler.api.models import ( - DatabaseAndSchemaConfig, - ProfileSampleConfig, - TableConfig, +from metadata.profiler.api.models import DatabaseAndSchemaConfig, TableConfig +from metadata.profiler.config import ( + get_database_profiler_config, + get_schema_profiler_config, ) -from metadata.profiler.interface.profiler_interface import ProfilerInterface +from metadata.sampler.config import ( + get_profile_sample_config, + get_sample_data_count_config, +) +from metadata.sampler.models import SampleConfig class ProfilerInterfaceTest(TestCase): @@ -130,49 +134,35 @@ def setUpClass(cls) -> None: ) def test_get_schema_profiler_config(self): - self.assertIsNone( - ProfilerInterface.get_schema_profiler_config(schema_entity=None) - ) + self.assertIsNone(get_schema_profiler_config(schema_entity=None)) schema_entity_copy = deepcopy(self.schema_entity) schema_entity_copy.databaseSchemaProfilerConfig = None - self.assertIsNone( - ProfilerInterface.get_schema_profiler_config( - schema_entity=schema_entity_copy - ) - ) + self.assertIsNone(get_schema_profiler_config(schema_entity=schema_entity_copy)) self.assertEqual( - ProfilerInterface.get_schema_profiler_config( - schema_entity=self.schema_entity - ), + get_schema_profiler_config(schema_entity=self.schema_entity), self.schema_profiler_config, ) def test_get_database_profiler_config(self): - self.assertIsNone( - ProfilerInterface.get_database_profiler_config(database_entity=None) - ) + self.assertIsNone(get_database_profiler_config(database_entity=None)) database_entity_copy = deepcopy(self.database_entity) database_entity_copy.databaseProfilerConfig = None self.assertIsNone( - ProfilerInterface.get_database_profiler_config( - database_entity=database_entity_copy - ) + get_database_profiler_config(database_entity=database_entity_copy) ) self.assertEqual( - ProfilerInterface.get_database_profiler_config( - database_entity=self.database_entity - ), + get_database_profiler_config(database_entity=self.database_entity), self.database_profiler_config, ) def test_get_profile_sample_configs(self): source_config = DatabaseServiceProfilerPipeline() - expected = ProfileSampleConfig( + expected = SampleConfig( profile_sample=11, profile_sample_type=ProfileSampleType.PERCENTAGE, ) - actual = ProfilerInterface.get_profile_sample_config( + actual = get_profile_sample_config( entity=self.table, schema_profiler_config=self.schema_profiler_config, database_profiler_config=self.database_profiler_config, @@ -186,11 +176,11 @@ def test_get_profile_sample_configs(self): profileSampleType=ProfileSampleType.PERCENTAGE, fullyQualifiedName="demo", ) - expected = ProfileSampleConfig( + expected = SampleConfig( profile_sample=11, profile_sample_type=ProfileSampleType.PERCENTAGE, ) - actual = ProfilerInterface.get_profile_sample_config( + actual = get_profile_sample_config( entity=self.table, schema_profiler_config=self.schema_profiler_config, database_profiler_config=self.database_profiler_config, @@ -200,13 +190,13 @@ def test_get_profile_sample_configs(self): self.assertEqual(expected, actual) profiler = None - expected = ProfileSampleConfig( + expected = SampleConfig( profile_sample=22, profile_sample_type=ProfileSampleType.PERCENTAGE, ) table_copy = deepcopy(self.table) table_copy.tableProfilerConfig = None - actual = ProfilerInterface.get_profile_sample_config( + actual = get_profile_sample_config( entity=table_copy, schema_profiler_config=None, database_profiler_config=self.database_profiler_config, @@ -224,51 +214,51 @@ def test_get_sample_data_count_config(self): ) source_config = DatabaseServiceProfilerPipeline() - actual = ProfilerInterface.get_sample_data_count_config( + actual = get_sample_data_count_config( entity=self.table, - schema_profiler_config=self.schema_profiler_config, - database_profiler_config=self.database_profiler_config, + schema_entity=self.schema_entity, + database_entity=self.database_entity, entity_config=entity_config, - source_config=source_config, + default_sample_data_count=50, ) self.assertEqual(20, actual) - actual = ProfilerInterface.get_sample_data_count_config( + actual = get_sample_data_count_config( entity=self.table, - schema_profiler_config=self.schema_profiler_config, - database_profiler_config=self.database_profiler_config, + schema_entity=self.schema_entity, + database_entity=self.database_entity, entity_config=None, - source_config=source_config, + default_sample_data_count=50, ) self.assertEqual(101, actual) table_copy = deepcopy(self.table) table_copy.tableProfilerConfig = None - actual = ProfilerInterface.get_sample_data_count_config( + actual = get_sample_data_count_config( entity=table_copy, - schema_profiler_config=self.schema_profiler_config, - database_profiler_config=self.database_profiler_config, + schema_entity=self.schema_entity, + database_entity=self.database_entity, entity_config=None, - source_config=source_config, + default_sample_data_count=50, ) self.assertEqual(102, actual) - actual = ProfilerInterface.get_sample_data_count_config( + actual = get_sample_data_count_config( entity=table_copy, - schema_profiler_config=None, - database_profiler_config=self.database_profiler_config, + schema_entity=None, + database_entity=self.database_entity, entity_config=None, - source_config=source_config, + default_sample_data_count=50, ) self.assertEqual(202, actual) - actual = ProfilerInterface.get_sample_data_count_config( + actual = get_sample_data_count_config( entity=table_copy, - schema_profiler_config=None, - database_profiler_config=None, + schema_entity=None, + database_entity=None, entity_config=None, - source_config=source_config, + default_sample_data_count=50, ) self.assertEqual(50, actual) diff --git a/ingestion/tests/unit/test_partition.py b/ingestion/tests/unit/test_partition.py index 86922501a767..566c658524d7 100644 --- a/ingestion/tests/unit/test_partition.py +++ b/ingestion/tests/unit/test_partition.py @@ -27,7 +27,7 @@ from metadata.generated.schema.entity.services.databaseService import ( DatabaseServiceType, ) -from metadata.utils.partition import get_partition_details +from metadata.sampler.partition import get_partition_details class MockTable(BaseModel): From 76db1b1ad9acc7476a3416cfdf9d77c597bb68a1 Mon Sep 17 00:00:00 2001 From: Pere Miquel Brull Date: Fri, 15 Nov 2024 11:03:26 +0100 Subject: [PATCH 13/29] fix tests --- .../tests/unit/profiler/pandas/test_sample.py | 8 +- .../snowflake/test_sampling_method.py | 39 +++-- .../unit/profiler/sqlalchemy/test_profiler.py | 36 +++-- .../unit/profiler/sqlalchemy/test_runner.py | 42 +++-- .../unit/profiler/sqlalchemy/test_sample.py | 153 ++++++++---------- .../sqlalchemy/test_sqa_profiler_interface.py | 39 +++-- 6 files changed, 159 insertions(+), 158 deletions(-) diff --git a/ingestion/tests/unit/profiler/pandas/test_sample.py b/ingestion/tests/unit/profiler/pandas/test_sample.py index 84be75703317..24f72a5235f3 100644 --- a/ingestion/tests/unit/profiler/pandas/test_sample.py +++ b/ingestion/tests/unit/profiler/pandas/test_sample.py @@ -159,7 +159,7 @@ def setUpClass(cls, mock_get_connection, mock_sample_get_connection) -> None: service_connection_config=DatalakeConnection(configSource={}), ometa_client=None, entity=cls.table_entity, - default_sample_config=SampleConfig(profile_sample=50.0), + sample_config=SampleConfig(profile_sample=50.0), ) cls.datalake_profiler_interface = PandasProfilerInterface( service_connection_config=DatalakeConnection(configSource={}), @@ -190,7 +190,7 @@ def test_random_sampler(self, _): service_connection_config=DatalakeConnection(configSource={}), ometa_client=None, entity=self.table_entity, - default_sample_config=SampleConfig(profile_sample=50.0), + sample_config=SampleConfig(profile_sample=50.0), ) random_sample = sampler.random_sample() res = sum(len(r) for r in random_sample) @@ -223,7 +223,7 @@ def test_sample_property(self, *_): service_connection_config=DatalakeConnection(configSource={}), ometa_client=None, entity=self.table_entity, - default_sample_config=SampleConfig(profile_sample=50.0), + sample_config=SampleConfig(profile_sample=50.0), ) datalake_profiler_interface = PandasProfilerInterface( service_connection_config=DatalakeConnection(configSource={}), @@ -299,7 +299,7 @@ def test_sample_data(self, *_): service_connection_config=DatalakeConnection(configSource={}), ometa_client=None, entity=self.table_entity, - default_sample_config=SampleConfig(profile_sample=50.0), + sample_config=SampleConfig(profile_sample=50.0), ) sample_data = sampler.fetch_sample_data() diff --git a/ingestion/tests/unit/profiler/sqlalchemy/snowflake/test_sampling_method.py b/ingestion/tests/unit/profiler/sqlalchemy/snowflake/test_sampling_method.py index b62c49ffd151..3a97ecddbe63 100644 --- a/ingestion/tests/unit/profiler/sqlalchemy/snowflake/test_sampling_method.py +++ b/ingestion/tests/unit/profiler/sqlalchemy/snowflake/test_sampling_method.py @@ -1,5 +1,4 @@ from unittest import TestCase -from unittest.mock import patch from uuid import uuid4 from sqlalchemy import Column, Integer @@ -17,13 +16,12 @@ from metadata.generated.schema.entity.services.connections.database.snowflakeConnection import ( SnowflakeConnection, ) -from metadata.profiler.api.models import SampleConfig from metadata.profiler.interface.sqlalchemy.profiler_interface import ( SQAProfilerInterface, ) -from metadata.profiler.processor.sampler.sqlalchemy.snowflake.sampler import ( - SnowflakeSampler, -) +from metadata.sampler.models import SampleConfig +from metadata.sampler.sqlalchemy.sampler import SQASampler +from metadata.sampler.sqlalchemy.snowflake.sampler import SnowflakeSampler Base = declarative_base() @@ -49,12 +47,15 @@ class SampleTest(TestCase): username="myuser", account="myaccount", warehouse="mywarehouse" ) - with patch.object( - SQAProfilerInterface, "_convert_table_to_orm_object", return_value=User - ): - sqa_profiler_interface = SQAProfilerInterface( - snowflake_conn, None, table_entity, None, None, None, None, None, 5, 43200 - ) + sampler = SQASampler( + service_connection_config=snowflake_conn, + ometa_client=None, + entity=None, + orm_table=User, + ) + sqa_profiler_interface = SQAProfilerInterface( + snowflake_conn, None, table_entity, None, sampler, 5, 43200, orm_table=User + ) session = sqa_profiler_interface.session def test_omit_sampling_method_type(self): @@ -62,11 +63,13 @@ def test_omit_sampling_method_type(self): use BERNOULLI if sampling method type is not specified. """ sampler = SnowflakeSampler( - client=self.session, - table=User, - profile_sample_config=SampleConfig( + service_connection_config=self.snowflake_conn, + ometa_client=None, + entity=self.table_entity, + sample_config=SampleConfig( profile_sample_type=ProfileSampleType.PERCENTAGE, profile_sample=50.0 ), + orm_table=User, ) query: CTE = sampler.get_sample_query() assert "FROM users SAMPLE BERNOULLI" in str(query) @@ -80,13 +83,15 @@ def test_specify_sampling_method_type(self): SamplingMethodType.BERNOULLI, ]: sampler = SnowflakeSampler( - client=self.session, - table=User, - profile_sample_config=SampleConfig( + service_connection_config=self.snowflake_conn, + ometa_client=None, + entity=self.table_entity, + sample_config=SampleConfig( profile_sample_type=ProfileSampleType.PERCENTAGE, profile_sample=50.0, sampling_method_type=sampling_method_type, ), + orm_table=User, ) query: CTE = sampler.get_sample_query() assert f"FROM users SAMPLE {sampling_method_type.value}" in str(query) diff --git a/ingestion/tests/unit/profiler/sqlalchemy/test_profiler.py b/ingestion/tests/unit/profiler/sqlalchemy/test_profiler.py index a9c30d4ecc8b..72a0273aa58e 100644 --- a/ingestion/tests/unit/profiler/sqlalchemy/test_profiler.py +++ b/ingestion/tests/unit/profiler/sqlalchemy/test_profiler.py @@ -16,7 +16,6 @@ from concurrent.futures import TimeoutError from datetime import datetime from unittest import TestCase -from unittest.mock import patch from uuid import uuid4 import pytest @@ -52,6 +51,7 @@ from metadata.profiler.metrics.registry import Metrics from metadata.profiler.processor.core import MissingMetricException, Profiler from metadata.profiler.processor.default import DefaultProfiler +from metadata.sampler.sqlalchemy.sampler import SQASampler Base = declarative_base() @@ -107,8 +107,16 @@ class ProfilerTest(TestCase): ), ], ) + + sampler = SQASampler( + service_connection_config=sqlite_conn, + ometa_client=None, + entity=table_entity, + orm_table=User, + ) + sqa_profiler_interface = SQAProfilerInterface( - sqlite_conn, None, table_entity, None, None, None, None, None, 5, 43200 + sqlite_conn, None, table_entity, None, sampler, 5, 43200, orm_table=User ) @classmethod @@ -277,20 +285,16 @@ def test__prepare_table_metrics(self): def test_profiler_with_timeout(self): """check timeout is properly used""" - with patch.object( - SQAProfilerInterface, "_convert_table_to_orm_object", return_value=User - ): - sqa_profiler_interface = SQAProfilerInterface( - self.sqlite_conn, - None, - self.table_entity, - None, - None, - None, - None, - None, - timeout_seconds=0, - ) + sqa_profiler_interface = SQAProfilerInterface( + self.sqlite_conn, + None, + self.table_entity, + None, + self.sampler, + 5, + 0, + orm_table=User, + ) simple = DefaultProfiler( profiler_interface=sqa_profiler_interface, diff --git a/ingestion/tests/unit/profiler/sqlalchemy/test_runner.py b/ingestion/tests/unit/profiler/sqlalchemy/test_runner.py index 2a1d6fd2ba5b..9be685b8fea5 100644 --- a/ingestion/tests/unit/profiler/sqlalchemy/test_runner.py +++ b/ingestion/tests/unit/profiler/sqlalchemy/test_runner.py @@ -13,7 +13,8 @@ Test Sample behavior """ import time -from unittest import TestCase +from unittest import TestCase, mock +from unittest.mock import Mock, patch import pytest from sqlalchemy import TEXT, Column, Integer, String, create_engine, func @@ -21,9 +22,9 @@ from sqlalchemy.orm import declarative_base from metadata.ingestion.connections.session import create_and_bind_session -from metadata.profiler.api.models import SampleConfig from metadata.profiler.processor.runner import QueryRunner -from metadata.profiler.processor.sampler.sqlalchemy.sampler import SQASampler +from metadata.sampler.models import SampleConfig +from metadata.sampler.sqlalchemy.sampler import SQASampler from metadata.utils.timeout import cls_timeout Base = declarative_base() @@ -60,16 +61,6 @@ class RunnerTest(TestCase): engine = create_engine("sqlite+pysqlite:///:memory:", echo=False, future=True) session = create_and_bind_session(engine) - sampler = SQASampler( - client=session, - table=User, - profile_sample_config=SampleConfig(profile_sample=50.0), - ) - sample = sampler.random_sample() - - raw_runner = QueryRunner(session=session, table=User, sample=sample) - timeout_runner: Timer = cls_timeout(1)(Timer()) - @classmethod def setUpClass(cls) -> None: """ @@ -77,6 +68,25 @@ def setUpClass(cls) -> None: """ User.__table__.create(bind=cls.engine) + with ( + patch.object(SQASampler, "get_client", return_value=cls.session), + mock.patch( + "metadata.sampler.sampler_interface.get_ssl_connection", + return_value=Mock(), + ), + ): + sampler = SQASampler( + service_connection_config=Mock(), + ometa_client=None, + entity=None, + sample_config=SampleConfig(profile_sample=50.0), + orm_table=User, + ) + cls.sample = sampler.random_sample() + + cls.raw_runner = QueryRunner(session=cls.session, table=User, sample=cls.sample) + cls.timeout_runner: Timer = cls_timeout(1)(Timer()) + # Insert 30 rows for i in range(10): data = [ @@ -161,7 +171,7 @@ def test_select_from_statement(self): Test querying using `from_statement` returns expected values """ stmt = "SELECT name FROM users" - self.raw_runner._profile_sample_query = stmt + self.raw_runner.profile_sample_query = stmt res = self.raw_runner.select_all_from_table(Column(User.name.name)) assert len(res) == 30 @@ -170,9 +180,9 @@ def test_select_from_statement(self): assert len(res) == 1 stmt = "SELECT id FROM users" - self.raw_runner._profile_sample_query = stmt + self.raw_runner.profile_sample_query = stmt with pytest.raises(OperationalError): self.raw_runner.select_first_from_table(Column(User.name.name)) - self.raw_runner._profile_sample_query = None + self.raw_runner.profile_sample_query = None diff --git a/ingestion/tests/unit/profiler/sqlalchemy/test_sample.py b/ingestion/tests/unit/profiler/sqlalchemy/test_sample.py index e883779ff873..6ce8dffa8133 100644 --- a/ingestion/tests/unit/profiler/sqlalchemy/test_sample.py +++ b/ingestion/tests/unit/profiler/sqlalchemy/test_sample.py @@ -14,7 +14,6 @@ """ import os from unittest import TestCase -from unittest.mock import patch from uuid import uuid4 from sqlalchemy import TEXT, Column, Integer, String, func @@ -26,14 +25,14 @@ SQLiteConnection, SQLiteScheme, ) -from metadata.profiler.api.models import SampleConfig from metadata.profiler.interface.sqlalchemy.profiler_interface import ( SQAProfilerInterface, ) from metadata.profiler.metrics.registry import Metrics from metadata.profiler.orm.registry import CustomTypes from metadata.profiler.processor.core import Profiler -from metadata.profiler.processor.sampler.sqlalchemy.sampler import SQASampler +from metadata.sampler.models import SampleConfig +from metadata.sampler.sqlalchemy.sampler import SQASampler Base = declarative_base() @@ -92,20 +91,50 @@ class SampleTest(TestCase): ], ) - with patch.object( - SQAProfilerInterface, "_convert_table_to_orm_object", return_value=User - ): - sqa_profiler_interface = SQAProfilerInterface( - sqlite_conn, None, table_entity, None, None, None, None, None, 5, 43200 - ) - engine = sqa_profiler_interface.session.get_bind() - session = sqa_profiler_interface.session - @classmethod def setUpClass(cls) -> None: """ Prepare Ingredients """ + + cls.sampler = SQASampler( + service_connection_config=cls.sqlite_conn, + ometa_client=None, + entity=None, + sample_config=SampleConfig(profile_sample=50.0), + orm_table=User, + ) + cls.sample = cls.sampler.random_sample() + cls.sqa_profiler_interface = SQAProfilerInterface( + cls.sqlite_conn, + None, + cls.table_entity, + None, + cls.sampler, + 5, + 43200, + orm_table=User, + ) + cls.engine = cls.sqa_profiler_interface.session.get_bind() + cls.session = cls.sqa_profiler_interface.session + + cls.full_sampler = SQASampler( + service_connection_config=cls.sqlite_conn, + ometa_client=None, + entity=None, + orm_table=User, + ) + cls.full_sqa_profiler_interface = SQAProfilerInterface( + cls.sqlite_conn, + None, + cls.table_entity, + None, + cls.full_sampler, + 5, + 43200, + orm_table=User, + ) + User.__table__.create(bind=cls.engine) # Insert 30 rows @@ -141,12 +170,7 @@ def test_random_sampler(self): The random sampler should be able to generate a random subset of data """ - sampler = SQASampler( - client=self.session, - table=User, - profile_sample_config=SampleConfig(profile_sample=50.0), - ) - random_sample = sampler.random_sample() + random_sample = self.sampler.random_sample() res = self.session.query(func.count()).select_from(random_sample).first() assert res[0] < 30 @@ -155,24 +179,7 @@ def test_sample_property(self): Sample property should be properly generated """ - # Randomly pick table_count to init the Profiler, we don't care for this test - with patch.object( - SQAProfilerInterface, "_convert_table_to_orm_object", return_value=User - ): - sqa_profiler_interface = SQAProfilerInterface( - self.sqlite_conn, - None, - self.table_entity, - None, - SampleConfig(profile_sample=50.0), - None, - None, - None, - ) - - sample = sqa_profiler_interface._create_thread_safe_sampler( - self.session, User - ).random_sample() + sample = self.sqa_profiler_interface.sampler.random_sample() res = self.session.query(func.count()).select_from(sample).first() assert res[0] < 30 @@ -199,22 +206,8 @@ def test_random_sample_count(self): """ count = Metrics.COUNT.value - with patch.object( - SQAProfilerInterface, "_convert_table_to_orm_object", return_value=User - ): - profiler = Profiler( - count, - profiler_interface=SQAProfilerInterface( - self.sqlite_conn, - None, - self.table_entity, - None, - SampleConfig(profile_sample=50), - None, - None, - None, - ), - ) + + profiler = Profiler(count, profiler_interface=self.sqa_profiler_interface) res = profiler.compute_metrics()._column_results assert res.get(User.name.name)[Metrics.COUNT.name] < 30 @@ -230,28 +223,16 @@ def test_random_sample_histogram(self): third_quartile = Metrics.THIRD_QUARTILE.value iqr = Metrics.IQR.value - with patch.object( - SQAProfilerInterface, "_convert_table_to_orm_object", return_value=User - ): - profiler = Profiler( - hist, - count, - min, - max, - first_quartile, - third_quartile, - iqr, - profiler_interface=SQAProfilerInterface( - self.sqlite_conn, - None, - self.table_entity, - None, - SampleConfig(profile_sample=50), - None, - None, - None, - ), - ) + profiler = Profiler( + hist, + count, + min, + max, + first_quartile, + third_quartile, + iqr, + profiler_interface=self.sqa_profiler_interface, + ) res = profiler.compute_metrics()._column_results # The sum of all frequencies should be sampled @@ -265,7 +246,7 @@ def test_random_sample_histogram(self): first_quartile, third_quartile, iqr, - profiler_interface=self.sqa_profiler_interface, + profiler_interface=self.full_sqa_profiler_interface, ) res = profiler.compute_metrics()._column_results @@ -302,11 +283,7 @@ def test_sample_data(self): """ We should be able to pick up sample data from the sampler """ - sampler = SQASampler( - client=self.session, - table=User, - ) - sample_data = sampler.fetch_sample_data() + sample_data = self.full_sampler.fetch_sample_data() assert len(sample_data.columns) == 6 assert len(sample_data.rows) == 30 @@ -348,9 +325,12 @@ class UserBinary(Base): self.session.commit() sampler = SQASampler( - client=self.session, - table=UserBinary, + service_connection_config=self.sqlite_conn, + ometa_client=None, + entity=None, + orm_table=UserBinary, ) + sample_data = sampler.fetch_sample_data() assert len(sample_data.columns) == 7 @@ -377,9 +357,12 @@ def test_sample_from_user_query(self): """ stmt = "SELECT id, name FROM users" sampler = SQASampler( - client=self.session, - table=User, - profile_sample_query=stmt, + service_connection_config=self.sqlite_conn, + ometa_client=None, + entity=None, + sample_config=SampleConfig(profile_sample=50.0), + orm_table=User, + sample_query=stmt, ) sample_data = sampler.fetch_sample_data() diff --git a/ingestion/tests/unit/profiler/sqlalchemy/test_sqa_profiler_interface.py b/ingestion/tests/unit/profiler/sqlalchemy/test_sqa_profiler_interface.py index 7b78a6175a60..3bd75d0c0bbf 100644 --- a/ingestion/tests/unit/profiler/sqlalchemy/test_sqa_profiler_interface.py +++ b/ingestion/tests/unit/profiler/sqlalchemy/test_sqa_profiler_interface.py @@ -16,7 +16,6 @@ import os from datetime import datetime from unittest import TestCase -from unittest.mock import patch from uuid import uuid4 from sqlalchemy import TEXT, Column, Integer, String, inspect @@ -51,6 +50,7 @@ ) from metadata.profiler.metrics.static.row_count import RowCount from metadata.profiler.processor.default import get_default_metrics +from metadata.sampler.sqlalchemy.sampler import SQASampler class User(declarative_base()): @@ -78,19 +78,15 @@ def setUp(self) -> None: sqlite_conn = SQLiteConnection( scheme=SQLiteScheme.sqlite_pysqlite, ) - with patch.object( - SQAProfilerInterface, "_convert_table_to_orm_object", return_value=User - ): - self.sqa_profiler_interface = SQAProfilerInterface( - sqlite_conn, - None, - table_entity, - None, - None, - None, - None, - None, - ) + sampler = SQASampler( + service_connection_config=sqlite_conn, + ometa_client=None, + entity=None, + orm_table=User, + ) + self.sqa_profiler_interface = SQAProfilerInterface( + sqlite_conn, None, table_entity, None, sampler, 5, 43200, orm_table=User + ) self.table = User def test_init_interface(self): @@ -118,12 +114,15 @@ class SQAInterfaceTestMultiThread(TestCase): scheme=SQLiteScheme.sqlite_pysqlite, databaseMode=db_path + "?check_same_thread=False", ) - with patch.object( - SQAProfilerInterface, "_convert_table_to_orm_object", return_value=User - ): - sqa_profiler_interface = SQAProfilerInterface( - sqlite_conn, None, table_entity, None, None, None, None, None, 5, 43200 - ) + sampler = SQASampler( + service_connection_config=sqlite_conn, + ometa_client=None, + entity=None, + orm_table=User, + ) + sqa_profiler_interface = SQAProfilerInterface( + sqlite_conn, None, table_entity, None, sampler, 5, 43200, orm_table=User + ) @classmethod def setUpClass(cls) -> None: From f6d6e833ec9a08fe4b733c73da13fac74bdd6d54 Mon Sep 17 00:00:00 2001 From: Pere Miquel Brull Date: Fri, 15 Nov 2024 16:51:56 +0100 Subject: [PATCH 14/29] fix tests --- .../profiler/pandas/test_custom_metrics.py | 169 +++++++++++------- .../profiler/pandas/test_datalake_metrics.py | 46 +++-- .../unit/profiler/pandas/test_profiler.py | 45 +++-- .../pandas/test_profiler_interface.py | 45 +++-- .../unit/profiler/sqlalchemy/test_metrics.py | 165 +++++++++-------- .../unit/profiler/test_profiler_interface.py | 31 ++-- .../tests/unit/profiler/test_workflow.py | 53 ++++-- ingestion/tests/unit/test_suite/conftest.py | 23 ++- 8 files changed, 364 insertions(+), 213 deletions(-) diff --git a/ingestion/tests/unit/profiler/pandas/test_custom_metrics.py b/ingestion/tests/unit/profiler/pandas/test_custom_metrics.py index cac724d17e1a..372416666246 100644 --- a/ingestion/tests/unit/profiler/pandas/test_custom_metrics.py +++ b/ingestion/tests/unit/profiler/pandas/test_custom_metrics.py @@ -12,10 +12,9 @@ """ Test Metrics behavior """ -# import datetime import os -from unittest import TestCase -from unittest.mock import patch +from unittest import TestCase, mock +from unittest.mock import Mock, patch from uuid import uuid4 import pandas as pd @@ -35,11 +34,22 @@ PandasProfilerInterface, ) from metadata.profiler.processor.core import Profiler +from metadata.sampler.pandas.sampler import DatalakeSampler BUCKET_NAME = "MyBucket" REGION = "us-west-1" +class FakeClient: + def __init__(self): + self._client = None + + +class FakeConnection: + def __init__(self): + self.client = FakeClient() + + class MetricsTest(TestCase): """ Run checks on different metrics @@ -98,24 +108,43 @@ class MetricsTest(TestCase): ], ) - def setUp(self): - with patch( - "metadata.mixins.pandas.pandas_mixin.fetch_dataframe", - return_value=self.dfs, + @mock.patch( + "metadata.profiler.interface.profiler_interface.get_ssl_connection", + return_value=FakeConnection(), + ) + @mock.patch( + "metadata.sampler.sampler_interface.get_ssl_connection", + return_value=FakeConnection(), + ) + def setUp(self, *_): + with ( + patch.object(DatalakeSampler, "table", new_callable=lambda: self.dfs), + patch.object(DatalakeSampler, "get_client") as mock_client, ): - self.sqa_profiler_interface = PandasProfilerInterface( - self.datalake_conn, - None, - self.table_entity, - None, - None, - None, - None, - None, + mock_client.return_value = Mock() + self.sampler = DatalakeSampler( + service_connection_config=DatalakeConnection(configSource={}), + ometa_client=None, + entity=self.table_entity, + ) + self.datalake_profiler_interface = PandasProfilerInterface( + service_connection_config=DatalakeConnection(configSource={}), + ometa_client=None, + entity=self.table_entity, + source_config=None, + sampler=self.sampler, thread_count=1, ) - def test_table_custom_metric(self): + @mock.patch( + "metadata.profiler.interface.profiler_interface.get_ssl_connection", + return_value=FakeConnection(), + ) + @mock.patch( + "metadata.sampler.sampler_interface.get_ssl_connection", + return_value=FakeConnection(), + ) + def test_table_custom_metric(self, *_): table_entity = Table( id=uuid4(), name="user", @@ -163,34 +192,45 @@ def test_table_custom_metric(self): ), ], ) - with patch( - "metadata.mixins.pandas.pandas_mixin.fetch_dataframe", - return_value=self.dfs, + + with ( + patch.object(DatalakeSampler, "table", new_callable=lambda: self.dfs), + patch.object(DatalakeSampler, "get_client") as mock_client, ): - self.sqa_profiler_interface = PandasProfilerInterface( - self.datalake_conn, - None, - table_entity, - None, - None, - None, - None, - None, + mock_client.return_value = Mock() + sampler = DatalakeSampler( + service_connection_config=DatalakeConnection(configSource={}), + ometa_client=None, + entity=table_entity, + ) + datalake_profiler_interface = PandasProfilerInterface( + service_connection_config=DatalakeConnection(configSource={}), + ometa_client=None, + entity=table_entity, + source_config=None, + sampler=sampler, thread_count=1, ) + profiler = Profiler( + profiler_interface=datalake_profiler_interface, + ) + metrics = profiler.compute_metrics() + for k, v in metrics._table_results.items(): + for metric in v: + if metric.name == "LastNameFilter": + assert metric.value == 1 + if metric.name == "notUS": + assert metric.value == 2 - profiler = Profiler( - profiler_interface=self.sqa_profiler_interface, - ) - metrics = profiler.compute_metrics() - for k, v in metrics._table_results.items(): - for metric in v: - if metric.name == "LastNameFilter": - assert metric.value == 1 - if metric.name == "notUS": - assert metric.value == 2 - - def test_column_custom_metric(self): + @mock.patch( + "metadata.profiler.interface.profiler_interface.get_ssl_connection", + return_value=FakeConnection(), + ) + @mock.patch( + "metadata.sampler.sampler_interface.get_ssl_connection", + return_value=FakeConnection(), + ) + def test_column_custom_metric(self, *_): table_entity = Table( id=uuid4(), name="user", @@ -216,29 +256,32 @@ def test_column_custom_metric(self): ) ], ) - with patch( - "metadata.mixins.pandas.pandas_mixin.fetch_dataframe", - return_value=self.dfs, + with ( + patch.object(DatalakeSampler, "table", new_callable=lambda: self.dfs), + patch.object(DatalakeSampler, "get_client") as mock_client, ): - self.sqa_profiler_interface = PandasProfilerInterface( - self.datalake_conn, - None, - table_entity, - None, - None, - None, - None, - None, + mock_client.return_value = Mock() + sampler = DatalakeSampler( + service_connection_config=DatalakeConnection(configSource={}), + ometa_client=None, + entity=table_entity, + ) + datalake_profiler_interface = PandasProfilerInterface( + service_connection_config=DatalakeConnection(configSource={}), + ometa_client=None, + entity=table_entity, + source_config=None, + sampler=sampler, thread_count=1, ) - profiler = Profiler( - profiler_interface=self.sqa_profiler_interface, - ) - metrics = profiler.compute_metrics() - for k, v in metrics._column_results.items(): - for metric in v.get("customMetrics", []): - if metric.name == "CustomerBornedAfter1991": - assert metric.value == 1 - if metric.name == "AverageAge": - assert metric.value == 2 + profiler = Profiler( + profiler_interface=datalake_profiler_interface, + ) + metrics = profiler.compute_metrics() + for k, v in metrics._column_results.items(): + for metric in v.get("customMetrics", []): + if metric.name == "CustomerBornAfter1991": + assert metric.value == 1 + if metric.name == "AverageAge": + assert metric.value == 2 diff --git a/ingestion/tests/unit/profiler/pandas/test_datalake_metrics.py b/ingestion/tests/unit/profiler/pandas/test_datalake_metrics.py index 6a8bf445ef76..db8bdae22fb7 100644 --- a/ingestion/tests/unit/profiler/pandas/test_datalake_metrics.py +++ b/ingestion/tests/unit/profiler/pandas/test_datalake_metrics.py @@ -14,6 +14,7 @@ """ import os from unittest import TestCase, mock +from unittest.mock import Mock, patch from uuid import uuid4 from sqlalchemy import TEXT, Column, Date, DateTime, Integer, String, Time @@ -31,6 +32,7 @@ from metadata.profiler.metrics.core import add_props from metadata.profiler.metrics.registry import Metrics from metadata.profiler.processor.core import Profiler +from metadata.sampler.pandas.sampler import DatalakeSampler Base = declarative_base() @@ -91,15 +93,21 @@ class DatalakeMetricsTest(TestCase): "metadata.profiler.interface.profiler_interface.get_ssl_connection", return_value=FakeConnection(), ) + @mock.patch( + "metadata.sampler.sampler_interface.get_ssl_connection", + return_value=FakeConnection(), + ) @mock.patch( "metadata.mixins.pandas.pandas_mixin.fetch_dataframe", return_value=[df1, pd.concat([df2, pd.DataFrame(index=df1.index)])], ) - def setUpClass(cls, mock_get_connection, mocked_dfs): + def setUpClass(cls, mock_get_connection, mock_sample_get_connection, mocked_dfs): """ Setup the test class. We won't mock S3 with moto as we want to test that metrics are computed correctly on a list of dataframes. """ + import pandas as pd + table_entity = Table( id=uuid4(), name="user", @@ -151,17 +159,31 @@ def setUpClass(cls, mock_get_connection, mocked_dfs): ], ) - cls.datalake_profiler_interface = PandasProfilerInterface( - entity=table_entity, - service_connection_config=DatalakeConnection(configSource={}), - storage_config=None, - ometa_client=None, - thread_count=None, - profile_sample_config=None, - source_config=None, - sample_query=None, - table_partition_config=None, - ) + with ( + patch.object( + DatalakeSampler, + "table", + new_callable=lambda: [ + cls.df1, + pd.concat([cls.df2, pd.DataFrame(index=cls.df1.index)]), + ], + ), + patch.object(DatalakeSampler, "get_client") as mock_client, + ): + mock_client.return_value = Mock() + sampler = DatalakeSampler( + service_connection_config=DatalakeConnection(configSource={}), + ometa_client=None, + entity=table_entity, + ) + cls.datalake_profiler_interface = PandasProfilerInterface( + service_connection_config=DatalakeConnection(configSource={}), + ometa_client=None, + entity=table_entity, + source_config=None, + sampler=sampler, + thread_count=None, + ) def test_count(self): """ diff --git a/ingestion/tests/unit/profiler/pandas/test_profiler.py b/ingestion/tests/unit/profiler/pandas/test_profiler.py index 1fbb61bdc8f6..75db15a30741 100644 --- a/ingestion/tests/unit/profiler/pandas/test_profiler.py +++ b/ingestion/tests/unit/profiler/pandas/test_profiler.py @@ -15,6 +15,7 @@ import os from datetime import datetime from unittest import TestCase, mock +from unittest.mock import Mock, patch from uuid import uuid4 import pytest @@ -49,6 +50,7 @@ from metadata.profiler.metrics.registry import Metrics from metadata.profiler.processor.core import MissingMetricException, Profiler from metadata.profiler.processor.default import DefaultProfiler +from metadata.sampler.pandas.sampler import DatalakeSampler Base = declarative_base() @@ -155,21 +157,36 @@ class ProfilerTest(TestCase): return_value=FakeConnection(), ) @mock.patch( - "metadata.mixins.pandas.pandas_mixin.fetch_dataframe", - return_value=[df1, pd.concat([df2, pd.DataFrame(index=df1.index)])], + "metadata.sampler.sampler_interface.get_ssl_connection", + return_value=FakeConnection(), ) - def setUpClass(cls, mock_get_connection, mocked_dfs): - cls.datalake_profiler_interface = PandasProfilerInterface( - entity=cls.table_entity, - service_connection_config=DatalakeConnection(configSource={}), - storage_config=None, - ometa_client=None, - thread_count=None, - profile_sample_config=None, - source_config=None, - sample_query=None, - table_partition_config=None, - ) + def setUp(cls, mock_get_connection, *_) -> None: + import pandas as pd + + with ( + patch.object( + DatalakeSampler, + "table", + new_callable=lambda: [ + cls.df1, + pd.concat([cls.df2, pd.DataFrame(index=cls.df1.index)]), + ], + ), + patch.object(DatalakeSampler, "get_client") as mock_client, + ): + mock_client.return_value = Mock() + cls.sampler = DatalakeSampler( + service_connection_config=DatalakeConnection(configSource={}), + ometa_client=None, + entity=cls.table_entity, + ) + cls.datalake_profiler_interface = PandasProfilerInterface( + service_connection_config=DatalakeConnection(configSource={}), + ometa_client=None, + entity=cls.table_entity, + source_config=None, + sampler=cls.sampler, + ) def test_default_profiler(self): """ diff --git a/ingestion/tests/unit/profiler/pandas/test_profiler_interface.py b/ingestion/tests/unit/profiler/pandas/test_profiler_interface.py index 9dc53f933a63..0672f1b83e40 100644 --- a/ingestion/tests/unit/profiler/pandas/test_profiler_interface.py +++ b/ingestion/tests/unit/profiler/pandas/test_profiler_interface.py @@ -16,6 +16,7 @@ import os from datetime import datetime from unittest import TestCase, mock +from unittest.mock import Mock, patch from uuid import uuid4 from sqlalchemy import TEXT, Column, Integer, String, inspect @@ -49,6 +50,7 @@ ) from metadata.profiler.metrics.static.row_count import RowCount from metadata.profiler.processor.default import get_default_metrics +from metadata.sampler.pandas.sampler import DatalakeSampler class User(declarative_base()): @@ -150,21 +152,36 @@ class PandasInterfaceTest(TestCase): return_value=FakeConnection(), ) @mock.patch( - "metadata.mixins.pandas.pandas_mixin.fetch_dataframe", - return_value=[df1, pd.concat([df2, pd.DataFrame(index=df1.index)])], + "metadata.sampler.sampler_interface.get_ssl_connection", + return_value=FakeConnection(), ) - def setUp(cls, mock_get_connection, mocked_dfs) -> None: - cls.datalake_profiler_interface = PandasProfilerInterface( - entity=cls.table_entity, - service_connection_config=DatalakeConnection(configSource={}), - storage_config=None, - ometa_client=None, - thread_count=None, - profile_sample_config=None, - source_config=None, - sample_query=None, - table_partition_config=None, - ) + def setUp(cls, mock_get_connection, *_) -> None: + import pandas as pd + + with ( + patch.object( + DatalakeSampler, + "table", + new_callable=lambda: [ + cls.df1, + pd.concat([cls.df2, pd.DataFrame(index=cls.df1.index)]), + ], + ), + patch.object(DatalakeSampler, "get_client") as mock_client, + ): + mock_client.return_value = Mock() + cls.sampler = DatalakeSampler( + service_connection_config=DatalakeConnection(configSource={}), + ometa_client=None, + entity=cls.table_entity, + ) + cls.datalake_profiler_interface = PandasProfilerInterface( + service_connection_config=DatalakeConnection(configSource={}), + ometa_client=None, + entity=cls.table_entity, + source_config=None, + sampler=cls.sampler, + ) @classmethod def setUpClass(cls) -> None: diff --git a/ingestion/tests/unit/profiler/sqlalchemy/test_metrics.py b/ingestion/tests/unit/profiler/sqlalchemy/test_metrics.py index 9c31d5ffca56..df16d89a30f2 100644 --- a/ingestion/tests/unit/profiler/sqlalchemy/test_metrics.py +++ b/ingestion/tests/unit/profiler/sqlalchemy/test_metrics.py @@ -16,7 +16,6 @@ import math import os from unittest import TestCase -from unittest.mock import patch from uuid import uuid4 from sqlalchemy import TEXT, Column, Date, DateTime, Float, Integer, String, Time @@ -37,6 +36,7 @@ from metadata.profiler.metrics.system.system import SystemMetricsComputer from metadata.profiler.orm.functions.sum import SumFn from metadata.profiler.processor.core import Profiler +from metadata.sampler.sqlalchemy.sampler import SQASampler Base = declarative_base() @@ -84,20 +84,23 @@ def setUpClass(cls) -> None: Prepare Ingredients """ - with patch.object( - SQAProfilerInterface, "_convert_table_to_orm_object", return_value=User - ): - cls.sqa_profiler_interface = SQAProfilerInterface( - cls.sqlite_conn, - None, - cls.table_entity, - None, - None, - None, - None, - None, - thread_count=1, - ) + sampler = SQASampler( + service_connection_config=cls.sqlite_conn, + ometa_client=None, + entity=None, + orm_table=User, + ) + cls.sqa_profiler_interface = SQAProfilerInterface( + cls.sqlite_conn, + None, + cls.table_entity, + None, + sampler, + 1, + 43200, + orm_table=User, + ) + cls.engine = cls.sqa_profiler_interface.session.get_bind() User.__table__.create(bind=cls.engine) @@ -258,21 +261,22 @@ class NonNumericNumbers(Base): float_col = Column(Float()) # date of employment NonNumericNumbers.__table__.create(bind=self.engine) - with patch.object( - SQAProfilerInterface, - "_convert_table_to_orm_object", - return_value=NonNumericNumbers, - ): - sqa_profiler_interface = SQAProfilerInterface( - self.sqlite_conn, - None, - self.table_entity, - None, - None, - None, - None, - None, - ) + sampler = SQASampler( + service_connection_config=self.sqlite_conn, + ometa_client=None, + entity=self.table_entity, + orm_table=NonNumericNumbers, + ) + sqa_profiler_interface = SQAProfilerInterface( + self.sqlite_conn, + None, + self.table_entity, + None, + sampler, + 1, + 43200, + orm_table=NonNumericNumbers, + ) data = [ NonNumericNumbers(float_col=math.nan), @@ -792,19 +796,22 @@ class EmptyUser(Base): EmptyUser.__table__.create(bind=self.engine) - with patch.object( - SQAProfilerInterface, "_convert_table_to_orm_object", return_value=EmptyUser - ): - sqa_profiler_interface = SQAProfilerInterface( - self.sqlite_conn, - None, - self.table_entity, - None, - None, - None, - None, - None, - ) + sampler = SQASampler( + service_connection_config=self.sqlite_conn, + ometa_client=None, + entity=self.table_entity, + orm_table=EmptyUser, + ) + sqa_profiler_interface = SQAProfilerInterface( + self.sqlite_conn, + None, + self.table_entity, + None, + sampler, + 1, + 43200, + orm_table=EmptyUser, + ) hist = Metrics.HISTOGRAM.value res = ( @@ -943,7 +950,7 @@ def test_table_custom_metric(self): ], customMetrics=[ CustomMetric( - name="CustomerBornedAfter1991", + name="CustomerBornAfter1991", expression="SELECT COUNT(id) FROM users WHERE dob > '1991-01-01'", ), CustomMetric( @@ -952,28 +959,30 @@ def test_table_custom_metric(self): ), ], ) - with patch.object( - SQAProfilerInterface, "_convert_table_to_orm_object", return_value=User - ): - self.sqa_profiler_interface = SQAProfilerInterface( - self.sqlite_conn, - None, - table_entity, - None, - None, - None, - None, - None, - thread_count=1, - ) + sampler = SQASampler( + service_connection_config=self.sqlite_conn, + ometa_client=None, + entity=None, + orm_table=User, + ) + sqa_profiler_interface = SQAProfilerInterface( + self.sqlite_conn, + None, + table_entity, + None, + sampler, + 1, + 43200, + orm_table=User, + ) profiler = Profiler( - profiler_interface=self.sqa_profiler_interface, + profiler_interface=sqa_profiler_interface, ) metrics = profiler.compute_metrics() for k, v in metrics._table_results.items(): for metric in v: - if metric.name == "CustomerBornedAfter1991": + if metric.name == "CustomerBornAfter1991": assert metric.value == 2 if metric.name == "AverageAge": assert metric.value == 20.0 @@ -988,7 +997,7 @@ def test_column_custom_metric(self): dataType=DataType.INT, customMetrics=[ CustomMetric( - name="CustomerBornedAfter1991", + name="CustomerBornAfter1991", columnName="id", expression="SELECT SUM(id) FROM users WHERE dob > '1991-01-01'", ), @@ -1001,28 +1010,30 @@ def test_column_custom_metric(self): ) ], ) - with patch.object( - SQAProfilerInterface, "_convert_table_to_orm_object", return_value=User - ): - self.sqa_profiler_interface = SQAProfilerInterface( - self.sqlite_conn, - None, - table_entity, - None, - None, - None, - None, - None, - thread_count=1, - ) + sampler = SQASampler( + service_connection_config=self.sqlite_conn, + ometa_client=None, + entity=None, + orm_table=User, + ) + sqa_profiler_interface = SQAProfilerInterface( + self.sqlite_conn, + None, + table_entity, + None, + sampler, + 1, + 43200, + orm_table=User, + ) profiler = Profiler( - profiler_interface=self.sqa_profiler_interface, + profiler_interface=sqa_profiler_interface, ) metrics = profiler.compute_metrics() for k, v in metrics._column_results.items(): for metric in v.get("customMetrics", []): - if metric.name == "CustomerBornedAfter1991": + if metric.name == "CustomerBornAfter1991": assert metric.value == 3.0 if metric.name == "AverageAge": assert metric.value == 20.0 diff --git a/ingestion/tests/unit/profiler/test_profiler_interface.py b/ingestion/tests/unit/profiler/test_profiler_interface.py index 48ab47c9d6fa..a0336bcd31ee 100644 --- a/ingestion/tests/unit/profiler/test_profiler_interface.py +++ b/ingestion/tests/unit/profiler/test_profiler_interface.py @@ -164,10 +164,14 @@ def test_get_profile_sample_configs(self): ) actual = get_profile_sample_config( entity=self.table, - schema_profiler_config=self.schema_profiler_config, - database_profiler_config=self.database_profiler_config, + schema_entity=self.schema_entity, + database_entity=self.database_entity, entity_config=None, - source_config=source_config, + default_sample_config=SampleConfig( + profile_sample=source_config.profileSample, + profile_sample_type=source_config.profileSampleType, + sampling_method_type=source_config.samplingMethodType, + ), ) self.assertEqual(expected, actual) @@ -182,10 +186,14 @@ def test_get_profile_sample_configs(self): ) actual = get_profile_sample_config( entity=self.table, - schema_profiler_config=self.schema_profiler_config, - database_profiler_config=self.database_profiler_config, + schema_entity=self.schema_entity, + database_entity=self.database_entity, entity_config=profiler, - source_config=source_config, + default_sample_config=SampleConfig( + profile_sample=source_config.profileSample, + profile_sample_type=source_config.profileSampleType, + sampling_method_type=source_config.samplingMethodType, + ), ) self.assertEqual(expected, actual) @@ -198,10 +206,14 @@ def test_get_profile_sample_configs(self): table_copy.tableProfilerConfig = None actual = get_profile_sample_config( entity=table_copy, - schema_profiler_config=None, - database_profiler_config=self.database_profiler_config, + schema_entity=None, + database_entity=self.database_entity, entity_config=profiler, - source_config=source_config, + default_sample_config=SampleConfig( + profile_sample=source_config.profileSample, + profile_sample_type=source_config.profileSampleType, + sampling_method_type=source_config.samplingMethodType, + ), ) self.assertEqual(expected, actual) @@ -212,7 +224,6 @@ def test_get_sample_data_count_config(self): sampleDataCount=20, fullyQualifiedName="demo", ) - source_config = DatabaseServiceProfilerPipeline() actual = get_sample_data_count_config( entity=self.table, diff --git a/ingestion/tests/unit/profiler/test_workflow.py b/ingestion/tests/unit/profiler/test_workflow.py index e1c5c605ec81..004da2d49c70 100644 --- a/ingestion/tests/unit/profiler/test_workflow.py +++ b/ingestion/tests/unit/profiler/test_workflow.py @@ -20,6 +20,7 @@ from pytest import raises from sqlalchemy.orm import declarative_base +from metadata.generated.schema.entity.data.database import Database from metadata.generated.schema.entity.data.table import ( Column, DataType, @@ -44,6 +45,7 @@ from metadata.profiler.interface.sqlalchemy.profiler_interface import ( SQAProfilerInterface, ) +from metadata.profiler.orm.converter import base from metadata.profiler.processor.default import DefaultProfiler from metadata.profiler.source.database.base.profiler_source import ProfilerSource from metadata.profiler.source.fetcher.fetcher_strategy import DatabaseFetcherStrategy @@ -62,6 +64,9 @@ Column(name="age", dataType=DataType.INT), ], database=EntityReference(id=uuid.uuid4(), name="db", type="database"), + databaseSchema=EntityReference( + id=uuid.uuid4(), name="schema", type="databaseSchema" + ), tableProfilerConfig=TableProfilerConfig( profileSample=80.0, ), # type: ignore @@ -110,15 +115,15 @@ class User(Base): @patch.object( SQAProfilerInterface, - "_convert_table_to_orm_object", - return_value=User, + "table", + new_callable=lambda: User, ) @patch.object( OpenMetadataSource, "_validate_service_name", return_value=True, ) -def test_init_workflow(mocked_method, mocked_orm): # pylint: disable=unused-argument +def test_init_workflow(mocked_method, *_): # pylint: disable=unused-argument """ We can initialise the workflow from a config """ @@ -294,17 +299,27 @@ def test_filter_entities(): assert len(fetcher._filter_entities(all_tables)) == 1 +@patch.object( + base, + "get_orm_database", + return_value="db", +) +@patch.object( + base, + "get_orm_schema", + return_value="schema", +) @patch.object( SQAProfilerInterface, - "_convert_table_to_orm_object", - return_value=User, + "table", + new_callable=lambda: User, ) @patch.object( OpenMetadataSource, "_validate_service_name", return_value=True, ) -def test_profile_def(mocked_method, mocked_orm): # pylint: disable=unused-argument +def test_profile_def(mocked_method, *_): # pylint: disable=unused-argument """ Validate the definitions of the profile in the JSON """ @@ -323,11 +338,13 @@ def test_profile_def(mocked_method, mocked_orm): # pylint: disable=unused-argum profiler_source = ProfilerSource( profile_workflow.config, - DatabaseService( + Database( id=uuid.uuid4(), name="myDataBaseService", - serviceType=DatabaseServiceType.SQLite, - ), # type: ignore + service=EntityReference( + id=uuid.uuid4(), name="my_service", type="databaseService" + ), + ), profile_workflow.metadata, None, ) @@ -342,19 +359,27 @@ def test_profile_def(mocked_method, mocked_orm): # pylint: disable=unused-argum assert config_metrics_label == profiler_obj_metrics +@patch.object( + base, + "get_orm_database", + return_value="db", +) +@patch.object( + base, + "get_orm_schema", + return_value="schema", +) @patch.object( SQAProfilerInterface, - "_convert_table_to_orm_object", - return_value=User, + "table", + new_callable=lambda: User, ) @patch.object( OpenMetadataSource, "_validate_service_name", return_value=True, ) -def test_default_profile_def( - mocked_method, mocked_orm # pylint: disable=unused-argument -): +def test_default_profile_def(mocked_method, *_): # pylint: disable=unused-argument """ If no information is specified for the profiler, let's use the SimpleTableProfiler and SimpleProfiler diff --git a/ingestion/tests/unit/test_suite/conftest.py b/ingestion/tests/unit/test_suite/conftest.py index 65ede54b386d..307ee2f61cb8 100644 --- a/ingestion/tests/unit/test_suite/conftest.py +++ b/ingestion/tests/unit/test_suite/conftest.py @@ -15,7 +15,6 @@ import os from datetime import datetime, timedelta -from unittest.mock import patch from uuid import uuid4 import pytest @@ -32,6 +31,7 @@ ) from metadata.generated.schema.tests.testCase import TestCase, TestCaseParameterValue from metadata.generated.schema.type.entityReference import EntityReference +from metadata.sampler.sqlalchemy.sampler import SQASampler Base = declarative_base() @@ -89,14 +89,19 @@ def create_sqlite_table(): databaseMode=db_path + "?check_same_thread=False", ) # type: ignore - with patch.object( - SQATestSuiteInterface, "_convert_table_to_orm_object", return_value=User - ): - sqa_profiler_interface = SQATestSuiteInterface( - sqlite_conn, # type: ignore - table_entity=TABLE, - ometa_client=None, # type: ignore - ) + sampler = SQASampler( + service_connection_config=sqlite_conn, + ometa_client=None, + entity=TABLE, + orm_table=User, + ) + sqa_profiler_interface = SQATestSuiteInterface( + sqlite_conn, + None, + sampler, + TABLE, + orm_table=User, + ) runner = sqa_profiler_interface.runner engine = sqa_profiler_interface.session.get_bind() From 56f66436bbc214c6919dccb854c1fe10849d92b9 Mon Sep 17 00:00:00 2001 From: Pere Miquel Brull Date: Sat, 16 Nov 2024 10:53:35 +0100 Subject: [PATCH 15/29] linting and tests --- .../pandas/pandas_test_suite_interface.py | 2 +- .../interface/test_suite_interface.py | 2 - .../table_diff_params_setter.py | 1 - .../database/datalake/clients/azure_blob.py | 3 - .../source/database/datalake/clients/base.py | 11 +- .../source/database/datalake/clients/gcs.py | 6 +- .../source/database/datalake/clients/s3.py | 3 - .../profiler/interface/profiler_interface.py | 2 - .../source/database/base/profiler_source.py | 3 + .../profiler/source/fetcher/config.py | 2 + .../src/metadata/sampler/nosql/sampler.py | 16 ++- .../src/metadata/sampler/pandas/sampler.py | 4 +- ingestion/src/metadata/sampler/partition.py | 109 ++++++++++-------- .../src/metadata/sampler/sampler_interface.py | 4 +- .../metadata/sampler/sqlalchemy/sampler.py | 2 +- .../sampler/sqlalchemy/snowflake/sampler.py | 1 + .../src/metadata/utils/profiler_utils.py | 1 + .../unit/profiler/test_profiler_partitions.py | 18 +-- 18 files changed, 110 insertions(+), 80 deletions(-) diff --git a/ingestion/src/metadata/data_quality/interface/pandas/pandas_test_suite_interface.py b/ingestion/src/metadata/data_quality/interface/pandas/pandas_test_suite_interface.py index b67e8e663875..0d894a212a4f 100644 --- a/ingestion/src/metadata/data_quality/interface/pandas/pandas_test_suite_interface.py +++ b/ingestion/src/metadata/data_quality/interface/pandas/pandas_test_suite_interface.py @@ -44,7 +44,7 @@ def __init__( ometa_client: OpenMetadata, sampler: SamplerInterface, table_entity: Table, - **kwargs, # pylint: disable=unused-argument + **__, ): super().__init__( service_connection_config, diff --git a/ingestion/src/metadata/data_quality/interface/test_suite_interface.py b/ingestion/src/metadata/data_quality/interface/test_suite_interface.py index 82e58ffa2842..44e8930e22b7 100644 --- a/ingestion/src/metadata/data_quality/interface/test_suite_interface.py +++ b/ingestion/src/metadata/data_quality/interface/test_suite_interface.py @@ -48,8 +48,6 @@ def __init__( ometa_client: OpenMetadata, sampler: SamplerInterface, table_entity: Table, - *args, - **kwargs, ): """Required attribute for the interface""" self.ometa_client = ometa_client diff --git a/ingestion/src/metadata/data_quality/validations/runtime_param_setter/table_diff_params_setter.py b/ingestion/src/metadata/data_quality/validations/runtime_param_setter/table_diff_params_setter.py index 772a0d0e46db..b721e22910ba 100644 --- a/ingestion/src/metadata/data_quality/validations/runtime_param_setter/table_diff_params_setter.py +++ b/ingestion/src/metadata/data_quality/validations/runtime_param_setter/table_diff_params_setter.py @@ -116,7 +116,6 @@ def get_parameters(self, test_case) -> TableDiffRuntimeParameters: whereClause=self.build_where_clause(test_case), ) - # pylint: disable=protected-access def build_where_clause(self, test_case) -> Optional[str]: param_where_clause = self.get_parameter(test_case, "where", None) partition_where_clause = ( diff --git a/ingestion/src/metadata/ingestion/source/database/datalake/clients/azure_blob.py b/ingestion/src/metadata/ingestion/source/database/datalake/clients/azure_blob.py index 40fdaa30d5df..93bed25a52aa 100644 --- a/ingestion/src/metadata/ingestion/source/database/datalake/clients/azure_blob.py +++ b/ingestion/src/metadata/ingestion/source/database/datalake/clients/azure_blob.py @@ -26,9 +26,6 @@ class DatalakeAzureBlobClient(DatalakeBaseClient): - def __init__(self, client: BlobServiceClient): - self._client = client - @classmethod def from_config(cls, config: AzureConfig) -> "DatalakeAzureBlobClient": try: diff --git a/ingestion/src/metadata/ingestion/source/database/datalake/clients/base.py b/ingestion/src/metadata/ingestion/source/database/datalake/clients/base.py index bd9ccbf52a48..b37100ccb8e4 100644 --- a/ingestion/src/metadata/ingestion/source/database/datalake/clients/base.py +++ b/ingestion/src/metadata/ingestion/source/database/datalake/clients/base.py @@ -13,10 +13,19 @@ Datalake Base Client """ from abc import ABC, abstractmethod -from typing import Callable, Iterable, Optional +from typing import Any, Callable, Iterable, Optional class DatalakeBaseClient(ABC): + """Base DL client implementation""" + + def __init__(self, client: Any, **kwargs): + self._client = client + + @property + def client(self) -> Any: + return self._client + @classmethod @abstractmethod def from_config(cls, config) -> "DatalakeBaseClient": diff --git a/ingestion/src/metadata/ingestion/source/database/datalake/clients/gcs.py b/ingestion/src/metadata/ingestion/source/database/datalake/clients/gcs.py index 8b29ae407bec..cf3b5f2d85fe 100644 --- a/ingestion/src/metadata/ingestion/source/database/datalake/clients/gcs.py +++ b/ingestion/src/metadata/ingestion/source/database/datalake/clients/gcs.py @@ -33,9 +33,11 @@ class DatalakeGcsClient(DatalakeBaseClient): def __init__( - self, client: storage.Client, temp_credentials_file_path_list: List[str] + self, + client: storage.Client, + temp_credentials_file_path_list: List[str], ): - self._client = client + super().__init__(client=client) self._temp_credentials_file_path_list = temp_credentials_file_path_list @property diff --git a/ingestion/src/metadata/ingestion/source/database/datalake/clients/s3.py b/ingestion/src/metadata/ingestion/source/database/datalake/clients/s3.py index 2dfdeab5607f..a7dcb85fabde 100644 --- a/ingestion/src/metadata/ingestion/source/database/datalake/clients/s3.py +++ b/ingestion/src/metadata/ingestion/source/database/datalake/clients/s3.py @@ -25,9 +25,6 @@ class DatalakeS3Client(DatalakeBaseClient): - def __init__(self, client): - self._client = client - @classmethod def from_config(cls, config: S3Config) -> "DatalakeS3Client": if not config.securityConfig: diff --git a/ingestion/src/metadata/profiler/interface/profiler_interface.py b/ingestion/src/metadata/profiler/interface/profiler_interface.py index fc58719a462d..96811c897191 100644 --- a/ingestion/src/metadata/profiler/interface/profiler_interface.py +++ b/ingestion/src/metadata/profiler/interface/profiler_interface.py @@ -60,7 +60,6 @@ def failed_profiler(self, error: str, stack_trace: Optional[str] = None) -> None ) -# pylint: disable=too-many-instance-attributes class ProfilerInterface(Root, ABC): """Protocol interface for the profiler processor""" @@ -112,7 +111,6 @@ def __init__( # pylint: disable=too-many-arguments **kwargs, ) - # pylint: disable=too-many-locals @classmethod def create( cls, diff --git a/ingestion/src/metadata/profiler/source/database/base/profiler_source.py b/ingestion/src/metadata/profiler/source/database/base/profiler_source.py index cce8716418be..2f59655cd695 100644 --- a/ingestion/src/metadata/profiler/source/database/base/profiler_source.py +++ b/ingestion/src/metadata/profiler/source/database/base/profiler_source.py @@ -87,6 +87,9 @@ def __init__( self._interface_type: str = config.source.type.lower() self.sqa_metadata = self._set_sqa_metadata() self._interface = None + # We define this in create_profiler_interface to help us reuse + # this method for the sampler, which does not have a DatabaseServiceProfilerPipeline + self.source_config = None self.global_profiler_configuration = global_profiler_configuration @property diff --git a/ingestion/src/metadata/profiler/source/fetcher/config.py b/ingestion/src/metadata/profiler/source/fetcher/config.py index b45a4933c2ae..ef44e9eca1e2 100644 --- a/ingestion/src/metadata/profiler/source/fetcher/config.py +++ b/ingestion/src/metadata/profiler/source/fetcher/config.py @@ -17,6 +17,8 @@ from metadata.generated.schema.type.filterPattern import FilterPattern +# We take the names from the JSON Schema +# pylint: disable=invalid-name @runtime_checkable class EntityFilterConfigInterface(Protocol): """Interface for the OM workflow source configs that allow filtering""" diff --git a/ingestion/src/metadata/sampler/nosql/sampler.py b/ingestion/src/metadata/sampler/nosql/sampler.py index 6eeb75deca31..6c18d8f884d4 100644 --- a/ingestion/src/metadata/sampler/nosql/sampler.py +++ b/ingestion/src/metadata/sampler/nosql/sampler.py @@ -1,3 +1,14 @@ +# Copyright 2021 Collate +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""NoSQL Sampler""" from typing import Dict, List, Optional, Tuple from metadata.generated.schema.entity.data.table import ProfileSampleType, TableData @@ -9,6 +20,7 @@ class NoSQLSampler(SamplerInterface): + """NoSQL generic implementation for the sampler""" client: NoSQLAdaptor @@ -46,8 +58,8 @@ def _fetch_sample_data_from_user_query(self) -> TableData: rows=[list(map(str, row)) for row in rows], columns=[c.name for c in cols] ) - def random_sample(self): - pass + def random_sample(self, **__): + """No randomization for NoSQL""" def fetch_sample_data(self, columns: List[SQALikeColumn]) -> TableData: if self.sample_query: diff --git a/ingestion/src/metadata/sampler/pandas/sampler.py b/ingestion/src/metadata/sampler/pandas/sampler.py index 3124ce9889ee..b14633e6eea5 100644 --- a/ingestion/src/metadata/sampler/pandas/sampler.py +++ b/ingestion/src/metadata/sampler/pandas/sampler.py @@ -50,7 +50,7 @@ def table(self): if not self._table: self._table = self.return_ometa_dataframes_sampled( service_connection_config=self.service_connection_config, - client=self.client._client, + client=self.client.client, table=self.entity, profile_sample_config=self.sample_config.profile_sample, ) @@ -162,7 +162,7 @@ def get_col_row(self, data_frame, columns: Optional[List[SQALikeColumn]] = None) break return cols, rows - def random_sample(self, is_sampled: bool = False): + def random_sample(self, is_sampled: bool = False, **__): """Generate random sample from the table Returns: diff --git a/ingestion/src/metadata/sampler/partition.py b/ingestion/src/metadata/sampler/partition.py index 703dd7bb6ff3..e62ad94a0496 100644 --- a/ingestion/src/metadata/sampler/partition.py +++ b/ingestion/src/metadata/sampler/partition.py @@ -106,56 +106,65 @@ def get_partition_details( return profiler_partitioning_config if service_type == DatabaseServiceType.BigQuery: - if table_partition: - column_partitions: Optional[ - List[PartitionColumnDetails] - ] = entity.tablePartition.columns - if not column_partitions: - raise TypeError("table partition missing. Skipping table") - - partiton = column_partitions[0] - - if partiton.intervalType == PartitionIntervalTypes.TIME_UNIT: - return PartitionProfilerConfig( - enablePartitioning=True, - partitionColumnName=partiton.columnName, - partitionIntervalUnit=PartitionIntervalUnit.DAY - if partiton.interval != "HOUR" - else partiton.interval, - partitionInterval=1, - partitionIntervalType=partiton.intervalType.value, - partitionValues=None, - partitionIntegerRangeStart=None, - partitionIntegerRangeEnd=None, - ) - if partiton.intervalType == PartitionIntervalTypes.INGESTION_TIME: - return PartitionProfilerConfig( - enablePartitioning=True, - partitionColumnName="_PARTITIONDATE" - if partiton.interval == "DAY" - else "_PARTITIONTIME", - partitionIntervalUnit=PartitionIntervalUnit.DAY - if partiton.interval != "HOUR" - else partiton.interval, - partitionInterval=1, - partitionIntervalType=partiton.intervalType.value, - partitionValues=None, - partitionIntegerRangeStart=None, - partitionIntegerRangeEnd=None, - ) - if partiton.intervalType == PartitionIntervalTypes.INTEGER_RANGE: - return PartitionProfilerConfig( - enablePartitioning=True, - partitionColumnName=partiton.columnName, - partitionIntervalUnit=None, - partitionInterval=None, - partitionIntervalType=partiton.intervalType.value, - partitionValues=None, - partitionIntegerRangeStart=1, - partitionIntegerRangeEnd=10000, - ) - raise TypeError( - f"Unsupported partition type {partiton.intervalType}. Skipping table" + return _handle_bigquery_partition(entity, table_partition) + + return None + + +def _handle_bigquery_partition( + entity: Table, table_partition: TablePartition +) -> Optional[PartitionProfilerConfig]: + """Bigquery specific logic for partitions""" + if table_partition: + column_partitions: Optional[ + List[PartitionColumnDetails] + ] = entity.tablePartition.columns + if not column_partitions: + raise TypeError("table partition missing. Skipping table") + + partition = column_partitions[0] + + if partition.intervalType == PartitionIntervalTypes.TIME_UNIT: + return PartitionProfilerConfig( + enablePartitioning=True, + partitionColumnName=partition.columnName, + partitionIntervalUnit=PartitionIntervalUnit.DAY + if partition.interval != "HOUR" + else partition.interval, + partitionInterval=1, + partitionIntervalType=partition.intervalType.value, + partitionValues=None, + partitionIntegerRangeStart=None, + partitionIntegerRangeEnd=None, + ) + if partition.intervalType == PartitionIntervalTypes.INGESTION_TIME: + return PartitionProfilerConfig( + enablePartitioning=True, + partitionColumnName="_PARTITIONDATE" + if partition.interval == "DAY" + else "_PARTITIONTIME", + partitionIntervalUnit=PartitionIntervalUnit.DAY + if partition.interval != "HOUR" + else partition.interval, + partitionInterval=1, + partitionIntervalType=partition.intervalType.value, + partitionValues=None, + partitionIntegerRangeStart=None, + partitionIntegerRangeEnd=None, ) + if partition.intervalType == PartitionIntervalTypes.INTEGER_RANGE: + return PartitionProfilerConfig( + enablePartitioning=True, + partitionColumnName=partition.columnName, + partitionIntervalUnit=None, + partitionInterval=None, + partitionIntervalType=partition.intervalType.value, + partitionValues=None, + partitionIntegerRangeStart=1, + partitionIntegerRangeEnd=10000, + ) + raise TypeError( + f"Unsupported partition type {partition.intervalType}. Skipping table" + ) return None diff --git a/ingestion/src/metadata/sampler/sampler_interface.py b/ingestion/src/metadata/sampler/sampler_interface.py index 10dd8fe714d2..5b7334f0ab45 100644 --- a/ingestion/src/metadata/sampler/sampler_interface.py +++ b/ingestion/src/metadata/sampler/sampler_interface.py @@ -53,6 +53,7 @@ class SamplerInterface(ABC): """Sampler interface""" + # pylint: disable=too-many-instance-attributes, too-many-arguments def __init__( self, service_connection_config: Union[DatabaseConnection, DatalakeConnection], @@ -65,7 +66,7 @@ def __init__( sample_query: Optional[str] = None, storage_config: DataStorageConfig = None, sample_data_count: Optional[int] = SAMPLE_DATA_DEFAULT_COUNT, - **kwargs, + **__, ): self.ometa_client = ometa_client self._sample = None @@ -87,6 +88,7 @@ def __init__( self.connection = get_ssl_connection(self.service_connection_config) self.client = self.get_client() + # pylint: disable=too-many-arguments, too-many-locals @classmethod def create( cls, diff --git a/ingestion/src/metadata/sampler/sqlalchemy/sampler.py b/ingestion/src/metadata/sampler/sqlalchemy/sampler.py index 89c811ee462f..be44cfd95f3d 100644 --- a/ingestion/src/metadata/sampler/sqlalchemy/sampler.py +++ b/ingestion/src/metadata/sampler/sqlalchemy/sampler.py @@ -120,7 +120,7 @@ def get_sample_query(self, *, column=None) -> Query: .cte(f"{self.table.__tablename__}_rnd") ) - def random_sample(self, ccolumn=None) -> Union[DeclarativeMeta, AliasedClass]: + def random_sample(self, ccolumn=None, **__) -> Union[DeclarativeMeta, AliasedClass]: """ Either return a sampled CTE of table, or the full table if no sampling is required. diff --git a/ingestion/src/metadata/sampler/sqlalchemy/snowflake/sampler.py b/ingestion/src/metadata/sampler/sqlalchemy/snowflake/sampler.py index 70dbbc9b416e..2ba02d097d20 100644 --- a/ingestion/src/metadata/sampler/sqlalchemy/snowflake/sampler.py +++ b/ingestion/src/metadata/sampler/sqlalchemy/snowflake/sampler.py @@ -42,6 +42,7 @@ class SnowflakeSampler(SQASampler): run the query in the whole table. """ + # pylint: disable=too-many-arguments def __init__( self, service_connection_config: Union[DatabaseConnection, DatalakeConnection], diff --git a/ingestion/src/metadata/utils/profiler_utils.py b/ingestion/src/metadata/utils/profiler_utils.py index c9af1d7dd555..623c56b7fe53 100644 --- a/ingestion/src/metadata/utils/profiler_utils.py +++ b/ingestion/src/metadata/utils/profiler_utils.py @@ -110,6 +110,7 @@ def set_cache(cache: defaultdict, key: str, value): def get_context_entities( entity: Table, metadata: OpenMetadata ) -> Tuple[DatabaseSchema, Database, DatabaseService]: + """Based on the table, get all the parent entities""" schema_entity = None database_entity = None db_service = None diff --git a/ingestion/tests/unit/profiler/test_profiler_partitions.py b/ingestion/tests/unit/profiler/test_profiler_partitions.py index 087cc36ece11..59dfcf3ef582 100644 --- a/ingestion/tests/unit/profiler/test_profiler_partitions.py +++ b/ingestion/tests/unit/profiler/test_profiler_partitions.py @@ -30,7 +30,7 @@ DatabaseServiceType, ) from metadata.generated.schema.type.entityReference import EntityReference -from metadata.profiler.interface.profiler_interface import ProfilerInterface +from metadata.sampler.partition import get_partition_details from metadata.workflow.profiler import ProfilerWorkflow """ @@ -154,7 +154,7 @@ def test_partition_details_time_unit(self): ) table_entity = cast(Table, table_entity) - resp = ProfilerInterface.get_partition_details(table_entity) + resp = get_partition_details(table_entity) if resp: assert resp.partitionColumnName == "e" @@ -171,7 +171,7 @@ def test_partition_details_time_unit(self): ) # type: ignore ) - resp = ProfilerInterface.get_partition_details(table_entity) + resp = get_partition_details(table_entity) if resp: assert resp.partitionColumnName == "e" @@ -195,7 +195,7 @@ def test_partition_details_ingestion_time_date(self): ) table_entity = cast(Table, table_entity) - resp = ProfilerInterface.get_partition_details(table_entity) + resp = get_partition_details(table_entity) if resp: assert resp.partitionColumnName == "_PARTITIONDATE" @@ -212,7 +212,7 @@ def test_partition_details_ingestion_time_date(self): ) # type: ignore ) - resp = ProfilerInterface.get_partition_details(table_entity) + resp = get_partition_details(table_entity) if resp: assert resp.partitionInterval == 10 assert resp.partitionColumnName == "_PARTITIONDATE" @@ -235,7 +235,7 @@ def test_partition_details_ingestion_time_hour(self): ) table_entity = cast(Table, table_entity) - resp = ProfilerInterface.get_partition_details(table_entity) + resp = get_partition_details(table_entity) if resp: assert resp.partitionColumnName == "_PARTITIONTIME" @@ -252,7 +252,7 @@ def test_partition_details_ingestion_time_hour(self): ) # type: ignore ) - resp = ProfilerInterface.get_partition_details(table_entity) + resp = get_partition_details(table_entity) if resp: assert resp.partitionInterval == 1 @@ -284,7 +284,7 @@ def test_partition_non_bq_table_profiler_partition_config(self): ) table_entity = cast(Table, table_entity) - resp = ProfilerInterface.get_partition_details(table_entity) + resp = get_partition_details(table_entity) if resp: assert resp.enablePartitioning assert resp.partitionColumnName == "foo" @@ -309,6 +309,6 @@ def test_partition_non_bq_table_no_profiler_partition_config(self): ) table_entity = cast(Table, table_entity) - resp = ProfilerInterface.get_partition_details(table_entity) + resp = get_partition_details(table_entity) assert resp is None From 48b8b64b899fe2cbcfac83e385317aba96f9af52 Mon Sep 17 00:00:00 2001 From: Pere Miquel Brull Date: Sat, 16 Nov 2024 10:55:38 +0100 Subject: [PATCH 16/29] rename workflow --- ...tabaseServiceAutoClassificationPipeline.json} | 16 ++++++++-------- .../json/schema/metadataIngestion/workflow.json | 2 +- 2 files changed, 9 insertions(+), 9 deletions(-) rename openmetadata-spec/src/main/resources/json/schema/metadataIngestion/{databaseServiceSamplerPipeline.json => databaseServiceAutoClassificationPipeline.json} (89%) diff --git a/openmetadata-spec/src/main/resources/json/schema/metadataIngestion/databaseServiceSamplerPipeline.json b/openmetadata-spec/src/main/resources/json/schema/metadataIngestion/databaseServiceAutoClassificationPipeline.json similarity index 89% rename from openmetadata-spec/src/main/resources/json/schema/metadataIngestion/databaseServiceSamplerPipeline.json rename to openmetadata-spec/src/main/resources/json/schema/metadataIngestion/databaseServiceAutoClassificationPipeline.json index 11c5001bdfa0..bc803a2f0f56 100644 --- a/openmetadata-spec/src/main/resources/json/schema/metadataIngestion/databaseServiceSamplerPipeline.json +++ b/openmetadata-spec/src/main/resources/json/schema/metadataIngestion/databaseServiceAutoClassificationPipeline.json @@ -1,22 +1,22 @@ { - "$id": "https://open-metadata.org/schema/metadataIngestion/databaseServiceSamplerPipeline.json", + "$id": "https://open-metadata.org/schema/metadataIngestion/databaseServiceAutoClassificationPipeline.json", "$schema": "http://json-schema.org/draft-07/schema#", - "title": "DatabaseServiceSamplerPipeline", - "description": "DatabaseService Sampler & Auto Classification Pipeline Configuration.", + "title": "DatabaseServiceAutoClassificationPipeline", + "description": "DatabaseService AutoClassification & Auto Classification Pipeline Configuration.", "type": "object", "definitions": { - "samplerConfigType": { + "autoClassificationConfigType": { "description": "Profiler Source Config Pipeline type", "type": "string", - "enum": ["Sampler"], - "default": "Sampler" + "enum": ["AutoClassification"], + "default": "AutoClassification" } }, "properties": { "type": { "description": "Pipeline type", - "$ref": "#/definitions/samplerConfigType", - "default": "Sampler" + "$ref": "#/definitions/autoClassificationConfigType", + "default": "AutoClassification" }, "classificationFilterPattern": { "description": "Regex to only compute metrics for table that matches the given tag, tiers, gloassary pattern.", diff --git a/openmetadata-spec/src/main/resources/json/schema/metadataIngestion/workflow.json b/openmetadata-spec/src/main/resources/json/schema/metadataIngestion/workflow.json index f47d99e71126..9f9fe0d266c6 100644 --- a/openmetadata-spec/src/main/resources/json/schema/metadataIngestion/workflow.json +++ b/openmetadata-spec/src/main/resources/json/schema/metadataIngestion/workflow.json @@ -31,7 +31,7 @@ "$ref": "databaseServiceProfilerPipeline.json" }, { - "$ref": "databaseServiceSamplerPipeline.json" + "$ref": "databaseServiceAutoClassificationPipeline.json" }, { "$ref": "pipelineServiceMetadataPipeline.json" From d58e91684103859d0c5d5a0f3d8337a277e6a437 Mon Sep 17 00:00:00 2001 From: Pere Miquel Brull Date: Sat, 16 Nov 2024 11:00:32 +0100 Subject: [PATCH 17/29] rename --- .../src/metadata/cli/{sample.py => classify.py} | 6 +++--- ingestion/src/metadata/cmd.py | 10 +++++----- ingestion/src/metadata/pii/processor.py | 9 +++++---- ingestion/src/metadata/sampler/processor.py | 9 +++++---- .../workflow/{sampler.py => classification.py} | 13 +++++++------ 5 files changed, 25 insertions(+), 22 deletions(-) rename ingestion/src/metadata/cli/{sample.py => classify.py} (89%) rename ingestion/src/metadata/workflow/{sampler.py => classification.py} (81%) diff --git a/ingestion/src/metadata/cli/sample.py b/ingestion/src/metadata/cli/classify.py similarity index 89% rename from ingestion/src/metadata/cli/sample.py rename to ingestion/src/metadata/cli/classify.py index 7236ff157c8f..8bdffd04fefe 100644 --- a/ingestion/src/metadata/cli/sample.py +++ b/ingestion/src/metadata/cli/classify.py @@ -21,13 +21,13 @@ PipelineType, ) from metadata.utils.logger import cli_logger, redacted_config -from metadata.workflow.sampler import SamplerWorkflow +from metadata.workflow.classification import AutoClassificationWorkflow from metadata.workflow.workflow_init_error_handler import WorkflowInitErrorHandler logger = cli_logger() -def run_sample(config_path: Path) -> None: +def run_classification(config_path: Path) -> None: """ Run the sampler workflow from a config path to a JSON or YAML file @@ -38,7 +38,7 @@ def run_sample(config_path: Path) -> None: try: config_dict = load_config_file(config_path) logger.debug("Using workflow config:\n%s", redacted_config(config_dict)) - workflow = SamplerWorkflow.create(config_dict) + workflow = AutoClassificationWorkflow.create(config_dict) except Exception as exc: logger.debug(traceback.format_exc()) WorkflowInitErrorHandler.print_init_error( diff --git a/ingestion/src/metadata/cmd.py b/ingestion/src/metadata/cmd.py index b01eb2bae4ff..7f91638d04b1 100644 --- a/ingestion/src/metadata/cmd.py +++ b/ingestion/src/metadata/cmd.py @@ -22,11 +22,11 @@ from metadata.__version__ import get_metadata_version from metadata.cli.app import run_app +from metadata.cli.classify import run_classification from metadata.cli.dataquality import run_test from metadata.cli.ingest import run_ingest from metadata.cli.lineage import run_lineage from metadata.cli.profile import run_profiler -from metadata.cli.sample import run_sample from metadata.cli.usage import run_usage from metadata.utils.logger import cli_logger, set_loggers_level @@ -41,7 +41,7 @@ class MetadataCommands(Enum): WEBHOOK = "webhook" LINEAGE = "lineage" APP = "app" - SAMPLE = "sample" + AUTO_CLASSIFICATION = "classification" RUN_PATH_METHODS = { @@ -51,7 +51,7 @@ class MetadataCommands(Enum): MetadataCommands.PROFILE.value: run_profiler, MetadataCommands.TEST.value: run_test, MetadataCommands.APP.value: run_app, - MetadataCommands.SAMPLE.value: run_sample, + MetadataCommands.AUTO_CLASSIFICATION.value: run_classification, } @@ -129,8 +129,8 @@ def get_parser(args: Optional[List[str]] = None): ) create_common_config_parser_args( sub_parser.add_parser( - MetadataCommands.SAMPLE.value, - help="Workflow for running sampling and auto classification", + MetadataCommands.AUTO_CLASSIFICATION.value, + help="Workflow for running auto classification", ) ) webhook_args( diff --git a/ingestion/src/metadata/pii/processor.py b/ingestion/src/metadata/pii/processor.py index cf2e90e36a30..7395371ab2da 100644 --- a/ingestion/src/metadata/pii/processor.py +++ b/ingestion/src/metadata/pii/processor.py @@ -19,8 +19,8 @@ from metadata.generated.schema.entity.services.ingestionPipelines.status import ( StackTraceError, ) -from metadata.generated.schema.metadataIngestion.databaseServiceSamplerPipeline import ( - DatabaseServiceSamplerPipeline, +from metadata.generated.schema.metadataIngestion.databaseServiceAutoClassificationPipeline import ( + DatabaseServiceAutoClassificationPipeline, ) from metadata.generated.schema.metadataIngestion.workflow import ( OpenMetadataWorkflowConfig, @@ -61,8 +61,9 @@ def __init__( self.metadata = metadata # Init and type the source config - self.source_config: DatabaseServiceSamplerPipeline = cast( - DatabaseServiceSamplerPipeline, self.config.source.sourceConfig.config + self.source_config: DatabaseServiceAutoClassificationPipeline = cast( + DatabaseServiceAutoClassificationPipeline, + self.config.source.sourceConfig.config, ) # Used to satisfy type checked self._ner_scanner = None diff --git a/ingestion/src/metadata/sampler/processor.py b/ingestion/src/metadata/sampler/processor.py index ecd126674102..5159f2b79006 100644 --- a/ingestion/src/metadata/sampler/processor.py +++ b/ingestion/src/metadata/sampler/processor.py @@ -26,8 +26,8 @@ StackTraceError, ) from metadata.generated.schema.entity.services.serviceType import ServiceType -from metadata.generated.schema.metadataIngestion.databaseServiceSamplerPipeline import ( - DatabaseServiceSamplerPipeline, +from metadata.generated.schema.metadataIngestion.databaseServiceAutoClassificationPipeline import ( + DatabaseServiceAutoClassificationPipeline, ) from metadata.generated.schema.metadataIngestion.workflow import ( OpenMetadataWorkflowConfig, @@ -64,8 +64,9 @@ def __init__(self, config: OpenMetadataWorkflowConfig, metadata: OpenMetadata): self.config = config self.metadata = metadata - self.source_config: DatabaseServiceSamplerPipeline = cast( - DatabaseServiceSamplerPipeline, self.config.source.sourceConfig.config + self.source_config: DatabaseServiceAutoClassificationPipeline = cast( + DatabaseServiceAutoClassificationPipeline, + self.config.source.sourceConfig.config, ) # Used to satisfy type checked self._interface_type: str = config.source.type.lower() diff --git a/ingestion/src/metadata/workflow/sampler.py b/ingestion/src/metadata/workflow/classification.py similarity index 81% rename from ingestion/src/metadata/workflow/sampler.py rename to ingestion/src/metadata/workflow/classification.py index 0cc3d7a6835e..4f1417d9da4a 100644 --- a/ingestion/src/metadata/workflow/sampler.py +++ b/ingestion/src/metadata/workflow/classification.py @@ -13,8 +13,8 @@ """ from typing import cast -from metadata.generated.schema.metadataIngestion.databaseServiceSamplerPipeline import ( - DatabaseServiceSamplerPipeline, +from metadata.generated.schema.metadataIngestion.databaseServiceAutoClassificationPipeline import ( + DatabaseServiceAutoClassificationPipeline, ) from metadata.ingestion.api.steps import Processor from metadata.pii.processor import PIIProcessor @@ -25,8 +25,8 @@ logger = profiler_logger() -class SamplerWorkflow(ProfilerWorkflow): - """Sampler workflow implementation. Based on the Profiler logic with different steps""" +class AutoClassificationWorkflow(ProfilerWorkflow): + """Auto Classification workflow implementation. Based on the Profiler logic with different steps""" def set_steps(self): source_class = self._get_source_class() @@ -36,8 +36,9 @@ def set_steps(self): sampler_processor = self._get_sampler_processor() # Only instantiate the PII Processor on demand - source_config: DatabaseServiceSamplerPipeline = cast( - DatabaseServiceSamplerPipeline, self.config.source.sourceConfig.config + source_config: DatabaseServiceAutoClassificationPipeline = cast( + DatabaseServiceAutoClassificationPipeline, + self.config.source.sourceConfig.config, ) if source_config.enableAutoClassification: pii_processor = self._get_pii_processor() From 11f8156de67c96b8f0b8c027672474fb0788d2cd Mon Sep 17 00:00:00 2001 From: Pere Miquel Brull Date: Sat, 16 Nov 2024 11:22:00 +0100 Subject: [PATCH 18/29] prepare ingestion pipeline --- ingestion/src/metadata/cmd.py | 2 +- .../test_auto_classification_workflow.py | 68 ++++++++++++++ .../ingestion/auto_classification.py | 93 +++++++++++++++++++ .../workflows/ingestion/registry.py | 2 + .../ingestionPipelines/ingestionPipeline.json | 2 +- 5 files changed, 165 insertions(+), 2 deletions(-) create mode 100644 ingestion/tests/integration/workflow/test_auto_classification_workflow.py create mode 100644 openmetadata-airflow-apis/openmetadata_managed_apis/workflows/ingestion/auto_classification.py diff --git a/ingestion/src/metadata/cmd.py b/ingestion/src/metadata/cmd.py index 7f91638d04b1..e1d0d62ff4c2 100644 --- a/ingestion/src/metadata/cmd.py +++ b/ingestion/src/metadata/cmd.py @@ -41,7 +41,7 @@ class MetadataCommands(Enum): WEBHOOK = "webhook" LINEAGE = "lineage" APP = "app" - AUTO_CLASSIFICATION = "classification" + AUTO_CLASSIFICATION = "classify" RUN_PATH_METHODS = { diff --git a/ingestion/tests/integration/workflow/test_auto_classification_workflow.py b/ingestion/tests/integration/workflow/test_auto_classification_workflow.py new file mode 100644 index 000000000000..c15dbd763607 --- /dev/null +++ b/ingestion/tests/integration/workflow/test_auto_classification_workflow.py @@ -0,0 +1,68 @@ +# Copyright 2021 Collate +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Validate the initialization of the Auto Classification Workflow +""" +from unittest.mock import patch + +import yaml + +from metadata.profiler.source.metadata import OpenMetadataSource +from metadata.workflow.application import ApplicationWorkflow, AppRunner +from metadata.workflow.classification import AutoClassificationWorkflow + + +@patch.object( + OpenMetadataSource, + "_validate_service_name", + return_value=True, +) +def test_init_auto_classification(*_) -> None: + """We can properly instantiate the app""" + + config = """ + source: + type: mysql + serviceName: mysql + serviceConnection: + config: + type: Mysql + username: openmetadata_user + authType: + password: openmetadata_password + hostPort: localhost:3306 + databaseSchema: openmetadata_db + sourceConfig: + config: + type: AutoClassification + storeSampleData: true + enableAutoClassification: true + tableFilterPattern: + includes: + - entity + processor: + type: orm-profiler + config: {} + sink: + type: metadata-rest + config: {} + workflowConfig: + loggerLevel: DEBUG + openMetadataServerConfig: + enableVersionValidation: false + hostPort: 'http://localhost:8585/api' + authProvider: openmetadata + securityConfig: + jwtToken: "eyJraWQiOiJHYjM4OWEtOWY3Ni1nZGpzLWE5MmotMDI0MmJrOTQzNTYiLCJ0eXAiOiJKV1QiLCJhbGciOiJSUzI1NiJ9.eyJzdWIiOiJhZG1pbiIsImlzQm90IjpmYWxzZSwiaXNzIjoib3Blbi1tZXRhZGF0YS5vcmciLCJpYXQiOjE2NjM5Mzg0NjIsImVtYWlsIjoiYWRtaW5Ab3Blbm1ldGFkYXRhLm9yZyJ9.tS8um_5DKu7HgzGBzS1VTA5uUjKWOCU0B_j08WXBiEC0mr0zNREkqVfwFDD-d24HlNEbrqioLsBuFRiwIWKc1m_ZlVQbG7P36RUxhuv2vbSp80FKyNM-Tj93FDzq91jsyNmsQhyNv_fNr3TXfzzSPjHt8Go0FMMP66weoKMgW2PbXlhVKwEuXUHyakLLzewm9UMeQaEiRzhiTMU3UkLXcKbYEJJvfNFcLwSl9W8JCO_l0Yj3ud-qt_nQYEZwqW6u5nfdQllN133iikV4fM5QZsMCnm8Rq1mvLR0y9bmJiD7fwM1tmJ791TUWqmKaTnP49U493VanKpUAfzIiOiIbhg" + """ + + workflow = AutoClassificationWorkflow.create(yaml.safe_load(config)) + assert isinstance(workflow, AutoClassificationWorkflow) diff --git a/openmetadata-airflow-apis/openmetadata_managed_apis/workflows/ingestion/auto_classification.py b/openmetadata-airflow-apis/openmetadata_managed_apis/workflows/ingestion/auto_classification.py new file mode 100644 index 000000000000..afb3a7ae176a --- /dev/null +++ b/openmetadata-airflow-apis/openmetadata_managed_apis/workflows/ingestion/auto_classification.py @@ -0,0 +1,93 @@ +# Copyright 2021 Collate +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Auto Classification DAG function builder +""" +import json + +from airflow import DAG +from metadata.workflow.classification import AutoClassificationWorkflow + +from openmetadata_managed_apis.utils.logger import set_operator_logger +from openmetadata_managed_apis.workflows.ingestion.common import build_dag, build_source + +from metadata.generated.schema.entity.services.ingestionPipelines.ingestionPipeline import ( + IngestionPipeline, +) +from metadata.generated.schema.metadataIngestion.workflow import ( + LogLevels, + OpenMetadataWorkflowConfig, + Processor, + Sink, + WorkflowConfig, +) + + +def auto_classification_workflow(workflow_config: OpenMetadataWorkflowConfig): + """ + Task that creates and runs the auto classification workflow. + + The workflow_config gets cooked form the incoming + ingestionPipeline. + + This is the callable used to create the PythonOperator + """ + + set_operator_logger(workflow_config) + + config = json.loads(workflow_config.model_dump_json(exclude_defaults=False)) + workflow = AutoClassificationWorkflow.create(config) + + workflow.execute() + workflow.raise_from_status() + workflow.print_status() + workflow.stop() + + +def build_auto_classification_workflow_config( + ingestion_pipeline: IngestionPipeline, +) -> OpenMetadataWorkflowConfig: + """ + Given an airflow_pipeline, prepare the workflow config JSON + """ + workflow_config = OpenMetadataWorkflowConfig( + source=build_source(ingestion_pipeline), + sink=Sink( + type="metadata-rest", + config={}, + ), + processor=Processor( + type="orm-profiler", + config={}, + ), + workflowConfig=WorkflowConfig( + loggerLevel=ingestion_pipeline.loggerLevel or LogLevels.INFO, + openMetadataServerConfig=ingestion_pipeline.openMetadataServerConnection, + ), + ingestionPipelineFQN=ingestion_pipeline.fullyQualifiedName.root, + ) + + return workflow_config + + +def build_auto_classification_dag(ingestion_pipeline: IngestionPipeline) -> DAG: + """ + Build a simple metadata workflow DAG + """ + workflow_config = build_auto_classification_workflow_config(ingestion_pipeline) + dag = build_dag( + task_name="auto_classification_task", + ingestion_pipeline=ingestion_pipeline, + workflow_config=workflow_config, + workflow_fn=auto_classification_workflow, + ) + + return dag diff --git a/openmetadata-airflow-apis/openmetadata_managed_apis/workflows/ingestion/registry.py b/openmetadata-airflow-apis/openmetadata_managed_apis/workflows/ingestion/registry.py index 356470756674..0f5b55b69b42 100644 --- a/openmetadata-airflow-apis/openmetadata_managed_apis/workflows/ingestion/registry.py +++ b/openmetadata-airflow-apis/openmetadata_managed_apis/workflows/ingestion/registry.py @@ -16,6 +16,7 @@ from openmetadata_managed_apis.workflows.ingestion.application import ( build_application_dag, ) +from openmetadata_managed_apis.workflows.ingestion.auto_classification import build_auto_classification_dag from openmetadata_managed_apis.workflows.ingestion.dbt import build_dbt_dag from openmetadata_managed_apis.workflows.ingestion.es_reindex import ( build_es_reindex_dag, @@ -40,6 +41,7 @@ build_registry.add(PipelineType.lineage.value)(build_lineage_dag) build_registry.add(PipelineType.dbt.value)(build_dbt_dag) build_registry.add(PipelineType.profiler.value)(build_profiler_dag) +build_registry.add(PipelineType.autoClassification.value)(build_auto_classification_dag) build_registry.add(PipelineType.TestSuite.value)(build_test_suite_dag) build_registry.add(PipelineType.elasticSearchReindex.value)(build_es_reindex_dag) build_registry.add(PipelineType.application.value)(build_application_dag) diff --git a/openmetadata-spec/src/main/resources/json/schema/entity/services/ingestionPipelines/ingestionPipeline.json b/openmetadata-spec/src/main/resources/json/schema/entity/services/ingestionPipelines/ingestionPipeline.json index 42e7771d4eb6..314c78b2e555 100644 --- a/openmetadata-spec/src/main/resources/json/schema/entity/services/ingestionPipelines/ingestionPipeline.json +++ b/openmetadata-spec/src/main/resources/json/schema/entity/services/ingestionPipelines/ingestionPipeline.json @@ -11,7 +11,7 @@ "description": "Type of Pipeline - metadata, usage", "type": "string", "javaType": "org.openmetadata.schema.entity.services.ingestionPipelines.PipelineType", - "enum": ["metadata", "usage", "lineage", "profiler", "TestSuite", "dataInsight", "elasticSearchReindex", "dbt", "application"] + "enum": ["metadata", "usage", "lineage", "profiler", "autoClassification", "TestSuite", "dataInsight", "elasticSearchReindex", "dbt", "application"] }, "pipelineStatus": { "type": "object", From fc01ac697be130e7812c64117b51ac922202cd24 Mon Sep 17 00:00:00 2001 From: Pere Miquel Brull Date: Sat, 16 Nov 2024 11:37:44 +0100 Subject: [PATCH 19/29] migrate profiler pipeline --- .../native/1.6.0/mysql/schemaChanges.sql | 4 ++++ .../native/1.6.0/postgres/schemaChanges.sql | 5 +++++ .../upgrade/upgrade-prerequisites.md | 20 +++++++++++++++++++ .../databaseServiceProfilerPipeline.json | 18 ----------------- 4 files changed, 29 insertions(+), 18 deletions(-) diff --git a/bootstrap/sql/migrations/native/1.6.0/mysql/schemaChanges.sql b/bootstrap/sql/migrations/native/1.6.0/mysql/schemaChanges.sql index 1365f5ed036f..b033d86c8164 100644 --- a/bootstrap/sql/migrations/native/1.6.0/mysql/schemaChanges.sql +++ b/bootstrap/sql/migrations/native/1.6.0/mysql/schemaChanges.sql @@ -1743,3 +1743,7 @@ UPDATE ingestion_pipeline_entity SET json = JSON_REMOVE(json, '$.sourceConfig.config.overrideViewLineage') WHERE JSON_EXTRACT(json, '$.pipelineType') = 'metadata'; +-- classification and sampling configs from the profiler pipelines +UPDATE ingestion_pipeline_entity +SET json = JSON_REMOVE(json, '$.sourceConfig.config.processPiiSensitive', '$.sourceConfig.config.confidence', '$.sourceConfig.config.generateSampleData') +WHERE JSON_EXTRACT(json, '$.pipelineType') = 'profiler'; diff --git a/bootstrap/sql/migrations/native/1.6.0/postgres/schemaChanges.sql b/bootstrap/sql/migrations/native/1.6.0/postgres/schemaChanges.sql index e125f937e9eb..c5711a317417 100644 --- a/bootstrap/sql/migrations/native/1.6.0/postgres/schemaChanges.sql +++ b/bootstrap/sql/migrations/native/1.6.0/postgres/schemaChanges.sql @@ -1729,3 +1729,8 @@ CREATE INDEX idx_event_subscription_id ON successful_sent_change_events (event_s UPDATE ingestion_pipeline_entity SET json = json::jsonb #- '{sourceConfig,config,overrideViewLineage}' WHERE json #>> '{pipelineType}' = 'metadata'; + +-- classification and sampling configs from the profiler pipelines +UPDATE ingestion_pipeline_entity +SET json = json::jsonb #- '{sourceConfig,config,processPiiSensitive}' #- '{sourceConfig,config,confidence}' #- '{sourceConfig,config,generateSampleData}' +WHERE json #>> '{pipelineType}' = 'profiler'; \ No newline at end of file diff --git a/openmetadata-docs/content/partials/v1.6/deployment/upgrade/upgrade-prerequisites.md b/openmetadata-docs/content/partials/v1.6/deployment/upgrade/upgrade-prerequisites.md index 24bc4ebc30df..05ae8bf4c51a 100644 --- a/openmetadata-docs/content/partials/v1.6/deployment/upgrade/upgrade-prerequisites.md +++ b/openmetadata-docs/content/partials/v1.6/deployment/upgrade/upgrade-prerequisites.md @@ -124,3 +124,23 @@ workflow as successful. However, any errors when sending the information to Open Now, we're changing this behavior to consider the success rate of all the steps involved in the workflow. The UI will then show more `Partial Success` statuses rather than `Failed`, properly reflecting the real state of the workflow. + +### Profiler & Auto Classification Workflow + +We are creating a new `Auto Classification` workflow that will take care of managing the sample data and PII classification, +which was previously done by the Profiler workflow. This change will allow us to have a more modular and scalable system. + +The Profiler workflow will now only focus on the profiling part of the data, while the Auto Classification will take care +of the rest. + +This means that we are removing these properties from the `DatabaseServiceProfilerPipeline` schema: +- `generateSampleData` +- `processPiiSensitive` +- `confidence` +which will be moved to the new `DatabaseServiceAutoClassificationPipeline` schema. + +What you will need to do: +- If you are using the **EXTERNAL** ingestion for the profiler (YAML configuration), you will need to update your configuration, +removing these properties as well. +- If you still want to use the Auto PII Classification and sampling features, you can create the new workflow +from the UI. diff --git a/openmetadata-spec/src/main/resources/json/schema/metadataIngestion/databaseServiceProfilerPipeline.json b/openmetadata-spec/src/main/resources/json/schema/metadataIngestion/databaseServiceProfilerPipeline.json index 40a59174bd5c..2dbd5186980d 100644 --- a/openmetadata-spec/src/main/resources/json/schema/metadataIngestion/databaseServiceProfilerPipeline.json +++ b/openmetadata-spec/src/main/resources/json/schema/metadataIngestion/databaseServiceProfilerPipeline.json @@ -50,12 +50,6 @@ "default": false, "title": "Use FQN For Filtering" }, - "generateSampleData": { - "description": "Option to turn on/off generating sample data. If enabled, profiler will ingest sample data for each table.", - "type": "boolean", - "default": true, - "title": "Generate Sample Data" - }, "computeMetrics": { "description": "Option to turn on/off computing profiler metrics.", "type": "boolean", @@ -80,18 +74,6 @@ "default": false, "title": "Use Gathered Statistics" }, - "processPiiSensitive": { - "description": "Optional configuration to automatically tag columns that might contain sensitive information", - "type": "boolean", - "default": false, - "title": "Auto Tag PII" - }, - "confidence": { - "description": "Set the Confidence value for which you want the column to be tagged as PII. Confidence value ranges from 0 to 100. A higher number will yield less false positives but more false negatives. A lower number will yield more false positives but less false negatives.", - "type": "number", - "default": 80, - "title": "PII Inference Confidence Level" - }, "profileSampleType": { "$ref": "../entity/data/table.json#/definitions/profileSampleType", "title": "Profile Sample Type" From 7117f1d94b09b58e35b55e856d40b376d6a1bd26 Mon Sep 17 00:00:00 2001 From: Pere Miquel Brull Date: Sat, 16 Nov 2024 12:04:37 +0100 Subject: [PATCH 20/29] fix --- ingestion/pyproject.toml | 2 +- .../integration/workflow/test_auto_classification_workflow.py | 1 - .../workflows/ingestion/auto_classification.py | 3 +-- .../openmetadata_managed_apis/workflows/ingestion/registry.py | 4 +++- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/ingestion/pyproject.toml b/ingestion/pyproject.toml index 9192fa2ff434..ac88a38ee746 100644 --- a/ingestion/pyproject.toml +++ b/ingestion/pyproject.toml @@ -273,7 +273,7 @@ ignore = [ "src/metadata/workflow/metadata.py", "src/metadata/workflow/profiler.py", "src/metadata/workflow/usage.py", - "src/metadata/workflow/sampler.py", + "src/metadata/workflow/classification.py", "src/metadata/workflow/workflow_status_mixin.py", ] diff --git a/ingestion/tests/integration/workflow/test_auto_classification_workflow.py b/ingestion/tests/integration/workflow/test_auto_classification_workflow.py index c15dbd763607..38297a689d9d 100644 --- a/ingestion/tests/integration/workflow/test_auto_classification_workflow.py +++ b/ingestion/tests/integration/workflow/test_auto_classification_workflow.py @@ -16,7 +16,6 @@ import yaml from metadata.profiler.source.metadata import OpenMetadataSource -from metadata.workflow.application import ApplicationWorkflow, AppRunner from metadata.workflow.classification import AutoClassificationWorkflow diff --git a/openmetadata-airflow-apis/openmetadata_managed_apis/workflows/ingestion/auto_classification.py b/openmetadata-airflow-apis/openmetadata_managed_apis/workflows/ingestion/auto_classification.py index afb3a7ae176a..4d1f81d223bf 100644 --- a/openmetadata-airflow-apis/openmetadata_managed_apis/workflows/ingestion/auto_classification.py +++ b/openmetadata-airflow-apis/openmetadata_managed_apis/workflows/ingestion/auto_classification.py @@ -14,8 +14,6 @@ import json from airflow import DAG -from metadata.workflow.classification import AutoClassificationWorkflow - from openmetadata_managed_apis.utils.logger import set_operator_logger from openmetadata_managed_apis.workflows.ingestion.common import build_dag, build_source @@ -29,6 +27,7 @@ Sink, WorkflowConfig, ) +from metadata.workflow.classification import AutoClassificationWorkflow def auto_classification_workflow(workflow_config: OpenMetadataWorkflowConfig): diff --git a/openmetadata-airflow-apis/openmetadata_managed_apis/workflows/ingestion/registry.py b/openmetadata-airflow-apis/openmetadata_managed_apis/workflows/ingestion/registry.py index 0f5b55b69b42..6705c34ec1b1 100644 --- a/openmetadata-airflow-apis/openmetadata_managed_apis/workflows/ingestion/registry.py +++ b/openmetadata-airflow-apis/openmetadata_managed_apis/workflows/ingestion/registry.py @@ -16,7 +16,9 @@ from openmetadata_managed_apis.workflows.ingestion.application import ( build_application_dag, ) -from openmetadata_managed_apis.workflows.ingestion.auto_classification import build_auto_classification_dag +from openmetadata_managed_apis.workflows.ingestion.auto_classification import ( + build_auto_classification_dag, +) from openmetadata_managed_apis.workflows.ingestion.dbt import build_dbt_dag from openmetadata_managed_apis.workflows.ingestion.es_reindex import ( build_es_reindex_dag, From dfcbb78cdb4c17d8cbd799178eaaaa165a376b4d Mon Sep 17 00:00:00 2001 From: Pere Miquel Brull Date: Sat, 16 Nov 2024 16:45:35 +0100 Subject: [PATCH 21/29] fix --- .../workflows/bigquery_classifier.yaml | 56 +++++++++++++++++++ .../examples/workflows/bigquery_profiler.yaml | 3 +- .../examples/workflows/db2_profiler.yaml | 3 +- .../examples/workflows/mysql_profiler.yaml | 1 - .../workflows/redshift_classifier.yaml | 38 +++++++++++++ .../examples/workflows/redshift_profiler.yaml | 5 +- .../profiler/pandas/test_custom_metrics.py | 9 +-- .../profiler/pandas/test_datalake_metrics.py | 3 +- .../unit/profiler/pandas/test_profiler.py | 3 +- .../pandas/test_profiler_interface.py | 3 +- .../tests/unit/profiler/pandas/test_sample.py | 15 ++--- 11 files changed, 109 insertions(+), 30 deletions(-) create mode 100644 ingestion/src/metadata/examples/workflows/bigquery_classifier.yaml create mode 100644 ingestion/src/metadata/examples/workflows/redshift_classifier.yaml diff --git a/ingestion/src/metadata/examples/workflows/bigquery_classifier.yaml b/ingestion/src/metadata/examples/workflows/bigquery_classifier.yaml new file mode 100644 index 000000000000..463fd00f7c59 --- /dev/null +++ b/ingestion/src/metadata/examples/workflows/bigquery_classifier.yaml @@ -0,0 +1,56 @@ +source: + type: bigquery + serviceName: local_bigquery + serviceConnection: + config: + type: BigQuery + credentials: + gcpConfig: + type: service_account + projectId: my-project-id-1234 + privateKeyId: privateKeyID + privateKey: "-----BEGIN PRIVATE KEY-----\nmySuperSecurePrivateKey==\n-----END PRIVATE KEY-----\n" + clientEmail: client@email.secure + clientId: "1234567890" + authUri: https://accounts.google.com/o/oauth2/auth + tokenUri: https://oauth2.googleapis.com/token + authProviderX509CertUrl: https://www.googleapis.com/oauth2/v1/certs + clientX509CertUrl: https://www.googleapis.com/oauth2/v1/certs + sourceConfig: + config: + type: AutoClassification + storeSampleData: true + enableAutoClassification: true + databaseFilterPattern: + includes: + - hello-world-1234 + schemaFilterPattern: + includes: + - super_schema + tableFilterPattern: + includes: + - abc + +processor: + type: "orm-profiler" + config: + tableConfig: + - fullyQualifiedName: local_bigquery.hello-world-1234.super_schema.abc + profileSample: 85 + partitionConfig: + partitionQueryDuration: 180 + columnConfig: + excludeColumns: + - a + - b + +sink: + type: metadata-rest + config: {} +workflowConfig: +# loggerLevel: INFO # DEBUG, INFO, WARN or ERROR + openMetadataServerConfig: + hostPort: http://localhost:8585/api + authProvider: openmetadata + securityConfig: + jwtToken: "eyJraWQiOiJHYjM4OWEtOWY3Ni1nZGpzLWE5MmotMDI0MmJrOTQzNTYiLCJ0eXAiOiJKV1QiLCJhbGciOiJSUzI1NiJ9.eyJzdWIiOiJhZG1pbiIsImlzQm90IjpmYWxzZSwiaXNzIjoib3Blbi1tZXRhZGF0YS5vcmciLCJpYXQiOjE2NjM5Mzg0NjIsImVtYWlsIjoiYWRtaW5Ab3Blbm1ldGFkYXRhLm9yZyJ9.tS8um_5DKu7HgzGBzS1VTA5uUjKWOCU0B_j08WXBiEC0mr0zNREkqVfwFDD-d24HlNEbrqioLsBuFRiwIWKc1m_ZlVQbG7P36RUxhuv2vbSp80FKyNM-Tj93FDzq91jsyNmsQhyNv_fNr3TXfzzSPjHt8Go0FMMP66weoKMgW2PbXlhVKwEuXUHyakLLzewm9UMeQaEiRzhiTMU3UkLXcKbYEJJvfNFcLwSl9W8JCO_l0Yj3ud-qt_nQYEZwqW6u5nfdQllN133iikV4fM5QZsMCnm8Rq1mvLR0y9bmJiD7fwM1tmJ791TUWqmKaTnP49U493VanKpUAfzIiOiIbhg" \ No newline at end of file diff --git a/ingestion/src/metadata/examples/workflows/bigquery_profiler.yaml b/ingestion/src/metadata/examples/workflows/bigquery_profiler.yaml index c9eb5f1a85f8..97ccbb8b50df 100644 --- a/ingestion/src/metadata/examples/workflows/bigquery_profiler.yaml +++ b/ingestion/src/metadata/examples/workflows/bigquery_profiler.yaml @@ -19,8 +19,7 @@ source: sourceConfig: config: type: Profiler - generateSampleData: true - databaseFilterPattern: + databaseFilterPattern: includes: - hello-world-1234 schemaFilterPattern: diff --git a/ingestion/src/metadata/examples/workflows/db2_profiler.yaml b/ingestion/src/metadata/examples/workflows/db2_profiler.yaml index 23f7989f1d88..2876fbaafa5f 100644 --- a/ingestion/src/metadata/examples/workflows/db2_profiler.yaml +++ b/ingestion/src/metadata/examples/workflows/db2_profiler.yaml @@ -11,8 +11,7 @@ source: sourceConfig: config: type: Profiler - generateSampleData: true - databaseFilterPattern: + databaseFilterPattern: includes: - database schemaFilterPattern: diff --git a/ingestion/src/metadata/examples/workflows/mysql_profiler.yaml b/ingestion/src/metadata/examples/workflows/mysql_profiler.yaml index d69f44f8d06a..9d8ef314225d 100644 --- a/ingestion/src/metadata/examples/workflows/mysql_profiler.yaml +++ b/ingestion/src/metadata/examples/workflows/mysql_profiler.yaml @@ -13,7 +13,6 @@ source: sourceConfig: config: type: Profiler - generateSampleData: true schemaFilterPattern: includes: - openmetadata_db* diff --git a/ingestion/src/metadata/examples/workflows/redshift_classifier.yaml b/ingestion/src/metadata/examples/workflows/redshift_classifier.yaml new file mode 100644 index 000000000000..d3599ce6eab0 --- /dev/null +++ b/ingestion/src/metadata/examples/workflows/redshift_classifier.yaml @@ -0,0 +1,38 @@ +source: + type: redshift + serviceName: local_redshift + serviceConnection: + config: + hostPort: my-host:5439 + username: username + password: strongPassword + database: databaseToConnect + type: Redshift + sourceConfig: + config: + type: AutoClassification + databaseFilterPattern: + includes: + - database + schemaFilterPattern: + includes: + - schema_one + excludes: + - schema_two + tableFilterPattern: + includes: + - orders + - customers + +processor: + type: "orm-profiler" +sink: + type: metadata-rest + config: {} +workflowConfig: +# loggerLevel: INFO # DEBUG, INFO, WARN or ERROR + openMetadataServerConfig: + hostPort: http://localhost:8585/api + authProvider: openmetadata + securityConfig: + jwtToken: "eyJraWQiOiJHYjM4OWEtOWY3Ni1nZGpzLWE5MmotMDI0MmJrOTQzNTYiLCJ0eXAiOiJKV1QiLCJhbGciOiJSUzI1NiJ9.eyJzdWIiOiJhZG1pbiIsImlzQm90IjpmYWxzZSwiaXNzIjoib3Blbi1tZXRhZGF0YS5vcmciLCJpYXQiOjE2NjM5Mzg0NjIsImVtYWlsIjoiYWRtaW5Ab3Blbm1ldGFkYXRhLm9yZyJ9.tS8um_5DKu7HgzGBzS1VTA5uUjKWOCU0B_j08WXBiEC0mr0zNREkqVfwFDD-d24HlNEbrqioLsBuFRiwIWKc1m_ZlVQbG7P36RUxhuv2vbSp80FKyNM-Tj93FDzq91jsyNmsQhyNv_fNr3TXfzzSPjHt8Go0FMMP66weoKMgW2PbXlhVKwEuXUHyakLLzewm9UMeQaEiRzhiTMU3UkLXcKbYEJJvfNFcLwSl9W8JCO_l0Yj3ud-qt_nQYEZwqW6u5nfdQllN133iikV4fM5QZsMCnm8Rq1mvLR0y9bmJiD7fwM1tmJ791TUWqmKaTnP49U493VanKpUAfzIiOiIbhg" \ No newline at end of file diff --git a/ingestion/src/metadata/examples/workflows/redshift_profiler.yaml b/ingestion/src/metadata/examples/workflows/redshift_profiler.yaml index a9506cfa6917..82f7c2d081ba 100644 --- a/ingestion/src/metadata/examples/workflows/redshift_profiler.yaml +++ b/ingestion/src/metadata/examples/workflows/redshift_profiler.yaml @@ -6,13 +6,12 @@ source: hostPort: my-host:5439 username: username password: strongPassword - database: databseToConnect + database: databaseToConnect type: Redshift sourceConfig: config: type: Profiler - generateSampleData: true - databaseFilterPattern: + databaseFilterPattern: includes: - database schemaFilterPattern: diff --git a/ingestion/tests/unit/profiler/pandas/test_custom_metrics.py b/ingestion/tests/unit/profiler/pandas/test_custom_metrics.py index 372416666246..a4d76aa55d80 100644 --- a/ingestion/tests/unit/profiler/pandas/test_custom_metrics.py +++ b/ingestion/tests/unit/profiler/pandas/test_custom_metrics.py @@ -119,9 +119,8 @@ class MetricsTest(TestCase): def setUp(self, *_): with ( patch.object(DatalakeSampler, "table", new_callable=lambda: self.dfs), - patch.object(DatalakeSampler, "get_client") as mock_client, + patch.object(DatalakeSampler, "get_client", return_value=Mock()), ): - mock_client.return_value = Mock() self.sampler = DatalakeSampler( service_connection_config=DatalakeConnection(configSource={}), ometa_client=None, @@ -195,9 +194,8 @@ def test_table_custom_metric(self, *_): with ( patch.object(DatalakeSampler, "table", new_callable=lambda: self.dfs), - patch.object(DatalakeSampler, "get_client") as mock_client, + patch.object(DatalakeSampler, "get_client", return_value=Mock()), ): - mock_client.return_value = Mock() sampler = DatalakeSampler( service_connection_config=DatalakeConnection(configSource={}), ometa_client=None, @@ -258,9 +256,8 @@ def test_column_custom_metric(self, *_): ) with ( patch.object(DatalakeSampler, "table", new_callable=lambda: self.dfs), - patch.object(DatalakeSampler, "get_client") as mock_client, + patch.object(DatalakeSampler, "get_client", return_value=Mock()), ): - mock_client.return_value = Mock() sampler = DatalakeSampler( service_connection_config=DatalakeConnection(configSource={}), ometa_client=None, diff --git a/ingestion/tests/unit/profiler/pandas/test_datalake_metrics.py b/ingestion/tests/unit/profiler/pandas/test_datalake_metrics.py index db8bdae22fb7..c369496e0ddd 100644 --- a/ingestion/tests/unit/profiler/pandas/test_datalake_metrics.py +++ b/ingestion/tests/unit/profiler/pandas/test_datalake_metrics.py @@ -168,9 +168,8 @@ def setUpClass(cls, mock_get_connection, mock_sample_get_connection, mocked_dfs) pd.concat([cls.df2, pd.DataFrame(index=cls.df1.index)]), ], ), - patch.object(DatalakeSampler, "get_client") as mock_client, + patch.object(DatalakeSampler, "get_client", return_value=Mock()), ): - mock_client.return_value = Mock() sampler = DatalakeSampler( service_connection_config=DatalakeConnection(configSource={}), ometa_client=None, diff --git a/ingestion/tests/unit/profiler/pandas/test_profiler.py b/ingestion/tests/unit/profiler/pandas/test_profiler.py index 75db15a30741..3cbac7585b2c 100644 --- a/ingestion/tests/unit/profiler/pandas/test_profiler.py +++ b/ingestion/tests/unit/profiler/pandas/test_profiler.py @@ -172,9 +172,8 @@ def setUp(cls, mock_get_connection, *_) -> None: pd.concat([cls.df2, pd.DataFrame(index=cls.df1.index)]), ], ), - patch.object(DatalakeSampler, "get_client") as mock_client, + patch.object(DatalakeSampler, "get_client", return_value=Mock()), ): - mock_client.return_value = Mock() cls.sampler = DatalakeSampler( service_connection_config=DatalakeConnection(configSource={}), ometa_client=None, diff --git a/ingestion/tests/unit/profiler/pandas/test_profiler_interface.py b/ingestion/tests/unit/profiler/pandas/test_profiler_interface.py index 0672f1b83e40..02bb2941631c 100644 --- a/ingestion/tests/unit/profiler/pandas/test_profiler_interface.py +++ b/ingestion/tests/unit/profiler/pandas/test_profiler_interface.py @@ -167,9 +167,8 @@ def setUp(cls, mock_get_connection, *_) -> None: pd.concat([cls.df2, pd.DataFrame(index=cls.df1.index)]), ], ), - patch.object(DatalakeSampler, "get_client") as mock_client, + patch.object(DatalakeSampler, "get_client", return_value=Mock()), ): - mock_client.return_value = Mock() cls.sampler = DatalakeSampler( service_connection_config=DatalakeConnection(configSource={}), ometa_client=None, diff --git a/ingestion/tests/unit/profiler/pandas/test_sample.py b/ingestion/tests/unit/profiler/pandas/test_sample.py index 24f72a5235f3..d621b37a4a0e 100644 --- a/ingestion/tests/unit/profiler/pandas/test_sample.py +++ b/ingestion/tests/unit/profiler/pandas/test_sample.py @@ -152,9 +152,8 @@ def setUpClass(cls, mock_get_connection, mock_sample_get_connection) -> None: patch.object( DatalakeSampler, "table", new_callable=lambda: [cls.df1, cls.df2] ), - patch.object(DatalakeSampler, "get_client") as mock_client, + patch.object(DatalakeSampler, "get_client", return_value=Mock()), ): - mock_client.return_value = Mock() sampler = DatalakeSampler( service_connection_config=DatalakeConnection(configSource={}), ometa_client=None, @@ -183,9 +182,8 @@ def test_random_sampler(self, _): patch.object( DatalakeSampler, "table", new_callable=lambda: [self.df1, self.df2] ), - patch.object(DatalakeSampler, "get_client") as mock_client, + patch.object(DatalakeSampler, "get_client", return_value=Mock()), ): - mock_client.return_value = Mock() sampler = DatalakeSampler( service_connection_config=DatalakeConnection(configSource={}), ometa_client=None, @@ -216,9 +214,8 @@ def test_sample_property(self, *_): patch.object( DatalakeSampler, "table", new_callable=lambda: [self.df1, self.df2] ), - patch.object(DatalakeSampler, "get_client") as mock_client, + patch.object(DatalakeSampler, "get_client", return_value=Mock()), ): - mock_client.return_value = Mock() sampler = DatalakeSampler( service_connection_config=DatalakeConnection(configSource={}), ometa_client=None, @@ -292,9 +289,8 @@ def test_sample_data(self, *_): patch.object( DatalakeSampler, "table", new_callable=lambda: [self.df1, self.df2] ), - patch.object(DatalakeSampler, "get_client") as mock_client, + patch.object(DatalakeSampler, "get_client", return_value=Mock()), ): - mock_client.return_value = Mock() sampler = DatalakeSampler( service_connection_config=DatalakeConnection(configSource={}), ometa_client=None, @@ -319,9 +315,8 @@ def test_sample_from_user_query(self, *_): patch.object( DatalakeSampler, "table", new_callable=lambda: [self.df1, self.df2] ), - patch.object(DatalakeSampler, "get_client") as mock_client, + patch.object(DatalakeSampler, "get_client", return_value=Mock()), ): - mock_client.return_value = Mock() sampler = DatalakeSampler( service_connection_config=DatalakeConnection(configSource={}), ometa_client=None, From 40916603cdab3041097ca22a9bd21a0b5486b060 Mon Sep 17 00:00:00 2001 From: Pere Miquel Brull Date: Sun, 17 Nov 2024 10:21:05 +0100 Subject: [PATCH 22/29] compat --- .../orm_profiler/test_pii_processor.py | 25 ++++++------------- .../profiler/pandas/test_custom_metrics.py | 9 +++++++ .../profiler/pandas/test_datalake_metrics.py | 9 +++++++ .../unit/profiler/pandas/test_profiler.py | 8 ++++++ .../tests/unit/profiler/pandas/test_sample.py | 8 ++++++ .../unit/profiler/sqlalchemy/test_runner.py | 8 ++++++ 6 files changed, 49 insertions(+), 18 deletions(-) diff --git a/ingestion/tests/integration/orm_profiler/test_pii_processor.py b/ingestion/tests/integration/orm_profiler/test_pii_processor.py index 19cd6b4bf239..313e8f003ed1 100644 --- a/ingestion/tests/integration/orm_profiler/test_pii_processor.py +++ b/ingestion/tests/integration/orm_profiler/test_pii_processor.py @@ -20,9 +20,6 @@ CreateDatabaseSchemaRequest, ) from metadata.generated.schema.api.data.createTable import CreateTableRequest -from metadata.generated.schema.api.data.createTableProfile import ( - CreateTableProfileRequest, -) from metadata.generated.schema.api.services.createDatabaseService import ( CreateDatabaseServiceRequest, ) @@ -31,7 +28,6 @@ ColumnName, DataType, TableData, - TableProfile, ) from metadata.generated.schema.entity.services.connections.database.common.basicAuth import ( BasicAuth, @@ -47,8 +43,8 @@ DatabaseService, DatabaseServiceType, ) -from metadata.generated.schema.metadataIngestion.databaseServiceProfilerPipeline import ( - DatabaseServiceProfilerPipeline, +from metadata.generated.schema.metadataIngestion.databaseServiceAutoClassificationPipeline import ( + DatabaseServiceAutoClassificationPipeline, ) from metadata.generated.schema.metadataIngestion.workflow import ( OpenMetadataWorkflowConfig, @@ -59,12 +55,12 @@ from metadata.generated.schema.security.client.openMetadataJWTClientConfig import ( OpenMetadataJWTClientConfig, ) -from metadata.generated.schema.type.basic import Timestamp from metadata.generated.schema.type.tagLabel import TagFQN, TagLabel from metadata.ingestion.models.table_metadata import ColumnTag from metadata.ingestion.ometa.ometa_api import OpenMetadata from metadata.pii.processor import PIIProcessor -from metadata.profiler.api.models import ProfilerResponse, SampleData +from metadata.profiler.api.models import ProfilerResponse +from metadata.sampler.models import SampleData, SamplerResponse table_data = TableData( columns=[ @@ -172,9 +168,9 @@ class PiiProcessorTest(TestCase): type="mysql", serviceName="test", sourceConfig=SourceConfig( - config=DatabaseServiceProfilerPipeline( + config=DatabaseServiceAutoClassificationPipeline( confidence=85, - processPiiSensitive=True, + enableAutoClassification=True, ) ), ), @@ -305,15 +301,8 @@ def test_ner_scanner_process(self): test function for ner Scanner """ - record = ProfilerResponse( + record = SamplerResponse( table=self.table_entity, - profile=CreateTableProfileRequest( - tableProfile=TableProfile( - timestamp=Timestamp( - root=int(datetime.datetime.now().timestamp() * 1000) - ) - ) - ), sample_data=SampleData(data=table_data), ) diff --git a/ingestion/tests/unit/profiler/pandas/test_custom_metrics.py b/ingestion/tests/unit/profiler/pandas/test_custom_metrics.py index a4d76aa55d80..75bbf2a8482c 100644 --- a/ingestion/tests/unit/profiler/pandas/test_custom_metrics.py +++ b/ingestion/tests/unit/profiler/pandas/test_custom_metrics.py @@ -13,11 +13,13 @@ Test Metrics behavior """ import os +import sys from unittest import TestCase, mock from unittest.mock import Mock, patch from uuid import uuid4 import pandas as pd +import pytest from metadata.generated.schema.entity.data.table import Column as EntityColumn from metadata.generated.schema.entity.data.table import ColumnName, DataType, Table @@ -40,6 +42,13 @@ REGION = "us-west-1" +if sys.version_info < (3, 9): + pytest.skip( + "requires python 3.9+ due to incompatibility with object patch", + allow_module_level=True, + ) + + class FakeClient: def __init__(self): self._client = None diff --git a/ingestion/tests/unit/profiler/pandas/test_datalake_metrics.py b/ingestion/tests/unit/profiler/pandas/test_datalake_metrics.py index c369496e0ddd..696f9bdf2549 100644 --- a/ingestion/tests/unit/profiler/pandas/test_datalake_metrics.py +++ b/ingestion/tests/unit/profiler/pandas/test_datalake_metrics.py @@ -13,10 +13,12 @@ Test Metrics behavior """ import os +import sys from unittest import TestCase, mock from unittest.mock import Mock, patch from uuid import uuid4 +import pytest from sqlalchemy import TEXT, Column, Date, DateTime, Integer, String, Time from sqlalchemy.orm import declarative_base @@ -37,6 +39,13 @@ Base = declarative_base() +if sys.version_info < (3, 9): + pytest.skip( + "requires python 3.9+ due to incompatibility with object patch", + allow_module_level=True, + ) + + class User(Base): __tablename__ = "users" id = Column(Integer, primary_key=True) diff --git a/ingestion/tests/unit/profiler/pandas/test_profiler.py b/ingestion/tests/unit/profiler/pandas/test_profiler.py index 3cbac7585b2c..0c648ee732c5 100644 --- a/ingestion/tests/unit/profiler/pandas/test_profiler.py +++ b/ingestion/tests/unit/profiler/pandas/test_profiler.py @@ -13,6 +13,7 @@ Test Profiler behavior """ import os +import sys from datetime import datetime from unittest import TestCase, mock from unittest.mock import Mock, patch @@ -55,6 +56,13 @@ Base = declarative_base() +if sys.version_info < (3, 9): + pytest.skip( + "requires python 3.9+ due to incompatibility with object patch", + allow_module_level=True, + ) + + class User(Base): __tablename__ = "users" id = Column(Integer, primary_key=True) diff --git a/ingestion/tests/unit/profiler/pandas/test_sample.py b/ingestion/tests/unit/profiler/pandas/test_sample.py index d621b37a4a0e..b5b706c5fa76 100644 --- a/ingestion/tests/unit/profiler/pandas/test_sample.py +++ b/ingestion/tests/unit/profiler/pandas/test_sample.py @@ -13,6 +13,7 @@ Test Sample behavior """ import os +import sys from unittest import TestCase, mock from unittest.mock import Mock, patch from uuid import uuid4 @@ -38,6 +39,13 @@ Base = declarative_base() +if sys.version_info < (3, 9): + pytest.skip( + "requires python 3.9+ due to incompatibility with object patch", + allow_module_level=True, + ) + + class User(Base): __tablename__ = "users" id = Column(Integer, primary_key=True) diff --git a/ingestion/tests/unit/profiler/sqlalchemy/test_runner.py b/ingestion/tests/unit/profiler/sqlalchemy/test_runner.py index 9be685b8fea5..827eda1aef48 100644 --- a/ingestion/tests/unit/profiler/sqlalchemy/test_runner.py +++ b/ingestion/tests/unit/profiler/sqlalchemy/test_runner.py @@ -12,6 +12,7 @@ """ Test Sample behavior """ +import sys import time from unittest import TestCase, mock from unittest.mock import Mock, patch @@ -30,6 +31,13 @@ Base = declarative_base() +if sys.version_info < (3, 9): + pytest.skip( + "requires python 3.9+ due to incompatibility with object patch", + allow_module_level=True, + ) + + class User(Base): __tablename__ = "users" id = Column(Integer, primary_key=True) From 1c5309f41a9639f86acef204fbf1911a220504e2 Mon Sep 17 00:00:00 2001 From: Pere Miquel Brull Date: Sun, 17 Nov 2024 10:56:02 +0100 Subject: [PATCH 23/29] compat --- .../tests/unit/profiler/pandas/test_profiler_interface.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/ingestion/tests/unit/profiler/pandas/test_profiler_interface.py b/ingestion/tests/unit/profiler/pandas/test_profiler_interface.py index 02bb2941631c..9766c42e3c88 100644 --- a/ingestion/tests/unit/profiler/pandas/test_profiler_interface.py +++ b/ingestion/tests/unit/profiler/pandas/test_profiler_interface.py @@ -14,11 +14,13 @@ """ import os +import sys from datetime import datetime from unittest import TestCase, mock from unittest.mock import Mock, patch from uuid import uuid4 +import pytest from sqlalchemy import TEXT, Column, Integer, String, inspect from sqlalchemy.orm import declarative_base @@ -52,6 +54,12 @@ from metadata.profiler.processor.default import get_default_metrics from metadata.sampler.pandas.sampler import DatalakeSampler +if sys.version_info < (3, 9): + pytest.skip( + "requires python 3.9+ due to incompatibility with object patch", + allow_module_level=True, + ) + class User(declarative_base()): __tablename__ = "users" From 79958b89af69c0af9679baf03813f1b6141532c5 Mon Sep 17 00:00:00 2001 From: Pere Miquel Brull Date: Mon, 18 Nov 2024 11:22:33 +0100 Subject: [PATCH 24/29] fix tests and comments --- .../runner/base_test_suite_source.py | 17 +- .../source/database/base/profiler_source.py | 17 +- .../src/metadata/sampler/nosql/sampler.py | 4 +- ingestion/src/metadata/sampler/processor.py | 34 +- ingestion/src/metadata/utils/constants.py | 63 + .../tests/integration/integration_base.py | 2 +- .../orm_profiler/test_orm_profiler_e2e.py | 1253 ++++++++--------- .../integration/profiler/test_dynamodb.py | 10 +- .../profiler/test_nosql_profiler.py | 40 + .../test_suite/test_e2e_workflow.py | 2 +- 10 files changed, 757 insertions(+), 685 deletions(-) diff --git a/ingestion/src/metadata/data_quality/runner/base_test_suite_source.py b/ingestion/src/metadata/data_quality/runner/base_test_suite_source.py index 4afd8219beea..8d8583e7dad4 100644 --- a/ingestion/src/metadata/data_quality/runner/base_test_suite_source.py +++ b/ingestion/src/metadata/data_quality/runner/base_test_suite_source.py @@ -21,9 +21,6 @@ from metadata.data_quality.interface.test_suite_interface import TestSuiteInterface from metadata.data_quality.runner.core import DataTestsRunner from metadata.generated.schema.entity.data.table import Table -from metadata.generated.schema.entity.services.connections.database.datalakeConnection import ( - DatalakeConnection, -) from metadata.generated.schema.entity.services.databaseService import DatabaseConnection from metadata.generated.schema.entity.services.serviceType import ServiceType from metadata.generated.schema.metadataIngestion.testSuitePipeline import ( @@ -37,14 +34,13 @@ from metadata.profiler.orm.converter.base import ometa_to_sqa_orm from metadata.sampler.models import SampleConfig from metadata.sampler.sampler_interface import SamplerInterface +from metadata.utils.constants import NON_SQA_DATABASE_CONNECTIONS from metadata.utils.profiler_utils import get_context_entities from metadata.utils.service_spec.service_spec import ( import_sampler_class, import_test_suite_class, ) -NON_SQA_DATABASE_CONNECTIONS = (DatalakeConnection,) - class BaseTestSuiteRunner: """Base class for the data quality runner""" @@ -63,7 +59,6 @@ def __init__( config.source.sourceConfig.config ) self.ometa_client = ometa_client - self.sqa_metadata = self._set_sqa_metadata() @property def interface(self) -> Optional[TestSuiteInterface]: @@ -101,16 +96,10 @@ def _copy_service_config( return config_copy - def _set_sqa_metadata(self): - """Set sqlalchemy metadata""" - if not isinstance(self.service_conn_config, NON_SQA_DATABASE_CONNECTIONS): - return MetaData() - return None - def _build_table_orm(self, entity: Table) -> Optional[DeclarativeMeta]: """Build the ORM table if needed for the sampler and profiler interfaces""" - if not isinstance(self.service_conn_config, NON_SQA_DATABASE_CONNECTIONS): - return ometa_to_sqa_orm(entity, self.ometa_client, self.sqa_metadata) + if self.service_conn_config.type.value not in NON_SQA_DATABASE_CONNECTIONS: + return ometa_to_sqa_orm(entity, self.ometa_client, MetaData()) return None def create_data_quality_interface(self) -> TestSuiteInterface: diff --git a/ingestion/src/metadata/profiler/source/database/base/profiler_source.py b/ingestion/src/metadata/profiler/source/database/base/profiler_source.py index 2f59655cd695..3031004a443e 100644 --- a/ingestion/src/metadata/profiler/source/database/base/profiler_source.py +++ b/ingestion/src/metadata/profiler/source/database/base/profiler_source.py @@ -25,9 +25,6 @@ from metadata.generated.schema.entity.data.database import Database from metadata.generated.schema.entity.data.databaseSchema import DatabaseSchema from metadata.generated.schema.entity.data.table import Table -from metadata.generated.schema.entity.services.connections.database.datalakeConnection import ( - DatalakeConnection, -) from metadata.generated.schema.entity.services.databaseService import ( DatabaseConnection, DatabaseService, @@ -54,6 +51,7 @@ ) from metadata.sampler.models import SampleConfig from metadata.sampler.sampler_interface import SamplerInterface +from metadata.utils.constants import NON_SQA_DATABASE_CONNECTIONS from metadata.utils.logger import profiler_logger from metadata.utils.profiler_utils import get_context_entities from metadata.utils.service_spec.service_spec import ( @@ -61,8 +59,6 @@ import_sampler_class, ) -NON_SQA_DATABASE_CONNECTIONS = (DatalakeConnection,) - logger = profiler_logger() @@ -85,7 +81,6 @@ def __init__( ) self.ometa_client = ometa_client self._interface_type: str = config.source.type.lower() - self.sqa_metadata = self._set_sqa_metadata() self._interface = None # We define this in create_profiler_interface to help us reuse # this method for the sampler, which does not have a DatabaseServiceProfilerPipeline @@ -104,16 +99,10 @@ def interface(self, interface): """Set the interface""" self._interface = interface - def _set_sqa_metadata(self): - """Set sqlalchemy metadata""" - if not isinstance(self.service_conn_config, NON_SQA_DATABASE_CONNECTIONS): - return MetaData() - return None - def _build_table_orm(self, entity: Table) -> Optional[DeclarativeMeta]: """Build the ORM table if needed for the sampler and profiler interfaces""" - if not isinstance(self.service_conn_config, NON_SQA_DATABASE_CONNECTIONS): - return ometa_to_sqa_orm(entity, self.ometa_client, self.sqa_metadata) + if self.service_conn_config.type.value not in NON_SQA_DATABASE_CONNECTIONS: + return ometa_to_sqa_orm(entity, self.ometa_client, MetaData()) return None def _copy_service_config( diff --git a/ingestion/src/metadata/sampler/nosql/sampler.py b/ingestion/src/metadata/sampler/nosql/sampler.py index 6c18d8f884d4..a07f98cb5860 100644 --- a/ingestion/src/metadata/sampler/nosql/sampler.py +++ b/ingestion/src/metadata/sampler/nosql/sampler.py @@ -26,7 +26,7 @@ class NoSQLSampler(SamplerInterface): @property def table(self): - return self.table + return self.entity def get_client(self): return factory.create( @@ -71,7 +71,7 @@ def _fetch_sample_data(self, columns: List[SQALikeColumn]) -> TableData: returns sampled ometa dataframes """ limit = self._get_limit() - records = self.client.scan(self.table, self.table.columns, limit) + records = self.client.scan(self.table, self.table.columns, int(limit)) rows, cols = self.transpose_records(records, columns) return TableData( rows=[list(map(str, row)) for row in rows], diff --git a/ingestion/src/metadata/sampler/processor.py b/ingestion/src/metadata/sampler/processor.py index 5159f2b79006..60b446f5afb9 100644 --- a/ingestion/src/metadata/sampler/processor.py +++ b/ingestion/src/metadata/sampler/processor.py @@ -13,7 +13,6 @@ """ import traceback from copy import deepcopy -from functools import lru_cache from typing import Optional, cast from sqlalchemy import MetaData @@ -37,23 +36,16 @@ from metadata.ingestion.api.step import Step from metadata.ingestion.api.steps import Processor from metadata.ingestion.ometa.ometa_api import OpenMetadata +from metadata.profiler.api.models import ProfilerProcessorConfig from metadata.profiler.orm.converter.base import ometa_to_sqa_orm from metadata.profiler.source.metadata import ProfilerSourceAndEntity +from metadata.sampler.config import get_config_for_table from metadata.sampler.models import SampleConfig, SampleData, SamplerResponse from metadata.sampler.sampler_interface import SamplerInterface +from metadata.utils.constants import NON_SQA_DATABASE_CONNECTIONS from metadata.utils.profiler_utils import get_context_entities from metadata.utils.service_spec.service_spec import import_sampler_class -NON_SQA_DATABASE_CONNECTIONS = ("Datalake",) - - -@lru_cache -def _get_sqa_metadata(conn_type: str) -> Optional[MetaData]: - """Set sqlalchemy metadata""" - if conn_type not in NON_SQA_DATABASE_CONNECTIONS: - return MetaData() - return None - class SamplerProcessor(Processor): """Use the profiler interface to fetch the sample data""" @@ -68,6 +60,10 @@ def __init__(self, config: OpenMetadataWorkflowConfig, metadata: OpenMetadata): DatabaseServiceAutoClassificationPipeline, self.config.source.sourceConfig.config, ) # Used to satisfy type checked + # We still rely on the orm-processor. We should decouple this in the future + self.profiler_config = ProfilerProcessorConfig.model_validate( + self.config.processor.model_dump().get("config") + ) self._interface_type: str = config.source.type.lower() self.sampler_class = import_sampler_class( @@ -89,15 +85,17 @@ def _run(self, record: ProfilerSourceAndEntity) -> Either[SamplerResponse]: service_conn_config = self._copy_service_config( self.config, database_entity ) - sqa_metadata = _get_sqa_metadata(str(service_conn_config.type.value)) - _orm = self._build_table_orm(entity, sqa_metadata) + _orm = self._build_table_orm( + entity, conn_type=str(service_conn_config.type.value) + ) sampler_interface: SamplerInterface = self.sampler_class.create( service_connection_config=service_conn_config, ometa_client=self.metadata, entity=entity, schema_entity=schema_entity, database_entity=database_entity, + table_config=get_config_for_table(entity, self.profiler_config), default_sample_config=SampleConfig( profile_sample=self.source_config.profileSample, profile_sample_type=self.source_config.profileSampleType, @@ -138,14 +136,12 @@ def create( return cls(config=config, metadata=metadata) def _build_table_orm( - self, entity: Table, sqa_metadata: Optional[MetaData] + self, entity: Table, conn_type: str ) -> Optional[DeclarativeMeta]: """Build the ORM table if needed for the sampler and profiler interfaces""" - return ( - ometa_to_sqa_orm(entity, self.metadata, sqa_metadata) - if sqa_metadata - else None - ) + if conn_type not in NON_SQA_DATABASE_CONNECTIONS: + return ometa_to_sqa_orm(entity, self.metadata, MetaData()) + return None def _copy_service_config( self, config: OpenMetadataWorkflowConfig, database: Database diff --git a/ingestion/src/metadata/utils/constants.py b/ingestion/src/metadata/utils/constants.py index 2f1c921eb4d1..49a0e7c5eef5 100644 --- a/ingestion/src/metadata/utils/constants.py +++ b/ingestion/src/metadata/utils/constants.py @@ -27,6 +27,51 @@ from metadata.generated.schema.entity.domains.dataProduct import DataProduct from metadata.generated.schema.entity.domains.domain import Domain from metadata.generated.schema.entity.services.apiService import ApiService +from metadata.generated.schema.entity.services.connections.database.bigTableConnection import ( + BigtableType, +) +from metadata.generated.schema.entity.services.connections.database.couchbaseConnection import ( + CouchbaseType, +) +from metadata.generated.schema.entity.services.connections.database.datalakeConnection import ( + DatalakeType, +) +from metadata.generated.schema.entity.services.connections.database.deltaLakeConnection import ( + DeltaLakeType, +) +from metadata.generated.schema.entity.services.connections.database.domoDatabaseConnection import ( + DomoDatabaseType, +) +from metadata.generated.schema.entity.services.connections.database.dorisConnection import ( + DorisType, +) +from metadata.generated.schema.entity.services.connections.database.druidConnection import ( + DruidType, +) +from metadata.generated.schema.entity.services.connections.database.dynamoDBConnection import ( + DynamoDBType, +) +from metadata.generated.schema.entity.services.connections.database.glueConnection import ( + GlueType, +) +from metadata.generated.schema.entity.services.connections.database.icebergConnection import ( + IcebergType, +) +from metadata.generated.schema.entity.services.connections.database.mongoDBConnection import ( + MongoDBType, +) +from metadata.generated.schema.entity.services.connections.database.salesforceConnection import ( + SalesforceType, +) +from metadata.generated.schema.entity.services.connections.database.sapErpConnection import ( + SapErpType, +) +from metadata.generated.schema.entity.services.connections.database.sasConnection import ( + SasType, +) +from metadata.generated.schema.entity.services.connections.database.unityCatalogConnection import ( + DatabricksType, +) from metadata.generated.schema.entity.services.dashboardService import DashboardService from metadata.generated.schema.entity.services.databaseService import DatabaseService from metadata.generated.schema.entity.services.messagingService import MessagingService @@ -111,3 +156,21 @@ } CUSTOM_CONNECTOR_PREFIX = "custom" + +NON_SQA_DATABASE_CONNECTIONS = ( + DatalakeType.Datalake.value, + BigtableType.BigTable.value, + CouchbaseType.Couchbase.value, + DatabricksType.UnityCatalog.value, + DeltaLakeType.DeltaLake.value, + DomoDatabaseType.DomoDatabase.value, + DorisType.Doris.value, + DruidType.Druid.value, + DynamoDBType.DynamoDB.value, + GlueType.Glue.value, + IcebergType.Iceberg.value, + MongoDBType.MongoDB.value, + SalesforceType.Salesforce.value, + SapErpType.SapErp.value, + SasType.SAS.value, +) diff --git a/ingestion/tests/integration/integration_base.py b/ingestion/tests/integration/integration_base.py index 6cb82487792b..11fe6ee449aa 100644 --- a/ingestion/tests/integration/integration_base.py +++ b/ingestion/tests/integration/integration_base.py @@ -162,7 +162,7 @@ "serviceConnection": {{ "config": {service_config} }}, - "sourceConfig": {{"config": {{"type":"Profiler", "generateSampleData": true}}}} + "sourceConfig": {{"config": {{"type":"Profiler"}}}} }}, "processor": {{"type": "orm-profiler", "config": {{}}}}, "sink": {{"type": "metadata-rest", "config": {{}}}}, diff --git a/ingestion/tests/integration/orm_profiler/test_orm_profiler_e2e.py b/ingestion/tests/integration/orm_profiler/test_orm_profiler_e2e.py index 21d08bd95cbb..e9ec11ce550d 100644 --- a/ingestion/tests/integration/orm_profiler/test_orm_profiler_e2e.py +++ b/ingestion/tests/integration/orm_profiler/test_orm_profiler_e2e.py @@ -19,8 +19,9 @@ import logging from copy import deepcopy from datetime import datetime, timedelta -from unittest import TestCase +from uuid import uuid4 +import pytest from sqlalchemy import Column, DateTime, Integer, String, create_engine from sqlalchemy.orm import declarative_base @@ -42,6 +43,7 @@ get_beginning_of_day_timestamp_mill, get_end_of_day_timestamp_mill, ) +from metadata.workflow.classification import AutoClassificationWorkflow from metadata.workflow.metadata import MetadataWorkflow from metadata.workflow.profiler import ProfilerWorkflow from metadata.workflow.workflow_output_handler import WorkflowResultStatus @@ -100,16 +102,20 @@ class NewUser(Base): signedup = Column(DateTime) -class ProfilerWorkflowTest(TestCase): - """ - Run the end to end workflow and validate - """ - - engine = create_engine( +@pytest.fixture(scope="session") +def engine(): + return create_engine( f"sqlite+pysqlite:///{sqlite_shared}", ) - session = create_and_bind_session(engine) + +@pytest.fixture(scope="session") +def session(engine): + return create_and_bind_session(engine) + + +@pytest.fixture(scope="session") +def metadata(): server_config = OpenMetadataConnection( hostPort="http://localhost:8585/api", authProvider="openmetadata", @@ -117,656 +123,645 @@ class ProfilerWorkflowTest(TestCase): jwtToken="eyJraWQiOiJHYjM4OWEtOWY3Ni1nZGpzLWE5MmotMDI0MmJrOTQzNTYiLCJ0eXAiOiJKV1QiLCJhbGciOiJSUzI1NiJ9.eyJzdWIiOiJhZG1pbiIsImlzQm90IjpmYWxzZSwiaXNzIjoib3Blbi1tZXRhZGF0YS5vcmciLCJpYXQiOjE2NjM5Mzg0NjIsImVtYWlsIjoiYWRtaW5Ab3Blbm1ldGFkYXRhLm9yZyJ9.tS8um_5DKu7HgzGBzS1VTA5uUjKWOCU0B_j08WXBiEC0mr0zNREkqVfwFDD-d24HlNEbrqioLsBuFRiwIWKc1m_ZlVQbG7P36RUxhuv2vbSp80FKyNM-Tj93FDzq91jsyNmsQhyNv_fNr3TXfzzSPjHt8Go0FMMP66weoKMgW2PbXlhVKwEuXUHyakLLzewm9UMeQaEiRzhiTMU3UkLXcKbYEJJvfNFcLwSl9W8JCO_l0Yj3ud-qt_nQYEZwqW6u5nfdQllN133iikV4fM5QZsMCnm8Rq1mvLR0y9bmJiD7fwM1tmJ791TUWqmKaTnP49U493VanKpUAfzIiOiIbhg" ), ) - metadata = OpenMetadata(server_config) - - @classmethod - def setUpClass(cls) -> None: - """ - Prepare Ingredients - """ - try: - User.__table__.create(bind=cls.engine) - NewUser.__table__.create(bind=cls.engine) - except: - logger.warning("Table Already exists") - - data = [ - User( - name="John", - fullname="John Doe", - nickname="johnny b goode", - age=30, - signedup=datetime.now() - timedelta(days=10), - ), - User( - name="Jane", - fullname="Jone Doe", - nickname=None, - age=31, - signedup=datetime.now() - timedelta(days=2), - ), - User( - name="Joh", - fullname="Joh Doe", - nickname=None, - age=37, - signedup=datetime.now() - timedelta(days=1), - ), - User( - name="Jae", - fullname="Jae Doe", - nickname=None, - age=38, - signedup=datetime.now() - timedelta(days=1), - ), - ] - cls.session.add_all(data) - cls.session.commit() - - new_user = [ - NewUser( - name="John", - fullname="John Doe", - nickname="johnny b goode", - age=30, - signedup=datetime.now() - timedelta(days=10), - ), - NewUser( - name="Jane", - fullname="Jone Doe", - nickname=None, - age=31, - signedup=datetime.now() - timedelta(days=2), - ), - ] - cls.session.add_all(new_user) - cls.session.commit() - - ingestion_workflow = MetadataWorkflow.create(ingestion_config) - ingestion_workflow.execute() - ingestion_workflow.raise_from_status() - ingestion_workflow.print_status() - ingestion_workflow.stop() - - @classmethod - def tearDownClass(cls) -> None: - """ - Clean up - """ - - service_id = str( - cls.metadata.get_by_name(entity=DatabaseService, fqn="test_sqlite").id.root - ) - - cls.metadata.delete( - entity=DatabaseService, - entity_id=service_id, - recursive=True, - hard_delete=True, - ) - - User.__table__.drop(bind=cls.engine) - NewUser.__table__.drop(bind=cls.engine) - cls.session.close() - - def test_ingestion(self): - """ - Validate that the ingestion ran correctly - """ - - table_entity: Table = self.metadata.get_by_name( - entity=Table, fqn="test_sqlite.main.main.users" - ) - assert table_entity.fullyQualifiedName.root == "test_sqlite.main.main.users" - - def test_profiler_workflow(self): - """ - Prepare and execute the profiler workflow - on top of the Users table - """ - workflow_config = deepcopy(ingestion_config) - workflow_config["source"]["sourceConfig"]["config"].update( - { - "type": "Profiler", - "tableFilterPattern": {"includes": ["users"]}, - } - ) - workflow_config["processor"] = { - "type": "orm-profiler", - "config": { - "profiler": { - "name": "my_profiler", - "timeout_seconds": 60, - "metrics": ["row_count", "min", "max", "COUNT", "null_count"], - }, - "tableConfig": [ - { - "fullyQualifiedName": "test_sqlite.main.main.users", - "profileSample": 75, - } - ], - }, - } + return OpenMetadata(server_config) - profiler_workflow = ProfilerWorkflow.create(workflow_config) - profiler_workflow.execute() - status = profiler_workflow.result_status() - profiler_workflow.stop() - - assert status == WorkflowResultStatus.SUCCESS - - table = self.metadata.get_by_name( - entity=Table, - fqn="test_sqlite.main.main.users", - fields=["tableProfilerConfig"], - ) - - profile = self.metadata.get_latest_table_profile( - table.fullyQualifiedName - ).profile - - assert not table.tableProfilerConfig - assert profile.profileSample == 75.0 - assert profile.profileSampleType == ProfileSampleType.PERCENTAGE - - workflow_config["processor"]["config"]["tableConfig"][0][ - "profileSampleType" - ] = ProfileSampleType.ROWS - workflow_config["processor"]["config"]["tableConfig"][0]["profileSample"] = 3 - profiler_workflow = ProfilerWorkflow.create(workflow_config) - profiler_workflow.execute() - status = profiler_workflow.result_status() - profiler_workflow.stop() - - assert status == WorkflowResultStatus.SUCCESS - - table = self.metadata.get_by_name( - entity=Table, - fqn="test_sqlite.main.main.users", - fields=["tableProfilerConfig"], - ) - - profile = self.metadata.get_latest_table_profile( - table.fullyQualifiedName - ).profile - - assert not table.tableProfilerConfig - assert profile.profileSample == 3.0 - assert profile.rowCount == 4.0 - assert profile.profileSampleType == ProfileSampleType.ROWS - - def test_workflow_sample_profile(self): - """Test the workflow sample profile gets propagated down to the table profileSample""" - workflow_config = deepcopy(ingestion_config) - workflow_config["source"]["sourceConfig"]["config"].update( - { - "type": "Profiler", - "profileSample": 50, - "tableFilterPattern": {"includes": ["new/users"]}, - } - ) - workflow_config["processor"] = {"type": "orm-profiler", "config": {}} - - profiler_workflow = ProfilerWorkflow.create(workflow_config) - profiler_workflow.execute() - profiler_workflow.print_status() - profiler_workflow.stop() - - table = self.metadata.get_by_name( - entity=Table, - fqn="test_sqlite.main.main.new/users", - fields=["tableProfilerConfig"], - ) - # setting sampleProfile from config has been temporarly removed - # up until we split tests and profiling - assert table.tableProfilerConfig is None - - profile = self.metadata.get_latest_table_profile( - table.fullyQualifiedName - ).profile - - assert profile is not None - - def test_workflow_datetime_partition(self): - """test workflow with partition""" - workflow_config = deepcopy(ingestion_config) - workflow_config["source"]["sourceConfig"]["config"].update( - { - "type": "Profiler", - "tableFilterPattern": {"includes": ["users"]}, - } - ) - workflow_config["processor"] = { - "type": "orm-profiler", - "config": { - "profiler": { - "name": "my_profiler", - "timeout_seconds": 60, - }, - "tableConfig": [ - { - "fullyQualifiedName": "test_sqlite.main.main.users", - "profileSample": 100, - "partitionConfig": { - "enablePartitioning": "true", - "partitionColumnName": "signedup", - "partitionIntervalType": "TIME-UNIT", - "partitionIntervalUnit": "DAY", - "partitionInterval": 2, - }, - } - ], - }, - } - profiler_workflow = ProfilerWorkflow.create(workflow_config) - profiler_workflow.execute() - profiler_workflow.print_status() - profiler_workflow.stop() +@pytest.fixture +def service_name(): + return str(uuid4()) - table = self.metadata.get_by_name( - entity=Table, - fqn="test_sqlite.main.main.users", - fields=["tableProfilerConfig"], - ) - profile = self.metadata.get_latest_table_profile( - table.fullyQualifiedName - ).profile +@pytest.fixture +def create_data(engine, session): + try: + User.__table__.create(bind=engine) + NewUser.__table__.create(bind=engine) + except: + logger.warning("Table Already exists") - assert profile.rowCount == 4.0 + data = [ + User( + name="John", + fullname="John Doe", + nickname="johnny b goode", + age=30, + signedup=datetime.now() - timedelta(days=10), + ), + User( + name="Jane", + fullname="Jone Doe", + nickname=None, + age=31, + signedup=datetime.now() - timedelta(days=2), + ), + User( + name="Joh", + fullname="Joh Doe", + nickname=None, + age=37, + signedup=datetime.now() - timedelta(days=1), + ), + User( + name="Jae", + fullname="Jae Doe", + nickname=None, + age=38, + signedup=datetime.now() - timedelta(days=1), + ), + ] + session.add_all(data) + session.commit() + + new_user = [ + NewUser( + name="John", + fullname="John Doe", + nickname="johnny b goode", + age=30, + signedup=datetime.now() - timedelta(days=10), + ), + NewUser( + name="Jane", + fullname="Jone Doe", + nickname=None, + age=31, + signedup=datetime.now() - timedelta(days=2), + ), + ] + session.add_all(new_user) + session.commit() - workflow_config["processor"] = { - "type": "orm-profiler", - "config": { - "profiler": { - "name": "my_profiler", - "timeout_seconds": 60, - }, - "tableConfig": [ - { - "fullyQualifiedName": "test_sqlite.main.main.users", - "partitionConfig": { - "enablePartitioning": "true", - "partitionColumnName": "signedup", - "partitionIntervalType": "TIME-UNIT", - "partitionIntervalUnit": "DAY", - "partitionInterval": 2, - }, - } - ], + +@pytest.fixture +def ingest(service_name, create_data, metadata, engine, session): + ingestion_config["source"]["serviceName"] = service_name + + ingestion_workflow = MetadataWorkflow.create(ingestion_config) + ingestion_workflow.execute() + ingestion_workflow.raise_from_status() + ingestion_workflow.print_status() + ingestion_workflow.stop() + + yield + + service_id = str( + metadata.get_by_name(entity=DatabaseService, fqn=service_name).id.root + ) + + metadata.delete( + entity=DatabaseService, + entity_id=service_id, + recursive=True, + hard_delete=True, + ) + + User.__table__.drop(bind=engine) + NewUser.__table__.drop(bind=engine) + session.close() + + +def test_ingestion(ingest, metadata, service_name): + """ + Validate that the ingestion ran correctly + """ + + table_entity: Table = metadata.get_by_name( + entity=Table, fqn=f"{service_name}.main.main.users" + ) + assert table_entity.fullyQualifiedName.root == f"{service_name}.main.main.users" + + +def test_profiler_workflow(ingest, metadata, service_name): + """ + Prepare and execute the profiler workflow + on top of the Users table + """ + workflow_config = deepcopy(ingestion_config) + workflow_config["source"]["sourceConfig"]["config"].update( + { + "type": "Profiler", + "tableFilterPattern": {"includes": ["users"]}, + } + ) + workflow_config["processor"] = { + "type": "orm-profiler", + "config": { + "profiler": { + "name": "my_profiler", + "timeout_seconds": 60, + "metrics": ["row_count", "min", "max", "COUNT", "null_count"], }, + "tableConfig": [ + { + "fullyQualifiedName": f"{service_name}.main.main.users", + "profileSample": 75, + } + ], + }, + } + + profiler_workflow = ProfilerWorkflow.create(workflow_config) + profiler_workflow.execute() + status = profiler_workflow.result_status() + profiler_workflow.stop() + + assert status == WorkflowResultStatus.SUCCESS + + table = metadata.get_by_name( + entity=Table, + fqn=f"{service_name}.main.main.users", + fields=["tableProfilerConfig"], + ) + + profile = metadata.get_latest_table_profile(table.fullyQualifiedName).profile + + assert not table.tableProfilerConfig + assert profile.profileSample == 75.0 + assert profile.profileSampleType == ProfileSampleType.PERCENTAGE + + workflow_config["processor"]["config"]["tableConfig"][0][ + "profileSampleType" + ] = ProfileSampleType.ROWS + workflow_config["processor"]["config"]["tableConfig"][0]["profileSample"] = 3 + profiler_workflow = ProfilerWorkflow.create(workflow_config) + profiler_workflow.execute() + status = profiler_workflow.result_status() + profiler_workflow.stop() + + assert status == WorkflowResultStatus.SUCCESS + + table = metadata.get_by_name( + entity=Table, + fqn=f"{service_name}.main.main.users", + fields=["tableProfilerConfig"], + ) + + profile = metadata.get_latest_table_profile(table.fullyQualifiedName).profile + + assert not table.tableProfilerConfig + assert profile.profileSample == 3.0 + assert profile.rowCount == 4.0 + assert profile.profileSampleType == ProfileSampleType.ROWS + + +def test_workflow_sample_profile(ingest, metadata, service_name): + """Test the workflow sample profile gets propagated down to the table profileSample""" + workflow_config = deepcopy(ingestion_config) + workflow_config["source"]["sourceConfig"]["config"].update( + { + "type": "Profiler", + "profileSample": 50, + "tableFilterPattern": {"includes": ["new/users"]}, } + ) + workflow_config["processor"] = {"type": "orm-profiler", "config": {}} - profiler_workflow = ProfilerWorkflow.create(workflow_config) - profiler_workflow.execute() - profiler_workflow.print_status() - profiler_workflow.stop() - - table = self.metadata.get_by_name( - entity=Table, - fqn="test_sqlite.main.main.users", - fields=["tableProfilerConfig"], - ) - - profile = self.metadata.get_latest_table_profile( - table.fullyQualifiedName - ).profile - - assert profile.rowCount == 4.0 - - def test_workflow_integer_range_partition(self): - """test workflow with partition""" - workflow_config = deepcopy(ingestion_config) - workflow_config["source"]["sourceConfig"]["config"].update( - { - "type": "Profiler", - "tableFilterPattern": {"includes": ["users"]}, - } - ) - workflow_config["processor"] = { - "type": "orm-profiler", - "config": { - "profiler": { - "name": "my_profiler", - "timeout_seconds": 60, - }, - "tableConfig": [ - { - "fullyQualifiedName": "test_sqlite.main.main.users", - "partitionConfig": { - "enablePartitioning": "true", - "partitionColumnName": "age", - "partitionIntervalType": "INTEGER-RANGE", - "partitionIntegerRangeStart": 37, - "partitionIntegerRangeEnd": 38, - }, - } - ], - }, + profiler_workflow = ProfilerWorkflow.create(workflow_config) + profiler_workflow.execute() + profiler_workflow.print_status() + profiler_workflow.stop() + + table = metadata.get_by_name( + entity=Table, + fqn=f"{service_name}.main.main.new/users", + fields=["tableProfilerConfig"], + ) + # setting sampleProfile from config has been temporarly removed + # up until we split tests and profiling + assert table.tableProfilerConfig is None + + profile = metadata.get_latest_table_profile(table.fullyQualifiedName).profile + + assert profile is not None + + +def test_workflow_datetime_partition(ingest, metadata, service_name): + """test workflow with partition""" + workflow_config = deepcopy(ingestion_config) + workflow_config["source"]["sourceConfig"]["config"].update( + { + "type": "Profiler", + "tableFilterPattern": {"includes": ["users"]}, } + ) + workflow_config["processor"] = { + "type": "orm-profiler", + "config": { + "profiler": { + "name": "my_profiler", + "timeout_seconds": 60, + }, + "tableConfig": [ + { + "fullyQualifiedName": f"{service_name}.main.main.users", + "profileSample": 100, + "partitionConfig": { + "enablePartitioning": "true", + "partitionColumnName": "signedup", + "partitionIntervalType": "TIME-UNIT", + "partitionIntervalUnit": "DAY", + "partitionInterval": 2, + }, + } + ], + }, + } - profiler_workflow = ProfilerWorkflow.create(workflow_config) - profiler_workflow.execute() - profiler_workflow.print_status() - profiler_workflow.stop() + profiler_workflow = ProfilerWorkflow.create(workflow_config) + profiler_workflow.execute() + profiler_workflow.print_status() + profiler_workflow.stop() - table = self.metadata.get_by_name( - entity=Table, - fqn="test_sqlite.main.main.users", - fields=["tableProfilerConfig"], - ) + table = metadata.get_by_name( + entity=Table, + fqn=f"{service_name}.main.main.users", + fields=["tableProfilerConfig"], + ) - profile = self.metadata.get_latest_table_profile( - table.fullyQualifiedName - ).profile + profile = metadata.get_latest_table_profile(table.fullyQualifiedName).profile - assert profile.rowCount == 4.0 + assert profile.rowCount == 4.0 - workflow_config["processor"] = { - "type": "orm-profiler", - "config": { - "profiler": { - "name": "my_profiler", - "timeout_seconds": 60, - }, - "tableConfig": [ - { - "fullyQualifiedName": "test_sqlite.main.main.users", - "profileSample": 100, - "partitionConfig": { - "enablePartitioning": "true", - "partitionColumnName": "age", - "partitionIntervalType": "INTEGER-RANGE", - "partitionIntegerRangeStart": 37, - "partitionIntegerRangeEnd": 38, - }, - } - ], + workflow_config["processor"] = { + "type": "orm-profiler", + "config": { + "profiler": { + "name": "my_profiler", + "timeout_seconds": 60, }, + "tableConfig": [ + { + "fullyQualifiedName": f"{service_name}.main.main.users", + "partitionConfig": { + "enablePartitioning": "true", + "partitionColumnName": "signedup", + "partitionIntervalType": "TIME-UNIT", + "partitionIntervalUnit": "DAY", + "partitionInterval": 2, + }, + } + ], + }, + } + + profiler_workflow = ProfilerWorkflow.create(workflow_config) + profiler_workflow.execute() + profiler_workflow.print_status() + profiler_workflow.stop() + + table = metadata.get_by_name( + entity=Table, + fqn=f"{service_name}.main.main.users", + fields=["tableProfilerConfig"], + ) + + profile = metadata.get_latest_table_profile(table.fullyQualifiedName).profile + + assert profile.rowCount == 4.0 + + +def test_workflow_integer_range_partition(ingest, metadata, service_name): + """test workflow with partition""" + workflow_config = deepcopy(ingestion_config) + workflow_config["source"]["sourceConfig"]["config"].update( + { + "type": "Profiler", + "tableFilterPattern": {"includes": ["users"]}, } + ) + workflow_config["processor"] = { + "type": "orm-profiler", + "config": { + "profiler": { + "name": "my_profiler", + "timeout_seconds": 60, + }, + "tableConfig": [ + { + "fullyQualifiedName": f"{service_name}.main.main.users", + "partitionConfig": { + "enablePartitioning": "true", + "partitionColumnName": "age", + "partitionIntervalType": "INTEGER-RANGE", + "partitionIntegerRangeStart": 37, + "partitionIntegerRangeEnd": 38, + }, + } + ], + }, + } - profiler_workflow = ProfilerWorkflow.create(workflow_config) - profiler_workflow.execute() - profiler_workflow.print_status() - profiler_workflow.stop() - - table = self.metadata.get_by_name( - entity=Table, - fqn="test_sqlite.main.main.users", - fields=["tableProfilerConfig"], - ) - - profile = self.metadata.get_latest_table_profile( - table.fullyQualifiedName - ).profile - - assert profile.rowCount == 4.0 - - def test_workflow_values_partition(self): - """test workflow with partition""" - workflow_config = deepcopy(ingestion_config) - workflow_config["source"]["sourceConfig"]["config"].update( - { - "type": "Profiler", - "tableFilterPattern": {"includes": ["users"]}, - } - ) - workflow_config["processor"] = { - "type": "orm-profiler", - "config": { - "profiler": { - "name": "my_profiler", - "timeout_seconds": 60, - }, - "tableConfig": [ - { - "fullyQualifiedName": "test_sqlite.main.main.users", - "partitionConfig": { - "enablePartitioning": "true", - "partitionColumnName": "name", - "partitionIntervalType": "COLUMN-VALUE", - "partitionValues": ["John"], - }, - } - ], + profiler_workflow = ProfilerWorkflow.create(workflow_config) + profiler_workflow.execute() + profiler_workflow.print_status() + profiler_workflow.stop() + + table = metadata.get_by_name( + entity=Table, + fqn=f"{service_name}.main.main.users", + fields=["tableProfilerConfig"], + ) + + profile = metadata.get_latest_table_profile(table.fullyQualifiedName).profile + + assert profile.rowCount == 4.0 + + workflow_config["processor"] = { + "type": "orm-profiler", + "config": { + "profiler": { + "name": "my_profiler", + "timeout_seconds": 60, }, + "tableConfig": [ + { + "fullyQualifiedName": f"{service_name}.main.main.users", + "profileSample": 100, + "partitionConfig": { + "enablePartitioning": "true", + "partitionColumnName": "age", + "partitionIntervalType": "INTEGER-RANGE", + "partitionIntegerRangeStart": 37, + "partitionIntegerRangeEnd": 38, + }, + } + ], + }, + } + + profiler_workflow = ProfilerWorkflow.create(workflow_config) + profiler_workflow.execute() + profiler_workflow.print_status() + profiler_workflow.stop() + + table = metadata.get_by_name( + entity=Table, + fqn=f"{service_name}.main.main.users", + fields=["tableProfilerConfig"], + ) + + profile = metadata.get_latest_table_profile(table.fullyQualifiedName).profile + + assert profile.rowCount == 4.0 + + +def test_workflow_values_partition(ingest, metadata, service_name): + """test workflow with partition""" + workflow_config = deepcopy(ingestion_config) + workflow_config["source"]["sourceConfig"]["config"].update( + { + "type": "Profiler", + "tableFilterPattern": {"includes": ["users"]}, } + ) + workflow_config["processor"] = { + "type": "orm-profiler", + "config": { + "profiler": { + "name": "my_profiler", + "timeout_seconds": 60, + }, + "tableConfig": [ + { + "fullyQualifiedName": f"{service_name}.main.main.users", + "partitionConfig": { + "enablePartitioning": "true", + "partitionColumnName": "name", + "partitionIntervalType": "COLUMN-VALUE", + "partitionValues": ["John"], + }, + } + ], + }, + } - profiler_workflow = ProfilerWorkflow.create(workflow_config) - profiler_workflow.execute() - profiler_workflow.print_status() - profiler_workflow.stop() + profiler_workflow = ProfilerWorkflow.create(workflow_config) + profiler_workflow.execute() + profiler_workflow.print_status() + profiler_workflow.stop() - table = self.metadata.get_by_name( - entity=Table, - fqn="test_sqlite.main.main.users", - fields=["tableProfilerConfig"], - ) + table = metadata.get_by_name( + entity=Table, + fqn=f"{service_name}.main.main.users", + fields=["tableProfilerConfig"], + ) - profile = self.metadata.get_latest_table_profile( - table.fullyQualifiedName - ).profile + profile = metadata.get_latest_table_profile(table.fullyQualifiedName).profile - assert profile.rowCount == 4.0 - assert profile.profileSample is None + assert profile.rowCount == 4.0 + # If we don't have any sample, default to 100 + assert profile.profileSample == 100.0 - workflow_config["processor"] = { - "type": "orm-profiler", - "config": { - "profiler": { - "name": "my_profiler", - "timeout_seconds": 60, - }, - "tableConfig": [ - { - "fullyQualifiedName": "test_sqlite.main.main.users", - "profileSample": 100, - "partitionConfig": { - "enablePartitioning": "true", - "partitionColumnName": "name", - "partitionIntervalType": "COLUMN-VALUE", - "partitionValues": ["John"], - }, - } - ], + workflow_config["processor"] = { + "type": "orm-profiler", + "config": { + "profiler": { + "name": "my_profiler", + "timeout_seconds": 60, }, - } + "tableConfig": [ + { + "fullyQualifiedName": f"{service_name}.main.main.users", + "profileSample": 100, + "partitionConfig": { + "enablePartitioning": "true", + "partitionColumnName": "name", + "partitionIntervalType": "COLUMN-VALUE", + "partitionValues": ["John"], + }, + } + ], + }, + } - profiler_workflow = ProfilerWorkflow.create(workflow_config) - profiler_workflow.execute() - profiler_workflow.print_status() - profiler_workflow.stop() - - table = self.metadata.get_by_name( - entity=Table, - fqn="test_sqlite.main.main.users", - fields=["tableProfilerConfig"], - ) - - profile = self.metadata.get_latest_table_profile( - table.fullyQualifiedName - ).profile - - assert profile.rowCount == 4.0 - - def test_datalake_profiler_workflow_with_custom_profiler_config(self): - """Test custom profiler config return expected sample and metric computation""" - profiler_metrics = [ - "MIN", - "MAX", - "MEAN", - "MEDIAN", - ] - id_metrics = ["MIN", "MAX"] - non_metric_values = ["name", "timestamp"] - - workflow_config = deepcopy(ingestion_config) - workflow_config["source"]["sourceConfig"]["config"].update( - { - "type": "Profiler", - } - ) - workflow_config["processor"] = { - "type": "orm-profiler", - "config": { - "profiler": { - "name": "ingestion_profiler", - "metrics": profiler_metrics, - }, - "tableConfig": [ - { - "fullyQualifiedName": "test_sqlite.main.main.users", - "columnConfig": { - "includeColumns": [ - {"columnName": "id", "metrics": id_metrics}, - {"columnName": "age"}, - ] - }, - } - ], + profiler_workflow = ProfilerWorkflow.create(workflow_config) + profiler_workflow.execute() + profiler_workflow.print_status() + profiler_workflow.stop() + + table = metadata.get_by_name( + entity=Table, + fqn=f"{service_name}.main.main.users", + fields=["tableProfilerConfig"], + ) + + profile = metadata.get_latest_table_profile(table.fullyQualifiedName).profile + + assert profile.rowCount == 4.0 + + +def test_profiler_workflow_with_custom_profiler_config(ingest, metadata, service_name): + """Test custom profiler config return expected sample and metric computation""" + profiler_metrics = [ + "MIN", + "MAX", + "MEAN", + "MEDIAN", + ] + id_metrics = ["MIN", "MAX"] + non_metric_values = ["name", "timestamp"] + + workflow_config = deepcopy(ingestion_config) + workflow_config["source"]["sourceConfig"]["config"].update( + { + "type": "Profiler", + } + ) + workflow_config["processor"] = { + "type": "orm-profiler", + "config": { + "profiler": { + "name": "ingestion_profiler", + "metrics": profiler_metrics, }, + "tableConfig": [ + { + "fullyQualifiedName": f"{service_name}.main.main.users", + "columnConfig": { + "includeColumns": [ + {"columnName": "id", "metrics": id_metrics}, + {"columnName": "age"}, + ] + }, + } + ], + }, + } + + profiler_workflow = ProfilerWorkflow.create(workflow_config) + profiler_workflow.execute() + status = profiler_workflow.result_status() + profiler_workflow.stop() + + assert status == WorkflowResultStatus.SUCCESS + + table = metadata.get_by_name( + entity=Table, + fqn=f"{service_name}.main.main.users", + fields=["tableProfilerConfig"], + ) + + id_profile = metadata.get_profile_data( + f"{service_name}.main.main.users.id", + get_beginning_of_day_timestamp_mill(), + get_end_of_day_timestamp_mill(), + profile_type=ColumnProfile, + ).entities + + latest_id_profile = max(id_profile, key=lambda o: o.timestamp.root) + + id_metric_ln = 0 + for metric_name, metric in latest_id_profile: + if metric_name.upper() in id_metrics: + assert metric is not None + id_metric_ln += 1 + else: + assert metric is None if metric_name not in non_metric_values else True + + assert id_metric_ln == len(id_metrics) + + age_profile = metadata.get_profile_data( + f"{service_name}.main.main.users.age", + get_beginning_of_day_timestamp_mill(), + get_end_of_day_timestamp_mill(), + profile_type=ColumnProfile, + ).entities + + latest_age_profile = max(age_profile, key=lambda o: o.timestamp.root) + + age_metric_ln = 0 + for metric_name, metric in latest_age_profile: + if metric_name.upper() in profiler_metrics: + assert metric is not None + age_metric_ln += 1 + else: + assert metric is None if metric_name not in non_metric_values else True + + assert age_metric_ln == len(profiler_metrics) + + latest_exc_timestamp = latest_age_profile.timestamp.root + fullname_profile = metadata.get_profile_data( + f"{service_name}.main.main.users.fullname", + get_beginning_of_day_timestamp_mill(), + get_end_of_day_timestamp_mill(), + profile_type=ColumnProfile, + ).entities + + assert not [p for p in fullname_profile if p.timestamp.root == latest_exc_timestamp] + + workflow_config["source"]["sourceConfig"]["config"].update( + { + "type": "AutoClassification", } + ) - profiler_workflow = ProfilerWorkflow.create(workflow_config) - profiler_workflow.execute() - status = profiler_workflow.result_status() - profiler_workflow.stop() - - assert status == WorkflowResultStatus.SUCCESS - - table = self.metadata.get_by_name( - entity=Table, - fqn="test_sqlite.main.main.users", - fields=["tableProfilerConfig"], - ) - - id_profile = self.metadata.get_profile_data( - "test_sqlite.main.main.users.id", - get_beginning_of_day_timestamp_mill(), - get_end_of_day_timestamp_mill(), - profile_type=ColumnProfile, - ).entities - - latest_id_profile = max(id_profile, key=lambda o: o.timestamp.root) - - id_metric_ln = 0 - for metric_name, metric in latest_id_profile: - if metric_name.upper() in id_metrics: - assert metric is not None - id_metric_ln += 1 - else: - assert metric is None if metric_name not in non_metric_values else True - - assert id_metric_ln == len(id_metrics) - - age_profile = self.metadata.get_profile_data( - "test_sqlite.main.main.users.age", - get_beginning_of_day_timestamp_mill(), - get_end_of_day_timestamp_mill(), - profile_type=ColumnProfile, - ).entities - - latest_age_profile = max(age_profile, key=lambda o: o.timestamp.root) - - age_metric_ln = 0 - for metric_name, metric in latest_age_profile: - if metric_name.upper() in profiler_metrics: - assert metric is not None - age_metric_ln += 1 - else: - assert metric is None if metric_name not in non_metric_values else True - - assert age_metric_ln == len(profiler_metrics) - - latest_exc_timestamp = latest_age_profile.timestamp.root - fullname_profile = self.metadata.get_profile_data( - "test_sqlite.main.main.users.fullname", - get_beginning_of_day_timestamp_mill(), - get_end_of_day_timestamp_mill(), - profile_type=ColumnProfile, - ).entities - - assert not [ - p for p in fullname_profile if p.timestamp.root == latest_exc_timestamp - ] - - sample_data = self.metadata.get_sample_data(table) - assert sorted([c.root for c in sample_data.sampleData.columns]) == sorted( - ["id", "age"] - ) - - def test_sample_data_ingestion(self): - """test the rows of the sample data are what we expect""" - workflow_config = deepcopy(ingestion_config) - workflow_config["source"]["sourceConfig"]["config"].update( - { - "type": "Profiler", - "tableFilterPattern": {"includes": ["users"]}, - } - ) - workflow_config["processor"] = { - "type": "orm-profiler", - "config": { - "profiler": { - "name": "my_profiler", - "timeout_seconds": 60, - "metrics": ["row_count", "min", "max", "COUNT", "null_count"], - }, - "tableConfig": [ - { - "fullyQualifiedName": "test_sqlite.main.main.users", - } - ], - }, + profiler_workflow = AutoClassificationWorkflow.create(workflow_config) + profiler_workflow.execute() + profiler_workflow.stop() + + sample_data = metadata.get_sample_data(table) + assert sorted([c.root for c in sample_data.sampleData.columns]) == sorted( + ["id", "age"] + ) + + +def test_sample_data_ingestion(ingest, metadata, service_name): + """test the rows of the sample data are what we expect""" + workflow_config = deepcopy(ingestion_config) + workflow_config["source"]["sourceConfig"]["config"].update( + { + "type": "AutoClassification", + "tableFilterPattern": {"includes": ["users"]}, } + ) + workflow_config["processor"] = { + "type": "orm-profiler", + "config": {}, + } - profiler_workflow = ProfilerWorkflow.create(workflow_config) - profiler_workflow.execute() - status = profiler_workflow.result_status() - profiler_workflow.stop() - - assert status == WorkflowResultStatus.SUCCESS - - table = self.metadata.get_by_name( - entity=Table, - fqn="test_sqlite.main.main.users", - ) - - # Test we are getting the expected sample data - expected_sample_data = [ - [ - 1, - "John", - "John Doe", - "johnny b goode", - 30, - ], - [ - 2, - "Jane", - "Jone Doe", - None, - 31, - ], - [ - 3, - "Joh", - "Joh Doe", - None, - 37, - ], - [ - 4, - "Jae", - "Jae Doe", - None, - 38, - ], - ] - sample_data = self.metadata.get_sample_data(table).sampleData.rows - sample_data = [data[:-1] for data in sample_data] # remove timestamp as dynamic - self.assertListEqual( - sorted(sample_data), - sorted(expected_sample_data), - ) + profiler_workflow = AutoClassificationWorkflow.create(workflow_config) + profiler_workflow.execute() + status = profiler_workflow.result_status() + profiler_workflow.stop() + + assert status == WorkflowResultStatus.SUCCESS + + table = metadata.get_by_name( + entity=Table, + fqn=f"{service_name}.main.main.users", + ) + + # Test we are getting the expected sample data + expected_sample_data = [ + [ + 1, + "John", + "John Doe", + "johnny b goode", + 30, + ], + [ + 2, + "Jane", + "Jone Doe", + None, + 31, + ], + [ + 3, + "Joh", + "Joh Doe", + None, + 37, + ], + [ + 4, + "Jae", + "Jae Doe", + None, + 38, + ], + ] + sample_data = metadata.get_sample_data(table).sampleData.rows + sample_data = [data[:-1] for data in sample_data] # remove timestamp as dynamic + assert sorted(sample_data) == sorted(expected_sample_data) diff --git a/ingestion/tests/integration/profiler/test_dynamodb.py b/ingestion/tests/integration/profiler/test_dynamodb.py index d6c8e6df2ea3..8ba8f0bfb8e9 100644 --- a/ingestion/tests/integration/profiler/test_dynamodb.py +++ b/ingestion/tests/integration/profiler/test_dynamodb.py @@ -2,8 +2,8 @@ from metadata.generated.schema.entity.data.table import Table from metadata.generated.schema.entity.services.databaseService import DatabaseService -from metadata.generated.schema.metadataIngestion.databaseServiceProfilerPipeline import ( - ProfilerConfigType, +from metadata.generated.schema.metadataIngestion.databaseServiceAutoClassificationPipeline import ( + AutoClassificationConfigType, ) from metadata.generated.schema.metadataIngestion.workflow import ( LogLevels, @@ -14,8 +14,8 @@ WorkflowConfig, ) from metadata.ingestion.ometa.ometa_api import OpenMetadata +from metadata.workflow.classification import AutoClassificationWorkflow from metadata.workflow.metadata import MetadataWorkflow -from metadata.workflow.profiler import ProfilerWorkflow @pytest.fixture(autouse=True, scope="module") @@ -59,7 +59,7 @@ def test_sample_data(db_service, db_fqn, metadata): "serviceName": db_service.fullyQualifiedName.root, "sourceConfig": { "config": { - "type": ProfilerConfigType.Profiler.value, + "type": AutoClassificationConfigType.AutoClassification.value, }, }, }, @@ -76,7 +76,7 @@ def test_sample_data(db_service, db_fqn, metadata): "openMetadataServerConfig": metadata.config.model_dump(), }, } - profiler_workflow = ProfilerWorkflow.create(workflow_config) + profiler_workflow = AutoClassificationWorkflow.create(workflow_config) profiler_workflow.execute() profiler_workflow.raise_from_status() table = metadata.list_entities( diff --git a/ingestion/tests/integration/profiler/test_nosql_profiler.py b/ingestion/tests/integration/profiler/test_nosql_profiler.py index 08cea47bc782..a016e3a284b1 100644 --- a/ingestion/tests/integration/profiler/test_nosql_profiler.py +++ b/ingestion/tests/integration/profiler/test_nosql_profiler.py @@ -43,6 +43,7 @@ from metadata.utils.helpers import datetime_to_ts from metadata.utils.test_utils import accumulate_errors from metadata.utils.time_utils import get_end_of_day_timestamp_mill +from metadata.workflow.classification import AutoClassificationWorkflow from metadata.workflow.metadata import MetadataWorkflow from metadata.workflow.profiler import ProfilerWorkflow from metadata.workflow.workflow_output_handler import WorkflowResultStatus @@ -178,6 +179,13 @@ def run_profiler_workflow(self, config): profiler_workflow.stop() assert status == WorkflowResultStatus.SUCCESS + def run_auto_classification_workflow(self, config): + auto_classification_workflow = AutoClassificationWorkflow.create(config) + auto_classification_workflow.execute() + status = auto_classification_workflow.result_status() + auto_classification_workflow.stop() + assert status == WorkflowResultStatus.SUCCESS + def test_simple(self): workflow_config = deepcopy(self.ingestion_config) workflow_config["source"]["sourceConfig"]["config"].update( @@ -240,6 +248,18 @@ def test_simple(self): assert c1.max == c2.max assert c1.min == c2.min + auto_workflow_config = deepcopy(self.ingestion_config) + auto_workflow_config["source"]["sourceConfig"]["config"].update( + { + "type": "AutoClassification", + } + ) + auto_workflow_config["processor"] = { + "type": "orm-profiler", + "config": {}, + } + self.run_auto_classification_workflow(auto_workflow_config) + table = self.metadata.get_by_name( Table, f"{SERVICE_NAME}.default.{TEST_DATABASE}.{TEST_COLLECTION}" ) @@ -320,6 +340,26 @@ def test_custom_query(self): assert (len(column_profile.entities) > 0) == ( len(tc["expected"]["columns"]) > 0 ) + + auto_workflow_config = deepcopy(self.ingestion_config) + auto_workflow_config["source"]["sourceConfig"]["config"].update( + { + "type": "AutoClassification", + } + ) + auto_workflow_config["processor"] = { + "type": "orm-profiler", + "config": { + "tableConfig": [ + { + "fullyQualifiedName": f"{SERVICE_NAME}.default.{TEST_DATABASE}.{TEST_COLLECTION}", + "profileQuery": '{"age": %s}' % query_age, + } + ], + }, + } + self.run_auto_classification_workflow(auto_workflow_config) + table = self.metadata.get_by_name( Table, f"{SERVICE_NAME}.default.{TEST_DATABASE}.{TEST_COLLECTION}" ) diff --git a/ingestion/tests/integration/test_suite/test_e2e_workflow.py b/ingestion/tests/integration/test_suite/test_e2e_workflow.py index 0ca06822f794..eb09b7235ddc 100644 --- a/ingestion/tests/integration/test_suite/test_e2e_workflow.py +++ b/ingestion/tests/integration/test_suite/test_e2e_workflow.py @@ -49,7 +49,7 @@ test_suite_config = { "source": { - "type": "custom-database", + "type": "sqlite", "serviceName": "test_suite_service_test", "sourceConfig": { "config": { From 9c0f78b3204b4f0b0b557df1b10c584cd944d47d Mon Sep 17 00:00:00 2001 From: Pere Miquel Brull Date: Mon, 18 Nov 2024 13:53:30 +0100 Subject: [PATCH 25/29] tests --- .../src/metadata/sampler/pandas/sampler.py | 2 +- .../src/metadata/sampler/sampler_interface.py | 8 ++++--- ingestion/tests/integration/conftest.py | 1 - .../data_quality/test_data_diff.py | 2 +- .../data_quality/test_data_quality.py | 2 +- .../tests/integration/datalake/conftest.py | 24 +++++++++++++++++++ .../datalake/test_datalake_profiler_e2e.py | 14 +++++++++++ .../integration/datalake/test_ingestion.py | 2 +- .../integration/mysql/test_data_quality.py | 2 +- .../integration/postgres/test_data_quality.py | 4 ++-- 10 files changed, 50 insertions(+), 11 deletions(-) diff --git a/ingestion/src/metadata/sampler/pandas/sampler.py b/ingestion/src/metadata/sampler/pandas/sampler.py index b14633e6eea5..673f537b1c37 100644 --- a/ingestion/src/metadata/sampler/pandas/sampler.py +++ b/ingestion/src/metadata/sampler/pandas/sampler.py @@ -172,7 +172,7 @@ def random_sample(self, is_sampled: bool = False, **__): return self._rdn_sample_from_user_query() if self.partition_details: - self.table = self._partitioned_table() + self._table = self._partitioned_table() if not self.sample_config.profile_sample or is_sampled: return self.table diff --git a/ingestion/src/metadata/sampler/sampler_interface.py b/ingestion/src/metadata/sampler/sampler_interface.py index 5b7334f0ab45..8d52ade31714 100644 --- a/ingestion/src/metadata/sampler/sampler_interface.py +++ b/ingestion/src/metadata/sampler/sampler_interface.py @@ -120,9 +120,11 @@ def create( default_sample_config=default_sample_config, ) sample_query = get_sample_query(entity=entity, entity_config=table_config) - partition_details = get_partition_details(entity=entity) - include_columns = get_include_columns(entity, table_config) - exclude_columns = get_exclude_columns(entity, table_config) + partition_details = get_partition_details( + entity=entity, entity_config=table_config + ) + include_columns = get_include_columns(entity, entity_config=table_config) + exclude_columns = get_exclude_columns(entity, entity_config=table_config) return cls( service_connection_config=service_connection_config, diff --git a/ingestion/tests/integration/conftest.py b/ingestion/tests/integration/conftest.py index 2dbecb98984d..e1b3a1c12dc2 100644 --- a/ingestion/tests/integration/conftest.py +++ b/ingestion/tests/integration/conftest.py @@ -72,7 +72,6 @@ def profiler_config(db_service, workflow_config, sink_config): "sourceConfig": { "config": { "type": "Profiler", - "generateSampleData": True, "timeoutSeconds": 600, "threadCount": 1, # easier for debugging } diff --git a/ingestion/tests/integration/data_quality/test_data_diff.py b/ingestion/tests/integration/data_quality/test_data_diff.py index 890e279000df..3a786ce22c58 100644 --- a/ingestion/tests/integration/data_quality/test_data_diff.py +++ b/ingestion/tests/integration/data_quality/test_data_diff.py @@ -392,7 +392,7 @@ def test_happy_paths( ) workflow_config = { "source": { - "type": TestSuiteConfigType.TestSuite.value, + "type": "postgres", "serviceName": "MyTestSuite", "sourceConfig": { "config": { diff --git a/ingestion/tests/integration/data_quality/test_data_quality.py b/ingestion/tests/integration/data_quality/test_data_quality.py index cdc608cca249..ea1f07bd391b 100644 --- a/ingestion/tests/integration/data_quality/test_data_quality.py +++ b/ingestion/tests/integration/data_quality/test_data_quality.py @@ -38,7 +38,7 @@ def test_empty_test_suite( ) workflow_config = { "source": { - "type": TestSuiteConfigType.TestSuite.value, + "type": "postgres", "serviceName": "MyTestSuite", "sourceConfig": { "config": { diff --git a/ingestion/tests/integration/datalake/conftest.py b/ingestion/tests/integration/datalake/conftest.py index 337bea1081af..f6415b28ea18 100644 --- a/ingestion/tests/integration/datalake/conftest.py +++ b/ingestion/tests/integration/datalake/conftest.py @@ -17,6 +17,7 @@ import pytest from metadata.generated.schema.entity.services.databaseService import DatabaseService +from metadata.workflow.classification import AutoClassificationWorkflow from metadata.workflow.data_quality import TestSuiteWorkflow from metadata.workflow.metadata import MetadataWorkflow from metadata.workflow.profiler import ProfilerWorkflow @@ -211,7 +212,30 @@ def profiler_workflow_config(ingestion_config, workflow_config): return ingestion_config +@pytest.fixture(scope="class") +def auto_classification_workflow_config(ingestion_config, workflow_config): + ingestion_config["source"]["sourceConfig"]["config"].update( + { + "type": "AutoClassification", + } + ) + ingestion_config["processor"] = { + "type": "orm-profiler", + "config": {}, + } + ingestion_config["workflowConfig"] = workflow_config + return ingestion_config + + @pytest.fixture() def run_profiler(run_ingestion, run_workflow, profiler_workflow_config): """Test profiler ingestion""" run_workflow(ProfilerWorkflow, profiler_workflow_config) + + +@pytest.fixture() +def run_auto_classification( + run_ingestion, run_workflow, auto_classification_workflow_config +): + """Test profiler ingestion""" + run_workflow(AutoClassificationWorkflow, auto_classification_workflow_config) diff --git a/ingestion/tests/integration/datalake/test_datalake_profiler_e2e.py b/ingestion/tests/integration/datalake/test_datalake_profiler_e2e.py index 1a8244348a43..b8a12ab7e2c1 100644 --- a/ingestion/tests/integration/datalake/test_datalake_profiler_e2e.py +++ b/ingestion/tests/integration/datalake/test_datalake_profiler_e2e.py @@ -24,6 +24,7 @@ get_beginning_of_day_timestamp_mill, get_end_of_day_timestamp_mill, ) +from metadata.workflow.classification import AutoClassificationWorkflow from metadata.workflow.profiler import ProfilerWorkflow from metadata.workflow.workflow_output_handler import WorkflowResultStatus @@ -305,6 +306,19 @@ def test_datalake_profiler_workflow_with_custom_profiler_config( p for p in first_name_profile if p.timestamp.root == latest_exc_timestamp ] + ingestion_config["source"]["sourceConfig"]["config"].update( + { + "type": "AutoClassification", + } + ) + + auto_workflow = AutoClassificationWorkflow.create(ingestion_config) + auto_workflow.execute() + status = auto_workflow.result_status() + auto_workflow.stop() + + assert status == WorkflowResultStatus.SUCCESS + sample_data = metadata.get_sample_data(table) assert sorted([c.root for c in sample_data.sampleData.columns]) == sorted( ["id", "age"] diff --git a/ingestion/tests/integration/datalake/test_ingestion.py b/ingestion/tests/integration/datalake/test_ingestion.py index 1bfff44f1c7c..e8b2dd38247a 100644 --- a/ingestion/tests/integration/datalake/test_ingestion.py +++ b/ingestion/tests/integration/datalake/test_ingestion.py @@ -54,7 +54,7 @@ def test_ingestion(self, run_ingestion): if column.dataType == DataType.JSON: assert column.children - def test_profiler(self, run_profiler): + def test_auto_classification(self, run_auto_classification): """Also excluding the test for parquet files until the above is fixed""" csv_ = self.metadata.get_by_name( entity=Table, diff --git a/ingestion/tests/integration/mysql/test_data_quality.py b/ingestion/tests/integration/mysql/test_data_quality.py index 6d2e3c88da95..108ba2669d4b 100644 --- a/ingestion/tests/integration/mysql/test_data_quality.py +++ b/ingestion/tests/integration/mysql/test_data_quality.py @@ -27,7 +27,7 @@ def get_test_suite_config(workflow_config, sink_config): def inner(entity_fqn: str, test_case_definitions: List[TestCaseDefinition]): return { "source": { - "type": TestSuiteConfigType.TestSuite.value, + "type": "mysql", "serviceName": "MyTestSuite", "sourceConfig": { "config": TestSuitePipeline( diff --git a/ingestion/tests/integration/postgres/test_data_quality.py b/ingestion/tests/integration/postgres/test_data_quality.py index 737d049a4165..9b3b325af913 100644 --- a/ingestion/tests/integration/postgres/test_data_quality.py +++ b/ingestion/tests/integration/postgres/test_data_quality.py @@ -49,7 +49,7 @@ def run_data_quality_workflow( run_workflow(MetadataWorkflow, ingestion_config) test_suite_config = OpenMetadataWorkflowConfig( source=Source( - type=TestSuiteConfigType.TestSuite.value, + type="postgres", serviceName="MyTestSuite", sourceConfig=SourceConfig( config=TestSuitePipeline( @@ -323,7 +323,7 @@ def get_incompatible_column_type_config(workflow_config, sink_config): def inner(entity_fqn: str, incompatible_test_case: TestCaseDefinition): return { "source": { - "type": "TestSuite", + "type": "postgres", "serviceName": "MyTestSuite", "sourceConfig": { "config": { From d5a5bf46ea855f0a60a85c5253d85fb437f29246 Mon Sep 17 00:00:00 2001 From: Pere Miquel Brull Date: Mon, 18 Nov 2024 16:53:02 +0100 Subject: [PATCH 26/29] tests --- ingestion/tests/integration/data_quality/test_data_diff.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ingestion/tests/integration/data_quality/test_data_diff.py b/ingestion/tests/integration/data_quality/test_data_diff.py index 3a786ce22c58..4f5c93281c94 100644 --- a/ingestion/tests/integration/data_quality/test_data_diff.py +++ b/ingestion/tests/integration/data_quality/test_data_diff.py @@ -518,7 +518,7 @@ def test_error_paths( ) workflow_config = { "source": { - "type": TestSuiteConfigType.TestSuite.value, + "type": "postgres", "serviceName": "MyTestSuite", "sourceConfig": { "config": { From d4042a1499014f69be210b74895d59a99b2eaa35 Mon Sep 17 00:00:00 2001 From: Pere Miquel Brull Date: Mon, 18 Nov 2024 17:24:48 +0100 Subject: [PATCH 27/29] SampleData --- .../src/metadata/ingestion/source/database/sample_data.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ingestion/src/metadata/ingestion/source/database/sample_data.py b/ingestion/src/metadata/ingestion/source/database/sample_data.py index 064a5cacd248..e6d2fb8bde36 100644 --- a/ingestion/src/metadata/ingestion/source/database/sample_data.py +++ b/ingestion/src/metadata/ingestion/source/database/sample_data.py @@ -136,7 +136,8 @@ InvalidSchemaTypeException, schema_parser_config_registry, ) -from metadata.profiler.api.models import ProfilerResponse, SampleData +from metadata.profiler.api.models import ProfilerResponse +from metadata.sampler.models import SampleData from metadata.utils import entity_link, fqn from metadata.utils.constants import UTF_8 from metadata.utils.fqn import FQN_SEPARATOR From 1983af661741739092244d598fb077199dc9fe49 Mon Sep 17 00:00:00 2001 From: Pere Miquel Brull Date: Mon, 18 Nov 2024 17:46:01 +0100 Subject: [PATCH 28/29] SampleData --- .../ingestion/source/database/sample_data.py | 24 ++++--------------- 1 file changed, 4 insertions(+), 20 deletions(-) diff --git a/ingestion/src/metadata/ingestion/source/database/sample_data.py b/ingestion/src/metadata/ingestion/source/database/sample_data.py index e6d2fb8bde36..b4d6246bad53 100644 --- a/ingestion/src/metadata/ingestion/source/database/sample_data.py +++ b/ingestion/src/metadata/ingestion/source/database/sample_data.py @@ -136,8 +136,6 @@ InvalidSchemaTypeException, schema_parser_config_registry, ) -from metadata.profiler.api.models import ProfilerResponse -from metadata.sampler.models import SampleData from metadata.utils import entity_link, fqn from metadata.utils.constants import UTF_8 from metadata.utils.fqn import FQN_SEPARATOR @@ -900,24 +898,10 @@ def ingest_tables(self) -> Iterable[Either[Entity]]: self.metadata.ingest_table_sample_data( table_entity, - ProfilerResponse( - table=table_entity, - profile=CreateTableProfileRequest( - tableProfile=TableProfile( - timestamp=Timestamp( - int(datetime.now().timestamp() * 1000) - ), - columnCount=1.0, - rowCount=3.0, - ) - ), - sample_data=SampleData( - data=TableData( - rows=table["sampleData"]["rows"], - columns=table["sampleData"]["columns"], - ) - ), - ).sample_data.data, + TableData( + rows=table["sampleData"]["rows"], + columns=table["sampleData"]["columns"], + ), ) if table.get("customMetrics"): From 74ae0070fed8def73b2691bbce71b1a1b8caa303 Mon Sep 17 00:00:00 2001 From: Pere Miquel Brull Date: Mon, 18 Nov 2024 18:51:11 +0100 Subject: [PATCH 29/29] clean test --- .../test_auto_classification_workflow.py | 67 ------------------- 1 file changed, 67 deletions(-) delete mode 100644 ingestion/tests/integration/workflow/test_auto_classification_workflow.py diff --git a/ingestion/tests/integration/workflow/test_auto_classification_workflow.py b/ingestion/tests/integration/workflow/test_auto_classification_workflow.py deleted file mode 100644 index 38297a689d9d..000000000000 --- a/ingestion/tests/integration/workflow/test_auto_classification_workflow.py +++ /dev/null @@ -1,67 +0,0 @@ -# Copyright 2021 Collate -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" -Validate the initialization of the Auto Classification Workflow -""" -from unittest.mock import patch - -import yaml - -from metadata.profiler.source.metadata import OpenMetadataSource -from metadata.workflow.classification import AutoClassificationWorkflow - - -@patch.object( - OpenMetadataSource, - "_validate_service_name", - return_value=True, -) -def test_init_auto_classification(*_) -> None: - """We can properly instantiate the app""" - - config = """ - source: - type: mysql - serviceName: mysql - serviceConnection: - config: - type: Mysql - username: openmetadata_user - authType: - password: openmetadata_password - hostPort: localhost:3306 - databaseSchema: openmetadata_db - sourceConfig: - config: - type: AutoClassification - storeSampleData: true - enableAutoClassification: true - tableFilterPattern: - includes: - - entity - processor: - type: orm-profiler - config: {} - sink: - type: metadata-rest - config: {} - workflowConfig: - loggerLevel: DEBUG - openMetadataServerConfig: - enableVersionValidation: false - hostPort: 'http://localhost:8585/api' - authProvider: openmetadata - securityConfig: - jwtToken: "eyJraWQiOiJHYjM4OWEtOWY3Ni1nZGpzLWE5MmotMDI0MmJrOTQzNTYiLCJ0eXAiOiJKV1QiLCJhbGciOiJSUzI1NiJ9.eyJzdWIiOiJhZG1pbiIsImlzQm90IjpmYWxzZSwiaXNzIjoib3Blbi1tZXRhZGF0YS5vcmciLCJpYXQiOjE2NjM5Mzg0NjIsImVtYWlsIjoiYWRtaW5Ab3Blbm1ldGFkYXRhLm9yZyJ9.tS8um_5DKu7HgzGBzS1VTA5uUjKWOCU0B_j08WXBiEC0mr0zNREkqVfwFDD-d24HlNEbrqioLsBuFRiwIWKc1m_ZlVQbG7P36RUxhuv2vbSp80FKyNM-Tj93FDzq91jsyNmsQhyNv_fNr3TXfzzSPjHt8Go0FMMP66weoKMgW2PbXlhVKwEuXUHyakLLzewm9UMeQaEiRzhiTMU3UkLXcKbYEJJvfNFcLwSl9W8JCO_l0Yj3ud-qt_nQYEZwqW6u5nfdQllN133iikV4fM5QZsMCnm8Rq1mvLR0y9bmJiD7fwM1tmJ791TUWqmKaTnP49U493VanKpUAfzIiOiIbhg" - """ - - workflow = AutoClassificationWorkflow.create(yaml.safe_load(config)) - assert isinstance(workflow, AutoClassificationWorkflow)