From c2830303ed09d5d473811762979e38ce06965dfe Mon Sep 17 00:00:00 2001 From: Teddy Crepineau Date: Wed, 25 Oct 2023 08:53:53 +0200 Subject: [PATCH 01/22] fix: updated playwrigth test structure --- ingestion/tests/e2e/configs/connectors/db2.py | 36 +++++ .../tests/e2e/configs/connectors/druid.py | 21 +++ .../tests/e2e/configs/connectors/hive.py | 22 +++ .../tests/e2e/configs/connectors/interface.py | 152 +++++++++++++----- .../tests/e2e/configs/connectors/model.py | 32 ++++ ingestion/tests/e2e/conftest.py | 57 +++++-- .../entity/{redshift => database}/__init__.py | 0 .../e2e/entity/database/common_assertions.py | 73 +++++++++ .../tests/e2e/entity/database/test_db2.py | 36 +++++ .../tests/e2e/entity/database/test_druid.py | 89 ++++++++++ .../tests/e2e/entity/database/test_hive.py | 71 ++++++++ .../e2e/entity/database/test_redshift.py | 89 ++++++++++ .../tests/e2e/entity/redshift/conftest.py | 15 -- .../e2e/entity/redshift/metadata/__init__.py | 0 .../redshift/metadata/test_entity_page.py | 67 -------- .../e2e/entity/redshift/profiler/__init__.py | 0 .../redshift/profiler/test_profiler_page.py | 36 ----- 17 files changed, 625 insertions(+), 171 deletions(-) create mode 100644 ingestion/tests/e2e/configs/connectors/db2.py create mode 100644 ingestion/tests/e2e/configs/connectors/druid.py create mode 100644 ingestion/tests/e2e/configs/connectors/hive.py create mode 100644 ingestion/tests/e2e/configs/connectors/model.py rename ingestion/tests/e2e/entity/{redshift => database}/__init__.py (100%) create mode 100644 ingestion/tests/e2e/entity/database/common_assertions.py create mode 100644 ingestion/tests/e2e/entity/database/test_db2.py create mode 100644 ingestion/tests/e2e/entity/database/test_druid.py create mode 100644 ingestion/tests/e2e/entity/database/test_hive.py create mode 100644 ingestion/tests/e2e/entity/database/test_redshift.py delete mode 100644 ingestion/tests/e2e/entity/redshift/conftest.py delete mode 100644 ingestion/tests/e2e/entity/redshift/metadata/__init__.py delete mode 100644 ingestion/tests/e2e/entity/redshift/metadata/test_entity_page.py delete mode 100644 ingestion/tests/e2e/entity/redshift/profiler/__init__.py delete mode 100644 ingestion/tests/e2e/entity/redshift/profiler/test_profiler_page.py diff --git a/ingestion/tests/e2e/configs/connectors/db2.py b/ingestion/tests/e2e/configs/connectors/db2.py new file mode 100644 index 000000000000..b6202f2b6a11 --- /dev/null +++ b/ingestion/tests/e2e/configs/connectors/db2.py @@ -0,0 +1,36 @@ +"""Redshift connector for e2e tests""" + +import os + +from playwright.sync_api import Page, expect + +from .interface import DataBaseConnectorInterface + + +class Db2Connector(DataBaseConnectorInterface): + """db2 connector""" + def get_service(self, page: Page): + """get service from the service page""" + page.get_by_test_id("Db2").click() + + def set_connection(self, page): + """Set connection for redshift service""" + page.get_by_label("Username*").fill(os.environ["E2E_DB2_USERNAME"]) + expect(page.get_by_label("Username*")).to_have_value( + os.environ["E2E_DB2_USERNAME"] + ) + + page.get_by_label("Password").fill(os.environ["E2E_DB2_PASSWORD"]) + expect(page.get_by_label("Password")).to_have_value( + os.environ["E2E_DB2_PASSWORD"] + ) + + page.get_by_label("Host and Port*").fill(os.environ["E2E_DB2_HOST_PORT"]) + expect(page.get_by_label("Host and Port*")).to_have_value( + os.environ["E2E_DB2_HOST_PORT"] + ) + + page.get_by_label("database*").fill(os.environ["E2E_DB2_DATABASE"]) + expect(page.get_by_label("database*")).to_have_value( + os.environ["E2E_DB2_DATABASE"] + ) \ No newline at end of file diff --git a/ingestion/tests/e2e/configs/connectors/druid.py b/ingestion/tests/e2e/configs/connectors/druid.py new file mode 100644 index 000000000000..31a60e5361a6 --- /dev/null +++ b/ingestion/tests/e2e/configs/connectors/druid.py @@ -0,0 +1,21 @@ +"""Redshift connector for e2e tests""" + +import os + +from playwright.sync_api import Page, expect + +from .interface import DataBaseConnectorInterface + + +class DruidConnector(DataBaseConnectorInterface): + """db2 connector""" + def get_service(self, page: Page): + """get service from the service page""" + page.get_by_test_id("Druid").click() + + def set_connection(self, page): + """Set connection for redshift service""" + page.get_by_label("Host and Port*").fill(os.environ["E2E_DRUID_HOST_PORT"]) + expect(page.get_by_label("Host and Port*")).to_have_value( + os.environ["E2E_DRUID_HOST_PORT"] + ) diff --git a/ingestion/tests/e2e/configs/connectors/hive.py b/ingestion/tests/e2e/configs/connectors/hive.py new file mode 100644 index 000000000000..143957b57f08 --- /dev/null +++ b/ingestion/tests/e2e/configs/connectors/hive.py @@ -0,0 +1,22 @@ +"""MySQL connector for e2e tests""" + +import os + +from playwright.sync_api import Page, expect + +from .interface import DataBaseConnectorInterface + + +class HiveConnector(DataBaseConnectorInterface): + def get_service(self, page: Page): + """get service from the service page""" + page.get_by_test_id("Hive").click() + + def set_connection(self, page): + """Set connection for redshift service""" + page.locator("[id=\"root\\/hostPort\"]").fill(os.environ["E2E_HIVE_HOST_PORT"]) + expect(page.locator("[id=\"root\\/hostPort\"]")).to_have_value( + os.environ["E2E_HIVE_HOST_PORT"] + ) + + page.locator("[id=\"root\\/metastoreConnection__oneof_select\"]").select_option("2") \ No newline at end of file diff --git a/ingestion/tests/e2e/configs/connectors/interface.py b/ingestion/tests/e2e/configs/connectors/interface.py index e4ec2c50366f..31a006d40a94 100644 --- a/ingestion/tests/e2e/configs/connectors/interface.py +++ b/ingestion/tests/e2e/configs/connectors/interface.py @@ -1,14 +1,25 @@ """connectors interface""" import random -import re +from time import sleep +import time import string from abc import ABC, abstractmethod from typing import List -from playwright.sync_api import Page, expect +from playwright.sync_api import Page, expect, TimeoutError +from metadata.generated.schema.entity.services.ingestionPipelines.ingestionPipeline import PipelineState +from ingestion.tests.e2e.configs.connectors.model import ConnectorTestConfig, IngestionFilterConfig, ValidationTestConfig from ingestion.tests.e2e.configs.users.admin import Admin +from metadata.utils.time_utils import get_beginning_of_day_timestamp_mill, get_end_of_day_timestamp_mill +from metadata.ingestion.ometa.ometa_api import OpenMetadata +from metadata.generated.schema.entity.services.connections.metadata.openMetadataConnection import ( + OpenMetadataConnection, +) +from metadata.generated.schema.security.client.openMetadataJWTClientConfig import ( + OpenMetadataJWTClientConfig, +) BASE_URL = "http://localhost:8585" @@ -16,30 +27,54 @@ class DataBaseConnectorInterface(ABC): """Interface for connectors class for e2e tests""" - def __init__(self, schema_filters: List[str] = [], table_filters: List[str] = []): + def __init__(self, config: ConnectorTestConfig): """Initialize the connector""" - self.schema_filters = list(schema_filters) - self.table_filters = list(table_filters) + self.supports_profiler_ingestion = True + self.profiler_summary_card_count = 5 + + self.ingestion_config = config.ingestion + self.validation_config = config.validation + self.service_type = "Databases" self.service_name = None + self.metadata_ingestion_pipeline_fqn = None + self.profiler_ingestion_pipeline_fqn = None + self.ometa = OpenMetadata( + OpenMetadataConnection( + hostPort=f"{BASE_URL}/api", + authProvider="openmetadata", + securityConfig=OpenMetadataJWTClientConfig( + jwtToken="eyJraWQiOiJHYjM4OWEtOWY3Ni1nZGpzLWE5MmotMDI0MmJrOTQzNTYiLCJ0eXAiOiJKV1QiLCJhbGciOiJSUzI1NiJ9.eyJzdWIiOiJhZG1pbiIsImlzQm90IjpmYWxzZSwiaXNzIjoib3Blbi1tZXRhZGF0YS5vcmciLCJpYXQiOjE2NjM5Mzg0NjIsImVtYWlsIjoiYWRtaW5Ab3Blbm1ldGFkYXRhLm9yZyJ9.tS8um_5DKu7HgzGBzS1VTA5uUjKWOCU0B_j08WXBiEC0mr0zNREkqVfwFDD-d24HlNEbrqioLsBuFRiwIWKc1m_ZlVQbG7P36RUxhuv2vbSp80FKyNM-Tj93FDzq91jsyNmsQhyNv_fNr3TXfzzSPjHt8Go0FMMP66weoKMgW2PbXlhVKwEuXUHyakLLzewm9UMeQaEiRzhiTMU3UkLXcKbYEJJvfNFcLwSl9W8JCO_l0Yj3ud-qt_nQYEZwqW6u5nfdQllN133iikV4fM5QZsMCnm8Rq1mvLR0y9bmJiD7fwM1tmJ791TUWqmKaTnP49U493VanKpUAfzIiOiIbhg" + ), + ) + ) - def _check_and_handle_workflow(self, page: Page, type_: str): - try: - expect( - page.get_by_role( - "row", name=re.compile(f"{self.service_name}_{type_}_.*") - ).get_by_test_id("re-deploy-btn") - ).to_be_visible(timeout=1000) - except (TimeoutError, AssertionError): - page.get_by_role( - "row", name=re.compile(f"{self.service_name}_{type_}_.*") - ).get_by_test_id("deploy").click() - finally: - expect( - page.get_by_role( - "row", name=re.compile(f"{self.service_name}_{type_}_.*") - ).get_by_test_id("re-deploy-btn") - ).to_be_visible() + def _check_and_handle_workflow(self, page: Page, ingestion_pipeline_fqn: str): + pipeline_status = None + try_ = 0 + sleep(1) + # we'll iterate until we get a pipeline status + while not pipeline_status: + pipeline_status = self.ometa.get_pipeline_status_between_ts( + f"{self.service_name}.{ingestion_pipeline_fqn}", + get_beginning_of_day_timestamp_mill(), + get_end_of_day_timestamp_mill(), + ) + if not pipeline_status and try_ > 10: + # if we don't get a pipeline status after trying 10 times + # we need to deploy the workflow + try: + page.get_by_role( + "row", name=f"{ingestion_pipeline_fqn}" + ).get_by_test_id("re-deploy").click() + except TimeoutError: + page.get_by_role( + "row", name=f"{ingestion_pipeline_fqn}" + ).get_by_test_id("deploy").click() + if try_ > 20: + # if we've tried 20 times, we'll raise an exception + raise TimeoutError("Pipeline status not found") + try_ += 1 @abstractmethod def get_service(self, page: Page): @@ -63,22 +98,61 @@ def generate_service_name(): + "_-1" ) - def _set_schema_filter(self, page: Page): + def _set_ingestion_filter(self, type_: str, page: Page): """Set schema filter for redshift service""" - for schema in self.schema_filters: - page.locator('xpath=//*[@id="root/schemaFilterPattern/includes"]').fill( - schema + filter_config: IngestionFilterConfig = getattr(self.ingestion_config, type_) + if not filter_config: + return + + for container_type, value in filter_config: + if not value: + continue + if container_type == "schema_": + container_type = "schema" + for filter_type, filter_elements in value: + if not filter_elements: + continue + for element in filter_elements: + page.locator(f'xpath=//*[@id="root/{container_type}FilterPattern/{filter_type}"]').fill( + element + ) + + + def get_sorted_ingestion_pipeline_statues(self, ingestion_pipeline_fqn: str, desc=True): + statuses = self.ometa.get_pipeline_status_between_ts( + ingestion_pipeline_fqn, + get_beginning_of_day_timestamp_mill(), + get_end_of_day_timestamp_mill(), ) + return sorted(statuses, key=lambda x: x.startDate.__root__, reverse=True if desc else False) - def _set_table_filter(self, page: Page): - """Set schema filter for redshift service""" - for table in self.table_filters: - page.locator('[id="root\\/tableFilterPattern\\/includes"]').fill(table) + + def get_pipeline_status(self, ingestion_pipeline_fqn: str): + # Not best practice. Should use `expect`, though playwright does not have a `wait_until` function + # we'll make a call to the API to get the pipeline status and check if it's success + status = None + timeout = time.time() + 60*5 # 5 minutes from now + + while not status or status == PipelineState.running: + if time.time() > timeout: + raise TimeoutError("Pipeline with status {status} has been running for more than 5 minutes") + statuses = self.get_sorted_ingestion_pipeline_statues( + ingestion_pipeline_fqn, + ) + # we'll get the state of the most recent pipeline run + status = statuses[0].pipelineState + if status != PipelineState.running: + break + + return status def create_service_ingest_metadata(self, page: Page): - """Ingest redshift service data""" - page.goto(f"{BASE_URL}/") - Admin().login(page) + """Ingest redshift service data + + Args: + page (Page): playwright page. Should be logged in and pointing to the home page + e.g. page.goto(f"{BASE_URL}/") + """ page.get_by_test_id("app-bar-item-settings").click() page.get_by_text(self.service_type).click() page.get_by_test_id("add-service-button").click() @@ -91,18 +165,17 @@ def create_service_ingest_metadata(self, page: Page): self.set_connection(page) page.get_by_test_id("submit-btn").click() page.get_by_test_id("add-ingestion-button").click() - self._set_schema_filter(page) + self._set_ingestion_filter("metadata", page) + self.metadata_ingestion_pipeline_fqn = page.get_by_label("name*").input_value() page.get_by_test_id("submit-btn").click() page.get_by_test_id("deploy-button").click() page.get_by_test_id("view-service-button").click() page.get_by_test_id("ingestions").click() - self._check_and_handle_workflow(page, "metadata") + self._check_and_handle_workflow(page, self.metadata_ingestion_pipeline_fqn) return self.service_name def create_profiler_workflow(self, page: Page): """create profiler workflow""" - page.goto(f"{BASE_URL}/") - Admin().login(page) page.get_by_test_id("app-bar-item-settings").click() page.get_by_text("Databases").click() page.get_by_test_id(f"service-name-{self.service_name}").click() @@ -112,13 +185,14 @@ def create_profiler_workflow(self, page: Page): page.locator( "div:nth-child(5) > div > div:nth-child(2) > .form-group > .ant-row > div:nth-child(2) > .ant-select > .ant-select-selector > .ant-select-selection-overflow" ).click() - self._set_table_filter(page) + self._set_ingestion_filter("profiler", page) page.locator('[id="root\\/processPiiSensitive"]').click() + self.profiler_ingestion_pipeline_fqn = page.get_by_label("name*").input_value() page.get_by_test_id("submit-btn").click() page.get_by_test_id("deploy-button").click() page.get_by_test_id("view-service-button").click() page.get_by_test_id("ingestions").click() - self._check_and_handle_workflow(page, "profiler") + self._check_and_handle_workflow(page, self.profiler_ingestion_pipeline_fqn) def delete_service(self, page: Page): """Delete service""" diff --git a/ingestion/tests/e2e/configs/connectors/model.py b/ingestion/tests/e2e/configs/connectors/model.py new file mode 100644 index 000000000000..f82f04da3900 --- /dev/null +++ b/ingestion/tests/e2e/configs/connectors/model.py @@ -0,0 +1,32 @@ +"""Connector model config for testing.""" + +from typing import Optional +from pydantic import BaseModel + + +class IngestionFilterConfig(BaseModel): + includes: Optional[list[str]] = [] + excludes: Optional[list[str]] = [] + +class IngestionTestConfig(BaseModel): + database: Optional[IngestionFilterConfig] + schema_: Optional[IngestionFilterConfig] + table: Optional[IngestionFilterConfig] + +class ConnectorIngestionTestConfig(BaseModel): + metadata: Optional[IngestionTestConfig] + profiler: Optional[IngestionTestConfig] + +class ValidationTestConfig(BaseModel): + service: Optional[str] + database: Optional[str] + schema_: Optional[str] + table: Optional[str] + +class ConnectorValidationTestConfig(BaseModel): + metadata: Optional[ValidationTestConfig] + profiler: Optional[ValidationTestConfig] + +class ConnectorTestConfig(BaseModel): + ingestion: Optional[ConnectorIngestionTestConfig] + validation: Optional[ConnectorValidationTestConfig] \ No newline at end of file diff --git a/ingestion/tests/e2e/conftest.py b/ingestion/tests/e2e/conftest.py index 341bda4f0e4d..f9d0fd4547a3 100644 --- a/ingestion/tests/e2e/conftest.py +++ b/ingestion/tests/e2e/conftest.py @@ -2,9 +2,9 @@ import pytest -from playwright.sync_api import Browser, expect +from playwright.sync_api import Browser, expect, Page -from ingestion.tests.e2e.configs.common import create_user +from ingestion.tests.e2e.configs.common import create_user, go_to_service from ingestion.tests.e2e.configs.users.admin import Admin TIMEOUT = 60000 @@ -29,19 +29,48 @@ def browser_context_args(browser_context_args): } -@pytest.fixture(scope="session") -def create_data_consumer_user(browser: Browser): - """Create a data consumer user""" - context_ = browser.new_context( - base_url=BASE_URL, - java_script_enabled=True, - ) - page = context_.new_page() +@pytest.fixture(scope="function") +def admin_page_context(page: Page): page.goto("/") Admin().login(page) - data_consumer = create_user( - page, "data-consumer@example.com", "Data Consumer User", "Data Consumer" + yield page + page.close() + + +@pytest.fixture(scope="class") +def setUpClass(browser: Browser, request): # pylint: disable=invalid-name + """set up class for ingestion pipelines""" + context_ = browser.new_context(base_url=BASE_URL) + page = context_.new_page() + page.goto(f"{BASE_URL}/") + Admin().login(page) + + connector_obj = request.param["connector_obj"] + request.cls.connector_obj = connector_obj + + # create service and ingest metadata + connector_obj.create_service_ingest_metadata(page) + request.cls.service_name = connector_obj.service_name + page.get_by_text("Ingestions").click() + # Not best practice. Should use `expect`, though playwright does not have a `wait_until` function + # we'll make a call to the API to get the pipeline status and check if it's success + request.cls.metadata_ingestion_status = connector_obj.get_pipeline_status( + f"{connector_obj.service_name}.{connector_obj.metadata_ingestion_pipeline_fqn}" ) - yield data_consumer - data_consumer.delete(page) + + if connector_obj.supports_profiler_ingestion: + connector_obj.create_profiler_workflow(page) + go_to_service("Databases", page, connector_obj.service_name) + page.get_by_text("Ingestions").click() + + # Not best practice. Should use `expect`, though playwright does not have a `wait_until` function + # we'll make a call to the API to get the pipeline status and check if it's success + request.cls.profiler_ingestion_status = connector_obj.get_pipeline_status( + f"{connector_obj.service_name}.{connector_obj.profiler_ingestion_pipeline_fqn}" + ) + else: + request.cls.profiler_ingestion_status = None + + yield + connector_obj.delete_service(page) context_.close() diff --git a/ingestion/tests/e2e/entity/redshift/__init__.py b/ingestion/tests/e2e/entity/database/__init__.py similarity index 100% rename from ingestion/tests/e2e/entity/redshift/__init__.py rename to ingestion/tests/e2e/entity/database/__init__.py diff --git a/ingestion/tests/e2e/entity/database/common_assertions.py b/ingestion/tests/e2e/entity/database/common_assertions.py new file mode 100644 index 000000000000..b1875a4ece9d --- /dev/null +++ b/ingestion/tests/e2e/entity/database/common_assertions.py @@ -0,0 +1,73 @@ +"""common database assertions""" + +from playwright.sync_api import Page, expect + +from ingestion.tests.e2e.configs.common import go_to_service + + +def assert_change_database_owner(page_context: Page, service_name: str): + """assert database owner can be changed as expected""" + go_to_service("Databases", page_context, service_name) + page_context.get_by_test_id("edit-owner").click() + page_context.get_by_test_id("owner-select-users-search-bar").click() + page_context.get_by_test_id("owner-select-users-search-bar").fill("created-user") + page_context.get_by_text("created-user").click() + expect( + page_context.get_by_test_id("owner-label").get_by_test_id("owner-link") + ).to_have_text("created-user") + + +def assert_profile_data( + page_context: Page, + service_name: str, + database: str, + schema: str, + table: str, + connector_obj, + ): + """Assert profile data have been computed correctly""" + go_to_service("Databases", page_context, service_name) + page_context.get_by_role("link", name=database).click() + page_context.get_by_role("link", name=schema).click() + page_context.get_by_role("link", name=table, exact=True).click() + page_context.get_by_text("Profiler & Data Quality").click() + for card in range(connector_obj.profiler_summary_card_count): + summary_card = page_context.get_by_test_id("summary-card-container").nth(card) + description = summary_card.get_by_test_id("summary-card-description").inner_text() + assert description not in {"0"} + + +def assert_sample_data_ingestion( + page_context: Page, + service_name: str, + database: str, + schema: str, + table: str, + ): + """assert sample data are ingested as expected""" + go_to_service("Databases", page_context, service_name) + page_context.get_by_role("link", name=database).click() + page_context.get_by_role("link", name=schema).click() + page_context.get_by_role("link", name=table, exact=True).click() + page_context.get_by_text("Sample Data").click() + + expect(page_context.get_by_test_id("sample-data")).to_be_visible() + +def assert_pii_column_auto_tagging( + page_context: Page, + service_name: str, + database: str, + schema: str, + table: str, + column: str, + ): + """assert pii column auto tagging tagged as expected""" + go_to_service("Databases", page_context, service_name) + page_context.get_by_role("link", name=database).click() + page_context.get_by_role("link", name=schema).click() + page_context.get_by_role("link", name=table, exact=True).click() + + table_row = page_context.locator(f'tr:has-text("{column}")') + tag = table_row.locator('td:nth-child(4)') + expect(tag).to_be_visible() + assert tag.text_content() in {"Sensitive", "NonSensitive"} \ No newline at end of file diff --git a/ingestion/tests/e2e/entity/database/test_db2.py b/ingestion/tests/e2e/entity/database/test_db2.py new file mode 100644 index 000000000000..9d50377ce7df --- /dev/null +++ b/ingestion/tests/e2e/entity/database/test_db2.py @@ -0,0 +1,36 @@ +"""Test Hive database ingestion.""" + +import pytest +from playwright.sync_api import Page + +from metadata.generated.schema.entity.services.ingestionPipelines.ingestionPipeline import PipelineState +from ingestion.tests.e2e.configs.connectors.model import ConnectorTestConfig, ConnectorValidationTestConfig, IngestionTestConfig, ConnectorIngestionTestConfig, IngestionFilterConfig, ValidationTestConfig +from ingestion.tests.e2e.configs.connectors.db2 import Db2Connector +from ingestion.tests.e2e.entity.database.common_assertions import assert_change_database_owner, assert_profile_data, assert_sample_data_ingestion + + +@pytest.mark.parametrize( + "setUpClass", + [{"connector_obj":Db2Connector( + ConnectorTestConfig( + ingestion=ConnectorIngestionTestConfig( + metadata=IngestionTestConfig( + database=IngestionFilterConfig( + includes=["testdb"] + ), + ), # type: ignore + ), + validation=ConnectorValidationTestConfig( + profiler=ValidationTestConfig( + database="testdb", + schema_="sampledata", + table="customer" + ) # type: ignore + ) + ) + )}], + indirect=True +) +@pytest.mark.usefixtures("setUpClass") +class TestHiveConnector: + """We need to validate dependency can be installed in the test env.""" \ No newline at end of file diff --git a/ingestion/tests/e2e/entity/database/test_druid.py b/ingestion/tests/e2e/entity/database/test_druid.py new file mode 100644 index 000000000000..f228adc54449 --- /dev/null +++ b/ingestion/tests/e2e/entity/database/test_druid.py @@ -0,0 +1,89 @@ +"""Test default database ingestion (Redshift).""" + + +from playwright.sync_api import Page +import pytest + +from metadata.generated.schema.entity.services.ingestionPipelines.ingestionPipeline import PipelineState +from ingestion.tests.e2e.configs.connectors.model import ConnectorTestConfig, ConnectorValidationTestConfig, IngestionTestConfig, ConnectorIngestionTestConfig, IngestionFilterConfig, ValidationTestConfig +from ingestion.tests.e2e.configs.connectors.druid import DruidConnector +from ingestion.tests.e2e.entity.database.common_assertions import assert_change_database_owner, assert_pii_column_auto_tagging, assert_profile_data, assert_sample_data_ingestion + + +@pytest.mark.parametrize( + "setUpClass", + [{"connector_obj":DruidConnector( + ConnectorTestConfig( + ingestion=ConnectorIngestionTestConfig( + metadata=IngestionTestConfig( + schema_=IngestionFilterConfig( + includes=["druid"] + ), + ), # type: ignore + profiler=IngestionTestConfig( + schema_=IngestionFilterConfig( + includes=["druid"] + ), + ) # type: ignore + ), + validation=ConnectorValidationTestConfig( + profiler=ValidationTestConfig( + database="default", + schema_="druid", + table="inline_data" + ) # type: ignore + ) + ) + )}], + indirect=True +) +@pytest.mark.usefixtures("setUpClass") +class TestRedshiftConnector: + """Redshift connector test case""" + + def test_pipelines_statuses(self): + """check ingestion pipelines ran successfully""" + assert self.metadata_ingestion_status == PipelineState.success + # if the connector does not support profiler ingestion return None as status + assert self.profiler_ingestion_status in {PipelineState.success, None} + + + @pytest.mark.dependency(depends=["test_pipelines_statuses"]) + def test_change_database_owner(self, admin_page_context: Page): + """test change database owner""" + assert_change_database_owner(admin_page_context, self.service_name) + + @pytest.mark.dependency(depends=["test_pipelines_statuses"]) + def test_check_profile_data(self, admin_page_context: Page): + """check profile data are visible""" + assert_profile_data( + admin_page_context, + self.service_name, + self.connector_obj.validation_config.profiler.database, + self.connector_obj.validation_config.profiler.schema_, + self.connector_obj.validation_config.profiler.table, + self.connector_obj, + ) + + @pytest.mark.dependency(depends=["test_pipelines_statuses"]) + def test_sample_data_ingestion(self, admin_page_context: Page): + """test sample dta is ingested as expected for the table""" + assert_sample_data_ingestion( + admin_page_context, + self.service_name, + self.connector_obj.validation_config.profiler.database, + self.connector_obj.validation_config.profiler.schema_, + self.connector_obj.validation_config.profiler.table, + ) + + @pytest.mark.dependency(depends=["test_pipelines_statuses"]) + def test_pii_colum_auto_tagging(self, admin_page_context: Page): + """check pii column auto tagging tagged as expected""" + assert_pii_column_auto_tagging( + admin_page_context, + self.service_name, + self.connector_obj.validation_config.profiler.database, + self.connector_obj.validation_config.profiler.schema_, + self.connector_obj.validation_config.profiler.table, + "cityName", + ) diff --git a/ingestion/tests/e2e/entity/database/test_hive.py b/ingestion/tests/e2e/entity/database/test_hive.py new file mode 100644 index 000000000000..f17698df170a --- /dev/null +++ b/ingestion/tests/e2e/entity/database/test_hive.py @@ -0,0 +1,71 @@ +"""Test Hive database ingestion.""" + +import pytest +from playwright.sync_api import Page + +from metadata.generated.schema.entity.services.ingestionPipelines.ingestionPipeline import PipelineState +from ingestion.tests.e2e.configs.connectors.model import ConnectorTestConfig, ConnectorValidationTestConfig, IngestionTestConfig, ConnectorIngestionTestConfig, IngestionFilterConfig, ValidationTestConfig +from ingestion.tests.e2e.configs.connectors.hive import HiveConnector +from ingestion.tests.e2e.entity.database.common_assertions import assert_change_database_owner, assert_profile_data, assert_sample_data_ingestion + + +@pytest.mark.parametrize( + "setUpClass", + [{"connector_obj":HiveConnector( + ConnectorTestConfig( + ingestion=ConnectorIngestionTestConfig( + metadata=IngestionTestConfig( + database=IngestionFilterConfig( + includes=["default"] + ), + ), # type: ignore + ), + validation=ConnectorValidationTestConfig( + profiler=ValidationTestConfig( + database="default", + schema_="default", + table="t1" + ) # type: ignore + ) + ) + )}], + indirect=True +) +@pytest.mark.usefixtures("setUpClass") +class TestHiveConnector: + """Hive connector test case""" + + def test_pipelines_statuses(self): + """check ingestion pipelines ran successfully""" + assert self.metadata_ingestion_status == PipelineState.success + # if the connector does not support profiler ingestion return None as status + assert self.profiler_ingestion_status in {PipelineState.success, None} + + + @pytest.mark.dependency(depends=["test_pipelines_statuses"]) + def test_change_database_owner(self, admin_page_context: Page): + """test change database owner""" + assert_change_database_owner(admin_page_context, self.service_name) + + @pytest.mark.dependency(depends=["test_pipelines_statuses"]) + def test_check_profile_data(self, admin_page_context: Page): + """check profile data are visible""" + assert_profile_data( + admin_page_context, + self.service_name, + self.connector_obj.validation_config.profiler.database, + self.connector_obj.validation_config.profiler.schema_, + self.connector_obj.validation_config.profiler.table, + self.connector_obj, + ) + + @pytest.mark.dependency(depends=["test_pipelines_statuses"]) + def test_sample_data_ingestion(self, admin_page_context: Page): + """test sample dta is ingested as expected for the table""" + assert_sample_data_ingestion( + admin_page_context, + self.service_name, + self.connector_obj.validation_config.profiler.database, + self.connector_obj.validation_config.profiler.schema_, + self.connector_obj.validation_config.profiler.table, + ) diff --git a/ingestion/tests/e2e/entity/database/test_redshift.py b/ingestion/tests/e2e/entity/database/test_redshift.py new file mode 100644 index 000000000000..7575d9a5ed7c --- /dev/null +++ b/ingestion/tests/e2e/entity/database/test_redshift.py @@ -0,0 +1,89 @@ +"""Test default database ingestion (Redshift).""" + + +from playwright.sync_api import Page +import pytest + +from metadata.generated.schema.entity.services.ingestionPipelines.ingestionPipeline import PipelineState +from ingestion.tests.e2e.configs.connectors.model import ConnectorTestConfig, ConnectorValidationTestConfig, IngestionTestConfig, ConnectorIngestionTestConfig, IngestionFilterConfig, ValidationTestConfig +from ingestion.tests.e2e.configs.connectors.redshift import RedshiftConnector +from ingestion.tests.e2e.entity.database.common_assertions import assert_change_database_owner, assert_pii_column_auto_tagging, assert_profile_data, assert_sample_data_ingestion + + +@pytest.mark.parametrize( + "setUpClass", + [{"connector_obj":RedshiftConnector( + ConnectorTestConfig( + ingestion=ConnectorIngestionTestConfig( + metadata=IngestionTestConfig( + schema_=IngestionFilterConfig( + includes=["dbt_jaffle"] + ), + ), # type: ignore + profiler=IngestionTestConfig( + table=IngestionFilterConfig( + includes=["customers"] + ), + ) # type: ignore + ), + validation=ConnectorValidationTestConfig( + profiler=ValidationTestConfig( + database="dev", + schema_="dbt_jaffle", + table="customers" + ) # type: ignore + ) + ) + )}], + indirect=True +) +@pytest.mark.usefixtures("setUpClass") +class TestRedshiftConnector: + """Redshift connector test case""" + + def test_pipelines_statuses(self): + """check ingestion pipelines ran successfully""" + assert self.metadata_ingestion_status == PipelineState.success + # if the connector does not support profiler ingestion return None as status + assert self.profiler_ingestion_status in {PipelineState.success, None} + + + @pytest.mark.dependency(depends=["test_pipelines_statuses"]) + def test_change_database_owner(self, admin_page_context: Page): + """test change database owner""" + assert_change_database_owner(admin_page_context, self.service_name) + + @pytest.mark.dependency(depends=["test_pipelines_statuses"]) + def test_check_profile_data(self, admin_page_context: Page): + """check profile data are visible""" + assert_profile_data( + admin_page_context, + self.service_name, + self.connector_obj.validation_config.profiler.database, + self.connector_obj.validation_config.profiler.schema_, + self.connector_obj.validation_config.profiler.table, + self.connector_obj, + ) + + @pytest.mark.dependency(depends=["test_pipelines_statuses"]) + def test_sample_data_ingestion(self, admin_page_context: Page): + """test sample dta is ingested as expected for the table""" + assert_sample_data_ingestion( + admin_page_context, + self.service_name, + self.connector_obj.validation_config.profiler.database, + self.connector_obj.validation_config.profiler.schema_, + self.connector_obj.validation_config.profiler.table, + ) + + @pytest.mark.dependency(depends=["test_pipelines_statuses"]) + def test_pii_colum_auto_tagging(self, admin_page_context: Page): + """check pii column auto tagging tagged as expected""" + assert_pii_column_auto_tagging( + admin_page_context, + self.service_name, + self.connector_obj.validation_config.profiler.database, + self.connector_obj.validation_config.profiler.schema_, + self.connector_obj.validation_config.profiler.table, + "first_name" + ) diff --git a/ingestion/tests/e2e/entity/redshift/conftest.py b/ingestion/tests/e2e/entity/redshift/conftest.py deleted file mode 100644 index 12e292f8d3b0..000000000000 --- a/ingestion/tests/e2e/entity/redshift/conftest.py +++ /dev/null @@ -1,15 +0,0 @@ -"""Module fixture for data quality e2e tests""" - - -import pytest - -from ingestion.tests.e2e.configs.connectors.redshift import RedshiftConnector - -BASE_URL = "http://localhost:8585" - - -@pytest.fixture(scope="session") -def redshift_connector(): - """Create a redshift connector""" - redshift = RedshiftConnector(["dbt_jaffle"], ["customers"]) - yield redshift diff --git a/ingestion/tests/e2e/entity/redshift/metadata/__init__.py b/ingestion/tests/e2e/entity/redshift/metadata/__init__.py deleted file mode 100644 index e69de29bb2d1..000000000000 diff --git a/ingestion/tests/e2e/entity/redshift/metadata/test_entity_page.py b/ingestion/tests/e2e/entity/redshift/metadata/test_entity_page.py deleted file mode 100644 index eaa577029ea8..000000000000 --- a/ingestion/tests/e2e/entity/redshift/metadata/test_entity_page.py +++ /dev/null @@ -1,67 +0,0 @@ -""" -Entity metadata tests. Scenarios tested: - - -""" -import re - -import pytest -from playwright.sync_api import Page, expect - -from ingestion.tests.e2e.configs.common import go_to_service -from ingestion.tests.e2e.configs.users.admin import Admin - - -@pytest.mark.order(1) -def test_assert_metadata_ingestion_status_success(redshift_connector, page: Page): - """Assert that the ingestion status is success""" - - redshift_connector.create_service_ingest_metadata(page) - service_name = redshift_connector.service_name - go_to_service("Databases", page, service_name) - page.get_by_text("Ingestions").click() - - # Not best practice. Should use `expect`, though playwright does not have a `wait_until` function - status = ( - page.get_by_role("row", name=re.compile(f"^{service_name}_metadata_.*")) - .get_by_test_id("pipeline-status") - .text_content() - ) - while status in ("--", "Running"): - page.reload() - status = ( - page.get_by_role("row", name=re.compile(f"^{service_name}_metadata_.*")) - .get_by_test_id("pipeline-status") - .text_content() - ) - - assert status == "Success" - - -def test_change_database_owner(redshift_connector, page: Page): - """Test changing the database owner works as expected""" - - service_name = redshift_connector.service_name - page.goto("/") - Admin().login(page) - go_to_service("Databases", page, service_name) - page.get_by_test_id("edit-owner").click() - # page.get_by_role("tab", name="Users.*").click() - page.get_by_test_id("owner-select-users-search-bar").click() - page.get_by_test_id("owner-select-users-search-bar").fill("created-user") - page.get_by_text("created-user").click() - expect( - page.get_by_test_id("owner-label").get_by_test_id("owner-link") - ).to_have_text("created-user") - - -def test_data_consumer(redshift_connector, create_data_consumer_user, page: Page): - """...""" - - service_name = redshift_connector.service_name - user = create_data_consumer_user - page.goto("/") - user.login(page) - go_to_service("Databases", page, service_name) - expect(page.get_by_test_id("ingestions")).not_to_be_visible() - expect(page.get_by_test_id("data-testid")).not_to_be_visible() - expect(page.get_by_test_id("databases")).to_be_visible() diff --git a/ingestion/tests/e2e/entity/redshift/profiler/__init__.py b/ingestion/tests/e2e/entity/redshift/profiler/__init__.py deleted file mode 100644 index e69de29bb2d1..000000000000 diff --git a/ingestion/tests/e2e/entity/redshift/profiler/test_profiler_page.py b/ingestion/tests/e2e/entity/redshift/profiler/test_profiler_page.py deleted file mode 100644 index 88b124b3521b..000000000000 --- a/ingestion/tests/e2e/entity/redshift/profiler/test_profiler_page.py +++ /dev/null @@ -1,36 +0,0 @@ -""" -Entity profiler tests. Scenarios tested: - - -""" -import re -import time - -from playwright.sync_api import Page - -from ingestion.tests.e2e.configs.common import go_to_service - - -def test_assert_profiler_ingestion_status_success(redshift_connector, page: Page): - """test profiler ingestion status""" - - service_name = redshift_connector.service_name - redshift_connector.create_profiler_workflow(page) - go_to_service("Databases", page, service_name) - page.get_by_text("Ingestions").click() - - # Not best practice. Should use `expect`, though playwright does not have a `wait_until` function - status = ( - page.get_by_role("row", name=re.compile(f"^{service_name}_profiler_.*")) - .get_by_test_id("pipeline-status") - .text_content() - ) - while status in ("--", "Running"): - time.sleep(2) - page.reload() - status = ( - page.get_by_role("row", name=re.compile(f"{service_name}_profiler_.*")) - .get_by_test_id("pipeline-status") - .text_content() - ) - - assert status == "Success" From 94a20e53dcdbf78ee502aee8489040195cc9241d Mon Sep 17 00:00:00 2001 From: Teddy Crepineau Date: Wed, 25 Oct 2023 08:59:48 +0200 Subject: [PATCH 02/22] fix: druid profiler queries --- .../sqlalchemy/profiler_interface.py | 8 ++++++- .../profiler/metrics/static/stddev.py | 7 ++++++ .../metadata/profiler/orm/functions/length.py | 1 + .../metadata/profiler/orm/functions/median.py | 6 +++++ .../processor/sampler/sqlalchemy/sampler.py | 23 ++++++++++++++----- 5 files changed, 38 insertions(+), 7 deletions(-) diff --git a/ingestion/src/metadata/profiler/interface/sqlalchemy/profiler_interface.py b/ingestion/src/metadata/profiler/interface/sqlalchemy/profiler_interface.py index feff51c9cb24..cce910034c99 100644 --- a/ingestion/src/metadata/profiler/interface/sqlalchemy/profiler_interface.py +++ b/ingestion/src/metadata/profiler/interface/sqlalchemy/profiler_interface.py @@ -23,8 +23,9 @@ from typing import Dict, List from sqlalchemy import Column, inspect -from sqlalchemy.exc import ProgrammingError +from sqlalchemy.exc import ProgrammingError, ResourceClosedError from sqlalchemy.orm import scoped_session +from metadata.profiler.orm.registry import Dialects from metadata.generated.schema.entity.data.table import TableData from metadata.ingestion.connections.session import create_and_bind_thread_safe_session @@ -258,6 +259,11 @@ def _compute_query_metrics( row = runner.select_first_from_query(metric_query) return dict(row) + except ResourceClosedError as exc: + # if the query returns no results, we will get a ResourceClosedError from Druid + if not runner._session.get_bind().dialect.name == Dialects.Druid: + msg = f"Error trying to compute profile for {runner.table.__tablename__}.{column.name}: {exc}" + handle_query_exception(msg, exc, session) except Exception as exc: msg = f"Error trying to compute profile for {runner.table.__tablename__}.{column.name}: {exc}" handle_query_exception(msg, exc, session) diff --git a/ingestion/src/metadata/profiler/metrics/static/stddev.py b/ingestion/src/metadata/profiler/metrics/static/stddev.py index e95fe2185743..54f50e51ea9f 100644 --- a/ingestion/src/metadata/profiler/metrics/static/stddev.py +++ b/ingestion/src/metadata/profiler/metrics/static/stddev.py @@ -69,6 +69,13 @@ def _(element, compiler, **kw): proc = compiler.process(element.clauses, **kw) return "if(isNaN(stddevPop(%s)), null, stddevPop(%s))" % ((proc,) * 2) +@compiles(StdDevFn, Dialects.Druid) +def _(element, compiler, **kw): + """returns stdv for druid. Could not validate with our cluster + we might need to look into installing the druid-stats module + https://druid.apache.org/docs/latest/configuration/extensions/#loading-extensions + """ + return "NULL" class StdDev(StaticMetric): """ diff --git a/ingestion/src/metadata/profiler/orm/functions/length.py b/ingestion/src/metadata/profiler/orm/functions/length.py index cbe7181cbbe2..ded2893ca6d9 100644 --- a/ingestion/src/metadata/profiler/orm/functions/length.py +++ b/ingestion/src/metadata/profiler/orm/functions/length.py @@ -49,6 +49,7 @@ def _(element, compiler, **kw): @compiles(LenFn, Dialects.IbmDbSa) @compiles(LenFn, Dialects.Db2) @compiles(LenFn, Dialects.Hana) +@compiles(LenFn, Dialects.Druid) def _(element, compiler, **kw): return "LENGTH(%s)" % compiler.process(element.clauses, **kw) diff --git a/ingestion/src/metadata/profiler/orm/functions/median.py b/ingestion/src/metadata/profiler/orm/functions/median.py index 455b9a71bd42..e862bf0acdda 100644 --- a/ingestion/src/metadata/profiler/orm/functions/median.py +++ b/ingestion/src/metadata/profiler/orm/functions/median.py @@ -55,6 +55,12 @@ def _(elements, compiler, **kwargs): ) return f"if({null_check}({quantile_str}), null, {quantile_str})" +@compiles(MedianFn, Dialects.Druid) +def _(elements, compiler, **kwargs): + col, _, percentile = [ + compiler.process(element, **kwargs) for element in elements.clauses + ] + return f"APPROX_QUANTILE({col}, {percentile})" # pylint: disable=unused-argument @compiles(MedianFn, Dialects.Athena) diff --git a/ingestion/src/metadata/profiler/processor/sampler/sqlalchemy/sampler.py b/ingestion/src/metadata/profiler/processor/sampler/sqlalchemy/sampler.py index ac544e9878e9..edf19c7d98e3 100644 --- a/ingestion/src/metadata/profiler/processor/sampler/sqlalchemy/sampler.py +++ b/ingestion/src/metadata/profiler/processor/sampler/sqlalchemy/sampler.py @@ -12,6 +12,7 @@ Helper module to handle data sampling for the profiler """ +import traceback from typing import List, Optional, Union, cast from sqlalchemy import Column, inspect, text @@ -37,6 +38,9 @@ get_partition_col_type, get_value_filter, ) +from metadata.utils.logger import profiler_interface_registry_logger + +logger = profiler_interface_registry_logger() RANDOM_LABEL = "random" @@ -143,12 +147,19 @@ def fetch_sample_data(self, columns: Optional[List[Column]] = None) -> TableData if col.name != RANDOM_LABEL and col.name in names ] - sqa_sample = ( - self.client.query(*sqa_columns) - .select_from(rnd) - .limit(self.sample_limit) - .all() - ) + try: + sqa_sample = ( + self.client.query(*sqa_columns) + .select_from(rnd) + .limit(self.sample_limit) + .all() + ) + except Exception: + logger.debug("Cannot fetch sample data with random sampling. Falling back to 100 rows.") + logger.debug(traceback.format_exc()) + sqa_columns = [col for col in inspect(self.table).c] + sqa_sample = self.client.query(*sqa_columns).select_from(self.table).limit(100).all() + return TableData( columns=[column.name for column in sqa_columns], rows=[list(row) for row in sqa_sample], From 56c582a1aacaeb2807f2ff639a465d4d773b2fc4 Mon Sep 17 00:00:00 2001 From: Teddy Crepineau Date: Wed, 25 Oct 2023 09:06:09 +0200 Subject: [PATCH 03/22] fix: python linting --- .../sqlalchemy/profiler_interface.py | 4 +- .../profiler/metrics/static/stddev.py | 4 +- .../metadata/profiler/orm/functions/median.py | 2 + .../processor/sampler/sqlalchemy/sampler.py | 12 ++-- ingestion/tests/e2e/configs/connectors/db2.py | 3 +- .../tests/e2e/configs/connectors/druid.py | 1 + .../tests/e2e/configs/connectors/hive.py | 8 ++- .../tests/e2e/configs/connectors/interface.py | 58 ++++++++++------- .../tests/e2e/configs/connectors/model.py | 8 ++- ingestion/tests/e2e/conftest.py | 6 +- .../e2e/entity/database/common_assertions.py | 49 +++++++------- .../tests/e2e/entity/database/test_db2.py | 48 +++++++------- .../tests/e2e/entity/database/test_druid.py | 63 ++++++++++-------- .../tests/e2e/entity/database/test_hive.py | 54 +++++++++------ .../e2e/entity/database/test_redshift.py | 65 +++++++++++-------- 15 files changed, 228 insertions(+), 157 deletions(-) diff --git a/ingestion/src/metadata/profiler/interface/sqlalchemy/profiler_interface.py b/ingestion/src/metadata/profiler/interface/sqlalchemy/profiler_interface.py index cce910034c99..7c6d01c77c5d 100644 --- a/ingestion/src/metadata/profiler/interface/sqlalchemy/profiler_interface.py +++ b/ingestion/src/metadata/profiler/interface/sqlalchemy/profiler_interface.py @@ -25,7 +25,6 @@ from sqlalchemy import Column, inspect from sqlalchemy.exc import ProgrammingError, ResourceClosedError from sqlalchemy.orm import scoped_session -from metadata.profiler.orm.registry import Dialects from metadata.generated.schema.entity.data.table import TableData from metadata.ingestion.connections.session import create_and_bind_thread_safe_session @@ -39,6 +38,7 @@ from metadata.profiler.orm.functions.table_metric_construct import ( table_metric_construct_factory, ) +from metadata.profiler.orm.registry import Dialects from metadata.profiler.processor.runner import QueryRunner from metadata.profiler.processor.sampler.sampler_factory import sampler_factory_ from metadata.utils.custom_thread_pool import CustomThreadPoolExecutor @@ -261,7 +261,7 @@ def _compute_query_metrics( return dict(row) except ResourceClosedError as exc: # if the query returns no results, we will get a ResourceClosedError from Druid - if not runner._session.get_bind().dialect.name == Dialects.Druid: + if not runner._session.get_bind().dialect.name == Dialects.Druid: # pylint: disable=protected-access msg = f"Error trying to compute profile for {runner.table.__tablename__}.{column.name}: {exc}" handle_query_exception(msg, exc, session) except Exception as exc: diff --git a/ingestion/src/metadata/profiler/metrics/static/stddev.py b/ingestion/src/metadata/profiler/metrics/static/stddev.py index 54f50e51ea9f..e2654df12160 100644 --- a/ingestion/src/metadata/profiler/metrics/static/stddev.py +++ b/ingestion/src/metadata/profiler/metrics/static/stddev.py @@ -69,14 +69,16 @@ def _(element, compiler, **kw): proc = compiler.process(element.clauses, **kw) return "if(isNaN(stddevPop(%s)), null, stddevPop(%s))" % ((proc,) * 2) + @compiles(StdDevFn, Dialects.Druid) -def _(element, compiler, **kw): +def _(element, compiler, **kw): # pylint: disable=unused-argument """returns stdv for druid. Could not validate with our cluster we might need to look into installing the druid-stats module https://druid.apache.org/docs/latest/configuration/extensions/#loading-extensions """ return "NULL" + class StdDev(StaticMetric): """ STD Metric diff --git a/ingestion/src/metadata/profiler/orm/functions/median.py b/ingestion/src/metadata/profiler/orm/functions/median.py index e862bf0acdda..a02104477d1f 100644 --- a/ingestion/src/metadata/profiler/orm/functions/median.py +++ b/ingestion/src/metadata/profiler/orm/functions/median.py @@ -55,6 +55,7 @@ def _(elements, compiler, **kwargs): ) return f"if({null_check}({quantile_str}), null, {quantile_str})" + @compiles(MedianFn, Dialects.Druid) def _(elements, compiler, **kwargs): col, _, percentile = [ @@ -62,6 +63,7 @@ def _(elements, compiler, **kwargs): ] return f"APPROX_QUANTILE({col}, {percentile})" + # pylint: disable=unused-argument @compiles(MedianFn, Dialects.Athena) @compiles(MedianFn, Dialects.Presto) diff --git a/ingestion/src/metadata/profiler/processor/sampler/sqlalchemy/sampler.py b/ingestion/src/metadata/profiler/processor/sampler/sqlalchemy/sampler.py index edf19c7d98e3..b9216643ca66 100644 --- a/ingestion/src/metadata/profiler/processor/sampler/sqlalchemy/sampler.py +++ b/ingestion/src/metadata/profiler/processor/sampler/sqlalchemy/sampler.py @@ -31,6 +31,7 @@ from metadata.profiler.orm.registry import Dialects from metadata.profiler.processor.handle_partition import partition_filter_handler from metadata.profiler.processor.sampler.sampler_interface import SamplerInterface +from metadata.utils.logger import profiler_interface_registry_logger from metadata.utils.sqa_utils import ( build_query_filter, dispatch_to_date_or_datetime, @@ -38,7 +39,6 @@ get_partition_col_type, get_value_filter, ) -from metadata.utils.logger import profiler_interface_registry_logger logger = profiler_interface_registry_logger() @@ -155,10 +155,14 @@ def fetch_sample_data(self, columns: Optional[List[Column]] = None) -> TableData .all() ) except Exception: - logger.debug("Cannot fetch sample data with random sampling. Falling back to 100 rows.") + logger.debug( + "Cannot fetch sample data with random sampling. Falling back to 100 rows." + ) logger.debug(traceback.format_exc()) - sqa_columns = [col for col in inspect(self.table).c] - sqa_sample = self.client.query(*sqa_columns).select_from(self.table).limit(100).all() + sqa_columns = list(inspect(self.table).c) + sqa_sample = ( + self.client.query(*sqa_columns).select_from(self.table).limit(100).all() + ) return TableData( columns=[column.name for column in sqa_columns], diff --git a/ingestion/tests/e2e/configs/connectors/db2.py b/ingestion/tests/e2e/configs/connectors/db2.py index b6202f2b6a11..5841d767886e 100644 --- a/ingestion/tests/e2e/configs/connectors/db2.py +++ b/ingestion/tests/e2e/configs/connectors/db2.py @@ -9,6 +9,7 @@ class Db2Connector(DataBaseConnectorInterface): """db2 connector""" + def get_service(self, page: Page): """get service from the service page""" page.get_by_test_id("Db2").click() @@ -33,4 +34,4 @@ def set_connection(self, page): page.get_by_label("database*").fill(os.environ["E2E_DB2_DATABASE"]) expect(page.get_by_label("database*")).to_have_value( os.environ["E2E_DB2_DATABASE"] - ) \ No newline at end of file + ) diff --git a/ingestion/tests/e2e/configs/connectors/druid.py b/ingestion/tests/e2e/configs/connectors/druid.py index 31a60e5361a6..17360131956d 100644 --- a/ingestion/tests/e2e/configs/connectors/druid.py +++ b/ingestion/tests/e2e/configs/connectors/druid.py @@ -9,6 +9,7 @@ class DruidConnector(DataBaseConnectorInterface): """db2 connector""" + def get_service(self, page: Page): """get service from the service page""" page.get_by_test_id("Druid").click() diff --git a/ingestion/tests/e2e/configs/connectors/hive.py b/ingestion/tests/e2e/configs/connectors/hive.py index 143957b57f08..f767d554faa6 100644 --- a/ingestion/tests/e2e/configs/connectors/hive.py +++ b/ingestion/tests/e2e/configs/connectors/hive.py @@ -14,9 +14,11 @@ def get_service(self, page: Page): def set_connection(self, page): """Set connection for redshift service""" - page.locator("[id=\"root\\/hostPort\"]").fill(os.environ["E2E_HIVE_HOST_PORT"]) - expect(page.locator("[id=\"root\\/hostPort\"]")).to_have_value( + page.locator('[id="root\\/hostPort"]').fill(os.environ["E2E_HIVE_HOST_PORT"]) + expect(page.locator('[id="root\\/hostPort"]')).to_have_value( os.environ["E2E_HIVE_HOST_PORT"] ) - page.locator("[id=\"root\\/metastoreConnection__oneof_select\"]").select_option("2") \ No newline at end of file + page.locator('[id="root\\/metastoreConnection__oneof_select"]').select_option( + "2" + ) diff --git a/ingestion/tests/e2e/configs/connectors/interface.py b/ingestion/tests/e2e/configs/connectors/interface.py index 31a006d40a94..3e4335b53633 100644 --- a/ingestion/tests/e2e/configs/connectors/interface.py +++ b/ingestion/tests/e2e/configs/connectors/interface.py @@ -1,25 +1,31 @@ """connectors interface""" import random -from time import sleep -import time import string +import time from abc import ABC, abstractmethod -from typing import List +from time import sleep -from playwright.sync_api import Page, expect, TimeoutError +from playwright.sync_api import Page, TimeoutError, expect -from metadata.generated.schema.entity.services.ingestionPipelines.ingestionPipeline import PipelineState -from ingestion.tests.e2e.configs.connectors.model import ConnectorTestConfig, IngestionFilterConfig, ValidationTestConfig -from ingestion.tests.e2e.configs.users.admin import Admin -from metadata.utils.time_utils import get_beginning_of_day_timestamp_mill, get_end_of_day_timestamp_mill -from metadata.ingestion.ometa.ometa_api import OpenMetadata +from ingestion.tests.e2e.configs.connectors.model import ( + ConnectorTestConfig, + IngestionFilterConfig, +) from metadata.generated.schema.entity.services.connections.metadata.openMetadataConnection import ( OpenMetadataConnection, ) +from metadata.generated.schema.entity.services.ingestionPipelines.ingestionPipeline import ( + PipelineState, +) from metadata.generated.schema.security.client.openMetadataJWTClientConfig import ( OpenMetadataJWTClientConfig, ) +from metadata.ingestion.ometa.ometa_api import OpenMetadata +from metadata.utils.time_utils import ( + get_beginning_of_day_timestamp_mill, + get_end_of_day_timestamp_mill, +) BASE_URL = "http://localhost:8585" @@ -113,29 +119,35 @@ def _set_ingestion_filter(self, type_: str, page: Page): if not filter_elements: continue for element in filter_elements: - page.locator(f'xpath=//*[@id="root/{container_type}FilterPattern/{filter_type}"]').fill( - element - ) + page.locator( + f'xpath=//*[@id="root/{container_type}FilterPattern/{filter_type}"]' + ).fill(element) - - def get_sorted_ingestion_pipeline_statues(self, ingestion_pipeline_fqn: str, desc=True): + def get_sorted_ingestion_pipeline_statues( + self, ingestion_pipeline_fqn: str, desc=True + ): statuses = self.ometa.get_pipeline_status_between_ts( - ingestion_pipeline_fqn, - get_beginning_of_day_timestamp_mill(), - get_end_of_day_timestamp_mill(), - ) - return sorted(statuses, key=lambda x: x.startDate.__root__, reverse=True if desc else False) - + ingestion_pipeline_fqn, + get_beginning_of_day_timestamp_mill(), + get_end_of_day_timestamp_mill(), + ) + return sorted( + statuses, + key=lambda x: x.startDate.__root__, + reverse=True if desc else False, + ) def get_pipeline_status(self, ingestion_pipeline_fqn: str): # Not best practice. Should use `expect`, though playwright does not have a `wait_until` function # we'll make a call to the API to get the pipeline status and check if it's success status = None - timeout = time.time() + 60*5 # 5 minutes from now + timeout = time.time() + 60 * 5 # 5 minutes from now while not status or status == PipelineState.running: if time.time() > timeout: - raise TimeoutError("Pipeline with status {status} has been running for more than 5 minutes") + raise TimeoutError( + "Pipeline with status {status} has been running for more than 5 minutes" + ) statuses = self.get_sorted_ingestion_pipeline_statues( ingestion_pipeline_fqn, ) @@ -148,7 +160,7 @@ def get_pipeline_status(self, ingestion_pipeline_fqn: str): def create_service_ingest_metadata(self, page: Page): """Ingest redshift service data - + Args: page (Page): playwright page. Should be logged in and pointing to the home page e.g. page.goto(f"{BASE_URL}/") diff --git a/ingestion/tests/e2e/configs/connectors/model.py b/ingestion/tests/e2e/configs/connectors/model.py index f82f04da3900..02b567c4d6ce 100644 --- a/ingestion/tests/e2e/configs/connectors/model.py +++ b/ingestion/tests/e2e/configs/connectors/model.py @@ -1,6 +1,7 @@ """Connector model config for testing.""" from typing import Optional + from pydantic import BaseModel @@ -8,25 +9,30 @@ class IngestionFilterConfig(BaseModel): includes: Optional[list[str]] = [] excludes: Optional[list[str]] = [] + class IngestionTestConfig(BaseModel): database: Optional[IngestionFilterConfig] schema_: Optional[IngestionFilterConfig] table: Optional[IngestionFilterConfig] + class ConnectorIngestionTestConfig(BaseModel): metadata: Optional[IngestionTestConfig] profiler: Optional[IngestionTestConfig] + class ValidationTestConfig(BaseModel): service: Optional[str] database: Optional[str] schema_: Optional[str] table: Optional[str] + class ConnectorValidationTestConfig(BaseModel): metadata: Optional[ValidationTestConfig] profiler: Optional[ValidationTestConfig] + class ConnectorTestConfig(BaseModel): ingestion: Optional[ConnectorIngestionTestConfig] - validation: Optional[ConnectorValidationTestConfig] \ No newline at end of file + validation: Optional[ConnectorValidationTestConfig] diff --git a/ingestion/tests/e2e/conftest.py b/ingestion/tests/e2e/conftest.py index f9d0fd4547a3..f8500505b4a5 100644 --- a/ingestion/tests/e2e/conftest.py +++ b/ingestion/tests/e2e/conftest.py @@ -2,9 +2,9 @@ import pytest -from playwright.sync_api import Browser, expect, Page +from playwright.sync_api import Browser, Page, expect -from ingestion.tests.e2e.configs.common import create_user, go_to_service +from ingestion.tests.e2e.configs.common import go_to_service from ingestion.tests.e2e.configs.users.admin import Admin TIMEOUT = 60000 @@ -38,7 +38,7 @@ def admin_page_context(page: Page): @pytest.fixture(scope="class") -def setUpClass(browser: Browser, request): # pylint: disable=invalid-name +def setUpClass(browser: Browser, request): # pylint: disable=invalid-name """set up class for ingestion pipelines""" context_ = browser.new_context(base_url=BASE_URL) page = context_.new_page() diff --git a/ingestion/tests/e2e/entity/database/common_assertions.py b/ingestion/tests/e2e/entity/database/common_assertions.py index b1875a4ece9d..f2803879fac3 100644 --- a/ingestion/tests/e2e/entity/database/common_assertions.py +++ b/ingestion/tests/e2e/entity/database/common_assertions.py @@ -18,13 +18,13 @@ def assert_change_database_owner(page_context: Page, service_name: str): def assert_profile_data( - page_context: Page, - service_name: str, - database: str, - schema: str, - table: str, - connector_obj, - ): + page_context: Page, + service_name: str, + database: str, + schema: str, + table: str, + connector_obj, +): """Assert profile data have been computed correctly""" go_to_service("Databases", page_context, service_name) page_context.get_by_role("link", name=database).click() @@ -33,17 +33,19 @@ def assert_profile_data( page_context.get_by_text("Profiler & Data Quality").click() for card in range(connector_obj.profiler_summary_card_count): summary_card = page_context.get_by_test_id("summary-card-container").nth(card) - description = summary_card.get_by_test_id("summary-card-description").inner_text() + description = summary_card.get_by_test_id( + "summary-card-description" + ).inner_text() assert description not in {"0"} def assert_sample_data_ingestion( - page_context: Page, - service_name: str, - database: str, - schema: str, - table: str, - ): + page_context: Page, + service_name: str, + database: str, + schema: str, + table: str, +): """assert sample data are ingested as expected""" go_to_service("Databases", page_context, service_name) page_context.get_by_role("link", name=database).click() @@ -53,14 +55,15 @@ def assert_sample_data_ingestion( expect(page_context.get_by_test_id("sample-data")).to_be_visible() + def assert_pii_column_auto_tagging( - page_context: Page, - service_name: str, - database: str, - schema: str, - table: str, - column: str, - ): + page_context: Page, + service_name: str, + database: str, + schema: str, + table: str, + column: str, +): """assert pii column auto tagging tagged as expected""" go_to_service("Databases", page_context, service_name) page_context.get_by_role("link", name=database).click() @@ -68,6 +71,6 @@ def assert_pii_column_auto_tagging( page_context.get_by_role("link", name=table, exact=True).click() table_row = page_context.locator(f'tr:has-text("{column}")') - tag = table_row.locator('td:nth-child(4)') + tag = table_row.locator("td:nth-child(4)") expect(tag).to_be_visible() - assert tag.text_content() in {"Sensitive", "NonSensitive"} \ No newline at end of file + assert tag.text_content() in {"Sensitive", "NonSensitive"} diff --git a/ingestion/tests/e2e/entity/database/test_db2.py b/ingestion/tests/e2e/entity/database/test_db2.py index 9d50377ce7df..7648db2c3ea5 100644 --- a/ingestion/tests/e2e/entity/database/test_db2.py +++ b/ingestion/tests/e2e/entity/database/test_db2.py @@ -1,36 +1,40 @@ """Test Hive database ingestion.""" import pytest -from playwright.sync_api import Page -from metadata.generated.schema.entity.services.ingestionPipelines.ingestionPipeline import PipelineState -from ingestion.tests.e2e.configs.connectors.model import ConnectorTestConfig, ConnectorValidationTestConfig, IngestionTestConfig, ConnectorIngestionTestConfig, IngestionFilterConfig, ValidationTestConfig from ingestion.tests.e2e.configs.connectors.db2 import Db2Connector -from ingestion.tests.e2e.entity.database.common_assertions import assert_change_database_owner, assert_profile_data, assert_sample_data_ingestion +from ingestion.tests.e2e.configs.connectors.model import ( + ConnectorIngestionTestConfig, + ConnectorTestConfig, + ConnectorValidationTestConfig, + IngestionFilterConfig, + IngestionTestConfig, + ValidationTestConfig, +) @pytest.mark.parametrize( "setUpClass", - [{"connector_obj":Db2Connector( - ConnectorTestConfig( - ingestion=ConnectorIngestionTestConfig( - metadata=IngestionTestConfig( - database=IngestionFilterConfig( - includes=["testdb"] + [ + { + "connector_obj": Db2Connector( + ConnectorTestConfig( + ingestion=ConnectorIngestionTestConfig( + metadata=IngestionTestConfig( + database=IngestionFilterConfig(includes=["testdb"]), + ), # type: ignore + ), + validation=ConnectorValidationTestConfig( + profiler=ValidationTestConfig( + database="testdb", schema_="sampledata", table="customer" + ) # type: ignore ), - ), # type: ignore - ), - validation=ConnectorValidationTestConfig( - profiler=ValidationTestConfig( - database="testdb", - schema_="sampledata", - table="customer" - ) # type: ignore + ) ) - ) - )}], - indirect=True + } + ], + indirect=True, ) @pytest.mark.usefixtures("setUpClass") class TestHiveConnector: - """We need to validate dependency can be installed in the test env.""" \ No newline at end of file + """We need to validate dependency can be installed in the test env.""" diff --git a/ingestion/tests/e2e/entity/database/test_druid.py b/ingestion/tests/e2e/entity/database/test_druid.py index f228adc54449..a199f7dbc257 100644 --- a/ingestion/tests/e2e/entity/database/test_druid.py +++ b/ingestion/tests/e2e/entity/database/test_druid.py @@ -1,41 +1,53 @@ """Test default database ingestion (Redshift).""" -from playwright.sync_api import Page import pytest +from playwright.sync_api import Page -from metadata.generated.schema.entity.services.ingestionPipelines.ingestionPipeline import PipelineState -from ingestion.tests.e2e.configs.connectors.model import ConnectorTestConfig, ConnectorValidationTestConfig, IngestionTestConfig, ConnectorIngestionTestConfig, IngestionFilterConfig, ValidationTestConfig from ingestion.tests.e2e.configs.connectors.druid import DruidConnector -from ingestion.tests.e2e.entity.database.common_assertions import assert_change_database_owner, assert_pii_column_auto_tagging, assert_profile_data, assert_sample_data_ingestion +from ingestion.tests.e2e.configs.connectors.model import ( + ConnectorIngestionTestConfig, + ConnectorTestConfig, + ConnectorValidationTestConfig, + IngestionFilterConfig, + IngestionTestConfig, + ValidationTestConfig, +) +from ingestion.tests.e2e.entity.database.common_assertions import ( + assert_change_database_owner, + assert_pii_column_auto_tagging, + assert_profile_data, + assert_sample_data_ingestion, +) +from metadata.generated.schema.entity.services.ingestionPipelines.ingestionPipeline import ( + PipelineState, +) @pytest.mark.parametrize( "setUpClass", - [{"connector_obj":DruidConnector( - ConnectorTestConfig( - ingestion=ConnectorIngestionTestConfig( - metadata=IngestionTestConfig( - schema_=IngestionFilterConfig( - includes=["druid"] + [ + { + "connector_obj": DruidConnector( + ConnectorTestConfig( + ingestion=ConnectorIngestionTestConfig( + metadata=IngestionTestConfig( + schema_=IngestionFilterConfig(includes=["druid"]), + ), # type: ignore + profiler=IngestionTestConfig( + schema_=IngestionFilterConfig(includes=["druid"]), + ), # type: ignore ), - ), # type: ignore - profiler=IngestionTestConfig( - schema_=IngestionFilterConfig( - includes=["druid"] + validation=ConnectorValidationTestConfig( + profiler=ValidationTestConfig( + database="default", schema_="druid", table="inline_data" + ) # type: ignore ), - ) # type: ignore - ), - validation=ConnectorValidationTestConfig( - profiler=ValidationTestConfig( - database="default", - schema_="druid", - table="inline_data" - ) # type: ignore + ) ) - ) - )}], - indirect=True + } + ], + indirect=True, ) @pytest.mark.usefixtures("setUpClass") class TestRedshiftConnector: @@ -47,7 +59,6 @@ def test_pipelines_statuses(self): # if the connector does not support profiler ingestion return None as status assert self.profiler_ingestion_status in {PipelineState.success, None} - @pytest.mark.dependency(depends=["test_pipelines_statuses"]) def test_change_database_owner(self, admin_page_context: Page): """test change database owner""" diff --git a/ingestion/tests/e2e/entity/database/test_hive.py b/ingestion/tests/e2e/entity/database/test_hive.py index f17698df170a..127812a802e8 100644 --- a/ingestion/tests/e2e/entity/database/test_hive.py +++ b/ingestion/tests/e2e/entity/database/test_hive.py @@ -3,33 +3,46 @@ import pytest from playwright.sync_api import Page -from metadata.generated.schema.entity.services.ingestionPipelines.ingestionPipeline import PipelineState -from ingestion.tests.e2e.configs.connectors.model import ConnectorTestConfig, ConnectorValidationTestConfig, IngestionTestConfig, ConnectorIngestionTestConfig, IngestionFilterConfig, ValidationTestConfig from ingestion.tests.e2e.configs.connectors.hive import HiveConnector -from ingestion.tests.e2e.entity.database.common_assertions import assert_change_database_owner, assert_profile_data, assert_sample_data_ingestion +from ingestion.tests.e2e.configs.connectors.model import ( + ConnectorIngestionTestConfig, + ConnectorTestConfig, + ConnectorValidationTestConfig, + IngestionFilterConfig, + IngestionTestConfig, + ValidationTestConfig, +) +from ingestion.tests.e2e.entity.database.common_assertions import ( + assert_change_database_owner, + assert_profile_data, + assert_sample_data_ingestion, +) +from metadata.generated.schema.entity.services.ingestionPipelines.ingestionPipeline import ( + PipelineState, +) @pytest.mark.parametrize( "setUpClass", - [{"connector_obj":HiveConnector( - ConnectorTestConfig( - ingestion=ConnectorIngestionTestConfig( - metadata=IngestionTestConfig( - database=IngestionFilterConfig( - includes=["default"] + [ + { + "connector_obj": HiveConnector( + ConnectorTestConfig( + ingestion=ConnectorIngestionTestConfig( + metadata=IngestionTestConfig( + database=IngestionFilterConfig(includes=["default"]), + ), # type: ignore ), - ), # type: ignore - ), - validation=ConnectorValidationTestConfig( - profiler=ValidationTestConfig( - database="default", - schema_="default", - table="t1" - ) # type: ignore + validation=ConnectorValidationTestConfig( + profiler=ValidationTestConfig( + database="default", schema_="default", table="t1" + ) # type: ignore + ), + ) ) - ) - )}], - indirect=True + } + ], + indirect=True, ) @pytest.mark.usefixtures("setUpClass") class TestHiveConnector: @@ -41,7 +54,6 @@ def test_pipelines_statuses(self): # if the connector does not support profiler ingestion return None as status assert self.profiler_ingestion_status in {PipelineState.success, None} - @pytest.mark.dependency(depends=["test_pipelines_statuses"]) def test_change_database_owner(self, admin_page_context: Page): """test change database owner""" diff --git a/ingestion/tests/e2e/entity/database/test_redshift.py b/ingestion/tests/e2e/entity/database/test_redshift.py index 7575d9a5ed7c..4725c72238dd 100644 --- a/ingestion/tests/e2e/entity/database/test_redshift.py +++ b/ingestion/tests/e2e/entity/database/test_redshift.py @@ -1,41 +1,53 @@ """Test default database ingestion (Redshift).""" -from playwright.sync_api import Page import pytest +from playwright.sync_api import Page -from metadata.generated.schema.entity.services.ingestionPipelines.ingestionPipeline import PipelineState -from ingestion.tests.e2e.configs.connectors.model import ConnectorTestConfig, ConnectorValidationTestConfig, IngestionTestConfig, ConnectorIngestionTestConfig, IngestionFilterConfig, ValidationTestConfig +from ingestion.tests.e2e.configs.connectors.model import ( + ConnectorIngestionTestConfig, + ConnectorTestConfig, + ConnectorValidationTestConfig, + IngestionFilterConfig, + IngestionTestConfig, + ValidationTestConfig, +) from ingestion.tests.e2e.configs.connectors.redshift import RedshiftConnector -from ingestion.tests.e2e.entity.database.common_assertions import assert_change_database_owner, assert_pii_column_auto_tagging, assert_profile_data, assert_sample_data_ingestion +from ingestion.tests.e2e.entity.database.common_assertions import ( + assert_change_database_owner, + assert_pii_column_auto_tagging, + assert_profile_data, + assert_sample_data_ingestion, +) +from metadata.generated.schema.entity.services.ingestionPipelines.ingestionPipeline import ( + PipelineState, +) @pytest.mark.parametrize( "setUpClass", - [{"connector_obj":RedshiftConnector( - ConnectorTestConfig( - ingestion=ConnectorIngestionTestConfig( - metadata=IngestionTestConfig( - schema_=IngestionFilterConfig( - includes=["dbt_jaffle"] + [ + { + "connector_obj": RedshiftConnector( + ConnectorTestConfig( + ingestion=ConnectorIngestionTestConfig( + metadata=IngestionTestConfig( + schema_=IngestionFilterConfig(includes=["dbt_jaffle"]), + ), # type: ignore + profiler=IngestionTestConfig( + table=IngestionFilterConfig(includes=["customers"]), + ), # type: ignore ), - ), # type: ignore - profiler=IngestionTestConfig( - table=IngestionFilterConfig( - includes=["customers"] + validation=ConnectorValidationTestConfig( + profiler=ValidationTestConfig( + database="dev", schema_="dbt_jaffle", table="customers" + ) # type: ignore ), - ) # type: ignore - ), - validation=ConnectorValidationTestConfig( - profiler=ValidationTestConfig( - database="dev", - schema_="dbt_jaffle", - table="customers" - ) # type: ignore + ) ) - ) - )}], - indirect=True + } + ], + indirect=True, ) @pytest.mark.usefixtures("setUpClass") class TestRedshiftConnector: @@ -47,7 +59,6 @@ def test_pipelines_statuses(self): # if the connector does not support profiler ingestion return None as status assert self.profiler_ingestion_status in {PipelineState.success, None} - @pytest.mark.dependency(depends=["test_pipelines_statuses"]) def test_change_database_owner(self, admin_page_context: Page): """test change database owner""" @@ -85,5 +96,5 @@ def test_pii_colum_auto_tagging(self, admin_page_context: Page): self.connector_obj.validation_config.profiler.database, self.connector_obj.validation_config.profiler.schema_, self.connector_obj.validation_config.profiler.table, - "first_name" + "first_name", ) From e5045828812b05d24f07e21b48a66bab007cbde3 Mon Sep 17 00:00:00 2001 From: Teddy Crepineau Date: Wed, 25 Oct 2023 09:10:50 +0200 Subject: [PATCH 04/22] fix: python linting --- .../profiler/interface/sqlalchemy/profiler_interface.py | 6 +++++- ingestion/src/metadata/profiler/metrics/static/stddev.py | 2 +- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/ingestion/src/metadata/profiler/interface/sqlalchemy/profiler_interface.py b/ingestion/src/metadata/profiler/interface/sqlalchemy/profiler_interface.py index 7c6d01c77c5d..58b266157baa 100644 --- a/ingestion/src/metadata/profiler/interface/sqlalchemy/profiler_interface.py +++ b/ingestion/src/metadata/profiler/interface/sqlalchemy/profiler_interface.py @@ -261,7 +261,11 @@ def _compute_query_metrics( return dict(row) except ResourceClosedError as exc: # if the query returns no results, we will get a ResourceClosedError from Druid - if not runner._session.get_bind().dialect.name == Dialects.Druid: # pylint: disable=protected-access + if ( + # pylint: disable=protected-access + not runner._session.get_bind().dialect.name + == Dialects.Druid + ): msg = f"Error trying to compute profile for {runner.table.__tablename__}.{column.name}: {exc}" handle_query_exception(msg, exc, session) except Exception as exc: diff --git a/ingestion/src/metadata/profiler/metrics/static/stddev.py b/ingestion/src/metadata/profiler/metrics/static/stddev.py index e2654df12160..e55c843ec112 100644 --- a/ingestion/src/metadata/profiler/metrics/static/stddev.py +++ b/ingestion/src/metadata/profiler/metrics/static/stddev.py @@ -71,7 +71,7 @@ def _(element, compiler, **kw): @compiles(StdDevFn, Dialects.Druid) -def _(element, compiler, **kw): # pylint: disable=unused-argument +def _(element, compiler, **kw): # pylint: disable=unused-argument """returns stdv for druid. Could not validate with our cluster we might need to look into installing the druid-stats module https://druid.apache.org/docs/latest/configuration/extensions/#loading-extensions From eb2ea1610d7a1fd53cf9dda9b7d91d7ba5126375 Mon Sep 17 00:00:00 2001 From: Teddy Crepineau Date: Wed, 25 Oct 2023 09:25:04 +0200 Subject: [PATCH 05/22] fix: do not compute random sample if profile sample is 100 --- .../metadata/profiler/processor/sampler/sqlalchemy/sampler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ingestion/src/metadata/profiler/processor/sampler/sqlalchemy/sampler.py b/ingestion/src/metadata/profiler/processor/sampler/sqlalchemy/sampler.py index b9216643ca66..925c4e2dbe24 100644 --- a/ingestion/src/metadata/profiler/processor/sampler/sqlalchemy/sampler.py +++ b/ingestion/src/metadata/profiler/processor/sampler/sqlalchemy/sampler.py @@ -109,7 +109,7 @@ def random_sample(self) -> Union[DeclarativeMeta, AliasedClass]: if self._profile_sample_query: return self._rdn_sample_from_user_query() - if not self.profile_sample: + if not self.profile_sample or int(self.profile_sample) == 100: if self._partition_details: return self._partitioned_table() From ecfe2d94bccba8ba4af15b7b8ab6d6d5cc29313f Mon Sep 17 00:00:00 2001 From: Teddy Crepineau Date: Wed, 25 Oct 2023 09:30:27 +0200 Subject: [PATCH 06/22] fix: updated workflow to test on push --- .github/workflows/playwright-integration-tests-mysql.yml | 9 +++++---- .../workflows/playwright-integration-tests-postgres.yml | 9 +++++---- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/.github/workflows/playwright-integration-tests-mysql.yml b/.github/workflows/playwright-integration-tests-mysql.yml index bac78430015a..9fc9226efc6d 100644 --- a/.github/workflows/playwright-integration-tests-mysql.yml +++ b/.github/workflows/playwright-integration-tests-mysql.yml @@ -14,6 +14,7 @@ name: MySQL Playwright Integration Tests on: + pull_request: push: branches: - main @@ -21,10 +22,10 @@ on: - '1.[0-9]+[0-9]+' paths-ignore: - 'openmetadata-docs/**' - pull_request_target: - types: [labeled, opened, synchronize, reopened] - paths-ignore: - - 'openmetadata-docs/**' + # pull_request_target: + # types: [labeled, opened, synchronize, reopened] + # paths-ignore: + # - 'openmetadata-docs/**' jobs: playwright-mysql: diff --git a/.github/workflows/playwright-integration-tests-postgres.yml b/.github/workflows/playwright-integration-tests-postgres.yml index 619b8548137a..c854c305eb1c 100644 --- a/.github/workflows/playwright-integration-tests-postgres.yml +++ b/.github/workflows/playwright-integration-tests-postgres.yml @@ -14,6 +14,7 @@ name: Postgres Playwright Integration Tests on: + pull_request: push: branches: - main @@ -21,10 +22,10 @@ on: - '1.[0-9]+[0-9]+' paths-ignore: - 'openmetadata-docs/**' - pull_request_target: - types: [labeled, opened, synchronize, reopened] - paths-ignore: - - 'openmetadata-docs/**' + # pull_request_target: + # types: [labeled, opened, synchronize, reopened] + # paths-ignore: + # - 'openmetadata-docs/**' jobs: playwright-postgresql: From bcf0cd611a02ef74566663c5d5b49e1174d552e0 Mon Sep 17 00:00:00 2001 From: Teddy Crepineau Date: Wed, 25 Oct 2023 10:46:41 +0200 Subject: [PATCH 07/22] fix: move connector config to category folder --- ingestion/tests/e2e/configs/connectors/database/__init__.py | 0 ingestion/tests/e2e/configs/connectors/{ => database}/db2.py | 0 ingestion/tests/e2e/configs/connectors/{ => database}/druid.py | 2 +- ingestion/tests/e2e/configs/connectors/{ => database}/hive.py | 0 .../tests/e2e/configs/connectors/{ => database}/interface.py | 0 .../tests/e2e/configs/connectors/{ => database}/redshift.py | 0 6 files changed, 1 insertion(+), 1 deletion(-) create mode 100644 ingestion/tests/e2e/configs/connectors/database/__init__.py rename ingestion/tests/e2e/configs/connectors/{ => database}/db2.py (100%) rename ingestion/tests/e2e/configs/connectors/{ => database}/druid.py (96%) rename ingestion/tests/e2e/configs/connectors/{ => database}/hive.py (100%) rename ingestion/tests/e2e/configs/connectors/{ => database}/interface.py (100%) rename ingestion/tests/e2e/configs/connectors/{ => database}/redshift.py (100%) diff --git a/ingestion/tests/e2e/configs/connectors/database/__init__.py b/ingestion/tests/e2e/configs/connectors/database/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/ingestion/tests/e2e/configs/connectors/db2.py b/ingestion/tests/e2e/configs/connectors/database/db2.py similarity index 100% rename from ingestion/tests/e2e/configs/connectors/db2.py rename to ingestion/tests/e2e/configs/connectors/database/db2.py diff --git a/ingestion/tests/e2e/configs/connectors/druid.py b/ingestion/tests/e2e/configs/connectors/database/druid.py similarity index 96% rename from ingestion/tests/e2e/configs/connectors/druid.py rename to ingestion/tests/e2e/configs/connectors/database/druid.py index 17360131956d..4b862fd6c8e1 100644 --- a/ingestion/tests/e2e/configs/connectors/druid.py +++ b/ingestion/tests/e2e/configs/connectors/database/druid.py @@ -8,7 +8,7 @@ class DruidConnector(DataBaseConnectorInterface): - """db2 connector""" + """druid connector""" def get_service(self, page: Page): """get service from the service page""" diff --git a/ingestion/tests/e2e/configs/connectors/hive.py b/ingestion/tests/e2e/configs/connectors/database/hive.py similarity index 100% rename from ingestion/tests/e2e/configs/connectors/hive.py rename to ingestion/tests/e2e/configs/connectors/database/hive.py diff --git a/ingestion/tests/e2e/configs/connectors/interface.py b/ingestion/tests/e2e/configs/connectors/database/interface.py similarity index 100% rename from ingestion/tests/e2e/configs/connectors/interface.py rename to ingestion/tests/e2e/configs/connectors/database/interface.py diff --git a/ingestion/tests/e2e/configs/connectors/redshift.py b/ingestion/tests/e2e/configs/connectors/database/redshift.py similarity index 100% rename from ingestion/tests/e2e/configs/connectors/redshift.py rename to ingestion/tests/e2e/configs/connectors/database/redshift.py From d9144059f5a49cb69988879e81fc063315402254 Mon Sep 17 00:00:00 2001 From: Teddy Crepineau Date: Wed, 25 Oct 2023 10:46:56 +0200 Subject: [PATCH 08/22] fix: updated imports --- ingestion/tests/e2e/entity/database/test_db2.py | 6 +++--- .../tests/e2e/entity/database/test_druid.py | 17 +++++++++-------- .../tests/e2e/entity/database/test_hive.py | 9 +++++---- .../tests/e2e/entity/database/test_redshift.py | 11 ++++++----- 4 files changed, 23 insertions(+), 20 deletions(-) diff --git a/ingestion/tests/e2e/entity/database/test_db2.py b/ingestion/tests/e2e/entity/database/test_db2.py index 7648db2c3ea5..91a8b351d648 100644 --- a/ingestion/tests/e2e/entity/database/test_db2.py +++ b/ingestion/tests/e2e/entity/database/test_db2.py @@ -1,8 +1,8 @@ -"""Test Hive database ingestion.""" +"""Test Db2 database ingestion.""" import pytest -from ingestion.tests.e2e.configs.connectors.db2 import Db2Connector +from ingestion.tests.e2e.configs.connectors.database.db2 import Db2Connector from ingestion.tests.e2e.configs.connectors.model import ( ConnectorIngestionTestConfig, ConnectorTestConfig, @@ -36,5 +36,5 @@ indirect=True, ) @pytest.mark.usefixtures("setUpClass") -class TestHiveConnector: +class TestDb2Connector: """We need to validate dependency can be installed in the test env.""" diff --git a/ingestion/tests/e2e/entity/database/test_druid.py b/ingestion/tests/e2e/entity/database/test_druid.py index a199f7dbc257..c1e73e99aff0 100644 --- a/ingestion/tests/e2e/entity/database/test_druid.py +++ b/ingestion/tests/e2e/entity/database/test_druid.py @@ -1,10 +1,10 @@ -"""Test default database ingestion (Redshift).""" +"""Test default database ingestion (Druid).""" import pytest from playwright.sync_api import Page -from ingestion.tests.e2e.configs.connectors.druid import DruidConnector +from ingestion.tests.e2e.configs.connectors.database.druid import DruidConnector from ingestion.tests.e2e.configs.connectors.model import ( ConnectorIngestionTestConfig, ConnectorTestConfig, @@ -50,21 +50,22 @@ indirect=True, ) @pytest.mark.usefixtures("setUpClass") -class TestRedshiftConnector: - """Redshift connector test case""" +class TestDruidConnector: + """Druid connector test case""" + @pytest.mark.dependency() def test_pipelines_statuses(self): """check ingestion pipelines ran successfully""" assert self.metadata_ingestion_status == PipelineState.success # if the connector does not support profiler ingestion return None as status assert self.profiler_ingestion_status in {PipelineState.success, None} - @pytest.mark.dependency(depends=["test_pipelines_statuses"]) + @pytest.mark.dependency(depends=["TestDruidConnector::test_pipelines_statuses"]) def test_change_database_owner(self, admin_page_context: Page): """test change database owner""" assert_change_database_owner(admin_page_context, self.service_name) - @pytest.mark.dependency(depends=["test_pipelines_statuses"]) + @pytest.mark.dependency(depends=["TestDruidConnector::test_pipelines_statuses"]) def test_check_profile_data(self, admin_page_context: Page): """check profile data are visible""" assert_profile_data( @@ -76,7 +77,7 @@ def test_check_profile_data(self, admin_page_context: Page): self.connector_obj, ) - @pytest.mark.dependency(depends=["test_pipelines_statuses"]) + @pytest.mark.dependency(depends=["TestDruidConnector::test_pipelines_statuses"]) def test_sample_data_ingestion(self, admin_page_context: Page): """test sample dta is ingested as expected for the table""" assert_sample_data_ingestion( @@ -87,7 +88,7 @@ def test_sample_data_ingestion(self, admin_page_context: Page): self.connector_obj.validation_config.profiler.table, ) - @pytest.mark.dependency(depends=["test_pipelines_statuses"]) + @pytest.mark.dependency(depends=["TestDruidConnector::test_pipelines_statuses"]) def test_pii_colum_auto_tagging(self, admin_page_context: Page): """check pii column auto tagging tagged as expected""" assert_pii_column_auto_tagging( diff --git a/ingestion/tests/e2e/entity/database/test_hive.py b/ingestion/tests/e2e/entity/database/test_hive.py index 127812a802e8..a6a61a22d595 100644 --- a/ingestion/tests/e2e/entity/database/test_hive.py +++ b/ingestion/tests/e2e/entity/database/test_hive.py @@ -3,7 +3,7 @@ import pytest from playwright.sync_api import Page -from ingestion.tests.e2e.configs.connectors.hive import HiveConnector +from ingestion.tests.e2e.configs.connectors.database.hive import HiveConnector from ingestion.tests.e2e.configs.connectors.model import ( ConnectorIngestionTestConfig, ConnectorTestConfig, @@ -48,18 +48,19 @@ class TestHiveConnector: """Hive connector test case""" + @pytest.mark.dependency() def test_pipelines_statuses(self): """check ingestion pipelines ran successfully""" assert self.metadata_ingestion_status == PipelineState.success # if the connector does not support profiler ingestion return None as status assert self.profiler_ingestion_status in {PipelineState.success, None} - @pytest.mark.dependency(depends=["test_pipelines_statuses"]) + @pytest.mark.dependency(name="TestHiveConnector::test_pipelines_statuses") def test_change_database_owner(self, admin_page_context: Page): """test change database owner""" assert_change_database_owner(admin_page_context, self.service_name) - @pytest.mark.dependency(depends=["test_pipelines_statuses"]) + @pytest.mark.dependency(depends=["TestHiveConnector::test_pipelines_statuses"]) def test_check_profile_data(self, admin_page_context: Page): """check profile data are visible""" assert_profile_data( @@ -71,7 +72,7 @@ def test_check_profile_data(self, admin_page_context: Page): self.connector_obj, ) - @pytest.mark.dependency(depends=["test_pipelines_statuses"]) + @pytest.mark.dependency(depends=["TestHiveConnector::test_pipelines_statuses"]) def test_sample_data_ingestion(self, admin_page_context: Page): """test sample dta is ingested as expected for the table""" assert_sample_data_ingestion( diff --git a/ingestion/tests/e2e/entity/database/test_redshift.py b/ingestion/tests/e2e/entity/database/test_redshift.py index 4725c72238dd..4aaa88475c7b 100644 --- a/ingestion/tests/e2e/entity/database/test_redshift.py +++ b/ingestion/tests/e2e/entity/database/test_redshift.py @@ -12,7 +12,7 @@ IngestionTestConfig, ValidationTestConfig, ) -from ingestion.tests.e2e.configs.connectors.redshift import RedshiftConnector +from ingestion.tests.e2e.configs.connectors.database.redshift import RedshiftConnector from ingestion.tests.e2e.entity.database.common_assertions import ( assert_change_database_owner, assert_pii_column_auto_tagging, @@ -53,18 +53,19 @@ class TestRedshiftConnector: """Redshift connector test case""" + @pytest.mark.dependency() def test_pipelines_statuses(self): """check ingestion pipelines ran successfully""" assert self.metadata_ingestion_status == PipelineState.success # if the connector does not support profiler ingestion return None as status assert self.profiler_ingestion_status in {PipelineState.success, None} - @pytest.mark.dependency(depends=["test_pipelines_statuses"]) + @pytest.mark.dependency(depends=["TestRedshiftConnector::test_pipelines_statuses"]) def test_change_database_owner(self, admin_page_context: Page): """test change database owner""" assert_change_database_owner(admin_page_context, self.service_name) - @pytest.mark.dependency(depends=["test_pipelines_statuses"]) + @pytest.mark.dependency(depends=["TestRedshiftConnector::test_pipelines_statuses"]) def test_check_profile_data(self, admin_page_context: Page): """check profile data are visible""" assert_profile_data( @@ -76,7 +77,7 @@ def test_check_profile_data(self, admin_page_context: Page): self.connector_obj, ) - @pytest.mark.dependency(depends=["test_pipelines_statuses"]) + @pytest.mark.dependency(depends=["TestRedshiftConnector::test_pipelines_statuses"]) def test_sample_data_ingestion(self, admin_page_context: Page): """test sample dta is ingested as expected for the table""" assert_sample_data_ingestion( @@ -87,7 +88,7 @@ def test_sample_data_ingestion(self, admin_page_context: Page): self.connector_obj.validation_config.profiler.table, ) - @pytest.mark.dependency(depends=["test_pipelines_statuses"]) + @pytest.mark.dependency(depends=["TestRedshiftConnector::test_pipelines_statuses"]) def test_pii_colum_auto_tagging(self, admin_page_context: Page): """check pii column auto tagging tagged as expected""" assert_pii_column_auto_tagging( From 1b075dd1a4a7de86b14ba4cff26e954d60c144c3 Mon Sep 17 00:00:00 2001 From: Teddy Crepineau Date: Wed, 25 Oct 2023 10:47:09 +0200 Subject: [PATCH 09/22] fix: added pytest-dependency package --- ingestion/setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/ingestion/setup.py b/ingestion/setup.py index 3485f33aff17..a4362aceeb37 100644 --- a/ingestion/setup.py +++ b/ingestion/setup.py @@ -279,6 +279,7 @@ def get_long_description(): "pytest==7.0.0", "pytest-cov", "pytest-order", + "pytest-dependency", # install dbt dependency "dbt-artifacts-parser", VERSIONS["sqlalchemy-databricks"], From f7a356549b0dd4864ae8e2a885dc0948ec09160a Mon Sep 17 00:00:00 2001 From: Teddy Crepineau Date: Wed, 25 Oct 2023 10:47:21 +0200 Subject: [PATCH 10/22] fix: updated readme.md --- ingestion/tests/e2e/README.md | 67 ++++++++++++++++++++++++++++++++++- 1 file changed, 66 insertions(+), 1 deletion(-) diff --git a/ingestion/tests/e2e/README.md b/ingestion/tests/e2e/README.md index 117503fca028..42757a413f59 100644 --- a/ingestion/tests/e2e/README.md +++ b/ingestion/tests/e2e/README.md @@ -5,8 +5,73 @@ https://playwright.dev/python/docs/intro In the `e2e` folder you will find 2 folders and 1 file: - `conftest.py`: defines some module scope fixture (module here is the `e2e` folder). All tests will use `init_with_redshift` by default -- ingestin metadata from a redshift service. The ingestion will only happens on the first test execution. The `create_data_consumer_user` allows tests to login as a Data Consumer and perform some actions - `configs`: holds all the shared configuration. So far we have 2 main classes families (User and Connector) and common functions -- `entity`: holds entity related tests. It contains a subfolder per source. +- `entity`: holds entity related tests. It contains a subfolder per asset category. In the asset category folder you will find the `common_assertions.py`. This file contains all the common assertions to be ran for that specific asset. ## Install Dependencies and Run Tests run `make install_e2e_tests`. Run `make run_e2e_tests`, you can also pass arguments such as `make run_e2e_tests ARGS="--browser webkit"` to run tests against webkit browser or `make run_e2e_tests ARGS="--headed --slowmo 100"` to run the tests in slowmo mode and head full. +## Adding a new test +The first step is to define the connector config for your source. this happens in `configs/connectors/` folder. For a database connector, you will must ensure your class inherits from `DataBaseConnectorInterface`. You will then need to implement the `get_service()` and `set_connection()`. `get_service` specifies which service to choose from the `/add-service` page of the webside and `set_connection` the different elements to configure on the connector connection config page. If you are unsure how an element can be accessed on the page you can run `playwright codegen http://localhost:8585/` -- more info [here](https://playwright.dev/python/docs/codegen). By default `DataBaseConnectorInterface` sets `self.supports_profiler_ingestion=True` which will result in the profiler ingestion to run when the test class is executed. You can `self.supports_profiler_ingestion=False` in your specific connector to override this behavior. + +e.g. + +```python +class DruidConnector(DataBaseConnectorInterface): + """druid connector""" + + def __init__(self, config): + super().__init__(config) + self.supports_profiler_ingestion=False + + def set_connection(): + ... + + def get_service(): + ... +``` + + +Once your connector config has been created you will need to add a new test. Simply create a new file in the asset category of your choice (e.g. `entity/database/test_druid.py`). In this file create a new test class and mark this class with `@pytest.mark.usefixtures("setUpClass")` and `@pytest.mark.parametrize("setUpClass", ...)`. The first mark will make sure `setUpClass` fixture is ran before running your tests (this manage the ingestion of metadata and profiler as of Oct-25 2023) and `@pytest.mark.parametrize` will pass the right connector class to the `setUpClass` fixture. The second argument of `@pytest.mark.parametrize` should be as below +```python +[ + { + "connector_obj": ( + ConnectorTestConfig(...) + ) + } +] +``` + +`ConnectorTestConfig` defines the configuration to use for the test. It has 2 arguments: +- `ingestion`: This allows you to define the different filtering when performing the ingestion. it expects a `ConnectorIngestionTestConfig` which will take 2 arguments: + - `metadata`: this allows you to define metadata ingestion filters. It take a `IngestionTestConfig` which takes 3 arguments: + - `database`: it expects an `IngestionFilterConfig` class which takes 2 argumenst: + - `includes`: a list of str + - `excludes`: a list of str + - `schema_`: see `database` + - `table`: see `database` + - `profiler`: see `metadata` +- `validation`: this config can be used when we need to validate expectations against specific entities. As of Oct-25 2023 it is only used in the `assert_profile_data`, `assert_sample_data_ingestion` and `assert_pii_column_auto_tagging` test functions of the profiler. + +Once you have set up your class you can create your test. There are currently (as of Oct-25 2023) 5 assertions that can be performed: +- assert pipeline status are `success`. You can refer to the implementation in the existing test +- `assert_change_database_owner`: assert the owner of a data can be changed +- `assert_profile_data`: assert table profile data summary are visible +- `assert_sample_data_ingestion`: assert sample data are ingested and visible +- `assert_pii_column_auto_tagging`: assert auto PII tagging from the profiler has been performed + +Note that in every test method you define the following class attributes are accessible: +- `connector_obj`: ``` the connector class pass to `setUpClass` in the `@pytest.mark.parametrize` +- `service_name`: `str`` the name of the service that was created for the test +- `metadata_ingestion_status`: `PipelineState` the ingestion status of the metadata pipeline +- `profiler_ingestion_status`: `PipelineState` the ingestion status of the profiler pipeline. + +## Test Coverage +| **tests** | redshift | druid | hive | +|-----------------------------|:--------:|:-----:|:----:| +| metadata ingestion | ✅ | ✅ | ✅ | +| profiler ingestion | ✅ | ✅ | ✅ | +| change DB owner | ✅ | ✅ | ✅ | +| Table Profiler Summary Data | ✅ | ✅ | ✅ | +| Sample data visible | ✅ | ✅ | ✅ | +| Profiler PII auto Tag | ✅ | ✅ | ❌ | \ No newline at end of file From eaaeb2745fca62feb939637a9b0108eaa3bed4fe Mon Sep 17 00:00:00 2001 From: Teddy Crepineau Date: Wed, 25 Oct 2023 10:48:03 +0200 Subject: [PATCH 11/22] fix: python linting --- ingestion/tests/e2e/entity/database/test_redshift.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ingestion/tests/e2e/entity/database/test_redshift.py b/ingestion/tests/e2e/entity/database/test_redshift.py index 4aaa88475c7b..4a7dec6ddac4 100644 --- a/ingestion/tests/e2e/entity/database/test_redshift.py +++ b/ingestion/tests/e2e/entity/database/test_redshift.py @@ -4,6 +4,7 @@ import pytest from playwright.sync_api import Page +from ingestion.tests.e2e.configs.connectors.database.redshift import RedshiftConnector from ingestion.tests.e2e.configs.connectors.model import ( ConnectorIngestionTestConfig, ConnectorTestConfig, @@ -12,7 +13,6 @@ IngestionTestConfig, ValidationTestConfig, ) -from ingestion.tests.e2e.configs.connectors.database.redshift import RedshiftConnector from ingestion.tests.e2e.entity.database.common_assertions import ( assert_change_database_owner, assert_pii_column_auto_tagging, From 02716229945ee872ca78a5d948c777ee254cecbc Mon Sep 17 00:00:00 2001 From: Teddy Crepineau Date: Wed, 25 Oct 2023 10:53:45 +0200 Subject: [PATCH 12/22] fix: updated profile doc for Druid sampling --- .../connectors/ingestion/workflows/profiler/index.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/openmetadata-docs/content/v1.2.x-SNAPSHOT/connectors/ingestion/workflows/profiler/index.md b/openmetadata-docs/content/v1.2.x-SNAPSHOT/connectors/ingestion/workflows/profiler/index.md index fa71d85f2ab9..507edda88903 100644 --- a/openmetadata-docs/content/v1.2.x-SNAPSHOT/connectors/ingestion/workflows/profiler/index.md +++ b/openmetadata-docs/content/v1.2.x-SNAPSHOT/connectors/ingestion/workflows/profiler/index.md @@ -67,6 +67,8 @@ Set the sample to be use by the profiler for the specific table. - `Percentage`: Value must be between 0 and 100 exclusive (0 < percentage < 100). This will sample the table based on a percentage - `Row Count`: The table will be sampled based on a number of rows (i.e. `1,000`, `2,000`), etc. +⚠️ This option is currently not support for Druid. Sampling leverage `RANDOOM` functions in most database (some have specific sampling functions) and Druid provides neither of these option. We recommend using the partitionning or sample query option if you need to limit the amount of data scanned. + **Auto PII Tagging (Optional)** Configuration to automatically tag columns that might contain sensitive information. @@ -107,6 +109,8 @@ Set the sample to be use by the profiler for the specific table. - `Percentage`: Value must be between 0 and 100 exclusive (0 < percentage < 100). This will sample the table based on a percentage - `Row Count`: The table will be sampled based on a number of rows (i.e. `1,000`, `2,000`), etc. +⚠️ This option is currently not support for Druid. Sampling leverage `RANDOOM` functions in most database (some have specific sampling functions) and Druid provides neither of these option. We recommend using the partitionning or sample query option if you need to limit the amount of data scanned. + **Profile Sample Query** Use a query to sample data for the profiler. This will overwrite any profle sample set. From 9297ea13828b5f9f81df728c7a34045d43d4f051 Mon Sep 17 00:00:00 2001 From: Teddy Crepineau Date: Wed, 25 Oct 2023 11:30:36 +0200 Subject: [PATCH 13/22] fix: empty commit for CI From 2c907da004735240b55be52f3d427b2143bc7cb6 Mon Sep 17 00:00:00 2001 From: Teddy Crepineau Date: Wed, 25 Oct 2023 12:01:39 +0200 Subject: [PATCH 14/22] fix: added workflow constrain back --- .github/workflows/playwright-integration-tests-mysql.yml | 9 ++++----- .../workflows/playwright-integration-tests-postgres.yml | 9 ++++----- 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/.github/workflows/playwright-integration-tests-mysql.yml b/.github/workflows/playwright-integration-tests-mysql.yml index 9fc9226efc6d..bac78430015a 100644 --- a/.github/workflows/playwright-integration-tests-mysql.yml +++ b/.github/workflows/playwright-integration-tests-mysql.yml @@ -14,7 +14,6 @@ name: MySQL Playwright Integration Tests on: - pull_request: push: branches: - main @@ -22,10 +21,10 @@ on: - '1.[0-9]+[0-9]+' paths-ignore: - 'openmetadata-docs/**' - # pull_request_target: - # types: [labeled, opened, synchronize, reopened] - # paths-ignore: - # - 'openmetadata-docs/**' + pull_request_target: + types: [labeled, opened, synchronize, reopened] + paths-ignore: + - 'openmetadata-docs/**' jobs: playwright-mysql: diff --git a/.github/workflows/playwright-integration-tests-postgres.yml b/.github/workflows/playwright-integration-tests-postgres.yml index c854c305eb1c..619b8548137a 100644 --- a/.github/workflows/playwright-integration-tests-postgres.yml +++ b/.github/workflows/playwright-integration-tests-postgres.yml @@ -14,7 +14,6 @@ name: Postgres Playwright Integration Tests on: - pull_request: push: branches: - main @@ -22,10 +21,10 @@ on: - '1.[0-9]+[0-9]+' paths-ignore: - 'openmetadata-docs/**' - # pull_request_target: - # types: [labeled, opened, synchronize, reopened] - # paths-ignore: - # - 'openmetadata-docs/**' + pull_request_target: + types: [labeled, opened, synchronize, reopened] + paths-ignore: + - 'openmetadata-docs/**' jobs: playwright-postgresql: From 444d170e21d5bb03255fb33b4bd473d434118100 Mon Sep 17 00:00:00 2001 From: Teddy Crepineau Date: Wed, 25 Oct 2023 12:11:03 +0200 Subject: [PATCH 15/22] fix: sonar code smell --- .../profiler/interface/sqlalchemy/profiler_interface.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ingestion/src/metadata/profiler/interface/sqlalchemy/profiler_interface.py b/ingestion/src/metadata/profiler/interface/sqlalchemy/profiler_interface.py index 58b266157baa..7a02eb806da0 100644 --- a/ingestion/src/metadata/profiler/interface/sqlalchemy/profiler_interface.py +++ b/ingestion/src/metadata/profiler/interface/sqlalchemy/profiler_interface.py @@ -263,8 +263,8 @@ def _compute_query_metrics( # if the query returns no results, we will get a ResourceClosedError from Druid if ( # pylint: disable=protected-access - not runner._session.get_bind().dialect.name - == Dialects.Druid + runner._session.get_bind().dialect.name + != Dialects.Druid ): msg = f"Error trying to compute profile for {runner.table.__tablename__}.{column.name}: {exc}" handle_query_exception(msg, exc, session) From d441400aaa23d15cf326341695859c97ee288455 Mon Sep 17 00:00:00 2001 From: Teddy Crepineau Date: Wed, 25 Oct 2023 12:16:20 +0200 Subject: [PATCH 16/22] fix: added secrets to container --- .github/workflows/playwright-integration-tests-mysql.yml | 2 ++ .github/workflows/playwright-integration-tests-postgres.yml | 2 ++ 2 files changed, 4 insertions(+) diff --git a/.github/workflows/playwright-integration-tests-mysql.yml b/.github/workflows/playwright-integration-tests-mysql.yml index bac78430015a..1dea36327615 100644 --- a/.github/workflows/playwright-integration-tests-mysql.yml +++ b/.github/workflows/playwright-integration-tests-mysql.yml @@ -117,6 +117,8 @@ jobs: E2E_REDSHIFT_USERNAME: ${{ secrets.E2E_REDSHIFT_USERNAME }} E2E_REDSHIFT_PASSWORD: ${{ secrets.E2E_REDSHIFT_PASSWORD }} E2E_REDSHIFT_DATABASE: ${{ secrets.E2E_REDSHIFT_DATABASE }} + E2E_DRUID_HOST_PORT: ${{ secrets.E2E_DRUID_HOST_PORT }} + E2E_HIVE_HOST_PORT: ${{ secrets.E2E_HIVE_HOST_PORT }} run: | source env/bin/activate make install_e2e_tests diff --git a/.github/workflows/playwright-integration-tests-postgres.yml b/.github/workflows/playwright-integration-tests-postgres.yml index 619b8548137a..65721671ac3a 100644 --- a/.github/workflows/playwright-integration-tests-postgres.yml +++ b/.github/workflows/playwright-integration-tests-postgres.yml @@ -117,6 +117,8 @@ jobs: E2E_REDSHIFT_USERNAME: ${{ secrets.E2E_REDSHIFT_USERNAME }} E2E_REDSHIFT_PASSWORD: ${{ secrets.E2E_REDSHIFT_PASSWORD }} E2E_REDSHIFT_DATABASE: ${{ secrets.E2E_REDSHIFT_DATABASE }} + E2E_DRUID_HOST_PORT: ${{ secrets.E2E_DRUID_HOST_PORT }} + E2E_HIVE_HOST_PORT: ${{ secrets.E2E_HIVE_HOST_PORT }} run: | source env/bin/activate make install_e2e_tests From 686b1b2fe69398f876b6874687e8d2928a50bf63 Mon Sep 17 00:00:00 2001 From: Teddy Date: Wed, 25 Oct 2023 12:16:42 +0200 Subject: [PATCH 17/22] Update openmetadata-docs/content/v1.2.x-SNAPSHOT/connectors/ingestion/workflows/profiler/index.md Co-authored-by: Pere Miquel Brull --- .../connectors/ingestion/workflows/profiler/index.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openmetadata-docs/content/v1.2.x-SNAPSHOT/connectors/ingestion/workflows/profiler/index.md b/openmetadata-docs/content/v1.2.x-SNAPSHOT/connectors/ingestion/workflows/profiler/index.md index 507edda88903..5062bbb72041 100644 --- a/openmetadata-docs/content/v1.2.x-SNAPSHOT/connectors/ingestion/workflows/profiler/index.md +++ b/openmetadata-docs/content/v1.2.x-SNAPSHOT/connectors/ingestion/workflows/profiler/index.md @@ -67,7 +67,7 @@ Set the sample to be use by the profiler for the specific table. - `Percentage`: Value must be between 0 and 100 exclusive (0 < percentage < 100). This will sample the table based on a percentage - `Row Count`: The table will be sampled based on a number of rows (i.e. `1,000`, `2,000`), etc. -⚠️ This option is currently not support for Druid. Sampling leverage `RANDOOM` functions in most database (some have specific sampling functions) and Druid provides neither of these option. We recommend using the partitionning or sample query option if you need to limit the amount of data scanned. +⚠️ This option is currently not support for Druid. Sampling leverage `RANDOM` functions in most database (some have specific sampling functions) and Druid provides neither of these option. We recommend using the partitionning or sample query option if you need to limit the amount of data scanned. **Auto PII Tagging (Optional)** Configuration to automatically tag columns that might contain sensitive information. From fb03810822fddff7b47a804094353084a75dc93f Mon Sep 17 00:00:00 2001 From: Teddy Date: Wed, 25 Oct 2023 12:16:49 +0200 Subject: [PATCH 18/22] Update openmetadata-docs/content/v1.2.x-SNAPSHOT/connectors/ingestion/workflows/profiler/index.md Co-authored-by: Pere Miquel Brull --- .../connectors/ingestion/workflows/profiler/index.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openmetadata-docs/content/v1.2.x-SNAPSHOT/connectors/ingestion/workflows/profiler/index.md b/openmetadata-docs/content/v1.2.x-SNAPSHOT/connectors/ingestion/workflows/profiler/index.md index 5062bbb72041..932185d2c269 100644 --- a/openmetadata-docs/content/v1.2.x-SNAPSHOT/connectors/ingestion/workflows/profiler/index.md +++ b/openmetadata-docs/content/v1.2.x-SNAPSHOT/connectors/ingestion/workflows/profiler/index.md @@ -109,7 +109,7 @@ Set the sample to be use by the profiler for the specific table. - `Percentage`: Value must be between 0 and 100 exclusive (0 < percentage < 100). This will sample the table based on a percentage - `Row Count`: The table will be sampled based on a number of rows (i.e. `1,000`, `2,000`), etc. -⚠️ This option is currently not support for Druid. Sampling leverage `RANDOOM` functions in most database (some have specific sampling functions) and Druid provides neither of these option. We recommend using the partitionning or sample query option if you need to limit the amount of data scanned. +⚠️ This option is currently not support for Druid. Sampling leverage `RANDOM` functions in most database (some have specific sampling functions) and Druid provides neither of these option. We recommend using the partitionning or sample query option if you need to limit the amount of data scanned. **Profile Sample Query** Use a query to sample data for the profiler. This will overwrite any profle sample set. From 72870b690678d64acdd549c43cecec8c2ba7acf7 Mon Sep 17 00:00:00 2001 From: Teddy Date: Wed, 25 Oct 2023 13:32:13 +0200 Subject: [PATCH 19/22] Update ingestion/tests/e2e/entity/database/test_redshift.py --- ingestion/tests/e2e/entity/database/test_redshift.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ingestion/tests/e2e/entity/database/test_redshift.py b/ingestion/tests/e2e/entity/database/test_redshift.py index 4a7dec6ddac4..1e4662661646 100644 --- a/ingestion/tests/e2e/entity/database/test_redshift.py +++ b/ingestion/tests/e2e/entity/database/test_redshift.py @@ -40,7 +40,7 @@ ), validation=ConnectorValidationTestConfig( profiler=ValidationTestConfig( - database="dev", schema_="dbt_jaffle", table="customers" + database="e2e_cli_tests", schema_="dbt_jaffle", table="customers" ) # type: ignore ), ) From 1824ad88d18c1957d0b424056c687d5809d51864 Mon Sep 17 00:00:00 2001 From: Teddy Crepineau Date: Wed, 25 Oct 2023 13:41:59 +0200 Subject: [PATCH 20/22] fix: ran pylint --- ingestion/tests/e2e/entity/database/test_redshift.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/ingestion/tests/e2e/entity/database/test_redshift.py b/ingestion/tests/e2e/entity/database/test_redshift.py index 1e4662661646..ec1c8addf67e 100644 --- a/ingestion/tests/e2e/entity/database/test_redshift.py +++ b/ingestion/tests/e2e/entity/database/test_redshift.py @@ -40,7 +40,9 @@ ), validation=ConnectorValidationTestConfig( profiler=ValidationTestConfig( - database="e2e_cli_tests", schema_="dbt_jaffle", table="customers" + database="e2e_cli_tests", + schema_="dbt_jaffle", + table="customers", ) # type: ignore ), ) From e525ffcd1712b72ae7669aad381ee082bbdcade9 Mon Sep 17 00:00:00 2001 From: Teddy Crepineau Date: Wed, 25 Oct 2023 18:43:03 +0200 Subject: [PATCH 21/22] fix: updated redshift env var. --- .../playwright-integration-tests-mysql.yml | 2 +- .../playwright-integration-tests-postgres.yml | 2 +- Makefile | 2 +- ingestion/setup.py | 1 - ingestion/tests/e2e/configs/common.py | 3 +-- .../e2e/configs/connectors/database/interface.py | 6 +++--- .../e2e/configs/connectors/database/redshift.py | 4 ++-- ingestion/tests/e2e/conftest.py | 3 +-- .../e2e/entity/database/common_assertions.py | 11 +++++++---- ingestion/tests/e2e/entity/database/test_db2.py | 1 - ingestion/tests/e2e/entity/database/test_druid.py | 9 ++------- ingestion/tests/e2e/entity/database/test_hive.py | 8 ++------ .../tests/e2e/entity/database/test_redshift.py | 15 +++++---------- 13 files changed, 26 insertions(+), 41 deletions(-) diff --git a/.github/workflows/playwright-integration-tests-mysql.yml b/.github/workflows/playwright-integration-tests-mysql.yml index 1dea36327615..ef56279d5562 100644 --- a/.github/workflows/playwright-integration-tests-mysql.yml +++ b/.github/workflows/playwright-integration-tests-mysql.yml @@ -116,7 +116,7 @@ jobs: E2E_REDSHIFT_HOST_PORT: ${{ secrets.E2E_REDSHIFT_HOST_PORT }} E2E_REDSHIFT_USERNAME: ${{ secrets.E2E_REDSHIFT_USERNAME }} E2E_REDSHIFT_PASSWORD: ${{ secrets.E2E_REDSHIFT_PASSWORD }} - E2E_REDSHIFT_DATABASE: ${{ secrets.E2E_REDSHIFT_DATABASE }} + E2E_REDSHIFT_DB: ${{ secrets.E2E_REDSHIFT_DB }} E2E_DRUID_HOST_PORT: ${{ secrets.E2E_DRUID_HOST_PORT }} E2E_HIVE_HOST_PORT: ${{ secrets.E2E_HIVE_HOST_PORT }} run: | diff --git a/.github/workflows/playwright-integration-tests-postgres.yml b/.github/workflows/playwright-integration-tests-postgres.yml index 65721671ac3a..e91b61b5cbb8 100644 --- a/.github/workflows/playwright-integration-tests-postgres.yml +++ b/.github/workflows/playwright-integration-tests-postgres.yml @@ -116,7 +116,7 @@ jobs: E2E_REDSHIFT_HOST_PORT: ${{ secrets.E2E_REDSHIFT_HOST_PORT }} E2E_REDSHIFT_USERNAME: ${{ secrets.E2E_REDSHIFT_USERNAME }} E2E_REDSHIFT_PASSWORD: ${{ secrets.E2E_REDSHIFT_PASSWORD }} - E2E_REDSHIFT_DATABASE: ${{ secrets.E2E_REDSHIFT_DATABASE }} + E2E_REDSHIFT_DB: ${{ secrets.E2E_REDSHIFT_DB }} E2E_DRUID_HOST_PORT: ${{ secrets.E2E_DRUID_HOST_PORT }} E2E_HIVE_HOST_PORT: ${{ secrets.E2E_HIVE_HOST_PORT }} run: | diff --git a/Makefile b/Makefile index 5f9d85010848..3ba6ab348d75 100644 --- a/Makefile +++ b/Makefile @@ -75,7 +75,7 @@ unit_ingestion: ## Run Python unit tests .PHONY: run_e2e_tests run_e2e_tests: ## Run e2e tests - pytest --screenshot=only-on-failure --output="ingestion/tests/e2e/artifacts" $(ARGS) --junitxml=ingestion/junit/test-results-e2e.xml ingestion/tests/e2e + pytest --screenshot=only-on-failure --output="ingestion/tests/e2e/artifacts" $(ARGS) --slowmo 5 --junitxml=ingestion/junit/test-results-e2e.xml ingestion/tests/e2e .PHONY: run_python_tests run_python_tests: ## Run all Python tests with coverage diff --git a/ingestion/setup.py b/ingestion/setup.py index a4362aceeb37..3485f33aff17 100644 --- a/ingestion/setup.py +++ b/ingestion/setup.py @@ -279,7 +279,6 @@ def get_long_description(): "pytest==7.0.0", "pytest-cov", "pytest-order", - "pytest-dependency", # install dbt dependency "dbt-artifacts-parser", VERSIONS["sqlalchemy-databricks"], diff --git a/ingestion/tests/e2e/configs/common.py b/ingestion/tests/e2e/configs/common.py index 7befaf72e30b..91d4606bf569 100644 --- a/ingestion/tests/e2e/configs/common.py +++ b/ingestion/tests/e2e/configs/common.py @@ -3,9 +3,8 @@ import random import string -from playwright.sync_api import Page, expect - from ingestion.tests.e2e.configs.users.user import User +from playwright.sync_api import Page, expect BASE_URL = "http://localhost:8585" diff --git a/ingestion/tests/e2e/configs/connectors/database/interface.py b/ingestion/tests/e2e/configs/connectors/database/interface.py index 3e4335b53633..b1393c2481f0 100644 --- a/ingestion/tests/e2e/configs/connectors/database/interface.py +++ b/ingestion/tests/e2e/configs/connectors/database/interface.py @@ -6,12 +6,12 @@ from abc import ABC, abstractmethod from time import sleep -from playwright.sync_api import Page, TimeoutError, expect - from ingestion.tests.e2e.configs.connectors.model import ( ConnectorTestConfig, IngestionFilterConfig, ) +from playwright.sync_api import Page, TimeoutError, expect + from metadata.generated.schema.entity.services.connections.metadata.openMetadataConnection import ( OpenMetadataConnection, ) @@ -36,7 +36,7 @@ class DataBaseConnectorInterface(ABC): def __init__(self, config: ConnectorTestConfig): """Initialize the connector""" self.supports_profiler_ingestion = True - self.profiler_summary_card_count = 5 + self.profiler_summary_card_count = 4 self.ingestion_config = config.ingestion self.validation_config = config.validation diff --git a/ingestion/tests/e2e/configs/connectors/database/redshift.py b/ingestion/tests/e2e/configs/connectors/database/redshift.py index 746060d796ff..e70d144314bc 100644 --- a/ingestion/tests/e2e/configs/connectors/database/redshift.py +++ b/ingestion/tests/e2e/configs/connectors/database/redshift.py @@ -26,7 +26,7 @@ def set_connection(self, page): expect(page.get_by_label("Host and Port")).to_have_value( os.environ["E2E_REDSHIFT_HOST_PORT"] ) - page.get_by_label("Database*").fill(os.environ["E2E_REDSHIFT_DATABASE"]) + page.get_by_label("Database*").fill(os.environ["E2E_REDSHIFT_DB"]) expect(page.get_by_label("Database*")).to_have_value( - os.environ["E2E_REDSHIFT_DATABASE"] + os.environ["E2E_REDSHIFT_DB"] ) diff --git a/ingestion/tests/e2e/conftest.py b/ingestion/tests/e2e/conftest.py index f8500505b4a5..dcd3b1d121a4 100644 --- a/ingestion/tests/e2e/conftest.py +++ b/ingestion/tests/e2e/conftest.py @@ -2,10 +2,9 @@ import pytest -from playwright.sync_api import Browser, Page, expect - from ingestion.tests.e2e.configs.common import go_to_service from ingestion.tests.e2e.configs.users.admin import Admin +from playwright.sync_api import Browser, Page, expect TIMEOUT = 60000 BASE_URL = "http://localhost:8585" diff --git a/ingestion/tests/e2e/entity/database/common_assertions.py b/ingestion/tests/e2e/entity/database/common_assertions.py index f2803879fac3..193409da9900 100644 --- a/ingestion/tests/e2e/entity/database/common_assertions.py +++ b/ingestion/tests/e2e/entity/database/common_assertions.py @@ -1,8 +1,9 @@ """common database assertions""" -from playwright.sync_api import Page, expect +import time from ingestion.tests.e2e.configs.common import go_to_service +from playwright.sync_api import Page, expect def assert_change_database_owner(page_context: Page, service_name: str): @@ -10,11 +11,11 @@ def assert_change_database_owner(page_context: Page, service_name: str): go_to_service("Databases", page_context, service_name) page_context.get_by_test_id("edit-owner").click() page_context.get_by_test_id("owner-select-users-search-bar").click() - page_context.get_by_test_id("owner-select-users-search-bar").fill("created-user") - page_context.get_by_text("created-user").click() + page_context.get_by_test_id("owner-select-users-search-bar").fill("Aaron Johnson") + page_context.get_by_text("Aaron Johnson").click() expect( page_context.get_by_test_id("owner-label").get_by_test_id("owner-link") - ).to_have_text("created-user") + ).to_have_text("Aaron Johnson") def assert_profile_data( @@ -31,6 +32,7 @@ def assert_profile_data( page_context.get_by_role("link", name=schema).click() page_context.get_by_role("link", name=table, exact=True).click() page_context.get_by_text("Profiler & Data Quality").click() + time.sleep(0.05) for card in range(connector_obj.profiler_summary_card_count): summary_card = page_context.get_by_test_id("summary-card-container").nth(card) description = summary_card.get_by_test_id( @@ -70,6 +72,7 @@ def assert_pii_column_auto_tagging( page_context.get_by_role("link", name=schema).click() page_context.get_by_role("link", name=table, exact=True).click() + time.sleep(0.05) table_row = page_context.locator(f'tr:has-text("{column}")') tag = table_row.locator("td:nth-child(4)") expect(tag).to_be_visible() diff --git a/ingestion/tests/e2e/entity/database/test_db2.py b/ingestion/tests/e2e/entity/database/test_db2.py index 91a8b351d648..c9b3dec62546 100644 --- a/ingestion/tests/e2e/entity/database/test_db2.py +++ b/ingestion/tests/e2e/entity/database/test_db2.py @@ -1,7 +1,6 @@ """Test Db2 database ingestion.""" import pytest - from ingestion.tests.e2e.configs.connectors.database.db2 import Db2Connector from ingestion.tests.e2e.configs.connectors.model import ( ConnectorIngestionTestConfig, diff --git a/ingestion/tests/e2e/entity/database/test_druid.py b/ingestion/tests/e2e/entity/database/test_druid.py index c1e73e99aff0..173619078f15 100644 --- a/ingestion/tests/e2e/entity/database/test_druid.py +++ b/ingestion/tests/e2e/entity/database/test_druid.py @@ -2,8 +2,6 @@ import pytest -from playwright.sync_api import Page - from ingestion.tests.e2e.configs.connectors.database.druid import DruidConnector from ingestion.tests.e2e.configs.connectors.model import ( ConnectorIngestionTestConfig, @@ -19,6 +17,8 @@ assert_profile_data, assert_sample_data_ingestion, ) +from playwright.sync_api import Page + from metadata.generated.schema.entity.services.ingestionPipelines.ingestionPipeline import ( PipelineState, ) @@ -53,19 +53,16 @@ class TestDruidConnector: """Druid connector test case""" - @pytest.mark.dependency() def test_pipelines_statuses(self): """check ingestion pipelines ran successfully""" assert self.metadata_ingestion_status == PipelineState.success # if the connector does not support profiler ingestion return None as status assert self.profiler_ingestion_status in {PipelineState.success, None} - @pytest.mark.dependency(depends=["TestDruidConnector::test_pipelines_statuses"]) def test_change_database_owner(self, admin_page_context: Page): """test change database owner""" assert_change_database_owner(admin_page_context, self.service_name) - @pytest.mark.dependency(depends=["TestDruidConnector::test_pipelines_statuses"]) def test_check_profile_data(self, admin_page_context: Page): """check profile data are visible""" assert_profile_data( @@ -77,7 +74,6 @@ def test_check_profile_data(self, admin_page_context: Page): self.connector_obj, ) - @pytest.mark.dependency(depends=["TestDruidConnector::test_pipelines_statuses"]) def test_sample_data_ingestion(self, admin_page_context: Page): """test sample dta is ingested as expected for the table""" assert_sample_data_ingestion( @@ -88,7 +84,6 @@ def test_sample_data_ingestion(self, admin_page_context: Page): self.connector_obj.validation_config.profiler.table, ) - @pytest.mark.dependency(depends=["TestDruidConnector::test_pipelines_statuses"]) def test_pii_colum_auto_tagging(self, admin_page_context: Page): """check pii column auto tagging tagged as expected""" assert_pii_column_auto_tagging( diff --git a/ingestion/tests/e2e/entity/database/test_hive.py b/ingestion/tests/e2e/entity/database/test_hive.py index a6a61a22d595..aa90d5a7bfd4 100644 --- a/ingestion/tests/e2e/entity/database/test_hive.py +++ b/ingestion/tests/e2e/entity/database/test_hive.py @@ -1,8 +1,6 @@ """Test Hive database ingestion.""" import pytest -from playwright.sync_api import Page - from ingestion.tests.e2e.configs.connectors.database.hive import HiveConnector from ingestion.tests.e2e.configs.connectors.model import ( ConnectorIngestionTestConfig, @@ -17,6 +15,8 @@ assert_profile_data, assert_sample_data_ingestion, ) +from playwright.sync_api import Page + from metadata.generated.schema.entity.services.ingestionPipelines.ingestionPipeline import ( PipelineState, ) @@ -48,19 +48,16 @@ class TestHiveConnector: """Hive connector test case""" - @pytest.mark.dependency() def test_pipelines_statuses(self): """check ingestion pipelines ran successfully""" assert self.metadata_ingestion_status == PipelineState.success # if the connector does not support profiler ingestion return None as status assert self.profiler_ingestion_status in {PipelineState.success, None} - @pytest.mark.dependency(name="TestHiveConnector::test_pipelines_statuses") def test_change_database_owner(self, admin_page_context: Page): """test change database owner""" assert_change_database_owner(admin_page_context, self.service_name) - @pytest.mark.dependency(depends=["TestHiveConnector::test_pipelines_statuses"]) def test_check_profile_data(self, admin_page_context: Page): """check profile data are visible""" assert_profile_data( @@ -72,7 +69,6 @@ def test_check_profile_data(self, admin_page_context: Page): self.connector_obj, ) - @pytest.mark.dependency(depends=["TestHiveConnector::test_pipelines_statuses"]) def test_sample_data_ingestion(self, admin_page_context: Page): """test sample dta is ingested as expected for the table""" assert_sample_data_ingestion( diff --git a/ingestion/tests/e2e/entity/database/test_redshift.py b/ingestion/tests/e2e/entity/database/test_redshift.py index ec1c8addf67e..32ff398f6354 100644 --- a/ingestion/tests/e2e/entity/database/test_redshift.py +++ b/ingestion/tests/e2e/entity/database/test_redshift.py @@ -2,8 +2,6 @@ import pytest -from playwright.sync_api import Page - from ingestion.tests.e2e.configs.connectors.database.redshift import RedshiftConnector from ingestion.tests.e2e.configs.connectors.model import ( ConnectorIngestionTestConfig, @@ -19,6 +17,8 @@ assert_profile_data, assert_sample_data_ingestion, ) +from playwright.sync_api import Page + from metadata.generated.schema.entity.services.ingestionPipelines.ingestionPipeline import ( PipelineState, ) @@ -35,14 +35,14 @@ schema_=IngestionFilterConfig(includes=["dbt_jaffle"]), ), # type: ignore profiler=IngestionTestConfig( - table=IngestionFilterConfig(includes=["customers"]), + table=IngestionFilterConfig(includes=["customer"]), ), # type: ignore ), validation=ConnectorValidationTestConfig( profiler=ValidationTestConfig( database="e2e_cli_tests", schema_="dbt_jaffle", - table="customers", + table="customer", ) # type: ignore ), ) @@ -55,19 +55,16 @@ class TestRedshiftConnector: """Redshift connector test case""" - @pytest.mark.dependency() def test_pipelines_statuses(self): """check ingestion pipelines ran successfully""" assert self.metadata_ingestion_status == PipelineState.success # if the connector does not support profiler ingestion return None as status assert self.profiler_ingestion_status in {PipelineState.success, None} - @pytest.mark.dependency(depends=["TestRedshiftConnector::test_pipelines_statuses"]) def test_change_database_owner(self, admin_page_context: Page): """test change database owner""" assert_change_database_owner(admin_page_context, self.service_name) - @pytest.mark.dependency(depends=["TestRedshiftConnector::test_pipelines_statuses"]) def test_check_profile_data(self, admin_page_context: Page): """check profile data are visible""" assert_profile_data( @@ -79,7 +76,6 @@ def test_check_profile_data(self, admin_page_context: Page): self.connector_obj, ) - @pytest.mark.dependency(depends=["TestRedshiftConnector::test_pipelines_statuses"]) def test_sample_data_ingestion(self, admin_page_context: Page): """test sample dta is ingested as expected for the table""" assert_sample_data_ingestion( @@ -90,7 +86,6 @@ def test_sample_data_ingestion(self, admin_page_context: Page): self.connector_obj.validation_config.profiler.table, ) - @pytest.mark.dependency(depends=["TestRedshiftConnector::test_pipelines_statuses"]) def test_pii_colum_auto_tagging(self, admin_page_context: Page): """check pii column auto tagging tagged as expected""" assert_pii_column_auto_tagging( @@ -99,5 +94,5 @@ def test_pii_colum_auto_tagging(self, admin_page_context: Page): self.connector_obj.validation_config.profiler.database, self.connector_obj.validation_config.profiler.schema_, self.connector_obj.validation_config.profiler.table, - "first_name", + "c_name", ) From b296d9db7a267227540d5cf6bb784744c4ca44ec Mon Sep 17 00:00:00 2001 From: Teddy Crepineau Date: Wed, 25 Oct 2023 19:10:09 +0200 Subject: [PATCH 22/22] fix: import linting --- ingestion/tests/e2e/configs/common.py | 3 ++- .../configs/connectors/database/interface.py | 6 ++---- ingestion/tests/e2e/configs/users/admin.py | 2 +- ingestion/tests/e2e/conftest.py | 5 +++-- .../e2e/entity/database/common_assertions.py | 3 ++- ingestion/tests/e2e/entity/database/test_db2.py | 5 +++-- .../tests/e2e/entity/database/test_druid.py | 17 +++++++++-------- .../tests/e2e/entity/database/test_hive.py | 17 +++++++++-------- .../tests/e2e/entity/database/test_redshift.py | 17 +++++++++-------- 9 files changed, 40 insertions(+), 35 deletions(-) diff --git a/ingestion/tests/e2e/configs/common.py b/ingestion/tests/e2e/configs/common.py index 91d4606bf569..ddf6e06a6e81 100644 --- a/ingestion/tests/e2e/configs/common.py +++ b/ingestion/tests/e2e/configs/common.py @@ -3,9 +3,10 @@ import random import string -from ingestion.tests.e2e.configs.users.user import User from playwright.sync_api import Page, expect +from .users.user import User + BASE_URL = "http://localhost:8585" diff --git a/ingestion/tests/e2e/configs/connectors/database/interface.py b/ingestion/tests/e2e/configs/connectors/database/interface.py index b1393c2481f0..f7b233f43dc0 100644 --- a/ingestion/tests/e2e/configs/connectors/database/interface.py +++ b/ingestion/tests/e2e/configs/connectors/database/interface.py @@ -6,10 +6,6 @@ from abc import ABC, abstractmethod from time import sleep -from ingestion.tests.e2e.configs.connectors.model import ( - ConnectorTestConfig, - IngestionFilterConfig, -) from playwright.sync_api import Page, TimeoutError, expect from metadata.generated.schema.entity.services.connections.metadata.openMetadataConnection import ( @@ -27,6 +23,8 @@ get_end_of_day_timestamp_mill, ) +from ...connectors.model import ConnectorTestConfig, IngestionFilterConfig + BASE_URL = "http://localhost:8585" diff --git a/ingestion/tests/e2e/configs/users/admin.py b/ingestion/tests/e2e/configs/users/admin.py index 6bc9dee27b40..1c88a53d83de 100644 --- a/ingestion/tests/e2e/configs/users/admin.py +++ b/ingestion/tests/e2e/configs/users/admin.py @@ -1,6 +1,6 @@ """Admin user configuration for e2e tests.""" -from ingestion.tests.e2e.configs.users.user import User +from ...configs.users.user import User class Admin(User): diff --git a/ingestion/tests/e2e/conftest.py b/ingestion/tests/e2e/conftest.py index dcd3b1d121a4..864e293a5763 100644 --- a/ingestion/tests/e2e/conftest.py +++ b/ingestion/tests/e2e/conftest.py @@ -2,10 +2,11 @@ import pytest -from ingestion.tests.e2e.configs.common import go_to_service -from ingestion.tests.e2e.configs.users.admin import Admin from playwright.sync_api import Browser, Page, expect +from .configs.common import go_to_service +from .configs.users.admin import Admin + TIMEOUT = 60000 BASE_URL = "http://localhost:8585" expect.set_options(timeout=TIMEOUT) diff --git a/ingestion/tests/e2e/entity/database/common_assertions.py b/ingestion/tests/e2e/entity/database/common_assertions.py index 193409da9900..692d15124f78 100644 --- a/ingestion/tests/e2e/entity/database/common_assertions.py +++ b/ingestion/tests/e2e/entity/database/common_assertions.py @@ -2,9 +2,10 @@ import time -from ingestion.tests.e2e.configs.common import go_to_service from playwright.sync_api import Page, expect +from ...configs.common import go_to_service + def assert_change_database_owner(page_context: Page, service_name: str): """assert database owner can be changed as expected""" diff --git a/ingestion/tests/e2e/entity/database/test_db2.py b/ingestion/tests/e2e/entity/database/test_db2.py index c9b3dec62546..aaa0f5b0aa83 100644 --- a/ingestion/tests/e2e/entity/database/test_db2.py +++ b/ingestion/tests/e2e/entity/database/test_db2.py @@ -1,8 +1,9 @@ """Test Db2 database ingestion.""" import pytest -from ingestion.tests.e2e.configs.connectors.database.db2 import Db2Connector -from ingestion.tests.e2e.configs.connectors.model import ( + +from ...configs.connectors.database.db2 import Db2Connector +from ...configs.connectors.model import ( ConnectorIngestionTestConfig, ConnectorTestConfig, ConnectorValidationTestConfig, diff --git a/ingestion/tests/e2e/entity/database/test_druid.py b/ingestion/tests/e2e/entity/database/test_druid.py index 173619078f15..315dec9ff9c1 100644 --- a/ingestion/tests/e2e/entity/database/test_druid.py +++ b/ingestion/tests/e2e/entity/database/test_druid.py @@ -2,8 +2,14 @@ import pytest -from ingestion.tests.e2e.configs.connectors.database.druid import DruidConnector -from ingestion.tests.e2e.configs.connectors.model import ( +from playwright.sync_api import Page + +from metadata.generated.schema.entity.services.ingestionPipelines.ingestionPipeline import ( + PipelineState, +) + +from ...configs.connectors.database.druid import DruidConnector +from ...configs.connectors.model import ( ConnectorIngestionTestConfig, ConnectorTestConfig, ConnectorValidationTestConfig, @@ -11,17 +17,12 @@ IngestionTestConfig, ValidationTestConfig, ) -from ingestion.tests.e2e.entity.database.common_assertions import ( +from ...entity.database.common_assertions import ( assert_change_database_owner, assert_pii_column_auto_tagging, assert_profile_data, assert_sample_data_ingestion, ) -from playwright.sync_api import Page - -from metadata.generated.schema.entity.services.ingestionPipelines.ingestionPipeline import ( - PipelineState, -) @pytest.mark.parametrize( diff --git a/ingestion/tests/e2e/entity/database/test_hive.py b/ingestion/tests/e2e/entity/database/test_hive.py index aa90d5a7bfd4..7bec8310708b 100644 --- a/ingestion/tests/e2e/entity/database/test_hive.py +++ b/ingestion/tests/e2e/entity/database/test_hive.py @@ -1,8 +1,14 @@ """Test Hive database ingestion.""" import pytest -from ingestion.tests.e2e.configs.connectors.database.hive import HiveConnector -from ingestion.tests.e2e.configs.connectors.model import ( +from playwright.sync_api import Page + +from metadata.generated.schema.entity.services.ingestionPipelines.ingestionPipeline import ( + PipelineState, +) + +from ...configs.connectors.database.hive import HiveConnector +from ...configs.connectors.model import ( ConnectorIngestionTestConfig, ConnectorTestConfig, ConnectorValidationTestConfig, @@ -10,16 +16,11 @@ IngestionTestConfig, ValidationTestConfig, ) -from ingestion.tests.e2e.entity.database.common_assertions import ( +from ...entity.database.common_assertions import ( assert_change_database_owner, assert_profile_data, assert_sample_data_ingestion, ) -from playwright.sync_api import Page - -from metadata.generated.schema.entity.services.ingestionPipelines.ingestionPipeline import ( - PipelineState, -) @pytest.mark.parametrize( diff --git a/ingestion/tests/e2e/entity/database/test_redshift.py b/ingestion/tests/e2e/entity/database/test_redshift.py index 32ff398f6354..96583959b4c2 100644 --- a/ingestion/tests/e2e/entity/database/test_redshift.py +++ b/ingestion/tests/e2e/entity/database/test_redshift.py @@ -2,8 +2,14 @@ import pytest -from ingestion.tests.e2e.configs.connectors.database.redshift import RedshiftConnector -from ingestion.tests.e2e.configs.connectors.model import ( +from playwright.sync_api import Page + +from metadata.generated.schema.entity.services.ingestionPipelines.ingestionPipeline import ( + PipelineState, +) + +from ...configs.connectors.database.redshift import RedshiftConnector +from ...configs.connectors.model import ( ConnectorIngestionTestConfig, ConnectorTestConfig, ConnectorValidationTestConfig, @@ -11,17 +17,12 @@ IngestionTestConfig, ValidationTestConfig, ) -from ingestion.tests.e2e.entity.database.common_assertions import ( +from ...entity.database.common_assertions import ( assert_change_database_owner, assert_pii_column_auto_tagging, assert_profile_data, assert_sample_data_ingestion, ) -from playwright.sync_api import Page - -from metadata.generated.schema.entity.services.ingestionPipelines.ingestionPipeline import ( - PipelineState, -) @pytest.mark.parametrize(