From 512a1c2bb5d82825220af3eaf5eb2e13a6cb0e3a Mon Sep 17 00:00:00 2001 From: Pere Miquel Brull Date: Tue, 3 Dec 2024 17:47:15 +0100 Subject: [PATCH 1/2] MINOR - Clean up configs & add auto classification docs --- .../native/1.6.0/mysql/schemaChanges.sql | 2 +- .../native/1.6.0/postgres/schemaChanges.sql | 2 +- ingestion/src/metadata/clients/aws_client.py | 4 +- .../source/database/base/profiler_source.py | 1 - ingestion/src/metadata/sampler/processor.py | 6 +- .../connectors/yaml/auto-classification.md | 154 ++++++++++++++++++ .../v1.6/connectors/yaml/data-profiler.md | 34 +--- .../partials/v1.6/connectors/yaml/lineage.md | 2 +- .../v1.6/connectors/yaml/query-usage.md | 2 +- .../connectors/database/athena/yaml.md | 2 + .../connectors/database/azuresql/yaml.md | 2 + .../connectors/database/bigquery/yaml.md | 2 + .../connectors/database/clickhouse/yaml.md | 2 + .../connectors/database/databricks/yaml.md | 2 + .../connectors/database/db2/yaml.md | 2 + .../connectors/database/doris/yaml.md | 2 + .../connectors/database/druid/yaml.md | 2 + .../connectors/database/greenplum/yaml.md | 2 + .../connectors/database/hive/yaml.md | 2 + .../connectors/database/impala/yaml.md | 2 + .../connectors/database/mariadb/yaml.md | 2 + .../connectors/database/mssql/yaml.md | 2 + .../connectors/database/mysql/yaml.md | 2 + .../connectors/database/oracle/yaml.md | 2 + .../connectors/database/pinotdb/yaml.md | 2 + .../connectors/database/postgres/yaml.md | 2 + .../connectors/database/presto/yaml.md | 2 + .../connectors/database/redshift/yaml.md | 2 + .../connectors/database/sap-hana/yaml.md | 2 + .../connectors/database/singlestore/yaml.md | 2 + .../connectors/database/snowflake/yaml.md | 2 + .../connectors/database/sqlite/yaml.md | 2 + .../connectors/database/synapse/yaml.md | 2 + .../connectors/database/teradata/yaml.md | 2 + .../connectors/database/trino/yaml.md | 2 + .../connectors/database/vertica/yaml.md | 2 + ...baseServiceAutoClassificationPipeline.json | 14 -- .../databaseServiceProfilerPipeline.json | 6 - .../Database/workflows/autoClassification.md | 25 +-- .../en-US/Database/workflows/profiler.md | 5 - 40 files changed, 218 insertions(+), 93 deletions(-) create mode 100644 openmetadata-docs/content/partials/v1.6/connectors/yaml/auto-classification.md diff --git a/bootstrap/sql/migrations/native/1.6.0/mysql/schemaChanges.sql b/bootstrap/sql/migrations/native/1.6.0/mysql/schemaChanges.sql index aaa61aa8d720..75b2ca0a1c63 100644 --- a/bootstrap/sql/migrations/native/1.6.0/mysql/schemaChanges.sql +++ b/bootstrap/sql/migrations/native/1.6.0/mysql/schemaChanges.sql @@ -1745,7 +1745,7 @@ WHERE JSON_EXTRACT(json, '$.pipelineType') = 'metadata'; -- classification and sampling configs from the profiler pipelines UPDATE ingestion_pipeline_entity -SET json = JSON_REMOVE(json, '$.sourceConfig.config.processPiiSensitive', '$.sourceConfig.config.confidence', '$.sourceConfig.config.generateSampleData') +SET json = JSON_REMOVE(json, '$.sourceConfig.config.processPiiSensitive', '$.sourceConfig.config.confidence', '$.sourceConfig.config.generateSampleData', '$.sourceConfig.config.sampleDataCount') WHERE JSON_EXTRACT(json, '$.pipelineType') = 'profiler'; -- Rename 'jobId' to 'jobIds', set 'jobId' as type array in 'jobIds' , add 'projectIds' for dbt cloud diff --git a/bootstrap/sql/migrations/native/1.6.0/postgres/schemaChanges.sql b/bootstrap/sql/migrations/native/1.6.0/postgres/schemaChanges.sql index 461d746d5504..66f713f99522 100644 --- a/bootstrap/sql/migrations/native/1.6.0/postgres/schemaChanges.sql +++ b/bootstrap/sql/migrations/native/1.6.0/postgres/schemaChanges.sql @@ -1732,7 +1732,7 @@ WHERE json #>> '{pipelineType}' = 'metadata'; -- classification and sampling configs from the profiler pipelines UPDATE ingestion_pipeline_entity -SET json = json::jsonb #- '{sourceConfig,config,processPiiSensitive}' #- '{sourceConfig,config,confidence}' #- '{sourceConfig,config,generateSampleData}' +SET json = json::jsonb #- '{sourceConfig,config,processPiiSensitive}' #- '{sourceConfig,config,confidence}' #- '{sourceConfig,config,generateSampleData}' #- '{sourceConfig,config,sampleDataCount}' WHERE json #>> '{pipelineType}' = 'profiler'; -- set value of 'jobId' as an array into 'jobIds' for dbt cloud diff --git a/ingestion/src/metadata/clients/aws_client.py b/ingestion/src/metadata/clients/aws_client.py index 44d646a8000f..8e87c600f96c 100644 --- a/ingestion/src/metadata/clients/aws_client.py +++ b/ingestion/src/metadata/clients/aws_client.py @@ -144,7 +144,7 @@ def create_session(self) -> Session: def get_client(self, service_name: str) -> Any: # initialize the client depending on the AWSCredentials passed if self.config is not None: - logger.info(f"Getting AWS client for service [{service_name}]") + logger.debug(f"Getting AWS client for service [{service_name}]") session = self.create_session() if self.config.endPointURL is not None: return session.client( @@ -152,7 +152,7 @@ def get_client(self, service_name: str) -> Any: ) return session.client(service_name=service_name) - logger.info(f"Getting AWS default client for service [{service_name}]") + logger.debug(f"Getting AWS default client for service [{service_name}]") # initialized with the credentials loaded from running machine return boto3.client(service_name=service_name) diff --git a/ingestion/src/metadata/profiler/source/database/base/profiler_source.py b/ingestion/src/metadata/profiler/source/database/base/profiler_source.py index 3013f53f0d97..f14dfb206a46 100644 --- a/ingestion/src/metadata/profiler/source/database/base/profiler_source.py +++ b/ingestion/src/metadata/profiler/source/database/base/profiler_source.py @@ -154,7 +154,6 @@ def create_profiler_interface( profile_sample_type=self.source_config.profileSampleType, sampling_method_type=self.source_config.samplingMethodType, ), - default_sample_data_count=self.source_config.sampleDataCount, ) profiler_interface: ProfilerInterface = profiler_class.create( diff --git a/ingestion/src/metadata/sampler/processor.py b/ingestion/src/metadata/sampler/processor.py index 87883277de34..d22f403479b4 100644 --- a/ingestion/src/metadata/sampler/processor.py +++ b/ingestion/src/metadata/sampler/processor.py @@ -88,11 +88,7 @@ def _run(self, record: ProfilerSourceAndEntity) -> Either[SamplerResponse]: schema_entity=schema_entity, database_entity=database_entity, table_config=get_config_for_table(entity, self.profiler_config), - default_sample_config=SampleConfig( - profile_sample=self.source_config.profileSample, - profile_sample_type=self.source_config.profileSampleType, - sampling_method_type=self.source_config.samplingMethodType, - ), + default_sample_config=SampleConfig(), default_sample_data_count=self.source_config.sampleDataCount, ) sample_data = SampleData( diff --git a/openmetadata-docs/content/partials/v1.6/connectors/yaml/auto-classification.md b/openmetadata-docs/content/partials/v1.6/connectors/yaml/auto-classification.md new file mode 100644 index 000000000000..42d7263a154a --- /dev/null +++ b/openmetadata-docs/content/partials/v1.6/connectors/yaml/auto-classification.md @@ -0,0 +1,154 @@ +## Auto Classification + +The Auto Classification workflow will be using the `orm-profiler` processor. + +After running a Metadata Ingestion workflow, we can run the Auto Classification workflow. +While the `serviceName` will be the same to that was used in Metadata Ingestion, so the ingestion bot can get the `serviceConnection` details from the server. + + +### 1. Define the YAML Config + +This is a sample config for the Auto Classification Workflow: + +{% codePreview %} + +{% codeInfoContainer %} + +#### Source Configuration - Source Config + +You can find all the definitions and types for the `sourceConfig` [here](https://github.com/open-metadata/OpenMetadata/blob/main/openmetadata-spec/src/main/resources/json/schema/metadataIngestion/databaseServiceAutoClassificationPipeline.json). + +{% codeInfo srNumber=14 %} + +**storeSampleData**: Option to turn on/off storing sample data. If enabled, we will ingest sample data for each table. + +{% /codeInfo %} + +{% codeInfo srNumber=15 %} + +**enableAutoClassification**: Optional configuration to automatically tag columns that might contain sensitive information. + +{% /codeInfo %} + +{% codeInfo srNumber=18 %} + +**confidence**: Set the Confidence value for which you want the column to be tagged as PII. Confidence value ranges from 0 to 100. A higher number will yield less false positives but more false negatives. A lower number will yield more false positives but less false negatives. + +{% /codeInfo %} + +{% codeInfo srNumber=19 %} + +**databaseFilterPattern**: Regex to only fetch databases that matches the pattern. + +{% /codeInfo %} + +{% codeInfo srNumber=20 %} + +**schemaFilterPattern**: Regex to only fetch tables or databases that matches the pattern. + +{% /codeInfo %} + +{% codeInfo srNumber=21 %} + +**tableFilterPattern**: Regex to only fetch tables or databases that matches the pattern. + +{% /codeInfo %} + +{% codeInfo srNumber=22 %} + +#### Processor Configuration + +Choose the `orm-profiler`. Its config can also be updated to define tests from the YAML itself instead of the UI: + +**tableConfig**: `tableConfig` allows you to set up some configuration at the table level. +{% /codeInfo %} + + +{% codeInfo srNumber=23 %} + +#### Sink Configuration + +To send the metadata to OpenMetadata, it needs to be specified as `type: metadata-rest`. +{% /codeInfo %} + + +{% partial file="/v1.5/connectors/yaml/workflow-config-def.md" /%} + +{% /codeInfoContainer %} + +{% codeBlock fileName="filename.yaml" %} + + +```yaml {% isCodeBlock=true %} +source: + type: {% $connector %} + serviceName: {% $connector %} + sourceConfig: + config: + type: AutoClassification +``` +```yaml {% srNumber=14 %} + # storeSampleData: true +``` +```yaml {% srNumber=15 %} + # enableAutoClassification: true +``` +```yaml {% srNumber=18 %} + # confidence: 80 +``` +```yaml {% srNumber=19 %} + # databaseFilterPattern: + # includes: + # - database1 + # - database2 + # excludes: + # - database3 + # - database4 +``` +```yaml {% srNumber=20 %} + # schemaFilterPattern: + # includes: + # - schema1 + # - schema2 + # excludes: + # - schema3 + # - schema4 +``` +```yaml {% srNumber=21 %} + # tableFilterPattern: + # includes: + # - table1 + # - table2 + # excludes: + # - table3 + # - table4 +``` + +```yaml {% srNumber=22 %} +processor: + type: orm-profiler + config: {} +``` + +```yaml {% srNumber=23 %} +sink: + type: metadata-rest + config: {} +``` + +{% partial file="/v1.5/connectors/yaml/workflow-config.md" /%} + +{% /codeBlock %} + +{% /codePreview %} + + +### 2. Run with the CLI + +After saving the YAML config, we will run the command the same way we did for the metadata ingestion: + +```bash +metadata classify -c +``` + +Note now instead of running `ingest`, we are using the `classify` command to select the Auto Classification workflow. diff --git a/openmetadata-docs/content/partials/v1.6/connectors/yaml/data-profiler.md b/openmetadata-docs/content/partials/v1.6/connectors/yaml/data-profiler.md index 66aada0ec687..e83b0587e2b4 100644 --- a/openmetadata-docs/content/partials/v1.6/connectors/yaml/data-profiler.md +++ b/openmetadata-docs/content/partials/v1.6/connectors/yaml/data-profiler.md @@ -2,7 +2,7 @@ The Data Profiler workflow will be using the `orm-profiler` processor. -After running a Metadata Ingestion workflow, we can run Data Profiler workflow. +After running a Metadata Ingestion workflow, we can run the Data Profiler workflow. While the `serviceName` will be the same to that was used in Metadata Ingestion, so the ingestion bot can get the `serviceConnection` details from the server. @@ -14,15 +14,10 @@ This is a sample config for the profiler: {% codeInfoContainer %} -{% codeInfo srNumber=13 %} #### Source Configuration - Source Config You can find all the definitions and types for the `sourceConfig` [here](https://github.com/open-metadata/OpenMetadata/blob/main/openmetadata-spec/src/main/resources/json/schema/metadataIngestion/databaseServiceProfilerPipeline.json). -**generateSampleData**: Option to turn on/off generating sample data. - -{% /codeInfo %} - {% codeInfo srNumber=14 %} **profileSample**: Percentage of data or no. of rows we want to execute the profiler and tests on. @@ -35,19 +30,6 @@ You can find all the definitions and types for the `sourceConfig` [here](https: {% /codeInfo %} -{% codeInfo srNumber=16 %} - -**processPiiSensitive**: Optional configuration to automatically tag columns that might contain sensitive information. - -{% /codeInfo %} - -{% codeInfo srNumber=17 %} - -**confidence**: Set the Confidence value for which you want the column to be marked - -{% /codeInfo %} - - {% codeInfo srNumber=18 %} **timeoutSeconds**: Profiler Timeout in Seconds @@ -100,27 +82,17 @@ To send the metadata to OpenMetadata, it needs to be specified as `type: metadat ```yaml {% isCodeBlock=true %} source: type: {% $connector %} - serviceName: local_athena + serviceName: {% $connector %} sourceConfig: config: type: Profiler ``` - -```yaml {% srNumber=13 %} - generateSampleData: true -``` ```yaml {% srNumber=14 %} # profileSample: 85 ``` ```yaml {% srNumber=15 %} # threadCount: 5 ``` -```yaml {% srNumber=16 %} - processPiiSensitive: false -``` -```yaml {% srNumber=17 %} - # confidence: 80 -``` ```yaml {% srNumber=18 %} # timeoutSeconds: 43200 ``` @@ -158,8 +130,6 @@ processor: config: {} # Remove braces if adding properties # tableConfig: # - fullyQualifiedName: - # profileSample: # default - # profileSample: # default will be 100 if omitted # profileQuery: # columnConfig: diff --git a/openmetadata-docs/content/partials/v1.6/connectors/yaml/lineage.md b/openmetadata-docs/content/partials/v1.6/connectors/yaml/lineage.md index 546c96688b10..2830ab1b632b 100644 --- a/openmetadata-docs/content/partials/v1.6/connectors/yaml/lineage.md +++ b/openmetadata-docs/content/partials/v1.6/connectors/yaml/lineage.md @@ -93,7 +93,7 @@ For a simple, local installation using our docker containers, this looks like: ```yaml {% srNumber=40 %} source: type: {% $connector %}-lineage - serviceName: + serviceName: {% $connector %} sourceConfig: config: type: DatabaseLineage diff --git a/openmetadata-docs/content/partials/v1.6/connectors/yaml/query-usage.md b/openmetadata-docs/content/partials/v1.6/connectors/yaml/query-usage.md index 12b0923f1b0d..73ac8958a58e 100644 --- a/openmetadata-docs/content/partials/v1.6/connectors/yaml/query-usage.md +++ b/openmetadata-docs/content/partials/v1.6/connectors/yaml/query-usage.md @@ -62,7 +62,7 @@ Note that the location is a directory that will be cleaned at the end of the ing ```yaml {% isCodeBlock=true %} source: type: {% $connector %}-usage - serviceName: + serviceName: {% $connector %} sourceConfig: config: type: DatabaseUsage diff --git a/openmetadata-docs/content/v1.6.x-SNAPSHOT/connectors/database/athena/yaml.md b/openmetadata-docs/content/v1.6.x-SNAPSHOT/connectors/database/athena/yaml.md index bfade366ddeb..a99cc4c708ab 100644 --- a/openmetadata-docs/content/v1.6.x-SNAPSHOT/connectors/database/athena/yaml.md +++ b/openmetadata-docs/content/v1.6.x-SNAPSHOT/connectors/database/athena/yaml.md @@ -363,6 +363,8 @@ source: {% partial file="/v1.5/connectors/yaml/data-profiler.md" variables={connector: "athena"} /%} +{% partial file="/v1.5/connectors/yaml/auto-classification.md" variables={connector: "athena"} /%} + {% partial file="/v1.5/connectors/yaml/data-quality.md" /%} ## dbt Integration diff --git a/openmetadata-docs/content/v1.6.x-SNAPSHOT/connectors/database/azuresql/yaml.md b/openmetadata-docs/content/v1.6.x-SNAPSHOT/connectors/database/azuresql/yaml.md index e3be11509f7d..30b6b9c236a0 100644 --- a/openmetadata-docs/content/v1.6.x-SNAPSHOT/connectors/database/azuresql/yaml.md +++ b/openmetadata-docs/content/v1.6.x-SNAPSHOT/connectors/database/azuresql/yaml.md @@ -204,6 +204,8 @@ source: {% partial file="/v1.5/connectors/yaml/data-profiler.md" variables={connector: "azuresql"} /%} +{% partial file="/v1.5/connectors/yaml/auto-classification.md" variables={connector: "azuresql"} /%} + {% partial file="/v1.5/connectors/yaml/data-quality.md" /%} ## dbt Integration diff --git a/openmetadata-docs/content/v1.6.x-SNAPSHOT/connectors/database/bigquery/yaml.md b/openmetadata-docs/content/v1.6.x-SNAPSHOT/connectors/database/bigquery/yaml.md index 9480d31c4c46..ad8c9da11b6b 100644 --- a/openmetadata-docs/content/v1.6.x-SNAPSHOT/connectors/database/bigquery/yaml.md +++ b/openmetadata-docs/content/v1.6.x-SNAPSHOT/connectors/database/bigquery/yaml.md @@ -235,6 +235,8 @@ source: {% partial file="/v1.5/connectors/yaml/data-profiler.md" variables={connector: "bigquery"} /%} +{% partial file="/v1.5/connectors/yaml/auto-classification.md" variables={connector: "bigquery"} /%} + {% partial file="/v1.5/connectors/yaml/data-quality.md" /%} ## dbt Integration diff --git a/openmetadata-docs/content/v1.6.x-SNAPSHOT/connectors/database/clickhouse/yaml.md b/openmetadata-docs/content/v1.6.x-SNAPSHOT/connectors/database/clickhouse/yaml.md index 7891e941d3ec..ca086c98c61a 100644 --- a/openmetadata-docs/content/v1.6.x-SNAPSHOT/connectors/database/clickhouse/yaml.md +++ b/openmetadata-docs/content/v1.6.x-SNAPSHOT/connectors/database/clickhouse/yaml.md @@ -240,6 +240,8 @@ source: {% partial file="/v1.5/connectors/yaml/data-profiler.md" variables={connector: "clickhouse"} /%} +{% partial file="/v1.5/connectors/yaml/auto-classification.md" variables={connector: "clickhouse"} /%} + {% partial file="/v1.5/connectors/yaml/data-quality.md" /%} ## dbt Integration diff --git a/openmetadata-docs/content/v1.6.x-SNAPSHOT/connectors/database/databricks/yaml.md b/openmetadata-docs/content/v1.6.x-SNAPSHOT/connectors/database/databricks/yaml.md index fbfe5ccf0c90..ef77b76b7feb 100644 --- a/openmetadata-docs/content/v1.6.x-SNAPSHOT/connectors/database/databricks/yaml.md +++ b/openmetadata-docs/content/v1.6.x-SNAPSHOT/connectors/database/databricks/yaml.md @@ -181,6 +181,8 @@ source: {% partial file="/v1.5/connectors/yaml/data-profiler.md" variables={connector: "databricks"} /%} +{% partial file="/v1.5/connectors/yaml/auto-classification.md" variables={connector: "databricks"} /%} + {% partial file="/v1.5/connectors/yaml/data-quality.md" /%} ## dbt Integration diff --git a/openmetadata-docs/content/v1.6.x-SNAPSHOT/connectors/database/db2/yaml.md b/openmetadata-docs/content/v1.6.x-SNAPSHOT/connectors/database/db2/yaml.md index e82954c74d37..8bac1efeaa96 100644 --- a/openmetadata-docs/content/v1.6.x-SNAPSHOT/connectors/database/db2/yaml.md +++ b/openmetadata-docs/content/v1.6.x-SNAPSHOT/connectors/database/db2/yaml.md @@ -178,6 +178,8 @@ source: {% partial file="/v1.5/connectors/yaml/data-profiler.md" variables={connector: "db2"} /%} +{% partial file="/v1.5/connectors/yaml/auto-classification.md" variables={connector: "db2"} /%} + {% partial file="/v1.5/connectors/yaml/data-quality.md" /%} ## dbt Integration diff --git a/openmetadata-docs/content/v1.6.x-SNAPSHOT/connectors/database/doris/yaml.md b/openmetadata-docs/content/v1.6.x-SNAPSHOT/connectors/database/doris/yaml.md index f24444f09b13..5525bde5f7e7 100644 --- a/openmetadata-docs/content/v1.6.x-SNAPSHOT/connectors/database/doris/yaml.md +++ b/openmetadata-docs/content/v1.6.x-SNAPSHOT/connectors/database/doris/yaml.md @@ -157,6 +157,8 @@ source: {% partial file="/v1.5/connectors/yaml/data-profiler.md" variables={connector: "doris"} /%} +{% partial file="/v1.5/connectors/yaml/auto-classification.md" variables={connector: "doris"} /%} + {% partial file="/v1.5/connectors/yaml/data-quality.md" /%} ## Securing Doris Connection with SSL in OpenMetadata diff --git a/openmetadata-docs/content/v1.6.x-SNAPSHOT/connectors/database/druid/yaml.md b/openmetadata-docs/content/v1.6.x-SNAPSHOT/connectors/database/druid/yaml.md index d1fe4e65576d..69c048214b9c 100644 --- a/openmetadata-docs/content/v1.6.x-SNAPSHOT/connectors/database/druid/yaml.md +++ b/openmetadata-docs/content/v1.6.x-SNAPSHOT/connectors/database/druid/yaml.md @@ -153,6 +153,8 @@ source: {% partial file="/v1.5/connectors/yaml/data-profiler.md" variables={connector: "druid"} /%} +{% partial file="/v1.5/connectors/yaml/auto-classification.md" variables={connector: "druid"} /%} + {% partial file="/v1.5/connectors/yaml/data-quality.md" /%} ## dbt Integration diff --git a/openmetadata-docs/content/v1.6.x-SNAPSHOT/connectors/database/greenplum/yaml.md b/openmetadata-docs/content/v1.6.x-SNAPSHOT/connectors/database/greenplum/yaml.md index df45c4f1acb4..e4f801cbe68c 100644 --- a/openmetadata-docs/content/v1.6.x-SNAPSHOT/connectors/database/greenplum/yaml.md +++ b/openmetadata-docs/content/v1.6.x-SNAPSHOT/connectors/database/greenplum/yaml.md @@ -265,6 +265,8 @@ source: {% partial file="/v1.5/connectors/yaml/data-profiler.md" variables={connector: "greenplum"} /%} +{% partial file="/v1.5/connectors/yaml/auto-classification.md" variables={connector: "greenplum"} /%} + {% partial file="/v1.5/connectors/yaml/data-quality.md" /%} ## Securing Greenplum Connection with SSL in OpenMetadata diff --git a/openmetadata-docs/content/v1.6.x-SNAPSHOT/connectors/database/hive/yaml.md b/openmetadata-docs/content/v1.6.x-SNAPSHOT/connectors/database/hive/yaml.md index ac72e013e195..effd8b469f82 100644 --- a/openmetadata-docs/content/v1.6.x-SNAPSHOT/connectors/database/hive/yaml.md +++ b/openmetadata-docs/content/v1.6.x-SNAPSHOT/connectors/database/hive/yaml.md @@ -200,6 +200,8 @@ source: {% partial file="/v1.5/connectors/yaml/data-profiler.md" variables={connector: "hive"} /%} +{% partial file="/v1.5/connectors/yaml/auto-classification.md" variables={connector: "hive"} /%} + {% partial file="/v1.5/connectors/yaml/data-quality.md" /%} ## Securing Hive Connection with SSL in OpenMetadata diff --git a/openmetadata-docs/content/v1.6.x-SNAPSHOT/connectors/database/impala/yaml.md b/openmetadata-docs/content/v1.6.x-SNAPSHOT/connectors/database/impala/yaml.md index 64152996eb0d..a48e1f251f7f 100644 --- a/openmetadata-docs/content/v1.6.x-SNAPSHOT/connectors/database/impala/yaml.md +++ b/openmetadata-docs/content/v1.6.x-SNAPSHOT/connectors/database/impala/yaml.md @@ -162,6 +162,8 @@ source: {% partial file="/v1.5/connectors/yaml/data-profiler.md" variables={connector: "impala"} /%} +{% partial file="/v1.5/connectors/yaml/auto-classification.md" variables={connector: "impala"} /%} + {% partial file="/v1.5/connectors/yaml/data-quality.md" /%} ## Securing Impala Connection with SSL in OpenMetadata diff --git a/openmetadata-docs/content/v1.6.x-SNAPSHOT/connectors/database/mariadb/yaml.md b/openmetadata-docs/content/v1.6.x-SNAPSHOT/connectors/database/mariadb/yaml.md index 07249a928950..f8af8b52ea7f 100644 --- a/openmetadata-docs/content/v1.6.x-SNAPSHOT/connectors/database/mariadb/yaml.md +++ b/openmetadata-docs/content/v1.6.x-SNAPSHOT/connectors/database/mariadb/yaml.md @@ -157,6 +157,8 @@ source: {% partial file="/v1.5/connectors/yaml/data-profiler.md" variables={connector: "mariadb"} /%} +{% partial file="/v1.5/connectors/yaml/auto-classification.md" variables={connector: "mariadb"} /%} + {% partial file="/v1.5/connectors/yaml/data-quality.md" /%} ## dbt Integration diff --git a/openmetadata-docs/content/v1.6.x-SNAPSHOT/connectors/database/mssql/yaml.md b/openmetadata-docs/content/v1.6.x-SNAPSHOT/connectors/database/mssql/yaml.md index ff97a9ed0bd1..84df0016e4b9 100644 --- a/openmetadata-docs/content/v1.6.x-SNAPSHOT/connectors/database/mssql/yaml.md +++ b/openmetadata-docs/content/v1.6.x-SNAPSHOT/connectors/database/mssql/yaml.md @@ -187,6 +187,8 @@ source: {% partial file="/v1.5/connectors/yaml/data-profiler.md" variables={connector: "mssql"} /%} +{% partial file="/v1.5/connectors/yaml/auto-classification.md" variables={connector: "mssql"} /%} + {% partial file="/v1.5/connectors/yaml/data-quality.md" /%} ## dbt Integration diff --git a/openmetadata-docs/content/v1.6.x-SNAPSHOT/connectors/database/mysql/yaml.md b/openmetadata-docs/content/v1.6.x-SNAPSHOT/connectors/database/mysql/yaml.md index 2a43ff9464fe..5f95b40cab71 100644 --- a/openmetadata-docs/content/v1.6.x-SNAPSHOT/connectors/database/mysql/yaml.md +++ b/openmetadata-docs/content/v1.6.x-SNAPSHOT/connectors/database/mysql/yaml.md @@ -269,6 +269,8 @@ source: {% partial file="/v1.5/connectors/yaml/data-profiler.md" variables={connector: "mysql"} /%} +{% partial file="/v1.5/connectors/yaml/auto-classification.md" variables={connector: "mysql"} /%} + {% partial file="/v1.5/connectors/yaml/data-quality.md" /%} ## Securing MySQL Connection with SSL in OpenMetadata diff --git a/openmetadata-docs/content/v1.6.x-SNAPSHOT/connectors/database/oracle/yaml.md b/openmetadata-docs/content/v1.6.x-SNAPSHOT/connectors/database/oracle/yaml.md index 090bd1b842e7..5804c9277320 100644 --- a/openmetadata-docs/content/v1.6.x-SNAPSHOT/connectors/database/oracle/yaml.md +++ b/openmetadata-docs/content/v1.6.x-SNAPSHOT/connectors/database/oracle/yaml.md @@ -233,6 +233,8 @@ source: {% partial file="/v1.5/connectors/yaml/data-profiler.md" variables={connector: "oracle"} /%} +{% partial file="/v1.5/connectors/yaml/auto-classification.md" variables={connector: "oracle"} /%} + {% partial file="/v1.5/connectors/yaml/data-quality.md" /%} ## Lineage diff --git a/openmetadata-docs/content/v1.6.x-SNAPSHOT/connectors/database/pinotdb/yaml.md b/openmetadata-docs/content/v1.6.x-SNAPSHOT/connectors/database/pinotdb/yaml.md index 902b801f933e..c6189ba6f9d6 100644 --- a/openmetadata-docs/content/v1.6.x-SNAPSHOT/connectors/database/pinotdb/yaml.md +++ b/openmetadata-docs/content/v1.6.x-SNAPSHOT/connectors/database/pinotdb/yaml.md @@ -151,6 +151,8 @@ source: {% partial file="/v1.5/connectors/yaml/data-profiler.md" variables={connector: "pinotdb"} /%} +{% partial file="/v1.5/connectors/yaml/auto-classification.md" variables={connector: "pinotdb"} /%} + {% partial file="/v1.5/connectors/yaml/data-quality.md" /%} ## dbt Integration diff --git a/openmetadata-docs/content/v1.6.x-SNAPSHOT/connectors/database/postgres/yaml.md b/openmetadata-docs/content/v1.6.x-SNAPSHOT/connectors/database/postgres/yaml.md index dff574be8ace..ebd9cc5c02fb 100644 --- a/openmetadata-docs/content/v1.6.x-SNAPSHOT/connectors/database/postgres/yaml.md +++ b/openmetadata-docs/content/v1.6.x-SNAPSHOT/connectors/database/postgres/yaml.md @@ -292,6 +292,8 @@ source: {% partial file="/v1.5/connectors/yaml/data-profiler.md" variables={connector: "postgres"} /%} +{% partial file="/v1.5/connectors/yaml/auto-classification.md" variables={connector: "postgres"} /%} + {% partial file="/v1.5/connectors/yaml/data-quality.md" /%} ## Securing Postgres Connection with SSL in OpenMetadata diff --git a/openmetadata-docs/content/v1.6.x-SNAPSHOT/connectors/database/presto/yaml.md b/openmetadata-docs/content/v1.6.x-SNAPSHOT/connectors/database/presto/yaml.md index c157d2aab190..54886fdd244f 100644 --- a/openmetadata-docs/content/v1.6.x-SNAPSHOT/connectors/database/presto/yaml.md +++ b/openmetadata-docs/content/v1.6.x-SNAPSHOT/connectors/database/presto/yaml.md @@ -160,6 +160,8 @@ source: {% partial file="/v1.5/connectors/yaml/data-profiler.md" variables={connector: "presto"} /%} +{% partial file="/v1.5/connectors/yaml/auto-classification.md" variables={connector: "presto"} /%} + {% partial file="/v1.5/connectors/yaml/data-quality.md" /%} ## dbt Integration diff --git a/openmetadata-docs/content/v1.6.x-SNAPSHOT/connectors/database/redshift/yaml.md b/openmetadata-docs/content/v1.6.x-SNAPSHOT/connectors/database/redshift/yaml.md index e86dc9eb358a..13d1ae99cd25 100644 --- a/openmetadata-docs/content/v1.6.x-SNAPSHOT/connectors/database/redshift/yaml.md +++ b/openmetadata-docs/content/v1.6.x-SNAPSHOT/connectors/database/redshift/yaml.md @@ -210,6 +210,8 @@ source: {% partial file="/v1.5/connectors/yaml/data-profiler.md" variables={connector: "redshift"} /%} +{% partial file="/v1.5/connectors/yaml/auto-classification.md" variables={connector: "redshift"} /%} + {% partial file="/v1.5/connectors/yaml/data-quality.md" /%} ## Securing Redshift Connection with SSL in OpenMetadata diff --git a/openmetadata-docs/content/v1.6.x-SNAPSHOT/connectors/database/sap-hana/yaml.md b/openmetadata-docs/content/v1.6.x-SNAPSHOT/connectors/database/sap-hana/yaml.md index 7fed2269fa14..25f77cd4ce5c 100644 --- a/openmetadata-docs/content/v1.6.x-SNAPSHOT/connectors/database/sap-hana/yaml.md +++ b/openmetadata-docs/content/v1.6.x-SNAPSHOT/connectors/database/sap-hana/yaml.md @@ -223,6 +223,8 @@ source: {% partial file="/v1.5/connectors/yaml/data-profiler.md" variables={connector: "sapHana"} /%} +{% partial file="/v1.5/connectors/yaml/auto-classification.md" variables={connector: "sapHana"} /%} + {% partial file="/v1.5/connectors/yaml/data-quality.md" /%} ## dbt Integration diff --git a/openmetadata-docs/content/v1.6.x-SNAPSHOT/connectors/database/singlestore/yaml.md b/openmetadata-docs/content/v1.6.x-SNAPSHOT/connectors/database/singlestore/yaml.md index 65296b55447e..be58d45c6111 100644 --- a/openmetadata-docs/content/v1.6.x-SNAPSHOT/connectors/database/singlestore/yaml.md +++ b/openmetadata-docs/content/v1.6.x-SNAPSHOT/connectors/database/singlestore/yaml.md @@ -153,6 +153,8 @@ source: {% partial file="/v1.5/connectors/yaml/data-profiler.md" variables={connector: "singlestore"} /%} +{% partial file="/v1.5/connectors/yaml/auto-classification.md" variables={connector: "singlestore"} /%} + {% partial file="/v1.5/connectors/yaml/data-quality.md" /%} ## dbt Integration diff --git a/openmetadata-docs/content/v1.6.x-SNAPSHOT/connectors/database/snowflake/yaml.md b/openmetadata-docs/content/v1.6.x-SNAPSHOT/connectors/database/snowflake/yaml.md index a28418ff115e..125f49b62aa0 100644 --- a/openmetadata-docs/content/v1.6.x-SNAPSHOT/connectors/database/snowflake/yaml.md +++ b/openmetadata-docs/content/v1.6.x-SNAPSHOT/connectors/database/snowflake/yaml.md @@ -275,6 +275,8 @@ source: {% partial file="/v1.5/connectors/yaml/data-profiler.md" variables={connector: "snowflake"} /%} +{% partial file="/v1.5/connectors/yaml/auto-classification.md" variables={connector: "snowflake"} /%} + {% partial file="/v1.5/connectors/yaml/data-quality.md" /%} ## dbt Integration diff --git a/openmetadata-docs/content/v1.6.x-SNAPSHOT/connectors/database/sqlite/yaml.md b/openmetadata-docs/content/v1.6.x-SNAPSHOT/connectors/database/sqlite/yaml.md index 2b003351bd93..5fbbc7cb1da4 100644 --- a/openmetadata-docs/content/v1.6.x-SNAPSHOT/connectors/database/sqlite/yaml.md +++ b/openmetadata-docs/content/v1.6.x-SNAPSHOT/connectors/database/sqlite/yaml.md @@ -160,6 +160,8 @@ source: {% partial file="/v1.5/connectors/yaml/data-profiler.md" variables={connector: "sqlite"} /%} +{% partial file="/v1.5/connectors/yaml/auto-classification.md" variables={connector: "sqlite"} /%} + {% partial file="/v1.5/connectors/yaml/data-quality.md" /%} ## Lineage diff --git a/openmetadata-docs/content/v1.6.x-SNAPSHOT/connectors/database/synapse/yaml.md b/openmetadata-docs/content/v1.6.x-SNAPSHOT/connectors/database/synapse/yaml.md index 34d97cdcc7bb..bae5c613db8b 100644 --- a/openmetadata-docs/content/v1.6.x-SNAPSHOT/connectors/database/synapse/yaml.md +++ b/openmetadata-docs/content/v1.6.x-SNAPSHOT/connectors/database/synapse/yaml.md @@ -205,6 +205,8 @@ source: {% partial file="/v1.5/connectors/yaml/data-profiler.md" variables={connector: "synapse"} /%} +{% partial file="/v1.5/connectors/yaml/auto-classification.md" variables={connector: "synapse"} /%} + {% partial file="/v1.5/connectors/yaml/data-quality.md" /%} ## dbt Integration diff --git a/openmetadata-docs/content/v1.6.x-SNAPSHOT/connectors/database/teradata/yaml.md b/openmetadata-docs/content/v1.6.x-SNAPSHOT/connectors/database/teradata/yaml.md index 2c179713a6fb..164778d26369 100644 --- a/openmetadata-docs/content/v1.6.x-SNAPSHOT/connectors/database/teradata/yaml.md +++ b/openmetadata-docs/content/v1.6.x-SNAPSHOT/connectors/database/teradata/yaml.md @@ -114,4 +114,6 @@ source: {% partial file="/v1.6/connectors/yaml/data-profiler.md" variables={connector: "teradata"} /%} +{% partial file="/v1.5/connectors/yaml/auto-classification.md" variables={connector: "teradata"} /%} + {% partial file="/v1.6/connectors/yaml/data-quality.md" /%} diff --git a/openmetadata-docs/content/v1.6.x-SNAPSHOT/connectors/database/trino/yaml.md b/openmetadata-docs/content/v1.6.x-SNAPSHOT/connectors/database/trino/yaml.md index 335ab627ae60..2a7e50579a08 100644 --- a/openmetadata-docs/content/v1.6.x-SNAPSHOT/connectors/database/trino/yaml.md +++ b/openmetadata-docs/content/v1.6.x-SNAPSHOT/connectors/database/trino/yaml.md @@ -224,6 +224,8 @@ source: {% partial file="/v1.5/connectors/yaml/data-profiler.md" variables={connector: "trino"} /%} +{% partial file="/v1.5/connectors/yaml/auto-classification.md" variables={connector: "trino"} /%} + {% partial file="/v1.5/connectors/yaml/data-quality.md" /%} ## SSL Configuration diff --git a/openmetadata-docs/content/v1.6.x-SNAPSHOT/connectors/database/vertica/yaml.md b/openmetadata-docs/content/v1.6.x-SNAPSHOT/connectors/database/vertica/yaml.md index 9948f47fd75f..192da520fe2c 100644 --- a/openmetadata-docs/content/v1.6.x-SNAPSHOT/connectors/database/vertica/yaml.md +++ b/openmetadata-docs/content/v1.6.x-SNAPSHOT/connectors/database/vertica/yaml.md @@ -197,6 +197,8 @@ source: {% partial file="/v1.5/connectors/yaml/data-profiler.md" variables={connector: "vertica"} /%} +{% partial file="/v1.5/connectors/yaml/auto-classification.md" variables={connector: "vertica"} /%} + {% partial file="/v1.5/connectors/yaml/data-quality.md" /%} ## dbt Integration diff --git a/openmetadata-spec/src/main/resources/json/schema/metadataIngestion/databaseServiceAutoClassificationPipeline.json b/openmetadata-spec/src/main/resources/json/schema/metadataIngestion/databaseServiceAutoClassificationPipeline.json index bc803a2f0f56..2b8b52cc6c37 100644 --- a/openmetadata-spec/src/main/resources/json/schema/metadataIngestion/databaseServiceAutoClassificationPipeline.json +++ b/openmetadata-spec/src/main/resources/json/schema/metadataIngestion/databaseServiceAutoClassificationPipeline.json @@ -68,20 +68,6 @@ "default": 80, "title": "Auto Classification Inference Confidence Level" }, - "profileSampleType": { - "$ref": "../entity/data/table.json#/definitions/profileSampleType", - "title": "Profile Sample Type" - }, - "profileSample": { - "description": "Percentage of data or no. of rows used to compute the profiler metrics and run data quality tests", - "type": "number", - "default": null, - "title": "Profile Sample" - }, - "samplingMethodType": { - "$ref": "../entity/data/table.json#/definitions/samplingMethodType", - "title": "Sampling Method Type" - }, "sampleDataCount": { "description": "Number of sample rows to ingest when 'Generate Sample Data' is enabled", "type": "integer", diff --git a/openmetadata-spec/src/main/resources/json/schema/metadataIngestion/databaseServiceProfilerPipeline.json b/openmetadata-spec/src/main/resources/json/schema/metadataIngestion/databaseServiceProfilerPipeline.json index 2dbd5186980d..26c9aa600231 100644 --- a/openmetadata-spec/src/main/resources/json/schema/metadataIngestion/databaseServiceProfilerPipeline.json +++ b/openmetadata-spec/src/main/resources/json/schema/metadataIngestion/databaseServiceProfilerPipeline.json @@ -88,12 +88,6 @@ "$ref": "../entity/data/table.json#/definitions/samplingMethodType", "title": "Sampling Method Type" }, - "sampleDataCount": { - "description": "Number of sample rows to ingest when 'Generate Sample Data' is enabled", - "type": "integer", - "default": 50, - "title": "Sample Data Rows Count" - }, "threadCount": { "description": "Number of threads to use during metric computations", "type": "number", diff --git a/openmetadata-ui/src/main/resources/ui/public/locales/en-US/Database/workflows/autoClassification.md b/openmetadata-ui/src/main/resources/ui/public/locales/en-US/Database/workflows/autoClassification.md index fc83c0ff3514..905546681538 100644 --- a/openmetadata-ui/src/main/resources/ui/public/locales/en-US/Database/workflows/autoClassification.md +++ b/openmetadata-ui/src/main/resources/ui/public/locales/en-US/Database/workflows/autoClassification.md @@ -79,7 +79,7 @@ $$ $$section ### Store Sample Data $(id="storeSampleData") -Set the Store Sample Data toggle to control whether to store sample data as part of Auto Classification workflow. If this is enabled, 100 rows will be ingested by default. You can update the number of rows in the "DatabaseServiceProfilerPipeline Advanced Config" section (i.e. `Sample Data Rows Count` setting). +Set the Store Sample Data toggle to control whether to store sample data as part of Auto Classification workflow. If this is enabled, 50 rows will be ingested by default. You can update the number of rows in the "DatabaseServiceProfilerPipeline Advanced Config" section (i.e. `Sample Data Rows Count` setting). If disabled, OpenMetadata will not store any sample data, but will still use it on-the-fly to compute the Auto Classification. $$ @@ -97,29 +97,6 @@ $$section Confidence level to use when inferring whether a column should be applied the classification or not (between 0 and 100). A number closer to 100 will yield less false positive but potentially more false negative. $$ -$$section -### Profile Sample Type $(id="profileSampleType") -The sample type can be set to either: - -* **Percentage**: this will use a percentage to sample the table (e.g. if table has 100 rows, and we set sample percentage tp 50%, the profiler will use 50 random rows to compute the metrics). -* **Row Count**: this will use a number of rows to sample the table (e.g. if table has 100 rows, and we set row count to 10, the profiler will use 10 random rows to compute the metrics). - $$ - -$$section -### Profile Sample $(id="profileSample") -Percentage of data or number of rows to use when sampling tables to compute the profiler metrics. By default (i.e. if left blank), the profiler will run against the entire table. -$$ - -$$section -### Sampling Method Type $(id="samplingMethodType") - -**This parameter is effective for Snowflake only** - -The sampling method type can be set to **BERNOULLI** or **SYSTEM**. You can find the difference of two values in the document of the Snowflake. When you choice **BERNOULLI**, it will scan full rows in the table even though small value is set at the **Profile Sample**. However, it has less restlictions than **SYSTEM**. - -If no option is chosen, the default is **BERNOULLI**. -$$ - $$section ### Sample Data Rows Count $(id="sampleDataCount") Set the number of rows to ingest when `Ingest Sample Data` toggle is on. Defaults to 50. diff --git a/openmetadata-ui/src/main/resources/ui/public/locales/en-US/Database/workflows/profiler.md b/openmetadata-ui/src/main/resources/ui/public/locales/en-US/Database/workflows/profiler.md index 4f716d3e44f9..ad121e0cf251 100644 --- a/openmetadata-ui/src/main/resources/ui/public/locales/en-US/Database/workflows/profiler.md +++ b/openmetadata-ui/src/main/resources/ui/public/locales/en-US/Database/workflows/profiler.md @@ -103,11 +103,6 @@ The sampling method type can be set to **BERNOULLI** or **SYSTEM**. You can find If no option is chosen, the default is **BERNOULLI**. $$ -$$section -### Sample Data Rows Count $(id="sampleDataCount") -Set the number of rows to ingest when `Ingest Sample Data` toggle is on. Defaults to 50. -$$ - $$section ### Thread Count $(id="threadCount") Number of threads that will be used when computing the profiler metrics. A high number can have negative performance effect. From a5f1d80ab6eabc1d6754d8e755685bbc54aa0714 Mon Sep 17 00:00:00 2001 From: Pere Miquel Brull Date: Tue, 3 Dec 2024 17:58:50 +0100 Subject: [PATCH 2/2] deprecation notice --- .../upgrade/upgrade-prerequisites.md | 34 ++++--------------- 1 file changed, 7 insertions(+), 27 deletions(-) diff --git a/openmetadata-docs/content/partials/v1.6/deployment/upgrade/upgrade-prerequisites.md b/openmetadata-docs/content/partials/v1.6/deployment/upgrade/upgrade-prerequisites.md index eae2e9fcd8ab..3cdc5aecb60a 100644 --- a/openmetadata-docs/content/partials/v1.6/deployment/upgrade/upgrade-prerequisites.md +++ b/openmetadata-docs/content/partials/v1.6/deployment/upgrade/upgrade-prerequisites.md @@ -84,33 +84,6 @@ during the migration after bumping this value, you can increase them further. After the migration is finished, you can revert this changes. -# New Versioning System for Ingestion Docker Image - -We are excited to announce a recent change in our version tagging system for our Ingestion Docker images. This update aims to improve consistency and clarity in our versioning, aligning our Docker image tags with our Python PyPi package versions. - -### Ingestion Docker Image Tags - -To maintain consistency, our Docker images will now follow the same 4-digit versioning system as of Python Package versions. For example, a Docker image version might look like `1.0.0.0`. - -Additionally, we will continue to provide a 3-digit version tag (e.g., `1.0.0`) that will always point to the latest corresponding 4-digit image tag. This ensures ease of use for those who prefer a simpler version tag while still having access to the most recent updates. - -### Benefits - -**Consistency**: Both Python applications and Docker images will have the same versioning format, making it easier to track and manage versions. -**Clarity**: The 4-digit system provides a clear and detailed versioning structure, helping users understand the nature and scope of changes. -**Non-Breaking Change**: This update is designed to be non-disruptive. Existing Ingestions and dependencies will remain unaffected. - -#### Example - -Here’s an example of how the new versioning works: - -**Python Application Version**: `1.5.0.0` -**Docker Image Tags**: -- `1.5.0.0` (specific version) -- `1.5.0` (latest version in the 1.5.0.x series) - -We believe this update will bring greater consistency and clarity to our versioning system. As always, we value your feedback and welcome any questions or comments you may have. - # Backward Incompatible Changes ## 1.6.0 @@ -145,6 +118,13 @@ removing these properties as well. - If you still want to use the Auto PII Classification and sampling features, you can create the new workflow from the UI. +### Collate - Metadata Actions for ML Tagging - Deprecation Notice + +Since we are introducing the `Auto Classification` workflow, **we are going to remove in 1.7 the `ML Tagging` action** +from the Metadata Actions. That feature will be covered already by the `Auto Classification` workflow, which even brings +more flexibility allow the on-the-fly usage of the sample data for classification purposes without having to store +it in the database. + ### Service Spec for the Ingestion Framework This impacts users who maintain their own connectors for the ingestion framework that are **NOT** part of the