-
Notifications
You must be signed in to change notification settings - Fork 2.9k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
feat(ingest/unity): GE Profiling #8951
Changes from 5 commits
38f216a
142b359
0baffb5
7222a43
05990ed
f1cd4b6
3de31ba
ff8f72d
807ab59
ebefe1c
3e71c04
5d83435
2cb0a7b
56ae954
0f38f95
8d450f3
5f93e9d
9aaf271
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -67,7 +67,6 @@ def generate_profiles( | |||||
self, | ||||||
requests: List[TableProfilerRequest], | ||||||
max_workers: int, | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. we can force these to be kwargs for clarity
Suggested change
|
||||||
db_name: Optional[str] = None, | ||||||
platform: Optional[str] = None, | ||||||
profiler_args: Optional[Dict] = None, | ||||||
) -> Iterable[Tuple[GEProfilerRequest, Optional[DatasetProfileClass]]]: | ||||||
|
@@ -92,7 +91,7 @@ def generate_profiles( | |||||
return | ||||||
|
||||||
# Otherwise, if column level profiling is enabled, use GE profiler. | ||||||
ge_profiler = self.get_profiler_instance(db_name) | ||||||
ge_profiler = self.get_profiler_instance() | ||||||
yield from ge_profiler.generate_profiles( | ||||||
ge_profile_requests, max_workers, platform, profiler_args | ||||||
) | ||||||
|
@@ -108,9 +107,7 @@ def get_inspectors(self) -> Iterable[Inspector]: | |||||
inspector = inspect(conn) | ||||||
yield inspector | ||||||
|
||||||
def get_profiler_instance( | ||||||
self, db_name: Optional[str] = None | ||||||
asikowitz marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
) -> "DatahubGEProfiler": | ||||||
def get_profiler_instance(self) -> "DatahubGEProfiler": | ||||||
logger.debug(f"Getting profiler instance from {self.platform}") | ||||||
url = self.config.get_sql_alchemy_url() | ||||||
|
||||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,14 +1,19 @@ | ||
import os | ||
from datetime import datetime, timedelta, timezone | ||
from typing import Any, Dict, Optional | ||
from typing import Any, Dict, Optional, Union | ||
from urllib.parse import urlparse | ||
|
||
import pydantic | ||
from pydantic import Field | ||
from typing_extensions import Literal | ||
|
||
from datahub.configuration.common import AllowDenyPattern, ConfigModel | ||
from datahub.configuration.source_common import DatasetSourceConfigMixin | ||
from datahub.configuration.validate_field_removal import pydantic_removed_field | ||
from datahub.configuration.validate_field_rename import pydantic_renamed_field | ||
from datahub.ingestion.source.ge_data_profiler import DATABRICKS | ||
from datahub.ingestion.source.ge_profiling_config import GEProfilingConfig | ||
from datahub.ingestion.source.sql.sql_config import SQLCommonConfig, make_sqlalchemy_uri | ||
from datahub.ingestion.source.state.stale_entity_removal_handler import ( | ||
StatefulStaleMetadataRemovalConfig, | ||
) | ||
|
@@ -23,25 +28,13 @@ | |
) | ||
|
||
|
||
class UnityCatalogProfilerConfig(ConfigModel): | ||
# TODO: Reduce duplicate code with DataLakeProfilerConfig, GEProfilingConfig, SQLAlchemyConfig | ||
enabled: bool = Field( | ||
default=False, description="Whether profiling should be done." | ||
) | ||
operation_config: OperationConfig = Field( | ||
default_factory=OperationConfig, | ||
description="Experimental feature. To specify operation configs.", | ||
) | ||
class UnityCatalogConfig(ConfigModel): | ||
method: str | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. how does this show up in the docs? does this need a Field(description="docs")? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Looks like it doesn't show up at all. I'll add a description but in general, our docs support for discriminated unions is not very good -- we don't show which type supports which options. I'll update example recipes to help here |
||
|
||
warehouse_id: Optional[str] = Field( | ||
default=None, description="SQL Warehouse id, for running profiling queries." | ||
) | ||
|
||
profile_table_level_only: bool = Field( | ||
default=False, | ||
description="Whether to perform profiling at table-level only or include column-level profiling as well.", | ||
) | ||
|
||
pattern: AllowDenyPattern = Field( | ||
default=AllowDenyPattern.allow_all(), | ||
description=( | ||
|
@@ -51,6 +44,24 @@ class UnityCatalogProfilerConfig(ConfigModel): | |
), | ||
) | ||
|
||
|
||
class UnityCatalogAnalyzeProfilerConfig(UnityCatalogConfig): | ||
method: Literal["analyze"] = "analyze" | ||
|
||
# TODO: Reduce duplicate code with DataLakeProfilerConfig, GEProfilingConfig, SQLAlchemyConfig | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes please |
||
enabled: bool = Field( | ||
default=False, description="Whether profiling should be done." | ||
) | ||
operation_config: OperationConfig = Field( | ||
default_factory=OperationConfig, | ||
description="Experimental feature. To specify operation configs.", | ||
) | ||
|
||
profile_table_level_only: bool = Field( | ||
default=False, | ||
description="Whether to perform profiling at table-level only or include column-level profiling as well.", | ||
) | ||
|
||
call_analyze: bool = Field( | ||
default=True, | ||
description=( | ||
|
@@ -82,7 +93,12 @@ def include_columns(self): | |
return not self.profile_table_level_only | ||
|
||
|
||
class UnityCatalogGEProfilerConfig(UnityCatalogConfig, GEProfilingConfig): | ||
method: Literal["ge"] = "ge" | ||
|
||
|
||
class UnityCatalogSourceConfig( | ||
SQLCommonConfig, | ||
StatefulIngestionConfigBase, | ||
BaseUsageConfig, | ||
DatasetSourceConfigMixin, | ||
|
@@ -122,10 +138,6 @@ class UnityCatalogSourceConfig( | |
default=AllowDenyPattern.allow_all(), | ||
description="Regex patterns for tables to filter in ingestion. Specify regex to match the entire table name in `catalog.schema.table` format. e.g. to match all tables starting with customer in Customer catalog and public schema, use the regex `Customer\\.public\\.customer.*`.", | ||
) | ||
domain: Dict[str, AllowDenyPattern] = Field( | ||
default=dict(), | ||
description='Attach domains to catalogs, schemas or tables during ingestion using regex patterns. Domain key can be a guid like *urn:li:domain:ec428203-ce86-4db3-985d-5a8ee6df32ba* or a string like "Marketing".) If you provide strings, then datahub will attempt to resolve this name to a guid, and will error out if this fails. There can be multiple domain keys specified.', | ||
) | ||
|
||
include_table_lineage: bool = pydantic.Field( | ||
default=True, | ||
|
@@ -156,15 +168,34 @@ class UnityCatalogSourceConfig( | |
description="Generate usage statistics.", | ||
) | ||
|
||
profiling: UnityCatalogProfilerConfig = Field( | ||
default=UnityCatalogProfilerConfig(), description="Data profiling configuration" | ||
profiling: Union[UnityCatalogGEProfilerConfig, UnityCatalogAnalyzeProfilerConfig] = Field( # type: ignore | ||
default=UnityCatalogGEProfilerConfig(), | ||
description="Data profiling configuration", | ||
discriminator="method", | ||
) | ||
|
||
scheme: str = DATABRICKS | ||
|
||
def get_sql_alchemy_url(self): | ||
return make_sqlalchemy_uri( | ||
scheme=self.scheme, | ||
username="token", | ||
password=self.token, | ||
at=urlparse(self.workspace_url).netloc, | ||
db=None, | ||
uri_opts={ | ||
"http_path": f"/sql/1.0/warehouses/{self.profiling.warehouse_id}" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This assumes use of SQL warehouse. Looks like this may take different formats.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hmm, good point. It's going to be a bit annoying to support both profilers :| There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Gonna hold off on this... want to get this in before I never get to it again |
||
}, | ||
) | ||
|
||
def is_profiling_enabled(self) -> bool: | ||
return self.profiling.enabled and is_profiling_enabled( | ||
self.profiling.operation_config | ||
) | ||
|
||
def is_ge_profiling(self) -> bool: | ||
return self.profiling.method == "ge" | ||
|
||
stateful_ingestion: Optional[StatefulStaleMetadataRemovalConfig] = pydantic.Field( | ||
default=None, description="Unity Catalog Stateful Ingestion Config." | ||
) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
let's remove this line