-
Notifications
You must be signed in to change notification settings - Fork 0
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
feat(integration/fivetran): Fivetran source ingestion integration #14
Changes from 6 commits
abe52ae
b04ff8c
3f392f4
0c295e2
b6897d1
7dddda3
81cf867
389c16f
561d2b5
d1a13f0
19234ee
2bba626
2e045d2
f17aaf2
af11bcc
5c2afd9
1958c3e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
from datahub.ingestion.source.fivetran.fivetran import FivetranSource |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,135 @@ | ||
import logging | ||
from dataclasses import dataclass, field as dataclass_field | ||
from typing import Dict, List, Optional | ||
|
||
import pydantic | ||
from pydantic import Field | ||
from pydantic.class_validators import root_validator | ||
|
||
from datahub.configuration.common import AllowDenyPattern, ConfigModel | ||
from datahub.configuration.source_common import DEFAULT_ENV, DatasetSourceConfigMixin | ||
from datahub.ingestion.source.state.stale_entity_removal_handler import ( | ||
StaleEntityRemovalSourceReport, | ||
StatefulStaleMetadataRemovalConfig, | ||
) | ||
from datahub.ingestion.source.state.stateful_ingestion_base import ( | ||
StatefulIngestionConfigBase, | ||
) | ||
from datahub.ingestion.source_config.sql.snowflake import BaseSnowflakeConfig | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
||
class Constant: | ||
""" | ||
keys used in fivetran plugin | ||
""" | ||
|
||
ORCHESTRATOR = "fivetran" | ||
# table column name | ||
SOURCE_SCHEMA_NAME = "source_schema_name" | ||
SOURCE_TABLE_NAME = "source_table_name" | ||
DESTINATION_SCHEMA_NAME = "destination_schema_name" | ||
DESTINATION_TABLE_NAME = "destination_table_name" | ||
SYNC_ID = "sync_id" | ||
MESSAGE_DATA = "message_data" | ||
TIME_STAMP = "time_stamp" | ||
STATUS = "status" | ||
USER_ID = "user_id" | ||
GIVEN_NAME = "given_name" | ||
FAMILY_NAME = "family_name" | ||
CONNECTOR_ID = "connector_id" | ||
CONNECTOR_NAME = "connector_name" | ||
CONNECTOR_TYPE_ID = "connector_type_id" | ||
PAUSED = "paused" | ||
SYNC_FREQUENCY = "sync_frequency" | ||
DESTINATION_ID = "destination_id" | ||
CONNECTING_USER_ID = "connecting_user_id" | ||
# Job status constants | ||
SUCCESSFUL = "SUCCESSFUL" | ||
FAILURE_WITH_TASK = "FAILURE_WITH_TASK" | ||
CANCELED = "CANCELED" | ||
|
||
|
||
SUPPORTED_DATA_PLATFORM_MAPPING = { | ||
"postgres": "postgres", | ||
"snowflake": "snowflake", | ||
"mysql": "mysql", | ||
} | ||
|
||
|
||
class SnowflakeDestinationConfig(BaseSnowflakeConfig): | ||
database: str = Field(description="The fivetran connector log database.") | ||
log_schema: str = Field(description="The fivetran connector log schema.") | ||
|
||
|
||
class FivetranLogConfig(ConfigModel): | ||
destination_platform: str = pydantic.Field( | ||
default="snowflake", | ||
description="The destination platform where fivetran connector log tables are dumped.", | ||
) | ||
snowflake_destination_config: Optional[SnowflakeDestinationConfig] = pydantic.Field( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. let's rename it to There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. But this config is specific to snowflake destination only. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. type is sufficient to understand |
||
default=None, | ||
description="If destination platform is 'snowflake', provide snowflake configuration.", | ||
) | ||
|
||
@root_validator(pre=True) | ||
def validate_destination_platfrom_and_config(cls, values: Dict) -> Dict: | ||
destination_platform = values["destination_platform"] | ||
if destination_platform == "snowflake": | ||
if "snowflake_destination_config" not in values: | ||
raise ValueError( | ||
"If destination platform is 'snowflake', user must provide snowflake destination configuration in the recipe." | ||
) | ||
else: | ||
raise ValueError( | ||
f"Destination platform '{destination_platform}' is not yet supported." | ||
) | ||
return values | ||
|
||
|
||
@dataclass | ||
class FivetranSourceReport(StaleEntityRemovalSourceReport): | ||
connectors_scanned: int = 0 | ||
filtered_connectors: List[str] = dataclass_field(default_factory=list) | ||
|
||
def report_connectors_scanned(self, count: int = 1) -> None: | ||
self.connectors_scanned += count | ||
|
||
def report_connectors_dropped(self, model: str) -> None: | ||
self.filtered_connectors.append(model) | ||
|
||
|
||
class PlatformDetail(ConfigModel): | ||
platform_instance: Optional[str] = pydantic.Field( | ||
default=None, | ||
description="The instance of the platform that all assets produced by this recipe belong to", | ||
) | ||
env: str = pydantic.Field( | ||
default=DEFAULT_ENV, | ||
description="The environment that all assets produced by DataHub platform ingestion source belong to", | ||
) | ||
|
||
|
||
class FivetranSourceConfig(StatefulIngestionConfigBase, DatasetSourceConfigMixin): | ||
fivetran_log_config: FivetranLogConfig = pydantic.Field( | ||
description="Fivetran log connector destination server configurations.", | ||
) | ||
connector_patterns: AllowDenyPattern = Field( | ||
default=AllowDenyPattern.allow_all(), | ||
description="Regex patterns for connectors to filter in ingestion.", | ||
) | ||
# Configuration for stateful ingestion | ||
stateful_ingestion: Optional[StatefulStaleMetadataRemovalConfig] = pydantic.Field( | ||
default=None, description="Airbyte Stateful Ingestion Config." | ||
) | ||
# Fivetran connector all sources to platform instance mapping | ||
sources_to_platform_instance: Dict[str, PlatformDetail] = pydantic.Field( | ||
default={}, | ||
description="A mapping of the connector's all sources dataset to platform instance. Use connector id as key.", | ||
) | ||
# Fivetran destination to platform instance mapping | ||
destination_to_platform_instance: Dict[str, PlatformDetail] = pydantic.Field( | ||
shubhamjagtap639 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
default={}, | ||
description="A mapping of destination dataset to platform instance. Use destination id as key.", | ||
) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,165 @@ | ||
import json | ||
import logging | ||
from dataclasses import dataclass | ||
from typing import Any, Dict, List, Tuple | ||
|
||
from sqlalchemy import create_engine | ||
|
||
from datahub.ingestion.source.fivetran.config import ( | ||
Constant, | ||
FivetranSourceConfig, | ||
FivetranSourceReport, | ||
) | ||
from datahub.ingestion.source.fivetran.fivetran_query import FivetranLogQuery | ||
|
||
logger: logging.Logger = logging.getLogger(__name__) | ||
|
||
|
||
@dataclass | ||
class Connector: | ||
connector_id: str | ||
connector_name: str | ||
connector_type: str | ||
paused: bool | ||
sync_frequency: int | ||
destination_id: str | ||
user_name: str | ||
source_tables: List[str] | ||
destination_tables: List[str] | ||
jobs: List["Job"] | ||
|
||
|
||
@dataclass | ||
class Job: | ||
job_id: str | ||
start_time: int | ||
end_time: int | ||
status: str | ||
|
||
|
||
class FivetranLogDataDictionary: | ||
def __init__( | ||
self, config: FivetranSourceConfig, report: FivetranSourceReport | ||
) -> None: | ||
self.logger = logger | ||
self.config = config | ||
self.report = report | ||
self.engine = self._get_log_destination_engine() | ||
|
||
def _get_log_destination_engine(self) -> Any: | ||
destination_platform = self.config.fivetran_log_config.destination_platform | ||
engine = None | ||
if destination_platform == "snowflake": | ||
snowflake_destination_config = ( | ||
self.config.fivetran_log_config.snowflake_destination_config | ||
) | ||
if snowflake_destination_config is not None: | ||
engine = create_engine( | ||
snowflake_destination_config.get_sql_alchemy_url(), | ||
**snowflake_destination_config.get_options(), | ||
) | ||
engine.execute( | ||
FivetranLogQuery.use_schema( | ||
snowflake_destination_config.database, | ||
snowflake_destination_config.log_schema, | ||
) | ||
) | ||
return engine | ||
|
||
def _query(self, query: str) -> List[Dict]: | ||
logger.debug("Query : {}".format(query)) | ||
resp = self.engine.execute(query) | ||
return [row for row in resp] | ||
|
||
def _get_table_lineage(self, connector_id: str) -> Tuple[List[str], List[str]]: | ||
table_lineage = self._query( | ||
FivetranLogQuery.get_table_lineage_query(connector_id=connector_id) | ||
) | ||
source_tables: List[str] = [] | ||
destination_tables: List[str] = [] | ||
for each in table_lineage: | ||
source_tables.append( | ||
f"{each[Constant.SOURCE_SCHEMA_NAME]}.{each[Constant.SOURCE_TABLE_NAME]}" | ||
) | ||
destination_tables.append( | ||
f"{each[Constant.DESTINATION_SCHEMA_NAME]}.{each[Constant.DESTINATION_TABLE_NAME]}" | ||
) | ||
return source_tables, destination_tables | ||
|
||
def _get_jobs_list(self, connector_id: str) -> List[Job]: | ||
jobs: List[Job] = [] | ||
sync_start_logs = { | ||
row[Constant.SYNC_ID]: row | ||
for row in self._query( | ||
FivetranLogQuery.get_sync_start_logs_query(connector_id=connector_id) | ||
) | ||
} | ||
sync_end_logs = { | ||
row[Constant.SYNC_ID]: row | ||
for row in self._query( | ||
FivetranLogQuery.get_sync_end_logs_query(connector_id=connector_id) | ||
) | ||
} | ||
for sync_id in sync_start_logs.keys(): | ||
if sync_end_logs.get(sync_id) is None: | ||
# If no sync-end event log for this sync id that means sync is still in progress | ||
continue | ||
|
||
message_data = json.loads(sync_end_logs[sync_id][Constant.MESSAGE_DATA]) | ||
if type(message_data) is str: | ||
# Sometimes message_data contains json string inside string | ||
# Ex: '"{\"status\":\"SUCCESSFUL\"}"' | ||
# Hence, need to do json loads twice. | ||
message_data = json.loads(message_data) | ||
|
||
jobs.append( | ||
Job( | ||
job_id=sync_id, | ||
start_time=round( | ||
sync_start_logs[sync_id][Constant.TIME_STAMP].timestamp() | ||
), | ||
end_time=round( | ||
sync_end_logs[sync_id][Constant.TIME_STAMP].timestamp() | ||
), | ||
status=message_data[Constant.STATUS], | ||
) | ||
) | ||
return jobs | ||
|
||
def _get_user_name(self, user_id: str) -> str: | ||
user_details = self._query(FivetranLogQuery.get_user_query(user_id=user_id))[0] | ||
return ( | ||
f"{user_details[Constant.GIVEN_NAME]} {user_details[Constant.FAMILY_NAME]}" | ||
) | ||
|
||
def get_connectors_list(self) -> List[Connector]: | ||
connectors: List[Connector] = [] | ||
connector_list = self._query(FivetranLogQuery.get_connectors_query()) | ||
for connector in connector_list: | ||
if not self.config.connector_patterns.allowed( | ||
connector[Constant.CONNECTOR_NAME] | ||
): | ||
self.report.report_connectors_dropped(connector[Constant.CONNECTOR_ID]) | ||
continue | ||
|
||
source_tables, destination_tables = self._get_table_lineage( | ||
connector[Constant.CONNECTOR_ID] | ||
) | ||
|
||
connectors.append( | ||
Connector( | ||
connector_id=connector[Constant.CONNECTOR_ID], | ||
connector_name=connector[Constant.CONNECTOR_NAME], | ||
connector_type=connector[Constant.CONNECTOR_TYPE_ID], | ||
paused=connector[Constant.PAUSED], | ||
sync_frequency=connector[Constant.SYNC_FREQUENCY], | ||
destination_id=connector[Constant.DESTINATION_ID], | ||
user_name=self._get_user_name( | ||
connector[Constant.CONNECTING_USER_ID] | ||
), | ||
source_tables=source_tables, | ||
destination_tables=destination_tables, | ||
jobs=self._get_jobs_list(connector[Constant.CONNECTOR_ID]), | ||
) | ||
) | ||
return connectors |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
why is this needed?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Now we added one unit test case on fivetran source