From 3a5e0dedb94b9f83a3fd9331061fd927422d13f8 Mon Sep 17 00:00:00 2001 From: Nikita Podshivalov Date: Tue, 11 Apr 2023 11:50:38 +0200 Subject: [PATCH 01/32] Initialization of dataset module --- backend/dataall/modules/datasets/__init__.py | 18 ++++++++++++++++++ .../dataall/modules/datasets/api/__init__.py | 1 + 2 files changed, 19 insertions(+) create mode 100644 backend/dataall/modules/datasets/__init__.py create mode 100644 backend/dataall/modules/datasets/api/__init__.py diff --git a/backend/dataall/modules/datasets/__init__.py b/backend/dataall/modules/datasets/__init__.py new file mode 100644 index 000000000..67298a06e --- /dev/null +++ b/backend/dataall/modules/datasets/__init__.py @@ -0,0 +1,18 @@ +"""Contains the code related to datasets""" +import logging +from dataall.modules.loader import ModuleInterface, ImportMode + +log = logging.getLogger(__name__) + + +class DatasetApiModuleInterface(ModuleInterface): + """Implements ModuleInterface for notebook GraphQl lambda""" + + @classmethod + def is_supported(cls, modes): + return ImportMode.API in modes + + def __init__(self): + import dataall.modules.datasets.api + log.info("API of datasets has been imported") + diff --git a/backend/dataall/modules/datasets/api/__init__.py b/backend/dataall/modules/datasets/api/__init__.py new file mode 100644 index 000000000..13cf9331d --- /dev/null +++ b/backend/dataall/modules/datasets/api/__init__.py @@ -0,0 +1 @@ +"""The GraphQL schema of datasets and related functionality""" From a50a02f39b05d64d1e6bc410afe956a66eeb5602 Mon Sep 17 00:00:00 2001 From: Nikita Podshivalov Date: Tue, 11 Apr 2023 11:55:30 +0200 Subject: [PATCH 02/32] Refactoring of datasets Moved dataset table column to modules --- backend/dataall/api/Objects/DatasetTable/schema.py | 2 +- backend/dataall/api/Objects/__init__.py | 1 - .../datasets/api}/DatasetTableColumn/__init__.py | 2 +- .../datasets/api}/DatasetTableColumn/input_types.py | 2 +- .../datasets/api}/DatasetTableColumn/mutations.py | 7 +++++-- .../datasets/api}/DatasetTableColumn/queries.py | 4 ++-- .../datasets/api}/DatasetTableColumn/resolvers.py | 10 +++++----- .../datasets/api}/DatasetTableColumn/schema.py | 4 ++-- backend/dataall/modules/datasets/api/__init__.py | 5 +++++ 9 files changed, 22 insertions(+), 15 deletions(-) rename backend/dataall/{api/Objects => modules/datasets/api}/DatasetTableColumn/__init__.py (70%) rename backend/dataall/{api/Objects => modules/datasets/api}/DatasetTableColumn/input_types.py (94%) rename backend/dataall/{api/Objects => modules/datasets/api}/DatasetTableColumn/mutations.py (78%) rename backend/dataall/{api/Objects => modules/datasets/api}/DatasetTableColumn/queries.py (74%) rename backend/dataall/{api/Objects => modules/datasets/api}/DatasetTableColumn/resolvers.py (94%) rename backend/dataall/{api/Objects => modules/datasets/api}/DatasetTableColumn/schema.py (93%) diff --git a/backend/dataall/api/Objects/DatasetTable/schema.py b/backend/dataall/api/Objects/DatasetTable/schema.py index dc1cffcb4..0436c0f2e 100644 --- a/backend/dataall/api/Objects/DatasetTable/schema.py +++ b/backend/dataall/api/Objects/DatasetTable/schema.py @@ -1,4 +1,4 @@ -from ..DatasetTableColumn.resolvers import list_table_columns +from dataall.modules.datasets.api.DatasetTableColumn.resolvers import list_table_columns from ... import gql from .resolvers import * from ...constants import GraphQLEnumMapper diff --git a/backend/dataall/api/Objects/__init__.py b/backend/dataall/api/Objects/__init__.py index 060f2ba6e..43d5e0833 100644 --- a/backend/dataall/api/Objects/__init__.py +++ b/backend/dataall/api/Objects/__init__.py @@ -18,7 +18,6 @@ Environment, Activity, DatasetTable, - DatasetTableColumn, Dataset, Group, Principal, diff --git a/backend/dataall/api/Objects/DatasetTableColumn/__init__.py b/backend/dataall/modules/datasets/api/DatasetTableColumn/__init__.py similarity index 70% rename from backend/dataall/api/Objects/DatasetTableColumn/__init__.py rename to backend/dataall/modules/datasets/api/DatasetTableColumn/__init__.py index dfa46b264..691b10331 100644 --- a/backend/dataall/api/Objects/DatasetTableColumn/__init__.py +++ b/backend/dataall/modules/datasets/api/DatasetTableColumn/__init__.py @@ -1,4 +1,4 @@ -from . import ( +from dataall.modules.datasets.api.DatasetTableColumn import ( input_types, mutations, queries, diff --git a/backend/dataall/api/Objects/DatasetTableColumn/input_types.py b/backend/dataall/modules/datasets/api/DatasetTableColumn/input_types.py similarity index 94% rename from backend/dataall/api/Objects/DatasetTableColumn/input_types.py rename to backend/dataall/modules/datasets/api/DatasetTableColumn/input_types.py index 24fbbdbca..745e7f271 100644 --- a/backend/dataall/api/Objects/DatasetTableColumn/input_types.py +++ b/backend/dataall/modules/datasets/api/DatasetTableColumn/input_types.py @@ -1,4 +1,4 @@ -from ... import gql +from dataall.api import gql DatasetTableColumnFilter = gql.InputType( name='DatasetTableColumnFilter', diff --git a/backend/dataall/api/Objects/DatasetTableColumn/mutations.py b/backend/dataall/modules/datasets/api/DatasetTableColumn/mutations.py similarity index 78% rename from backend/dataall/api/Objects/DatasetTableColumn/mutations.py rename to backend/dataall/modules/datasets/api/DatasetTableColumn/mutations.py index 012d83ea7..f7b682e3d 100644 --- a/backend/dataall/api/Objects/DatasetTableColumn/mutations.py +++ b/backend/dataall/modules/datasets/api/DatasetTableColumn/mutations.py @@ -1,5 +1,8 @@ -from ... import gql -from .resolvers import * +from dataall.api import gql +from dataall.modules.datasets.api.DatasetTableColumn.resolvers import ( + sync_table_columns, + update_table_column +) syncDatasetTableColumns = gql.MutationField( name='syncDatasetTableColumns', diff --git a/backend/dataall/api/Objects/DatasetTableColumn/queries.py b/backend/dataall/modules/datasets/api/DatasetTableColumn/queries.py similarity index 74% rename from backend/dataall/api/Objects/DatasetTableColumn/queries.py rename to backend/dataall/modules/datasets/api/DatasetTableColumn/queries.py index 4f5f05646..0a08e37b6 100644 --- a/backend/dataall/api/Objects/DatasetTableColumn/queries.py +++ b/backend/dataall/modules/datasets/api/DatasetTableColumn/queries.py @@ -1,5 +1,5 @@ -from ... import gql -from .resolvers import * +from dataall.api import gql +from dataall.modules.datasets.api.DatasetTableColumn.resolvers import list_table_columns listDatasetTableColumns = gql.QueryField( name='listDatasetTableColumns', diff --git a/backend/dataall/api/Objects/DatasetTableColumn/resolvers.py b/backend/dataall/modules/datasets/api/DatasetTableColumn/resolvers.py similarity index 94% rename from backend/dataall/api/Objects/DatasetTableColumn/resolvers.py rename to backend/dataall/modules/datasets/api/DatasetTableColumn/resolvers.py index 88bf2c728..a7a1bf5f4 100644 --- a/backend/dataall/api/Objects/DatasetTableColumn/resolvers.py +++ b/backend/dataall/modules/datasets/api/DatasetTableColumn/resolvers.py @@ -1,10 +1,10 @@ from sqlalchemy import or_ -from .... import db -from ....api.context import Context -from ....aws.handlers.service_handlers import Worker -from ....db import paginate, permissions, models -from ....db.api import ResourcePolicy +from dataall import db +from dataall.api.context import Context +from dataall.aws.handlers.service_handlers import Worker +from dataall.db import paginate, permissions, models +from dataall.db.api import ResourcePolicy def list_table_columns( diff --git a/backend/dataall/api/Objects/DatasetTableColumn/schema.py b/backend/dataall/modules/datasets/api/DatasetTableColumn/schema.py similarity index 93% rename from backend/dataall/api/Objects/DatasetTableColumn/schema.py rename to backend/dataall/modules/datasets/api/DatasetTableColumn/schema.py index d571fc9a6..8772e99b7 100644 --- a/backend/dataall/api/Objects/DatasetTableColumn/schema.py +++ b/backend/dataall/modules/datasets/api/DatasetTableColumn/schema.py @@ -1,5 +1,5 @@ -from ... import gql -from .resolvers import * +from dataall.api import gql +from dataall.modules.datasets.api.DatasetTableColumn.resolvers import resolve_terms DatasetTableColumn = gql.ObjectType( diff --git a/backend/dataall/modules/datasets/api/__init__.py b/backend/dataall/modules/datasets/api/__init__.py index 13cf9331d..f79e9a30c 100644 --- a/backend/dataall/modules/datasets/api/__init__.py +++ b/backend/dataall/modules/datasets/api/__init__.py @@ -1 +1,6 @@ """The GraphQL schema of datasets and related functionality""" +from dataall.modules.datasets.api import ( + DatasetTableColumn +) + +__all__ = ["DatasetTableColumn"] From be1498689d48aa63aae3eba763635f8af800148b Mon Sep 17 00:00:00 2001 From: Nikita Podshivalov Date: Tue, 11 Apr 2023 12:09:05 +0200 Subject: [PATCH 03/32] Refactoring of datasets Renamed table column to the python's convention format --- backend/dataall/api/Objects/DatasetTable/schema.py | 2 +- backend/dataall/modules/datasets/api/__init__.py | 4 ++-- .../api/{DatasetTableColumn => table_column}/__init__.py | 2 +- .../api/{DatasetTableColumn => table_column}/input_types.py | 0 .../api/{DatasetTableColumn => table_column}/mutations.py | 2 +- .../api/{DatasetTableColumn => table_column}/queries.py | 2 +- .../api/{DatasetTableColumn => table_column}/resolvers.py | 0 .../api/{DatasetTableColumn => table_column}/schema.py | 2 +- 8 files changed, 7 insertions(+), 7 deletions(-) rename backend/dataall/modules/datasets/api/{DatasetTableColumn => table_column}/__init__.py (70%) rename backend/dataall/modules/datasets/api/{DatasetTableColumn => table_column}/input_types.py (100%) rename backend/dataall/modules/datasets/api/{DatasetTableColumn => table_column}/mutations.py (89%) rename backend/dataall/modules/datasets/api/{DatasetTableColumn => table_column}/queries.py (80%) rename backend/dataall/modules/datasets/api/{DatasetTableColumn => table_column}/resolvers.py (100%) rename backend/dataall/modules/datasets/api/{DatasetTableColumn => table_column}/schema.py (95%) diff --git a/backend/dataall/api/Objects/DatasetTable/schema.py b/backend/dataall/api/Objects/DatasetTable/schema.py index 0436c0f2e..74d413818 100644 --- a/backend/dataall/api/Objects/DatasetTable/schema.py +++ b/backend/dataall/api/Objects/DatasetTable/schema.py @@ -1,4 +1,4 @@ -from dataall.modules.datasets.api.DatasetTableColumn.resolvers import list_table_columns +from dataall.modules.datasets.api.table_column.resolvers import list_table_columns from ... import gql from .resolvers import * from ...constants import GraphQLEnumMapper diff --git a/backend/dataall/modules/datasets/api/__init__.py b/backend/dataall/modules/datasets/api/__init__.py index f79e9a30c..538df0734 100644 --- a/backend/dataall/modules/datasets/api/__init__.py +++ b/backend/dataall/modules/datasets/api/__init__.py @@ -1,6 +1,6 @@ """The GraphQL schema of datasets and related functionality""" from dataall.modules.datasets.api import ( - DatasetTableColumn + table_column ) -__all__ = ["DatasetTableColumn"] +__all__ = ["table_column"] diff --git a/backend/dataall/modules/datasets/api/DatasetTableColumn/__init__.py b/backend/dataall/modules/datasets/api/table_column/__init__.py similarity index 70% rename from backend/dataall/modules/datasets/api/DatasetTableColumn/__init__.py rename to backend/dataall/modules/datasets/api/table_column/__init__.py index 691b10331..070301f58 100644 --- a/backend/dataall/modules/datasets/api/DatasetTableColumn/__init__.py +++ b/backend/dataall/modules/datasets/api/table_column/__init__.py @@ -1,4 +1,4 @@ -from dataall.modules.datasets.api.DatasetTableColumn import ( +from dataall.modules.datasets.api.table_column import ( input_types, mutations, queries, diff --git a/backend/dataall/modules/datasets/api/DatasetTableColumn/input_types.py b/backend/dataall/modules/datasets/api/table_column/input_types.py similarity index 100% rename from backend/dataall/modules/datasets/api/DatasetTableColumn/input_types.py rename to backend/dataall/modules/datasets/api/table_column/input_types.py diff --git a/backend/dataall/modules/datasets/api/DatasetTableColumn/mutations.py b/backend/dataall/modules/datasets/api/table_column/mutations.py similarity index 89% rename from backend/dataall/modules/datasets/api/DatasetTableColumn/mutations.py rename to backend/dataall/modules/datasets/api/table_column/mutations.py index f7b682e3d..0fc5a7d87 100644 --- a/backend/dataall/modules/datasets/api/DatasetTableColumn/mutations.py +++ b/backend/dataall/modules/datasets/api/table_column/mutations.py @@ -1,5 +1,5 @@ from dataall.api import gql -from dataall.modules.datasets.api.DatasetTableColumn.resolvers import ( +from dataall.modules.datasets.api.table_column.resolvers import ( sync_table_columns, update_table_column ) diff --git a/backend/dataall/modules/datasets/api/DatasetTableColumn/queries.py b/backend/dataall/modules/datasets/api/table_column/queries.py similarity index 80% rename from backend/dataall/modules/datasets/api/DatasetTableColumn/queries.py rename to backend/dataall/modules/datasets/api/table_column/queries.py index 0a08e37b6..2c29e94b7 100644 --- a/backend/dataall/modules/datasets/api/DatasetTableColumn/queries.py +++ b/backend/dataall/modules/datasets/api/table_column/queries.py @@ -1,5 +1,5 @@ from dataall.api import gql -from dataall.modules.datasets.api.DatasetTableColumn.resolvers import list_table_columns +from dataall.modules.datasets.api.table_column.resolvers import list_table_columns listDatasetTableColumns = gql.QueryField( name='listDatasetTableColumns', diff --git a/backend/dataall/modules/datasets/api/DatasetTableColumn/resolvers.py b/backend/dataall/modules/datasets/api/table_column/resolvers.py similarity index 100% rename from backend/dataall/modules/datasets/api/DatasetTableColumn/resolvers.py rename to backend/dataall/modules/datasets/api/table_column/resolvers.py diff --git a/backend/dataall/modules/datasets/api/DatasetTableColumn/schema.py b/backend/dataall/modules/datasets/api/table_column/schema.py similarity index 95% rename from backend/dataall/modules/datasets/api/DatasetTableColumn/schema.py rename to backend/dataall/modules/datasets/api/table_column/schema.py index 8772e99b7..9730b70b9 100644 --- a/backend/dataall/modules/datasets/api/DatasetTableColumn/schema.py +++ b/backend/dataall/modules/datasets/api/table_column/schema.py @@ -1,5 +1,5 @@ from dataall.api import gql -from dataall.modules.datasets.api.DatasetTableColumn.resolvers import resolve_terms +from dataall.modules.datasets.api.table_column.resolvers import resolve_terms DatasetTableColumn = gql.ObjectType( From 06f82ad80386f53780af735571f3b7eb66497594 Mon Sep 17 00:00:00 2001 From: Nikita Podshivalov Date: Tue, 11 Apr 2023 12:16:34 +0200 Subject: [PATCH 04/32] Refactoring of datasets Added dataset module to config.json --- config.json | 3 +++ 1 file changed, 3 insertions(+) diff --git a/config.json b/config.json index e0a9d85d0..4aed5ef87 100644 --- a/config.json +++ b/config.json @@ -2,6 +2,9 @@ "modules": { "notebooks": { "active": true + }, + "datasets": { + "active": true } } } \ No newline at end of file From 38145ae637a2084c1bb198a1fa76a395bd1c39b5 Mon Sep 17 00:00:00 2001 From: Nikita Podshivalov Date: Tue, 11 Apr 2023 12:32:00 +0200 Subject: [PATCH 05/32] Fixed leftover in loader --- backend/dataall/modules/loader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backend/dataall/modules/loader.py b/backend/dataall/modules/loader.py index 95aa2083a..aa4a656d4 100644 --- a/backend/dataall/modules/loader.py +++ b/backend/dataall/modules/loader.py @@ -57,7 +57,7 @@ def load_modules(modes: List[ImportMode]) -> None: log.info(f"Module {name} is not active. Skipping...") continue - if active.lower() == "true" and not _import_module(name): + if not _import_module(name): raise ValueError(f"Couldn't find module {name} under modules directory") log.info(f"Module {name} is loaded") From f0e146aa2bf5cfb2156a369f9690adb9cfda9c09 Mon Sep 17 00:00:00 2001 From: Nikita Podshivalov Date: Tue, 11 Apr 2023 13:56:04 +0200 Subject: [PATCH 06/32] Dataset refactoring Moved database table service --- .../api/Objects/DatasetProfiling/resolvers.py | 3 ++- .../api/Objects/DatasetTable/resolvers.py | 25 ++++++++++--------- backend/dataall/aws/handlers/glue.py | 5 ++-- backend/dataall/aws/handlers/redshift.py | 3 ++- backend/dataall/db/api/__init__.py | 1 - .../datasets/api/table_column/resolvers.py | 7 +++--- .../modules/datasets/services/__init__.py | 1 + .../datasets/services}/dataset_table.py | 9 +++---- .../subscriptions/subscription_service.py | 5 ++-- backend/dataall/tasks/tables_syncer.py | 3 ++- tests/api/test_dataset_table.py | 5 ++-- 11 files changed, 36 insertions(+), 31 deletions(-) create mode 100644 backend/dataall/modules/datasets/services/__init__.py rename backend/dataall/{db/api => modules/datasets/services}/dataset_table.py (98%) diff --git a/backend/dataall/api/Objects/DatasetProfiling/resolvers.py b/backend/dataall/api/Objects/DatasetProfiling/resolvers.py index 678a8cba6..c391a1a8c 100644 --- a/backend/dataall/api/Objects/DatasetProfiling/resolvers.py +++ b/backend/dataall/api/Objects/DatasetProfiling/resolvers.py @@ -6,6 +6,7 @@ from ....aws.handlers.sts import SessionHelper from ....db import api, permissions, models from ....db.api import ResourcePolicy +from dataall.modules.datasets.services.dataset_table import DatasetTable log = logging.getLogger(__name__) @@ -97,7 +98,7 @@ def get_last_table_profiling_run(context: Context, source, tableUri=None): if run: if not run.results: - table = api.DatasetTable.get_dataset_table_by_uri(session, tableUri) + table = DatasetTable.get_dataset_table_by_uri(session, tableUri) dataset = api.Dataset.get_dataset_by_uri(session, table.datasetUri) environment = api.Environment.get_environment_by_uri( session, dataset.environmentUri diff --git a/backend/dataall/api/Objects/DatasetTable/resolvers.py b/backend/dataall/api/Objects/DatasetTable/resolvers.py index 9ea811411..854b99de9 100644 --- a/backend/dataall/api/Objects/DatasetTable/resolvers.py +++ b/backend/dataall/api/Objects/DatasetTable/resolvers.py @@ -13,13 +13,14 @@ from ....db.api import ResourcePolicy, Glossary from ....searchproxy import indexers from ....utils import json_utils +from dataall.modules.datasets.services.dataset_table import DatasetTable log = logging.getLogger(__name__) def create_table(context, source, datasetUri: str = None, input: dict = None): with context.engine.scoped_session() as session: - table = db.api.DatasetTable.create_dataset_table( + table = DatasetTable.create_dataset_table( session=session, username=context.username, groups=context.groups, @@ -37,7 +38,7 @@ def list_dataset_tables(context, source, filter: dict = None): if not filter: filter = {} with context.engine.scoped_session() as session: - return db.api.DatasetTable.list_dataset_tables( + return DatasetTable.list_dataset_tables( session=session, username=context.username, groups=context.groups, @@ -49,8 +50,8 @@ def list_dataset_tables(context, source, filter: dict = None): def get_table(context, source: models.Dataset, tableUri: str = None): with context.engine.scoped_session() as session: - table = db.api.DatasetTable.get_dataset_table_by_uri(session, tableUri) - return db.api.DatasetTable.get_dataset_table( + table = DatasetTable.get_dataset_table_by_uri(session, tableUri) + return DatasetTable.get_dataset_table( session=session, username=context.username, groups=context.groups, @@ -64,14 +65,14 @@ def get_table(context, source: models.Dataset, tableUri: str = None): def update_table(context, source, tableUri: str = None, input: dict = None): with context.engine.scoped_session() as session: - table = db.api.DatasetTable.get_dataset_table_by_uri(session, tableUri) + table = DatasetTable.get_dataset_table_by_uri(session, tableUri) dataset = db.api.Dataset.get_dataset_by_uri(session, table.datasetUri) input['table'] = table input['tableUri'] = table.tableUri - db.api.DatasetTable.update_dataset_table( + DatasetTable.update_dataset_table( session=session, username=context.username, groups=context.groups, @@ -85,8 +86,8 @@ def update_table(context, source, tableUri: str = None, input: dict = None): def delete_table(context, source, tableUri: str = None): with context.engine.scoped_session() as session: - table = db.api.DatasetTable.get_dataset_table_by_uri(session, tableUri) - db.api.DatasetTable.delete_dataset_table( + table = DatasetTable.get_dataset_table_by_uri(session, tableUri) + DatasetTable.delete_dataset_table( session=session, username=context.username, groups=context.groups, @@ -102,7 +103,7 @@ def delete_table(context, source, tableUri: str = None): def preview(context, source, tableUri: str = None): with context.engine.scoped_session() as session: - table: models.DatasetTable = db.api.DatasetTable.get_dataset_table_by_uri( + table: models.DatasetTable = DatasetTable.get_dataset_table_by_uri( session, tableUri ) dataset = db.api.Dataset.get_dataset_by_uri(session, table.datasetUri) @@ -157,7 +158,7 @@ def get_glue_table_properties(context: Context, source: models.DatasetTable, **k if not source: return None with context.engine.scoped_session() as session: - table: models.DatasetTable = db.api.DatasetTable.get_dataset_table_by_uri( + table: models.DatasetTable = DatasetTable.get_dataset_table_by_uri( session, source.tableUri ) return json_utils.to_string(table.GlueTableProperties).replace('\\', ' ') @@ -186,7 +187,7 @@ def resolve_glossary_terms(context: Context, source: models.DatasetTable, **kwar def publish_table_update(context: Context, source, tableUri: str = None): with context.engine.scoped_session() as session: - table: models.DatasetTable = db.api.DatasetTable.get_dataset_table_by_uri( + table: models.DatasetTable = DatasetTable.get_dataset_table_by_uri( session, tableUri ) ResourcePolicy.check_user_resource_permission( @@ -235,7 +236,7 @@ def resolve_redshift_copy_location( def list_shared_tables_by_env_dataset(context: Context, source, datasetUri: str, envUri: str, filter: dict = None): with context.engine.scoped_session() as session: - return db.api.DatasetTable.get_dataset_tables_shared_with_env( + return DatasetTable.get_dataset_tables_shared_with_env( session, envUri, datasetUri diff --git a/backend/dataall/aws/handlers/glue.py b/backend/dataall/aws/handlers/glue.py index ebeb1fca6..468268640 100644 --- a/backend/dataall/aws/handlers/glue.py +++ b/backend/dataall/aws/handlers/glue.py @@ -6,6 +6,7 @@ from .sts import SessionHelper from ... import db from ...db import models +from dataall.modules.datasets.services.dataset_table import DatasetTable log = logging.getLogger('aws:glue') @@ -84,7 +85,7 @@ def list_tables(engine, task: models.Task): tables = Glue.list_glue_database_tables( accountid, dataset.GlueDatabaseName, region ) - db.api.DatasetTable.sync(session, dataset.datasetUri, glue_tables=tables) + DatasetTable.sync(session, dataset.datasetUri, glue_tables=tables) return tables @staticmethod @@ -642,7 +643,7 @@ def get_table_columns(engine, task: models.Task): f'//{dataset_table.name} due to: ' f'{e}' ) - db.api.DatasetTable.sync_table_columns( + DatasetTable.sync_table_columns( session, dataset_table, glue_table['Table'] ) return True diff --git a/backend/dataall/aws/handlers/redshift.py b/backend/dataall/aws/handlers/redshift.py index a6a02f9e7..a1f479417 100644 --- a/backend/dataall/aws/handlers/redshift.py +++ b/backend/dataall/aws/handlers/redshift.py @@ -9,6 +9,7 @@ from .sts import SessionHelper from ... import db from ...db import models +from dataall.modules.datasets.services.dataset_table import DatasetTable log = logging.getLogger(__name__) @@ -446,7 +447,7 @@ def copy_data(engine, task: models.Task): session, task.payload['datasetUri'] ) - table: models.DatasetTable = db.api.DatasetTable.get_dataset_table_by_uri( + table: models.DatasetTable = DatasetTable.get_dataset_table_by_uri( session, task.payload['tableUri'] ) diff --git a/backend/dataall/db/api/__init__.py b/backend/dataall/db/api/__init__.py index 01647c81b..19138f7d7 100644 --- a/backend/dataall/db/api/__init__.py +++ b/backend/dataall/db/api/__init__.py @@ -14,7 +14,6 @@ from .dataset import Dataset from .dataset_location import DatasetStorageLocation from .dataset_profiling_run import DatasetProfilingRun -from .dataset_table import DatasetTable from .notification import Notification from .redshift_cluster import RedshiftCluster from .vpc import Vpc diff --git a/backend/dataall/modules/datasets/api/table_column/resolvers.py b/backend/dataall/modules/datasets/api/table_column/resolvers.py index a7a1bf5f4..8d977403a 100644 --- a/backend/dataall/modules/datasets/api/table_column/resolvers.py +++ b/backend/dataall/modules/datasets/api/table_column/resolvers.py @@ -5,6 +5,7 @@ from dataall.aws.handlers.service_handlers import Worker from dataall.db import paginate, permissions, models from dataall.db.api import ResourcePolicy +from dataall.modules.datasets.services.dataset_table import DatasetTable def list_table_columns( @@ -19,7 +20,7 @@ def list_table_columns( filter = {} with context.engine.scoped_session() as session: if not source: - source = db.api.DatasetTable.get_dataset_table_by_uri(session, tableUri) + source = DatasetTable.get_dataset_table_by_uri(session, tableUri) q = ( session.query(models.DatasetTableColumn) .filter( @@ -44,7 +45,7 @@ def list_table_columns( def sync_table_columns(context: Context, source, tableUri: str = None): with context.engine.scoped_session() as session: - table: models.DatasetTable = db.api.DatasetTable.get_dataset_table_by_uri( + table: models.DatasetTable = DatasetTable.get_dataset_table_by_uri( session, tableUri ) ResourcePolicy.check_user_resource_permission( @@ -79,7 +80,7 @@ def update_table_column( ).get(columnUri) if not column: raise db.exceptions.ObjectNotFound('Column', columnUri) - table: models.DatasetTable = db.api.DatasetTable.get_dataset_table_by_uri( + table: models.DatasetTable = DatasetTable.get_dataset_table_by_uri( session, column.tableUri ) ResourcePolicy.check_user_resource_permission( diff --git a/backend/dataall/modules/datasets/services/__init__.py b/backend/dataall/modules/datasets/services/__init__.py new file mode 100644 index 000000000..03ef29863 --- /dev/null +++ b/backend/dataall/modules/datasets/services/__init__.py @@ -0,0 +1 @@ +"""Contains business logic for datasets""" diff --git a/backend/dataall/db/api/dataset_table.py b/backend/dataall/modules/datasets/services/dataset_table.py similarity index 98% rename from backend/dataall/db/api/dataset_table.py rename to backend/dataall/modules/datasets/services/dataset_table.py index 77ee515e3..7c46120c1 100644 --- a/backend/dataall/db/api/dataset_table.py +++ b/backend/dataall/modules/datasets/services/dataset_table.py @@ -1,12 +1,11 @@ import logging -from typing import List from sqlalchemy.sql import and_ -from .. import models, api, permissions, exceptions, paginate -from . import has_tenant_perm, has_resource_perm, Glossary, ResourcePolicy, Environment -from ..models import Dataset -from ...utils import json_utils +from dataall.db import models, api, permissions, exceptions, paginate +from dataall.db.api import has_tenant_perm, has_resource_perm, Glossary, ResourcePolicy, Environment +from dataall.db.models import Dataset +from dataall.utils import json_utils logger = logging.getLogger(__name__) diff --git a/backend/dataall/tasks/subscriptions/subscription_service.py b/backend/dataall/tasks/subscriptions/subscription_service.py index 52aeb4e40..7b2a6d461 100644 --- a/backend/dataall/tasks/subscriptions/subscription_service.py +++ b/backend/dataall/tasks/subscriptions/subscription_service.py @@ -14,6 +14,7 @@ from ...db import models from ...tasks.subscriptions import poll_queues from ...utils import json_utils +from dataall.modules.datasets.services.dataset_table import DatasetTable root = logging.getLogger() root.setLevel(logging.INFO) @@ -64,7 +65,7 @@ def notify_consumers(engine, messages): @staticmethod def publish_table_update_message(engine, message): with engine.scoped_session() as session: - table: models.DatasetTable = db.api.DatasetTable.get_table_by_s3_prefix( + table: models.DatasetTable = DatasetTable.get_table_by_s3_prefix( session, message.get('prefix'), message.get('accountid'), @@ -135,7 +136,7 @@ def publish_location_update_message(session, message): @staticmethod def store_dataquality_results(session, message): - table: models.DatasetTable = db.api.DatasetTable.get_table_by_s3_prefix( + table: models.DatasetTable = DatasetTable.get_table_by_s3_prefix( session, message.get('prefix'), message.get('accountid'), diff --git a/backend/dataall/tasks/tables_syncer.py b/backend/dataall/tasks/tables_syncer.py index 04bafdfa5..5e6e8d48e 100644 --- a/backend/dataall/tasks/tables_syncer.py +++ b/backend/dataall/tasks/tables_syncer.py @@ -13,6 +13,7 @@ connect, ) from ..utils.alarm_service import AlarmService +from dataall.modules.datasets.services.dataset_table import DatasetTable root = logging.getLogger() root.setLevel(logging.INFO) @@ -63,7 +64,7 @@ def sync_tables(engine, es=None): f'Found {len(tables)} tables on Glue database {dataset.GlueDatabaseName}' ) - db.api.DatasetTable.sync( + DatasetTable.sync( session, dataset.datasetUri, glue_tables=tables ) diff --git a/tests/api/test_dataset_table.py b/tests/api/test_dataset_table.py index 6c30e77ea..af27529d5 100644 --- a/tests/api/test_dataset_table.py +++ b/tests/api/test_dataset_table.py @@ -3,6 +3,7 @@ import pytest import dataall +from dataall.modules.datasets.services.dataset_table import DatasetTable @pytest.fixture(scope='module', autouse=True) @@ -289,9 +290,7 @@ def test_sync_tables_and_columns(client, table, dataset1, db): }, ] - assert dataall.db.api.DatasetTable.sync( - session, dataset1.datasetUri, glue_tables - ) + assert DatasetTable.sync(session, dataset1.datasetUri, glue_tables) new_table: dataall.db.models.DatasetTable = ( session.query(dataall.db.models.DatasetTable) .filter(dataall.db.models.DatasetTable.name == 'new_table') From b039163449245ef6c079f3c277ce52e2a5cda579 Mon Sep 17 00:00:00 2001 From: Nikita Podshivalov Date: Tue, 11 Apr 2023 14:02:44 +0200 Subject: [PATCH 07/32] Dataset refactoring Renamed DatasetTable to DatasetTableService to avoid collisions with models.DatasetTable --- .../api/Objects/DatasetProfiling/resolvers.py | 4 +-- .../api/Objects/DatasetTable/resolvers.py | 26 +++++++++---------- backend/dataall/aws/handlers/glue.py | 6 ++--- backend/dataall/aws/handlers/redshift.py | 4 +-- .../datasets/api/table_column/resolvers.py | 8 +++--- .../datasets/services/dataset_table.py | 16 ++++++------ .../subscriptions/subscription_service.py | 6 ++--- backend/dataall/tasks/tables_syncer.py | 4 +-- tests/api/test_dataset_table.py | 4 +-- 9 files changed, 39 insertions(+), 39 deletions(-) diff --git a/backend/dataall/api/Objects/DatasetProfiling/resolvers.py b/backend/dataall/api/Objects/DatasetProfiling/resolvers.py index c391a1a8c..4b4684019 100644 --- a/backend/dataall/api/Objects/DatasetProfiling/resolvers.py +++ b/backend/dataall/api/Objects/DatasetProfiling/resolvers.py @@ -6,7 +6,7 @@ from ....aws.handlers.sts import SessionHelper from ....db import api, permissions, models from ....db.api import ResourcePolicy -from dataall.modules.datasets.services.dataset_table import DatasetTable +from dataall.modules.datasets.services.dataset_table import DatasetTableService log = logging.getLogger(__name__) @@ -98,7 +98,7 @@ def get_last_table_profiling_run(context: Context, source, tableUri=None): if run: if not run.results: - table = DatasetTable.get_dataset_table_by_uri(session, tableUri) + table = DatasetTableService.get_dataset_table_by_uri(session, tableUri) dataset = api.Dataset.get_dataset_by_uri(session, table.datasetUri) environment = api.Environment.get_environment_by_uri( session, dataset.environmentUri diff --git a/backend/dataall/api/Objects/DatasetTable/resolvers.py b/backend/dataall/api/Objects/DatasetTable/resolvers.py index 854b99de9..3e2b833e3 100644 --- a/backend/dataall/api/Objects/DatasetTable/resolvers.py +++ b/backend/dataall/api/Objects/DatasetTable/resolvers.py @@ -13,14 +13,14 @@ from ....db.api import ResourcePolicy, Glossary from ....searchproxy import indexers from ....utils import json_utils -from dataall.modules.datasets.services.dataset_table import DatasetTable +from dataall.modules.datasets.services.dataset_table import DatasetTableService log = logging.getLogger(__name__) def create_table(context, source, datasetUri: str = None, input: dict = None): with context.engine.scoped_session() as session: - table = DatasetTable.create_dataset_table( + table = DatasetTableService.create_dataset_table( session=session, username=context.username, groups=context.groups, @@ -38,7 +38,7 @@ def list_dataset_tables(context, source, filter: dict = None): if not filter: filter = {} with context.engine.scoped_session() as session: - return DatasetTable.list_dataset_tables( + return DatasetTableService.list_dataset_tables( session=session, username=context.username, groups=context.groups, @@ -50,8 +50,8 @@ def list_dataset_tables(context, source, filter: dict = None): def get_table(context, source: models.Dataset, tableUri: str = None): with context.engine.scoped_session() as session: - table = DatasetTable.get_dataset_table_by_uri(session, tableUri) - return DatasetTable.get_dataset_table( + table = DatasetTableService.get_dataset_table_by_uri(session, tableUri) + return DatasetTableService.get_dataset_table( session=session, username=context.username, groups=context.groups, @@ -65,14 +65,14 @@ def get_table(context, source: models.Dataset, tableUri: str = None): def update_table(context, source, tableUri: str = None, input: dict = None): with context.engine.scoped_session() as session: - table = DatasetTable.get_dataset_table_by_uri(session, tableUri) + table = DatasetTableService.get_dataset_table_by_uri(session, tableUri) dataset = db.api.Dataset.get_dataset_by_uri(session, table.datasetUri) input['table'] = table input['tableUri'] = table.tableUri - DatasetTable.update_dataset_table( + DatasetTableService.update_dataset_table( session=session, username=context.username, groups=context.groups, @@ -86,8 +86,8 @@ def update_table(context, source, tableUri: str = None, input: dict = None): def delete_table(context, source, tableUri: str = None): with context.engine.scoped_session() as session: - table = DatasetTable.get_dataset_table_by_uri(session, tableUri) - DatasetTable.delete_dataset_table( + table = DatasetTableService.get_dataset_table_by_uri(session, tableUri) + DatasetTableService.delete_dataset_table( session=session, username=context.username, groups=context.groups, @@ -103,7 +103,7 @@ def delete_table(context, source, tableUri: str = None): def preview(context, source, tableUri: str = None): with context.engine.scoped_session() as session: - table: models.DatasetTable = DatasetTable.get_dataset_table_by_uri( + table: models.DatasetTable = DatasetTableService.get_dataset_table_by_uri( session, tableUri ) dataset = db.api.Dataset.get_dataset_by_uri(session, table.datasetUri) @@ -158,7 +158,7 @@ def get_glue_table_properties(context: Context, source: models.DatasetTable, **k if not source: return None with context.engine.scoped_session() as session: - table: models.DatasetTable = DatasetTable.get_dataset_table_by_uri( + table: models.DatasetTable = DatasetTableService.get_dataset_table_by_uri( session, source.tableUri ) return json_utils.to_string(table.GlueTableProperties).replace('\\', ' ') @@ -187,7 +187,7 @@ def resolve_glossary_terms(context: Context, source: models.DatasetTable, **kwar def publish_table_update(context: Context, source, tableUri: str = None): with context.engine.scoped_session() as session: - table: models.DatasetTable = DatasetTable.get_dataset_table_by_uri( + table: models.DatasetTable = DatasetTableService.get_dataset_table_by_uri( session, tableUri ) ResourcePolicy.check_user_resource_permission( @@ -236,7 +236,7 @@ def resolve_redshift_copy_location( def list_shared_tables_by_env_dataset(context: Context, source, datasetUri: str, envUri: str, filter: dict = None): with context.engine.scoped_session() as session: - return DatasetTable.get_dataset_tables_shared_with_env( + return DatasetTableService.get_dataset_tables_shared_with_env( session, envUri, datasetUri diff --git a/backend/dataall/aws/handlers/glue.py b/backend/dataall/aws/handlers/glue.py index 468268640..88f68fc84 100644 --- a/backend/dataall/aws/handlers/glue.py +++ b/backend/dataall/aws/handlers/glue.py @@ -6,7 +6,7 @@ from .sts import SessionHelper from ... import db from ...db import models -from dataall.modules.datasets.services.dataset_table import DatasetTable +from dataall.modules.datasets.services.dataset_table import DatasetTableService log = logging.getLogger('aws:glue') @@ -85,7 +85,7 @@ def list_tables(engine, task: models.Task): tables = Glue.list_glue_database_tables( accountid, dataset.GlueDatabaseName, region ) - DatasetTable.sync(session, dataset.datasetUri, glue_tables=tables) + DatasetTableService.sync(session, dataset.datasetUri, glue_tables=tables) return tables @staticmethod @@ -643,7 +643,7 @@ def get_table_columns(engine, task: models.Task): f'//{dataset_table.name} due to: ' f'{e}' ) - DatasetTable.sync_table_columns( + DatasetTableService.sync_table_columns( session, dataset_table, glue_table['Table'] ) return True diff --git a/backend/dataall/aws/handlers/redshift.py b/backend/dataall/aws/handlers/redshift.py index a1f479417..4d2591520 100644 --- a/backend/dataall/aws/handlers/redshift.py +++ b/backend/dataall/aws/handlers/redshift.py @@ -9,7 +9,7 @@ from .sts import SessionHelper from ... import db from ...db import models -from dataall.modules.datasets.services.dataset_table import DatasetTable +from dataall.modules.datasets.services.dataset_table import DatasetTableService log = logging.getLogger(__name__) @@ -447,7 +447,7 @@ def copy_data(engine, task: models.Task): session, task.payload['datasetUri'] ) - table: models.DatasetTable = DatasetTable.get_dataset_table_by_uri( + table: models.DatasetTable = DatasetTableService.get_dataset_table_by_uri( session, task.payload['tableUri'] ) diff --git a/backend/dataall/modules/datasets/api/table_column/resolvers.py b/backend/dataall/modules/datasets/api/table_column/resolvers.py index 8d977403a..e5e7fd60c 100644 --- a/backend/dataall/modules/datasets/api/table_column/resolvers.py +++ b/backend/dataall/modules/datasets/api/table_column/resolvers.py @@ -5,7 +5,7 @@ from dataall.aws.handlers.service_handlers import Worker from dataall.db import paginate, permissions, models from dataall.db.api import ResourcePolicy -from dataall.modules.datasets.services.dataset_table import DatasetTable +from dataall.modules.datasets.services.dataset_table import DatasetTableService def list_table_columns( @@ -20,7 +20,7 @@ def list_table_columns( filter = {} with context.engine.scoped_session() as session: if not source: - source = DatasetTable.get_dataset_table_by_uri(session, tableUri) + source = DatasetTableService.get_dataset_table_by_uri(session, tableUri) q = ( session.query(models.DatasetTableColumn) .filter( @@ -45,7 +45,7 @@ def list_table_columns( def sync_table_columns(context: Context, source, tableUri: str = None): with context.engine.scoped_session() as session: - table: models.DatasetTable = DatasetTable.get_dataset_table_by_uri( + table: models.DatasetTable = DatasetTableService.get_dataset_table_by_uri( session, tableUri ) ResourcePolicy.check_user_resource_permission( @@ -80,7 +80,7 @@ def update_table_column( ).get(columnUri) if not column: raise db.exceptions.ObjectNotFound('Column', columnUri) - table: models.DatasetTable = DatasetTable.get_dataset_table_by_uri( + table: models.DatasetTable = DatasetTableService.get_dataset_table_by_uri( session, column.tableUri ) ResourcePolicy.check_user_resource_permission( diff --git a/backend/dataall/modules/datasets/services/dataset_table.py b/backend/dataall/modules/datasets/services/dataset_table.py index 7c46120c1..eeeb99f1d 100644 --- a/backend/dataall/modules/datasets/services/dataset_table.py +++ b/backend/dataall/modules/datasets/services/dataset_table.py @@ -10,7 +10,7 @@ logger = logging.getLogger(__name__) -class DatasetTable: +class DatasetTableService: @staticmethod @has_tenant_perm(permissions.MANAGE_DATASETS) @has_resource_perm(permissions.CREATE_DATASET_TABLE) @@ -107,7 +107,7 @@ def get_dataset_table( data: dict = None, check_perm: bool = False, ) -> models.DatasetTable: - return DatasetTable.get_dataset_table_by_uri(session, data['tableUri']) + return DatasetTableService.get_dataset_table_by_uri(session, data['tableUri']) @staticmethod @has_tenant_perm(permissions.MANAGE_DATASETS) @@ -122,7 +122,7 @@ def update_dataset_table( ): table = data.get( 'table', - DatasetTable.get_dataset_table_by_uri(session, data['tableUri']), + DatasetTableService.get_dataset_table_by_uri(session, data['tableUri']), ) for k in [attr for attr in data.keys() if attr != 'term']: @@ -146,7 +146,7 @@ def delete_dataset_table( data: dict = None, check_perm: bool = False, ): - table = DatasetTable.get_dataset_table_by_uri(session, data['tableUri']) + table = DatasetTableService.get_dataset_table_by_uri(session, data['tableUri']) share_item_shared_states = api.ShareItemSM.get_share_item_shared_states() share_item = ( session.query(models.ShareObjectItem) @@ -210,7 +210,7 @@ def get_dataset_tables_shared_with_env( ): return [ {"tableUri": t.tableUri, "GlueTableName": t.GlueTableName} - for t in DatasetTable.query_dataset_tables_shared_with_env( + for t in DatasetTableService.query_dataset_tables_shared_with_env( session, environment_uri, dataset_uri ) ] @@ -235,7 +235,7 @@ def sync(session, datasetUri, glue_tables=None): existing_table_names = [e.GlueTableName for e in existing_tables] existing_dataset_tables_map = {t.GlueTableName: t for t in existing_tables} - DatasetTable.update_existing_tables_status(existing_tables, glue_tables) + DatasetTableService.update_existing_tables_status(existing_tables, glue_tables) logger.info( f'existing_tables={glue_tables}' ) @@ -284,7 +284,7 @@ def sync(session, datasetUri, glue_tables=None): table.get('Parameters', {}) ) - DatasetTable.sync_table_columns(session, updated_table, table) + DatasetTableService.sync_table_columns(session, updated_table, table) return True @@ -300,7 +300,7 @@ def update_existing_tables_status(existing_tables, glue_tables): @staticmethod def sync_table_columns(session, dataset_table, glue_table): - DatasetTable.delete_all_table_columns(session, dataset_table) + DatasetTableService.delete_all_table_columns(session, dataset_table) columns = [ {**item, **{'columnType': 'column'}} diff --git a/backend/dataall/tasks/subscriptions/subscription_service.py b/backend/dataall/tasks/subscriptions/subscription_service.py index 7b2a6d461..bf7eded35 100644 --- a/backend/dataall/tasks/subscriptions/subscription_service.py +++ b/backend/dataall/tasks/subscriptions/subscription_service.py @@ -14,7 +14,7 @@ from ...db import models from ...tasks.subscriptions import poll_queues from ...utils import json_utils -from dataall.modules.datasets.services.dataset_table import DatasetTable +from dataall.modules.datasets.services.dataset_table import DatasetTableService root = logging.getLogger() root.setLevel(logging.INFO) @@ -65,7 +65,7 @@ def notify_consumers(engine, messages): @staticmethod def publish_table_update_message(engine, message): with engine.scoped_session() as session: - table: models.DatasetTable = DatasetTable.get_table_by_s3_prefix( + table: models.DatasetTable = DatasetTableService.get_table_by_s3_prefix( session, message.get('prefix'), message.get('accountid'), @@ -136,7 +136,7 @@ def publish_location_update_message(session, message): @staticmethod def store_dataquality_results(session, message): - table: models.DatasetTable = DatasetTable.get_table_by_s3_prefix( + table: models.DatasetTable = DatasetTableService.get_table_by_s3_prefix( session, message.get('prefix'), message.get('accountid'), diff --git a/backend/dataall/tasks/tables_syncer.py b/backend/dataall/tasks/tables_syncer.py index 5e6e8d48e..7d2781ccf 100644 --- a/backend/dataall/tasks/tables_syncer.py +++ b/backend/dataall/tasks/tables_syncer.py @@ -13,7 +13,7 @@ connect, ) from ..utils.alarm_service import AlarmService -from dataall.modules.datasets.services.dataset_table import DatasetTable +from dataall.modules.datasets.services.dataset_table import DatasetTableService root = logging.getLogger() root.setLevel(logging.INFO) @@ -64,7 +64,7 @@ def sync_tables(engine, es=None): f'Found {len(tables)} tables on Glue database {dataset.GlueDatabaseName}' ) - DatasetTable.sync( + DatasetTableService.sync( session, dataset.datasetUri, glue_tables=tables ) diff --git a/tests/api/test_dataset_table.py b/tests/api/test_dataset_table.py index af27529d5..82548252b 100644 --- a/tests/api/test_dataset_table.py +++ b/tests/api/test_dataset_table.py @@ -3,7 +3,7 @@ import pytest import dataall -from dataall.modules.datasets.services.dataset_table import DatasetTable +from dataall.modules.datasets.services.dataset_table import DatasetTableService @pytest.fixture(scope='module', autouse=True) @@ -290,7 +290,7 @@ def test_sync_tables_and_columns(client, table, dataset1, db): }, ] - assert DatasetTable.sync(session, dataset1.datasetUri, glue_tables) + assert DatasetTableService.sync(session, dataset1.datasetUri, glue_tables) new_table: dataall.db.models.DatasetTable = ( session.query(dataall.db.models.DatasetTable) .filter(dataall.db.models.DatasetTable.name == 'new_table') From b7922ed51c674e210c66a0bd298dae099c9eface Mon Sep 17 00:00:00 2001 From: Nikita Podshivalov Date: Tue, 11 Apr 2023 14:19:02 +0200 Subject: [PATCH 08/32] Dataset refactoring Moved DatasetTableColumn to modules --- backend/dataall/api/Objects/Feed/resolvers.py | 5 ++-- .../dataall/api/Objects/Glossary/resolvers.py | 5 ++-- backend/dataall/aws/handlers/glue.py | 5 ++-- backend/dataall/db/api/glossary.py | 11 +++---- backend/dataall/db/models/__init__.py | 1 - .../datasets/api/table_column/resolvers.py | 21 +++++++------- .../dataall/modules/datasets/db/__init__.py | 1 + .../datasets/db/table_column_model.py} | 4 +-- .../datasets/services/dataset_table.py | 9 +++--- tests/api/test_dataset_table.py | 29 ++++++++++--------- tests/api/test_glossary.py | 5 ++-- 11 files changed, 52 insertions(+), 44 deletions(-) create mode 100644 backend/dataall/modules/datasets/db/__init__.py rename backend/dataall/{db/models/DatasetTableColumn.py => modules/datasets/db/table_column_model.py} (91%) diff --git a/backend/dataall/api/Objects/Feed/resolvers.py b/backend/dataall/api/Objects/Feed/resolvers.py index a6c0535de..cbde23f0d 100644 --- a/backend/dataall/api/Objects/Feed/resolvers.py +++ b/backend/dataall/api/Objects/Feed/resolvers.py @@ -2,6 +2,7 @@ from ....api.context import Context from ....db import paginate, models +from dataall.modules.datasets.db.table_column_model import DatasetTableColumn class Feed: @@ -19,7 +20,7 @@ def targetType(self): def resolve_feed_target_type(obj, *_): - if isinstance(obj, models.DatasetTableColumn): + if isinstance(obj, DatasetTableColumn): return 'DatasetTableColumn' elif isinstance(obj, models.Worksheet): return 'Worksheet' @@ -44,7 +45,7 @@ def resolve_target(context: Context, source: Feed, **kwargs): model = { 'Dataset': models.Dataset, 'DatasetTable': models.DatasetTable, - 'DatasetTableColumn': models.DatasetTableColumn, + 'DatasetTableColumn': DatasetTableColumn, 'DatasetStorageLocation': models.DatasetStorageLocation, 'Dashboard': models.Dashboard, 'DataPipeline': models.DataPipeline, diff --git a/backend/dataall/api/Objects/Glossary/resolvers.py b/backend/dataall/api/Objects/Glossary/resolvers.py index 801bd27dc..847f16ac6 100644 --- a/backend/dataall/api/Objects/Glossary/resolvers.py +++ b/backend/dataall/api/Objects/Glossary/resolvers.py @@ -11,6 +11,7 @@ from ....api.constants import ( GlossaryRole ) +from dataall.modules.datasets.db.table_column_model import DatasetTableColumn def resolve_glossary_node(obj: models.GlossaryNode, *_): @@ -322,7 +323,7 @@ def get_link(context: Context, source, linkUri: str = None): def target_union_resolver(obj, *_): - if isinstance(obj, models.DatasetTableColumn): + if isinstance(obj, DatasetTableColumn): return 'DatasetTableColumn' elif isinstance(obj, models.DatasetTable): return 'DatasetTable' @@ -341,7 +342,7 @@ def resolve_link_target(context, source, **kwargs): model = { 'Dataset': models.Dataset, 'DatasetTable': models.DatasetTable, - 'Column': models.DatasetTableColumn, + 'Column': DatasetTableColumn, 'DatasetStorageLocation': models.DatasetStorageLocation, 'Dashboard': models.Dashboard, }[source.targetType] diff --git a/backend/dataall/aws/handlers/glue.py b/backend/dataall/aws/handlers/glue.py index 88f68fc84..ca00a81f5 100644 --- a/backend/dataall/aws/handlers/glue.py +++ b/backend/dataall/aws/handlers/glue.py @@ -7,6 +7,7 @@ from ... import db from ...db import models from dataall.modules.datasets.services.dataset_table import DatasetTableService +from dataall.modules.datasets.db.table_column_model import DatasetTableColumn log = logging.getLogger('aws:glue') @@ -525,8 +526,8 @@ def _update_existing_crawler(glue, accountid, crawler_name, targets, database): @Worker.handler('glue.table.update_column') def update_table_columns(engine, task: models.Task): with engine.scoped_session() as session: - column: models.DatasetTableColumn = session.query( - models.DatasetTableColumn + column: DatasetTableColumn = session.query( + DatasetTableColumn ).get(task.targetUri) table: models.DatasetTable = session.query(models.DatasetTable).get( column.tableUri diff --git a/backend/dataall/db/api/glossary.py b/backend/dataall/db/api/glossary.py index 1616141c8..c6313e007 100644 --- a/backend/dataall/db/api/glossary.py +++ b/backend/dataall/db/api/glossary.py @@ -9,6 +9,7 @@ has_tenant_perm, ) from ..models.Glossary import GlossaryNodeStatus +from dataall.modules.datasets.db.table_column_model import DatasetTableColumn logger = logging.getLogger(__name__) @@ -133,7 +134,7 @@ def link_term(session, username, groups, uri, data=None, check_perm=None): elif targetType == 'Folder': target = session.query(models.DatasetStorageLocation).get(targetUri) elif targetType == 'Column': - target = session.query(models.DatasetTableColumn).get(targetUri) + target = session.query(DatasetTableColumn).get(targetUri) elif targetType == 'Dashboard': target = session.query(models.Dashboard).get(targetUri) else: @@ -361,11 +362,11 @@ def list_term_associations( models.DatasetTable.description.label('description'), ) columns = session.query( - models.DatasetTableColumn.columnUri.label('targetUri'), + DatasetTableColumn.columnUri.label('targetUri'), literal('column').label('targetType'), - models.DatasetTableColumn.label.label('label'), - models.DatasetTableColumn.name.label('name'), - models.DatasetTableColumn.description.label('description'), + DatasetTableColumn.label.label('label'), + DatasetTableColumn.name.label('name'), + DatasetTableColumn.description.label('description'), ) folders = session.query( models.DatasetStorageLocation.locationUri.label('targetUri'), diff --git a/backend/dataall/db/models/__init__.py b/backend/dataall/db/models/__init__.py index 1ce567c87..1ab4134b3 100644 --- a/backend/dataall/db/models/__init__.py +++ b/backend/dataall/db/models/__init__.py @@ -9,7 +9,6 @@ from .DatasetQualityRule import DatasetQualityRule from .DatasetStorageLocation import DatasetStorageLocation from .DatasetTable import DatasetTable -from .DatasetTableColumn import DatasetTableColumn from .DatasetTableProfilingJob import DatasetTableProfilingJob from .Environment import Environment from .EnvironmentGroup import EnvironmentGroup diff --git a/backend/dataall/modules/datasets/api/table_column/resolvers.py b/backend/dataall/modules/datasets/api/table_column/resolvers.py index e5e7fd60c..b958f2f7a 100644 --- a/backend/dataall/modules/datasets/api/table_column/resolvers.py +++ b/backend/dataall/modules/datasets/api/table_column/resolvers.py @@ -6,6 +6,7 @@ from dataall.db import paginate, permissions, models from dataall.db.api import ResourcePolicy from dataall.modules.datasets.services.dataset_table import DatasetTableService +from dataall.modules.datasets.db.table_column_model import DatasetTableColumn def list_table_columns( @@ -22,21 +23,21 @@ def list_table_columns( if not source: source = DatasetTableService.get_dataset_table_by_uri(session, tableUri) q = ( - session.query(models.DatasetTableColumn) + session.query(DatasetTableColumn) .filter( - models.DatasetTableColumn.tableUri == tableUri, - models.DatasetTableColumn.deleted.is_(None), + DatasetTableColumn.tableUri == tableUri, + DatasetTableColumn.deleted.is_(None), ) - .order_by(models.DatasetTableColumn.columnType.asc()) + .order_by(DatasetTableColumn.columnType.asc()) ) term = filter.get('term') if term: q = q.filter( or_( - models.DatasetTableColumn.label.ilike('%' + term + '%'), - models.DatasetTableColumn.description.ilike('%' + term + '%'), + DatasetTableColumn.label.ilike('%' + term + '%'), + DatasetTableColumn.description.ilike('%' + term + '%'), ) - ).order_by(models.DatasetTableColumn.columnType.asc()) + ).order_by(DatasetTableColumn.columnType.asc()) return paginate( q, page=filter.get('page', 1), page_size=filter.get('pageSize', 65) @@ -61,7 +62,7 @@ def sync_table_columns(context: Context, source, tableUri: str = None): return list_table_columns(context, source=table, tableUri=tableUri) -def resolve_terms(context, source: models.DatasetTableColumn, **kwargs): +def resolve_terms(context, source: DatasetTableColumn, **kwargs): if not source: return None with context.engine.scoped_session() as session: @@ -75,8 +76,8 @@ def update_table_column( context: Context, source, columnUri: str = None, input: dict = None ): with context.engine.scoped_session() as session: - column: models.DatasetTableColumn = session.query( - models.DatasetTableColumn + column: DatasetTableColumn = session.query( + DatasetTableColumn ).get(columnUri) if not column: raise db.exceptions.ObjectNotFound('Column', columnUri) diff --git a/backend/dataall/modules/datasets/db/__init__.py b/backend/dataall/modules/datasets/db/__init__.py new file mode 100644 index 000000000..104b49a42 --- /dev/null +++ b/backend/dataall/modules/datasets/db/__init__.py @@ -0,0 +1 @@ +"""Database logic for datasets""" diff --git a/backend/dataall/db/models/DatasetTableColumn.py b/backend/dataall/modules/datasets/db/table_column_model.py similarity index 91% rename from backend/dataall/db/models/DatasetTableColumn.py rename to backend/dataall/modules/datasets/db/table_column_model.py index f4fe1f7d6..4d3d7e009 100644 --- a/backend/dataall/db/models/DatasetTableColumn.py +++ b/backend/dataall/modules/datasets/db/table_column_model.py @@ -1,7 +1,7 @@ from sqlalchemy import Column, String -from .. import Base -from .. import Resource, utils +from dataall.db import Base +from dataall.db import Resource, utils class DatasetTableColumn(Resource, Base): diff --git a/backend/dataall/modules/datasets/services/dataset_table.py b/backend/dataall/modules/datasets/services/dataset_table.py index eeeb99f1d..873cbe01e 100644 --- a/backend/dataall/modules/datasets/services/dataset_table.py +++ b/backend/dataall/modules/datasets/services/dataset_table.py @@ -6,6 +6,7 @@ from dataall.db.api import has_tenant_perm, has_resource_perm, Glossary, ResourcePolicy, Environment from dataall.db.models import Dataset from dataall.utils import json_utils +from dataall.modules.datasets.db.table_column_model import DatasetTableColumn logger = logging.getLogger(__name__) @@ -315,7 +316,7 @@ def sync_table_columns(session, dataset_table, glue_table): logger.debug(f'Found partitions {partitions} for table {dataset_table}') for col in columns + partitions: - table_col = models.DatasetTableColumn( + table_col = DatasetTableColumn( name=col['Name'], description=col.get('Comment', 'No description provided'), label=col['Name'], @@ -333,11 +334,11 @@ def sync_table_columns(session, dataset_table, glue_table): @staticmethod def delete_all_table_columns(session, dataset_table): - session.query(models.DatasetTableColumn).filter( + session.query(DatasetTableColumn).filter( and_( - models.DatasetTableColumn.GlueDatabaseName + DatasetTableColumn.GlueDatabaseName == dataset_table.GlueDatabaseName, - models.DatasetTableColumn.GlueTableName == dataset_table.GlueTableName, + DatasetTableColumn.GlueTableName == dataset_table.GlueTableName, ) ).delete() session.commit() diff --git a/tests/api/test_dataset_table.py b/tests/api/test_dataset_table.py index 82548252b..a2fcb2add 100644 --- a/tests/api/test_dataset_table.py +++ b/tests/api/test_dataset_table.py @@ -4,6 +4,7 @@ import dataall from dataall.modules.datasets.services.dataset_table import DatasetTableService +from dataall.modules.datasets.db.table_column_model import DatasetTableColumn @pytest.fixture(scope='module', autouse=True) @@ -112,7 +113,7 @@ def test_add_columns(table, dataset1, db): .filter(dataall.db.models.DatasetTable.name == 'table1') .first() ) - table_col = dataall.db.models.DatasetTableColumn( + table_col = DatasetTableColumn( name='col1', description='None', label='col1', @@ -186,8 +187,8 @@ def test_update_dataset_table_column(client, table, dataset1, db): .first() ) column = ( - session.query(dataall.db.models.DatasetTableColumn) - .filter(dataall.db.models.DatasetTableColumn.tableUri == table.tableUri) + session.query(DatasetTableColumn) + .filter(DatasetTableColumn.tableUri == table.tableUri) .first() ) response = client.query( @@ -208,7 +209,7 @@ def test_update_dataset_table_column(client, table, dataset1, db): response.data.updateDatasetTableColumn.description == 'My new description' ) - column = session.query(dataall.db.models.DatasetTableColumn).get( + column = session.query(DatasetTableColumn).get( column.columnUri ) assert column.description == 'My new description' @@ -235,8 +236,8 @@ def test_sync_tables_and_columns(client, table, dataset1, db): .first() ) column = ( - session.query(dataall.db.models.DatasetTableColumn) - .filter(dataall.db.models.DatasetTableColumn.tableUri == table.tableUri) + session.query(DatasetTableColumn) + .filter(DatasetTableColumn.tableUri == table.tableUri) .first() ) glue_tables = [ @@ -298,10 +299,10 @@ def test_sync_tables_and_columns(client, table, dataset1, db): ) assert new_table assert new_table.GlueTableName == 'new_table' - columns: [dataall.db.models.DatasetTableColumn] = ( - session.query(dataall.db.models.DatasetTableColumn) - .filter(dataall.db.models.DatasetTableColumn.tableUri == new_table.tableUri) - .order_by(dataall.db.models.DatasetTableColumn.columnType.asc()) + columns: [DatasetTableColumn] = ( + session.query(DatasetTableColumn) + .filter(DatasetTableColumn.tableUri == new_table.tableUri) + .order_by(DatasetTableColumn.columnType.asc()) .all() ) assert len(columns) == 2 @@ -315,10 +316,10 @@ def test_sync_tables_and_columns(client, table, dataset1, db): ) assert existing_table assert existing_table.GlueTableName == 'table1' - columns: [dataall.db.models.DatasetTableColumn] = ( - session.query(dataall.db.models.DatasetTableColumn) - .filter(dataall.db.models.DatasetTableColumn.tableUri == new_table.tableUri) - .order_by(dataall.db.models.DatasetTableColumn.columnType.asc()) + columns: [DatasetTableColumn] = ( + session.query(DatasetTableColumn) + .filter(DatasetTableColumn.tableUri == new_table.tableUri) + .order_by(DatasetTableColumn.columnType.asc()) .all() ) assert len(columns) == 2 diff --git a/tests/api/test_glossary.py b/tests/api/test_glossary.py index 157c6cd2c..8276dca8c 100644 --- a/tests/api/test_glossary.py +++ b/tests/api/test_glossary.py @@ -1,5 +1,6 @@ from typing import List from dataall.db import models +from dataall.modules.datasets.db.table_column_model import DatasetTableColumn import pytest @@ -48,11 +49,11 @@ def _table(db, _dataset) -> models.DatasetTable: @pytest.fixture(scope='module', autouse=True) -def _columns(db, _dataset, _table) -> List[models.DatasetTableColumn]: +def _columns(db, _dataset, _table) -> List[DatasetTableColumn]: with db.scoped_session() as session: cols = [] for i in range(0, 10): - c = models.DatasetTableColumn( + c = DatasetTableColumn( datasetUri=_dataset.datasetUri, tableUri=_table.tableUri, label=f'c{i+1}', From 1771bcaa4fc590eda8ca01537e544b2e1f5fa317 Mon Sep 17 00:00:00 2001 From: Nikita Podshivalov Date: Tue, 11 Apr 2023 14:28:27 +0200 Subject: [PATCH 09/32] Notebooks doesn't require tasks --- backend/dataall/modules/notebooks/tasks/__init__.py | 1 - 1 file changed, 1 deletion(-) delete mode 100644 backend/dataall/modules/notebooks/tasks/__init__.py diff --git a/backend/dataall/modules/notebooks/tasks/__init__.py b/backend/dataall/modules/notebooks/tasks/__init__.py deleted file mode 100644 index 7da194e3b..000000000 --- a/backend/dataall/modules/notebooks/tasks/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Currently notebooks don't have tasks, but this module needed for correct loading""" From 3d1603f21806bbaa099d70f9af072c5f2d40a5e4 Mon Sep 17 00:00:00 2001 From: Nikita Podshivalov Date: Tue, 11 Apr 2023 14:31:07 +0200 Subject: [PATCH 10/32] Renamed tasks to handlers Currently, only async handlers require dedicated loading. Long-running tasks (scheduled tasks) might not need to have a dedicated loading mode --- backend/aws_handler.py | 2 +- backend/dataall/modules/loader.py | 8 ++++---- backend/local_graphql_server.py | 2 +- tests/conftest.py | 2 +- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/backend/aws_handler.py b/backend/aws_handler.py index 872c8f433..ec5382b6f 100644 --- a/backend/aws_handler.py +++ b/backend/aws_handler.py @@ -14,7 +14,7 @@ engine = get_engine(envname=ENVNAME) -load_modules(modes=[ImportMode.TASKS]) +load_modules(modes=[ImportMode.HANDLERS]) def handler(event, context=None): diff --git a/backend/dataall/modules/loader.py b/backend/dataall/modules/loader.py index aa4a656d4..9fa3c69bf 100644 --- a/backend/dataall/modules/loader.py +++ b/backend/dataall/modules/loader.py @@ -2,7 +2,7 @@ import importlib import logging from abc import ABC, abstractmethod -from enum import Enum +from enum import Enum, auto from typing import List from dataall.core.config import config @@ -19,9 +19,9 @@ class ImportMode(Enum): of functionality to be loaded, there should be different loading modes """ - API = "api" - CDK = "cdk" - TASKS = "tasks" + API = auto() + CDK = auto() + HANDLERS = auto() class ModuleInterface(ABC): diff --git a/backend/local_graphql_server.py b/backend/local_graphql_server.py index 3783ba0a3..44f79a087 100644 --- a/backend/local_graphql_server.py +++ b/backend/local_graphql_server.py @@ -30,7 +30,7 @@ es = connect(envname=ENVNAME) logger.info('Connected') # create_schema_and_tables(engine, envname=ENVNAME) -load_modules(modes=[ImportMode.API, ImportMode.TASKS]) +load_modules(modes=[ImportMode.API, ImportMode.HANDLERS]) Base.metadata.create_all(engine.engine) CDKPROXY_URL = ( 'http://cdkproxy:2805' if ENVNAME == 'dkrcompose' else 'http://localhost:2805' diff --git a/tests/conftest.py b/tests/conftest.py index a67d6bd41..2767a66a3 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -3,7 +3,7 @@ import dataall from dataall.modules.loader import load_modules, ImportMode -load_modules(modes=[ImportMode.TASKS, ImportMode.API, ImportMode.CDK]) +load_modules(modes=[ImportMode.HANDLERS, ImportMode.API, ImportMode.CDK]) ENVNAME = os.environ.get('envname', 'pytest') From fb6b515103927e80f96d8248ddb95568a08b7f7c Mon Sep 17 00:00:00 2001 From: Nikita Podshivalov Date: Tue, 11 Apr 2023 16:00:18 +0200 Subject: [PATCH 11/32] Dataset refactoring Extracted code from glue to glue_column_handler Added handlers importing for datasets --- backend/dataall/aws/handlers/glue.py | 99 --------------- backend/dataall/db/api/redshift_cluster.py | 6 +- backend/dataall/modules/datasets/__init__.py | 15 ++- .../modules/datasets/handlers/__init__.py | 8 ++ .../datasets/handlers/glue_column_handler.py | 113 ++++++++++++++++++ 5 files changed, 138 insertions(+), 103 deletions(-) create mode 100644 backend/dataall/modules/datasets/handlers/__init__.py create mode 100644 backend/dataall/modules/datasets/handlers/glue_column_handler.py diff --git a/backend/dataall/aws/handlers/glue.py b/backend/dataall/aws/handlers/glue.py index ca00a81f5..4bfda7ce3 100644 --- a/backend/dataall/aws/handlers/glue.py +++ b/backend/dataall/aws/handlers/glue.py @@ -7,7 +7,6 @@ from ... import db from ...db import models from dataall.modules.datasets.services.dataset_table import DatasetTableService -from dataall.modules.datasets.db.table_column_model import DatasetTableColumn log = logging.getLogger('aws:glue') @@ -522,104 +521,6 @@ def _update_existing_crawler(glue, accountid, crawler_name, targets, database): else: raise e - @staticmethod - @Worker.handler('glue.table.update_column') - def update_table_columns(engine, task: models.Task): - with engine.scoped_session() as session: - column: DatasetTableColumn = session.query( - DatasetTableColumn - ).get(task.targetUri) - table: models.DatasetTable = session.query(models.DatasetTable).get( - column.tableUri - ) - try: - aws_session = SessionHelper.remote_session(table.AWSAccountId) - - Glue.grant_pivot_role_all_table_permissions(aws_session, table) - - glue_client = aws_session.client('glue', region_name=table.region) - - original_table = glue_client.get_table( - CatalogId=table.AWSAccountId, - DatabaseName=table.GlueDatabaseName, - Name=table.name, - ) - updated_table = { - k: v - for k, v in original_table['Table'].items() - if k - not in [ - 'CatalogId', - 'VersionId', - 'DatabaseName', - 'CreateTime', - 'UpdateTime', - 'CreatedBy', - 'IsRegisteredWithLakeFormation', - ] - } - all_columns = updated_table.get('StorageDescriptor', {}).get( - 'Columns', [] - ) + updated_table.get('PartitionKeys', []) - for col in all_columns: - if col['Name'] == column.name: - col['Comment'] = column.description - log.info( - f'Found column {column.name} adding description {column.description}' - ) - response = glue_client.update_table( - DatabaseName=table.GlueDatabaseName, - TableInput=updated_table, - ) - log.info( - f'Column {column.name} updated successfully: {response}' - ) - return True - - except ClientError as e: - log.error( - f'Failed to update table column {column.name} description: {e}' - ) - raise e - - @staticmethod - def grant_pivot_role_all_table_permissions(aws_session, table): - """ - Pivot role needs to have all permissions - for tables managed inside dataall - :param aws_session: - :param table: - :return: - """ - try: - lf_client = aws_session.client('lakeformation', region_name=table.region) - grant_dict = dict( - Principal={ - 'DataLakePrincipalIdentifier': SessionHelper.get_delegation_role_arn( - table.AWSAccountId - ) - }, - Resource={ - 'Table': { - 'DatabaseName': table.GlueDatabaseName, - 'Name': table.name, - } - }, - Permissions=['SELECT', 'ALTER', 'DROP', 'INSERT'], - ) - response = lf_client.grant_permissions(**grant_dict) - log.error( - f'Successfully granted pivot role all table ' - f'aws://{table.AWSAccountId}/{table.GlueDatabaseName}/{table.name} ' - f'access: {response}' - ) - except ClientError as e: - log.error( - f'Failed to grant pivot role all table ' - f'aws://{table.AWSAccountId}/{table.GlueDatabaseName}/{table.name} ' - f'access: {e}' - ) - raise e @staticmethod @Worker.handler('glue.table.columns') diff --git a/backend/dataall/db/api/redshift_cluster.py b/backend/dataall/db/api/redshift_cluster.py index 91e687d2b..4167a555a 100644 --- a/backend/dataall/db/api/redshift_cluster.py +++ b/backend/dataall/db/api/redshift_cluster.py @@ -3,13 +3,13 @@ from sqlalchemy import and_, or_, literal from .. import models, api, exceptions, paginate, permissions -from . import has_resource_perm, ResourcePolicy, DatasetTable, Environment, Dataset -from ..models.Enums import ShareItemStatus +from . import has_resource_perm, ResourcePolicy, Environment, Dataset from ...utils.naming_convention import ( NamingConventionService, NamingConventionPattern, ) from ...utils.slugify import slugify +from dataall.modules.datasets.services.dataset_table import DatasetTableService log = logging.getLogger(__name__) @@ -495,7 +495,7 @@ def enable_copy_table( session, username, groups, uri, data=None, check_perm=True ) -> models.RedshiftClusterDatasetTable: cluster = RedshiftCluster.get_redshift_cluster_by_uri(session, uri) - table = DatasetTable.get_dataset_table_by_uri(session, data['tableUri']) + table = DatasetTableService.get_dataset_table_by_uri(session, data['tableUri']) table = models.RedshiftClusterDatasetTable( clusterUri=uri, datasetUri=data['datasetUri'], diff --git a/backend/dataall/modules/datasets/__init__.py b/backend/dataall/modules/datasets/__init__.py index 67298a06e..cd52bc207 100644 --- a/backend/dataall/modules/datasets/__init__.py +++ b/backend/dataall/modules/datasets/__init__.py @@ -1,12 +1,14 @@ """Contains the code related to datasets""" import logging +from typing import List + from dataall.modules.loader import ModuleInterface, ImportMode log = logging.getLogger(__name__) class DatasetApiModuleInterface(ModuleInterface): - """Implements ModuleInterface for notebook GraphQl lambda""" + """Implements ModuleInterface for dataset GraphQl lambda""" @classmethod def is_supported(cls, modes): @@ -16,3 +18,14 @@ def __init__(self): import dataall.modules.datasets.api log.info("API of datasets has been imported") + +class DatasetAsyncHandlersModuleInterface(ModuleInterface): + """Implements ModuleInterface for dataset async lambda""" + + @classmethod + def is_supported(cls, modes: List[ImportMode]): + return ImportMode.HANDLERS in modes + + def __init__(self): + import dataall.modules.datasets.handlers + log.info("Dataset handlers have been imported") diff --git a/backend/dataall/modules/datasets/handlers/__init__.py b/backend/dataall/modules/datasets/handlers/__init__.py new file mode 100644 index 000000000..7ed90c729 --- /dev/null +++ b/backend/dataall/modules/datasets/handlers/__init__.py @@ -0,0 +1,8 @@ +""" +Contains code with the handlers that are need for async +processing in a separate lambda function +""" +from dataall.modules.datasets.handlers import glue_column_handler + +__all__ = ["glue_column_handler"] + diff --git a/backend/dataall/modules/datasets/handlers/glue_column_handler.py b/backend/dataall/modules/datasets/handlers/glue_column_handler.py new file mode 100644 index 000000000..e7f8d358b --- /dev/null +++ b/backend/dataall/modules/datasets/handlers/glue_column_handler.py @@ -0,0 +1,113 @@ +import logging + +from botocore.exceptions import ClientError + +from dataall.aws.handlers.sts import SessionHelper +from dataall.db import models +from dataall.aws.handlers.service_handlers import Worker +from dataall.modules.datasets.db.table_column_model import DatasetTableColumn + +log = logging.getLogger(__name__) + + +class DatasetColumnGlueHandler: + """A handler for dataset table columns""" + + @staticmethod + @Worker.handler('glue.table.update_column') + def update_table_columns(engine, task: models.Task): + with engine.scoped_session() as session: + column: DatasetTableColumn = session.query( + DatasetTableColumn + ).get(task.targetUri) + table: models.DatasetTable = session.query(models.DatasetTable).get( + column.tableUri + ) + try: + aws_session = SessionHelper.remote_session(table.AWSAccountId) + + DatasetColumnGlueHandler.grant_pivot_role_all_table_permissions(aws_session, table) + + glue_client = aws_session.client('glue', region_name=table.region) + + original_table = glue_client.get_table( + CatalogId=table.AWSAccountId, + DatabaseName=table.GlueDatabaseName, + Name=table.name, + ) + updated_table = { + k: v + for k, v in original_table['Table'].items() + if k + not in [ + 'CatalogId', + 'VersionId', + 'DatabaseName', + 'CreateTime', + 'UpdateTime', + 'CreatedBy', + 'IsRegisteredWithLakeFormation', + ] + } + all_columns = updated_table.get('StorageDescriptor', {}).get( + 'Columns', [] + ) + updated_table.get('PartitionKeys', []) + for col in all_columns: + if col['Name'] == column.name: + col['Comment'] = column.description + log.info( + f'Found column {column.name} adding description {column.description}' + ) + response = glue_client.update_table( + DatabaseName=table.GlueDatabaseName, + TableInput=updated_table, + ) + log.info( + f'Column {column.name} updated successfully: {response}' + ) + return True + + except ClientError as e: + log.error( + f'Failed to update table column {column.name} description: {e}' + ) + raise e + + @staticmethod + def grant_pivot_role_all_table_permissions(aws_session, table): + """ + Pivot role needs to have all permissions + for tables managed inside dataall + :param aws_session: + :param table: + :return: + """ + try: + lf_client = aws_session.client('lakeformation', region_name=table.region) + grant_dict = dict( + Principal={ + 'DataLakePrincipalIdentifier': SessionHelper.get_delegation_role_arn( + table.AWSAccountId + ) + }, + Resource={ + 'Table': { + 'DatabaseName': table.GlueDatabaseName, + 'Name': table.name, + } + }, + Permissions=['SELECT', 'ALTER', 'DROP', 'INSERT'], + ) + response = lf_client.grant_permissions(**grant_dict) + log.error( + f'Successfully granted pivot role all table ' + f'aws://{table.AWSAccountId}/{table.GlueDatabaseName}/{table.name} ' + f'access: {response}' + ) + except ClientError as e: + log.error( + f'Failed to grant pivot role all table ' + f'aws://{table.AWSAccountId}/{table.GlueDatabaseName}/{table.name} ' + f'access: {e}' + ) + raise e From e3596a553a42d42baddcb3d36bd2b71b60f101a2 Mon Sep 17 00:00:00 2001 From: Nikita Podshivalov Date: Tue, 11 Apr 2023 17:18:46 +0200 Subject: [PATCH 12/32] Dataset refactoring Extracted the code for dataset table handler --- backend/dataall/aws/handlers/glue.py | 15 ---------- .../modules/datasets/handlers/__init__.py | 7 +++-- .../datasets/handlers/glue_table_handler.py | 30 +++++++++++++++++++ 3 files changed, 35 insertions(+), 17 deletions(-) create mode 100644 backend/dataall/modules/datasets/handlers/glue_table_handler.py diff --git a/backend/dataall/aws/handlers/glue.py b/backend/dataall/aws/handlers/glue.py index 4bfda7ce3..cc8c5cfc7 100644 --- a/backend/dataall/aws/handlers/glue.py +++ b/backend/dataall/aws/handlers/glue.py @@ -73,21 +73,6 @@ def database_exists(**data): log.info(f'Database {database} does not exist on account {accountid}...') return False - @staticmethod - @Worker.handler(path='glue.dataset.database.tables') - def list_tables(engine, task: models.Task): - with engine.scoped_session() as session: - dataset: models.Dataset = db.api.Dataset.get_dataset_by_uri( - session, task.targetUri - ) - accountid = dataset.AwsAccountId - region = dataset.region - tables = Glue.list_glue_database_tables( - accountid, dataset.GlueDatabaseName, region - ) - DatasetTableService.sync(session, dataset.datasetUri, glue_tables=tables) - return tables - @staticmethod def list_glue_database_tables(accountid, database, region): aws_session = SessionHelper.remote_session(accountid=accountid) diff --git a/backend/dataall/modules/datasets/handlers/__init__.py b/backend/dataall/modules/datasets/handlers/__init__.py index 7ed90c729..19bd47297 100644 --- a/backend/dataall/modules/datasets/handlers/__init__.py +++ b/backend/dataall/modules/datasets/handlers/__init__.py @@ -2,7 +2,10 @@ Contains code with the handlers that are need for async processing in a separate lambda function """ -from dataall.modules.datasets.handlers import glue_column_handler +from dataall.modules.datasets.handlers import ( + glue_column_handler, + glue_table_handler +) -__all__ = ["glue_column_handler"] +__all__ = ["glue_column_handler", "glue_table_handler"] diff --git a/backend/dataall/modules/datasets/handlers/glue_table_handler.py b/backend/dataall/modules/datasets/handlers/glue_table_handler.py new file mode 100644 index 000000000..9bb50c501 --- /dev/null +++ b/backend/dataall/modules/datasets/handlers/glue_table_handler.py @@ -0,0 +1,30 @@ +import logging + +from botocore.exceptions import ClientError + +from dataall.aws.handlers.glue import Glue +from dataall.aws.handlers.service_handlers import Worker +from dataall.db import models +from dataall.db.api import Dataset +from dataall.modules.datasets.services.dataset_table import DatasetTableService + +log = logging.getLogger(__name__) + + +class DatasetColumnGlueHandler: + """A handler for dataset table""" + + @staticmethod + @Worker.handler(path='glue.dataset.database.tables') + def list_tables(engine, task: models.Task): + with engine.scoped_session() as session: + dataset: models.Dataset = Dataset.get_dataset_by_uri( + session, task.targetUri + ) + account_id = dataset.AwsAccountId + region = dataset.region + tables = Glue.list_glue_database_tables( + account_id, dataset.GlueDatabaseName, region + ) + DatasetTableService.sync(session, dataset.datasetUri, glue_tables=tables) + return tables From 3af2ecfb0f24342eea970b4f9fd4263dbb81fc2e Mon Sep 17 00:00:00 2001 From: Nikita Podshivalov Date: Tue, 11 Apr 2023 17:28:46 +0200 Subject: [PATCH 13/32] Dataset refactoring Extracted the long-running task for datasets --- .../dataall/modules/datasets/tasks/__init__.py | 1 + .../datasets}/tasks/tables_syncer.py | 18 ++++++++---------- backend/dataall/tasks/__init__.py | 1 - deploy/stacks/container.py | 2 +- tests/tasks/test_tables_sync.py | 4 ++-- 5 files changed, 12 insertions(+), 14 deletions(-) create mode 100644 backend/dataall/modules/datasets/tasks/__init__.py rename backend/dataall/{ => modules/datasets}/tasks/tables_syncer.py (92%) diff --git a/backend/dataall/modules/datasets/tasks/__init__.py b/backend/dataall/modules/datasets/tasks/__init__.py new file mode 100644 index 000000000..da597f309 --- /dev/null +++ b/backend/dataall/modules/datasets/tasks/__init__.py @@ -0,0 +1 @@ +"""Code of the long-running tasks that run in ECS""" diff --git a/backend/dataall/tasks/tables_syncer.py b/backend/dataall/modules/datasets/tasks/tables_syncer.py similarity index 92% rename from backend/dataall/tasks/tables_syncer.py rename to backend/dataall/modules/datasets/tasks/tables_syncer.py index 7d2781ccf..27a870d60 100644 --- a/backend/dataall/tasks/tables_syncer.py +++ b/backend/dataall/modules/datasets/tasks/tables_syncer.py @@ -3,16 +3,14 @@ import sys from operator import and_ -from .. import db -from ..aws.handlers.glue import Glue -from ..aws.handlers.sts import SessionHelper -from ..db import get_engine -from ..db import models -from ..searchproxy import indexers -from ..searchproxy.connect import ( - connect, -) -from ..utils.alarm_service import AlarmService +from dataall import db +from dataall.aws.handlers.glue import Glue +from dataall.aws.handlers.sts import SessionHelper +from dataall.db import get_engine +from dataall.db import models +from dataall.searchproxy import indexers +from dataall.searchproxy.connect import connect +from dataall.utils.alarm_service import AlarmService from dataall.modules.datasets.services.dataset_table import DatasetTableService root = logging.getLogger() diff --git a/backend/dataall/tasks/__init__.py b/backend/dataall/tasks/__init__.py index 02ccaaa8b..89cb28e27 100644 --- a/backend/dataall/tasks/__init__.py +++ b/backend/dataall/tasks/__init__.py @@ -1,2 +1 @@ -from .tables_syncer import sync_tables from .catalog_indexer import index_objects diff --git a/deploy/stacks/container.py b/deploy/stacks/container.py index c313df82e..47fc3d114 100644 --- a/deploy/stacks/container.py +++ b/deploy/stacks/container.py @@ -94,7 +94,7 @@ def __init__( sync_tables_task = self.set_scheduled_task( cluster=cluster, - command=['python3.8', '-m', 'dataall.tasks.tables_syncer'], + command=['python3.8', '-m', 'dataall.modules.datasets.tasks.tables_syncer'], container_id=f'container', ecr_repository=ecr_repository, environment=self._create_env('INFO'), diff --git a/tests/tasks/test_tables_sync.py b/tests/tasks/test_tables_sync.py index 812dda1bd..d4e86b83f 100644 --- a/tests/tasks/test_tables_sync.py +++ b/tests/tasks/test_tables_sync.py @@ -147,14 +147,14 @@ def _test_tables_sync(db, org, env, sync_dataset, table, mocker): ], ) mocker.patch( - 'dataall.tasks.tables_syncer.is_assumable_pivot_role', return_value=True + 'dataall.modules.datasets.tables_syncer.is_assumable_pivot_role', return_value=True ) mocker.patch( 'dataall.aws.handlers.glue.Glue.grant_principals_all_table_permissions', return_value=True, ) - processed_tables = dataall.tasks.tables_syncer.sync_tables(engine=db) + processed_tables = dataall.modules.datasets.tasks.tables_syncer.sync_tables(engine=db) assert len(processed_tables) == 2 with db.scoped_session() as session: saved_table: dataall.db.models.DatasetTable = ( From 1a063b2cbe020ba5ffe72587902f0872ed788db4 Mon Sep 17 00:00:00 2001 From: Nikita Podshivalov Date: Tue, 11 Apr 2023 18:34:12 +0200 Subject: [PATCH 14/32] Dataset refactoring Extracted the subscription service into datasets --- .../datasets/tasks}/subscription_service.py | 16 ++++++++-------- backend/dataall/tasks/subscriptions/__init__.py | 1 - deploy/stacks/container.py | 2 +- 3 files changed, 9 insertions(+), 10 deletions(-) rename backend/dataall/{tasks/subscriptions => modules/datasets/tasks}/subscription_service.py (97%) diff --git a/backend/dataall/tasks/subscriptions/subscription_service.py b/backend/dataall/modules/datasets/tasks/subscription_service.py similarity index 97% rename from backend/dataall/tasks/subscriptions/subscription_service.py rename to backend/dataall/modules/datasets/tasks/subscription_service.py index bf7eded35..8674f903a 100644 --- a/backend/dataall/tasks/subscriptions/subscription_service.py +++ b/backend/dataall/modules/datasets/tasks/subscription_service.py @@ -6,14 +6,14 @@ from botocore.exceptions import ClientError from sqlalchemy import and_ -from ... import db -from ...aws.handlers.service_handlers import Worker -from ...aws.handlers.sts import SessionHelper -from ...aws.handlers.sqs import SqsQueue -from ...db import get_engine -from ...db import models -from ...tasks.subscriptions import poll_queues -from ...utils import json_utils +from dataall import db +from dataall.aws.handlers.service_handlers import Worker +from dataall.aws.handlers.sts import SessionHelper +from dataall.aws.handlers.sqs import SqsQueue +from dataall.db import get_engine +from dataall.db import models +from dataall.tasks.subscriptions import poll_queues +from dataall.utils import json_utils from dataall.modules.datasets.services.dataset_table import DatasetTableService root = logging.getLogger() diff --git a/backend/dataall/tasks/subscriptions/__init__.py b/backend/dataall/tasks/subscriptions/__init__.py index f60ca5310..fa0214e42 100644 --- a/backend/dataall/tasks/subscriptions/__init__.py +++ b/backend/dataall/tasks/subscriptions/__init__.py @@ -1,2 +1 @@ from .sqs_poller import poll_queues -from .subscription_service import SubscriptionService diff --git a/deploy/stacks/container.py b/deploy/stacks/container.py index 47fc3d114..d3c761519 100644 --- a/deploy/stacks/container.py +++ b/deploy/stacks/container.py @@ -179,7 +179,7 @@ def __init__( command=[ 'python3.8', '-m', - 'dataall.tasks.subscriptions.subscription_service', + 'dataall.modules.datasets.tasks.subscription_service', ], container_id=f'container', ecr_repository=ecr_repository, From b7337142064d4b14a1e247eae5ff412f7db81448 Mon Sep 17 00:00:00 2001 From: Nikita Podshivalov Date: Tue, 11 Apr 2023 18:39:54 +0200 Subject: [PATCH 15/32] Dataset refactoring Extracted the handler to get table columns --- backend/dataall/aws/handlers/glue.py | 30 ------------------- .../datasets/handlers/glue_column_handler.py | 29 ++++++++++++++++++ 2 files changed, 29 insertions(+), 30 deletions(-) diff --git a/backend/dataall/aws/handlers/glue.py b/backend/dataall/aws/handlers/glue.py index cc8c5cfc7..e05ce4c54 100644 --- a/backend/dataall/aws/handlers/glue.py +++ b/backend/dataall/aws/handlers/glue.py @@ -6,7 +6,6 @@ from .sts import SessionHelper from ... import db from ...db import models -from dataall.modules.datasets.services.dataset_table import DatasetTableService log = logging.getLogger('aws:glue') @@ -506,35 +505,6 @@ def _update_existing_crawler(glue, accountid, crawler_name, targets, database): else: raise e - - @staticmethod - @Worker.handler('glue.table.columns') - def get_table_columns(engine, task: models.Task): - with engine.scoped_session() as session: - dataset_table: models.DatasetTable = session.query(models.DatasetTable).get( - task.targetUri - ) - aws = SessionHelper.remote_session(dataset_table.AWSAccountId) - glue_client = aws.client('glue', region_name=dataset_table.region) - glue_table = {} - try: - glue_table = glue_client.get_table( - CatalogId=dataset_table.AWSAccountId, - DatabaseName=dataset_table.GlueDatabaseName, - Name=dataset_table.name, - ) - except glue_client.exceptions.ClientError as e: - log.error( - f'Failed to get table aws://{dataset_table.AWSAccountId}' - f'//{dataset_table.GlueDatabaseName}' - f'//{dataset_table.name} due to: ' - f'{e}' - ) - DatasetTableService.sync_table_columns( - session, dataset_table, glue_table['Table'] - ) - return True - @staticmethod @Worker.handler(path='glue.job.runs') def get_job_runs(engine, task: models.Task): diff --git a/backend/dataall/modules/datasets/handlers/glue_column_handler.py b/backend/dataall/modules/datasets/handlers/glue_column_handler.py index e7f8d358b..02003eea2 100644 --- a/backend/dataall/modules/datasets/handlers/glue_column_handler.py +++ b/backend/dataall/modules/datasets/handlers/glue_column_handler.py @@ -6,6 +6,7 @@ from dataall.db import models from dataall.aws.handlers.service_handlers import Worker from dataall.modules.datasets.db.table_column_model import DatasetTableColumn +from dataall.modules.datasets.services.dataset_table import DatasetTableService log = logging.getLogger(__name__) @@ -13,6 +14,34 @@ class DatasetColumnGlueHandler: """A handler for dataset table columns""" + @staticmethod + @Worker.handler('glue.table.columns') + def get_table_columns(engine, task: models.Task): + with engine.scoped_session() as session: + dataset_table: models.DatasetTable = session.query(models.DatasetTable).get( + task.targetUri + ) + aws = SessionHelper.remote_session(dataset_table.AWSAccountId) + glue_client = aws.client('glue', region_name=dataset_table.region) + glue_table = {} + try: + glue_table = glue_client.get_table( + CatalogId=dataset_table.AWSAccountId, + DatabaseName=dataset_table.GlueDatabaseName, + Name=dataset_table.name, + ) + except glue_client.exceptions.ClientError as e: + log.error( + f'Failed to get table aws://{dataset_table.AWSAccountId}' + f'//{dataset_table.GlueDatabaseName}' + f'//{dataset_table.name} due to: ' + f'{e}' + ) + DatasetTableService.sync_table_columns( + session, dataset_table, glue_table['Table'] + ) + return True + @staticmethod @Worker.handler('glue.table.update_column') def update_table_columns(engine, task: models.Task): From 2a4e2e09caf2e520118baa1ef22f1fb262541239 Mon Sep 17 00:00:00 2001 From: Nikita Podshivalov Date: Tue, 11 Apr 2023 19:06:39 +0200 Subject: [PATCH 16/32] Extracted feed registry Needed for migration for modules --- backend/dataall/api/Objects/Feed/resolvers.py | 29 ++------------- backend/dataall/core/feed/__init__.py | 1 + .../dataall/core/feed/services/__init__.py | 1 + .../core/feed/services/feed_registry.py | 36 +++++++++++++++++++ backend/dataall/modules/datasets/__init__.py | 3 ++ 5 files changed, 44 insertions(+), 26 deletions(-) create mode 100644 backend/dataall/core/feed/__init__.py create mode 100644 backend/dataall/core/feed/services/__init__.py create mode 100644 backend/dataall/core/feed/services/feed_registry.py diff --git a/backend/dataall/api/Objects/Feed/resolvers.py b/backend/dataall/api/Objects/Feed/resolvers.py index cbde23f0d..0fff09053 100644 --- a/backend/dataall/api/Objects/Feed/resolvers.py +++ b/backend/dataall/api/Objects/Feed/resolvers.py @@ -2,7 +2,7 @@ from ....api.context import Context from ....db import paginate, models -from dataall.modules.datasets.db.table_column_model import DatasetTableColumn +from dataall.core.feed.services.feed_registry import FeedRegistry class Feed: @@ -20,37 +20,14 @@ def targetType(self): def resolve_feed_target_type(obj, *_): - if isinstance(obj, DatasetTableColumn): - return 'DatasetTableColumn' - elif isinstance(obj, models.Worksheet): - return 'Worksheet' - elif isinstance(obj, models.DataPipeline): - return 'DataPipeline' - elif isinstance(obj, models.DatasetTable): - return 'DatasetTable' - elif isinstance(obj, models.Dataset): - return 'Dataset' - elif isinstance(obj, models.DatasetStorageLocation): - return 'DatasetStorageLocation' - elif isinstance(obj, models.Dashboard): - return 'Dashboard' - else: - return None + return FeedRegistry.find_by_model(obj) def resolve_target(context: Context, source: Feed, **kwargs): if not source: return None with context.engine.scoped_session() as session: - model = { - 'Dataset': models.Dataset, - 'DatasetTable': models.DatasetTable, - 'DatasetTableColumn': DatasetTableColumn, - 'DatasetStorageLocation': models.DatasetStorageLocation, - 'Dashboard': models.Dashboard, - 'DataPipeline': models.DataPipeline, - 'Worksheet': models.Worksheet, - }[source.targetType] + model = FeedRegistry.find(source.targetType) target = session.query(model).get(source.targetUri) return target diff --git a/backend/dataall/core/feed/__init__.py b/backend/dataall/core/feed/__init__.py new file mode 100644 index 000000000..d06f5a78f --- /dev/null +++ b/backend/dataall/core/feed/__init__.py @@ -0,0 +1 @@ +"""Contains all code related to feeds""" diff --git a/backend/dataall/core/feed/services/__init__.py b/backend/dataall/core/feed/services/__init__.py new file mode 100644 index 000000000..e87be7564 --- /dev/null +++ b/backend/dataall/core/feed/services/__init__.py @@ -0,0 +1 @@ +"""Contains business logic of feed""" diff --git a/backend/dataall/core/feed/services/feed_registry.py b/backend/dataall/core/feed/services/feed_registry.py new file mode 100644 index 000000000..7a382c057 --- /dev/null +++ b/backend/dataall/core/feed/services/feed_registry.py @@ -0,0 +1,36 @@ +from dataclasses import dataclass +from typing import Dict, Type +from dataall.db import Resource, models + + +@dataclass +class FeedDefinition: + target_type: str + model: Type[Resource] + + +class FeedRegistry: + """Registers feeds for different models""" + _DEFINITION: Dict[str, FeedDefinition] = {} + + @classmethod + def register(cls, feed: FeedDefinition): + cls._DEFINITION[feed.target_type] = feed + + @classmethod + def find(cls, target_type: str): + return cls._DEFINITION[target_type] + + @classmethod + def find_by_model(cls, obj: Resource): + for target_type, feed in cls._DEFINITION.items(): + if isinstance(obj, feed.model): + return target_type + return None + + +FeedRegistry.register(FeedDefinition("Worksheet", models.Worksheet)) +FeedRegistry.register(FeedDefinition("DataPipeline", models.DataPipeline)) +FeedRegistry.register(FeedDefinition("DatasetTable", models.DatasetTable)) +FeedRegistry.register(FeedDefinition("DatasetStorageLocation", models.DatasetStorageLocation)) +FeedRegistry.register(FeedDefinition("Dashboard", models.Dashboard)) diff --git a/backend/dataall/modules/datasets/__init__.py b/backend/dataall/modules/datasets/__init__.py index cd52bc207..0de251fee 100644 --- a/backend/dataall/modules/datasets/__init__.py +++ b/backend/dataall/modules/datasets/__init__.py @@ -2,6 +2,8 @@ import logging from typing import List +from dataall.core.feed.services.feed_registry import FeedRegistry, FeedDefinition +from dataall.modules.datasets.db.table_column_model import DatasetTableColumn from dataall.modules.loader import ModuleInterface, ImportMode log = logging.getLogger(__name__) @@ -16,6 +18,7 @@ def is_supported(cls, modes): def __init__(self): import dataall.modules.datasets.api + FeedRegistry.register(FeedDefinition("DatasetTableColumn", DatasetTableColumn)) log.info("API of datasets has been imported") From c15d0902da9a10a846e2c1f0231f0f0fafc6c0f0 Mon Sep 17 00:00:00 2001 From: Nikita Podshivalov Date: Tue, 11 Apr 2023 19:30:07 +0200 Subject: [PATCH 17/32] Extracted feed and glossary registry and created a model registry --- backend/dataall/api/Objects/Feed/resolvers.py | 6 +-- .../dataall/api/Objects/Glossary/resolvers.py | 23 ++------- backend/dataall/core/feed/__init__.py | 1 - .../dataall/core/feed/services/__init__.py | 1 - .../core/feed/services/feed_registry.py | 36 -------------- backend/dataall/core/utils/__init__.py | 1 + backend/dataall/core/utils/model_registry.py | 47 +++++++++++++++++++ backend/dataall/modules/datasets/__init__.py | 5 +- 8 files changed, 57 insertions(+), 63 deletions(-) delete mode 100644 backend/dataall/core/feed/__init__.py delete mode 100644 backend/dataall/core/feed/services/__init__.py delete mode 100644 backend/dataall/core/feed/services/feed_registry.py create mode 100644 backend/dataall/core/utils/__init__.py create mode 100644 backend/dataall/core/utils/model_registry.py diff --git a/backend/dataall/api/Objects/Feed/resolvers.py b/backend/dataall/api/Objects/Feed/resolvers.py index 0fff09053..1f328b1ae 100644 --- a/backend/dataall/api/Objects/Feed/resolvers.py +++ b/backend/dataall/api/Objects/Feed/resolvers.py @@ -1,8 +1,8 @@ from sqlalchemy import or_ -from ....api.context import Context -from ....db import paginate, models -from dataall.core.feed.services.feed_registry import FeedRegistry +from dataall.api.context import Context +from dataall.db import paginate, models +from dataall.core.utils.model_registry import FeedRegistry class Feed: diff --git a/backend/dataall/api/Objects/Glossary/resolvers.py b/backend/dataall/api/Objects/Glossary/resolvers.py index 847f16ac6..c6f2634d0 100644 --- a/backend/dataall/api/Objects/Glossary/resolvers.py +++ b/backend/dataall/api/Objects/Glossary/resolvers.py @@ -4,6 +4,7 @@ from .... import db from ....api.context import Context +from ....core.utils.model_registry import GlossaryRegistry from ....db import paginate, exceptions, models from ....searchproxy import upsert_dataset from ....searchproxy import upsert_table @@ -11,7 +12,6 @@ from ....api.constants import ( GlossaryRole ) -from dataall.modules.datasets.db.table_column_model import DatasetTableColumn def resolve_glossary_node(obj: models.GlossaryNode, *_): @@ -323,29 +323,12 @@ def get_link(context: Context, source, linkUri: str = None): def target_union_resolver(obj, *_): - if isinstance(obj, DatasetTableColumn): - return 'DatasetTableColumn' - elif isinstance(obj, models.DatasetTable): - return 'DatasetTable' - elif isinstance(obj, models.Dataset): - return 'Dataset' - elif isinstance(obj, models.DatasetStorageLocation): - return 'DatasetStorageLocation' - elif isinstance(obj, models.Dashboard): - return 'Dashboard' - else: - return None + return GlossaryRegistry.find_by_model(obj) def resolve_link_target(context, source, **kwargs): with context.engine.scoped_session() as session: - model = { - 'Dataset': models.Dataset, - 'DatasetTable': models.DatasetTable, - 'Column': DatasetTableColumn, - 'DatasetStorageLocation': models.DatasetStorageLocation, - 'Dashboard': models.Dashboard, - }[source.targetType] + model = GlossaryRegistry.find(source.targetUri) target = session.query(model).get(source.targetUri) return target diff --git a/backend/dataall/core/feed/__init__.py b/backend/dataall/core/feed/__init__.py deleted file mode 100644 index d06f5a78f..000000000 --- a/backend/dataall/core/feed/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Contains all code related to feeds""" diff --git a/backend/dataall/core/feed/services/__init__.py b/backend/dataall/core/feed/services/__init__.py deleted file mode 100644 index e87be7564..000000000 --- a/backend/dataall/core/feed/services/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Contains business logic of feed""" diff --git a/backend/dataall/core/feed/services/feed_registry.py b/backend/dataall/core/feed/services/feed_registry.py deleted file mode 100644 index 7a382c057..000000000 --- a/backend/dataall/core/feed/services/feed_registry.py +++ /dev/null @@ -1,36 +0,0 @@ -from dataclasses import dataclass -from typing import Dict, Type -from dataall.db import Resource, models - - -@dataclass -class FeedDefinition: - target_type: str - model: Type[Resource] - - -class FeedRegistry: - """Registers feeds for different models""" - _DEFINITION: Dict[str, FeedDefinition] = {} - - @classmethod - def register(cls, feed: FeedDefinition): - cls._DEFINITION[feed.target_type] = feed - - @classmethod - def find(cls, target_type: str): - return cls._DEFINITION[target_type] - - @classmethod - def find_by_model(cls, obj: Resource): - for target_type, feed in cls._DEFINITION.items(): - if isinstance(obj, feed.model): - return target_type - return None - - -FeedRegistry.register(FeedDefinition("Worksheet", models.Worksheet)) -FeedRegistry.register(FeedDefinition("DataPipeline", models.DataPipeline)) -FeedRegistry.register(FeedDefinition("DatasetTable", models.DatasetTable)) -FeedRegistry.register(FeedDefinition("DatasetStorageLocation", models.DatasetStorageLocation)) -FeedRegistry.register(FeedDefinition("Dashboard", models.Dashboard)) diff --git a/backend/dataall/core/utils/__init__.py b/backend/dataall/core/utils/__init__.py new file mode 100644 index 000000000..02ed9cfb4 --- /dev/null +++ b/backend/dataall/core/utils/__init__.py @@ -0,0 +1 @@ +"""Utility functions and classes""" diff --git a/backend/dataall/core/utils/model_registry.py b/backend/dataall/core/utils/model_registry.py new file mode 100644 index 000000000..9a4c21952 --- /dev/null +++ b/backend/dataall/core/utils/model_registry.py @@ -0,0 +1,47 @@ +from dataclasses import dataclass +from typing import Type, Dict + +from dataall.db import Resource, models + + +@dataclass +class ModelDefinition: + target_type: str + model: Type[Resource] + + +class ModelRegistry: + """Registers models for different target types""" + + def __init__(self): + self._definitions: Dict[str, ModelDefinition] = {} + + def register(self, model: ModelDefinition): + self._definitions[model.target_type] = model + + def find(self, target_type: str): + return self._definitions[target_type] + + def find_by_model(self, obj: Resource): + for target_type, definition in self._definitions.items(): + if isinstance(obj, definition.model): + return target_type + return None + + +# TODO should migrate to a proper file after the modularization +FeedRegistry = ModelRegistry() +GlossaryRegistry = ModelRegistry() + + +FeedRegistry.register(ModelDefinition("Worksheet", models.Worksheet)) +FeedRegistry.register(ModelDefinition("DataPipeline", models.DataPipeline)) +FeedRegistry.register(ModelDefinition("DatasetTable", models.DatasetTable)) +FeedRegistry.register(ModelDefinition("DatasetStorageLocation", models.DatasetStorageLocation)) +FeedRegistry.register(ModelDefinition("Dashboard", models.Dashboard)) + +GlossaryRegistry.register(ModelDefinition("DatasetTable", models.DatasetTable)) +GlossaryRegistry.register(ModelDefinition("DatasetStorageLocation", models.DatasetStorageLocation)) +GlossaryRegistry.register(ModelDefinition("Dashboard", models.Dashboard)) +GlossaryRegistry.register(ModelDefinition("DatasetTable", models.DatasetTable)) +GlossaryRegistry.register(ModelDefinition("Dataset", models.Dataset)) diff --git a/backend/dataall/modules/datasets/__init__.py b/backend/dataall/modules/datasets/__init__.py index 0de251fee..8f30bd897 100644 --- a/backend/dataall/modules/datasets/__init__.py +++ b/backend/dataall/modules/datasets/__init__.py @@ -2,7 +2,7 @@ import logging from typing import List -from dataall.core.feed.services.feed_registry import FeedRegistry, FeedDefinition +from dataall.core.utils.model_registry import ModelDefinition, FeedRegistry, GlossaryRegistry from dataall.modules.datasets.db.table_column_model import DatasetTableColumn from dataall.modules.loader import ModuleInterface, ImportMode @@ -18,7 +18,8 @@ def is_supported(cls, modes): def __init__(self): import dataall.modules.datasets.api - FeedRegistry.register(FeedDefinition("DatasetTableColumn", DatasetTableColumn)) + FeedRegistry.register(ModelDefinition("DatasetTableColumn", DatasetTableColumn)) + GlossaryRegistry.register(ModelDefinition("DatasetTableColumn", DatasetTableColumn)) log.info("API of datasets has been imported") From 052a2b1f33139dab09a8beb78d5a7cfb72387128 Mon Sep 17 00:00:00 2001 From: Nikita Podshivalov Date: Wed, 12 Apr 2023 11:12:56 +0200 Subject: [PATCH 18/32] Dataset refactoring Fixed tests and added new for dataset module --- tests/core/test_config.py | 6 +++++- tests/tasks/test_subscriptions.py | 4 ++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/tests/core/test_config.py b/tests/core/test_config.py index 3222e4144..30f69f792 100644 --- a/tests/core/test_config.py +++ b/tests/core/test_config.py @@ -25,4 +25,8 @@ def test_default_config(): assert "notebooks" in modules assert "active" in modules["notebooks"] - assert config.get_property("modules.notebooks.active") == "true" + assert "datasets" in modules + assert "active" in modules["datasets"] + + assert config.get_property("modules.notebooks.active") + assert config.get_property("modules.datasets.active") diff --git a/tests/tasks/test_subscriptions.py b/tests/tasks/test_subscriptions.py index 25cd6178a..874b8ccab 100644 --- a/tests/tasks/test_subscriptions.py +++ b/tests/tasks/test_subscriptions.py @@ -134,10 +134,10 @@ def share( def test_subscriptions(org, env, otherenv, db, dataset, share, mocker): mocker.patch( - 'dataall.tasks.subscriptions.subscription_service.SubscriptionService.sns_call', + 'dataall.modules.datasets.tasks.subscription_service.SubscriptionService.sns_call', return_value=True, ) - subscriber = dataall.tasks.subscriptions.subscription_service.SubscriptionService() + subscriber = dataall.modules.datasets.tasks.subscription_service.SubscriptionService() messages = [ { 'prefix': 's3://dataset/testtable/csv/', From d9844834646aa7564104ef184ca95d41f0ecbbb2 Mon Sep 17 00:00:00 2001 From: Nikita Podshivalov Date: Wed, 12 Apr 2023 13:12:39 +0200 Subject: [PATCH 19/32] Fixed and unignored test_tables_sync --- tests/tasks/test_tables_sync.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/tests/tasks/test_tables_sync.py b/tests/tasks/test_tables_sync.py index d4e86b83f..9d8282e65 100644 --- a/tests/tasks/test_tables_sync.py +++ b/tests/tasks/test_tables_sync.py @@ -92,7 +92,13 @@ def table(org, env, db, sync_dataset): yield table -def _test_tables_sync(db, org, env, sync_dataset, table, mocker): +@pytest.fixture(scope='module', autouse=True) +def permissions(db): + with db.scoped_session() as session: + yield dataall.db.api.Permission.init_permissions(session) + + +def test_tables_sync(db, org, env, sync_dataset, table, mocker): mocker.patch( 'dataall.aws.handlers.glue.Glue.list_glue_database_tables', return_value=[ @@ -147,7 +153,7 @@ def _test_tables_sync(db, org, env, sync_dataset, table, mocker): ], ) mocker.patch( - 'dataall.modules.datasets.tables_syncer.is_assumable_pivot_role', return_value=True + 'dataall.modules.datasets.tasks.tables_syncer.is_assumable_pivot_role', return_value=True ) mocker.patch( 'dataall.aws.handlers.glue.Glue.grant_principals_all_table_permissions', From dc0c9350b242be12238c8ce37b0dc74a018ed36d Mon Sep 17 00:00:00 2001 From: Nikita Podshivalov Date: Wed, 12 Apr 2023 14:10:09 +0200 Subject: [PATCH 20/32] Split model registry into feed and glossaries Glossaries had different target types and had to be treated differently --- backend/dataall/api/Objects/Feed/resolvers.py | 2 +- .../dataall/api/Objects/Glossary/resolvers.py | 7 +-- backend/dataall/core/feed/__init__.py | 1 + .../dataall/core/feed/services/__init__.py | 1 + .../dataall/core/feed/services/registry.py | 36 ++++++++++++++ backend/dataall/core/glossary/__init__.py | 1 + .../core/glossary/services/__init__.py | 1 + .../core/glossary/services/registry.py | 38 +++++++++++++++ backend/dataall/core/utils/model_registry.py | 47 ------------------- backend/dataall/db/api/glossary.py | 28 ++++------- backend/dataall/modules/datasets/__init__.py | 7 +-- 11 files changed, 97 insertions(+), 72 deletions(-) create mode 100644 backend/dataall/core/feed/__init__.py create mode 100644 backend/dataall/core/feed/services/__init__.py create mode 100644 backend/dataall/core/feed/services/registry.py create mode 100644 backend/dataall/core/glossary/__init__.py create mode 100644 backend/dataall/core/glossary/services/__init__.py create mode 100644 backend/dataall/core/glossary/services/registry.py delete mode 100644 backend/dataall/core/utils/model_registry.py diff --git a/backend/dataall/api/Objects/Feed/resolvers.py b/backend/dataall/api/Objects/Feed/resolvers.py index 1f328b1ae..598ec86e1 100644 --- a/backend/dataall/api/Objects/Feed/resolvers.py +++ b/backend/dataall/api/Objects/Feed/resolvers.py @@ -2,7 +2,7 @@ from dataall.api.context import Context from dataall.db import paginate, models -from dataall.core.utils.model_registry import FeedRegistry +from dataall.core.feed.services.registry import FeedRegistry class Feed: diff --git a/backend/dataall/api/Objects/Glossary/resolvers.py b/backend/dataall/api/Objects/Glossary/resolvers.py index c6f2634d0..ae8501993 100644 --- a/backend/dataall/api/Objects/Glossary/resolvers.py +++ b/backend/dataall/api/Objects/Glossary/resolvers.py @@ -4,7 +4,6 @@ from .... import db from ....api.context import Context -from ....core.utils.model_registry import GlossaryRegistry from ....db import paginate, exceptions, models from ....searchproxy import upsert_dataset from ....searchproxy import upsert_table @@ -13,6 +12,8 @@ GlossaryRole ) +from dataall.core.glossary.services.registry import GlossaryRegistry + def resolve_glossary_node(obj: models.GlossaryNode, *_): if obj.nodeType == 'G': @@ -323,12 +324,12 @@ def get_link(context: Context, source, linkUri: str = None): def target_union_resolver(obj, *_): - return GlossaryRegistry.find_by_model(obj) + return GlossaryRegistry.find_object_type(obj) def resolve_link_target(context, source, **kwargs): with context.engine.scoped_session() as session: - model = GlossaryRegistry.find(source.targetUri) + model = GlossaryRegistry.find_model(source.targetUri) target = session.query(model).get(source.targetUri) return target diff --git a/backend/dataall/core/feed/__init__.py b/backend/dataall/core/feed/__init__.py new file mode 100644 index 000000000..39f751553 --- /dev/null +++ b/backend/dataall/core/feed/__init__.py @@ -0,0 +1 @@ +"""Contains logic related to feeds""" diff --git a/backend/dataall/core/feed/services/__init__.py b/backend/dataall/core/feed/services/__init__.py new file mode 100644 index 000000000..5b130b24b --- /dev/null +++ b/backend/dataall/core/feed/services/__init__.py @@ -0,0 +1 @@ +"""Service layer of feeds""" diff --git a/backend/dataall/core/feed/services/registry.py b/backend/dataall/core/feed/services/registry.py new file mode 100644 index 000000000..a69bcdd37 --- /dev/null +++ b/backend/dataall/core/feed/services/registry.py @@ -0,0 +1,36 @@ +from dataclasses import dataclass +from typing import Type, Dict + +from dataall.db import Resource, models + + +@dataclass +class FeedDefinition: + target_type: str + model: Type[Resource] + + +class FeedRegistry: + """Registers models for different target types""" + + def __init__(self): + self._definitions: Dict[str, FeedDefinition] = {} + + def register(self, model: FeedDefinition): + self._definitions[model.target_type] = model + + def find(self, target_type: str): + return self._definitions[target_type] + + def find_by_model(self, obj: Resource): + for target_type, definition in self._definitions.items(): + if isinstance(obj, definition.model): + return target_type + return None + + +FeedRegistry.register(FeedDefinition("Worksheet", models.Worksheet)) +FeedRegistry.register(FeedDefinition("DataPipeline", models.DataPipeline)) +FeedRegistry.register(FeedDefinition("DatasetTable", models.DatasetTable)) +FeedRegistry.register(FeedDefinition("DatasetStorageLocation", models.DatasetStorageLocation)) +FeedRegistry.register(FeedDefinition("Dashboard", models.Dashboard)) diff --git a/backend/dataall/core/glossary/__init__.py b/backend/dataall/core/glossary/__init__.py new file mode 100644 index 000000000..aa81c1e26 --- /dev/null +++ b/backend/dataall/core/glossary/__init__.py @@ -0,0 +1 @@ +"""Contains code related to glossaries""" diff --git a/backend/dataall/core/glossary/services/__init__.py b/backend/dataall/core/glossary/services/__init__.py new file mode 100644 index 000000000..9ed65d261 --- /dev/null +++ b/backend/dataall/core/glossary/services/__init__.py @@ -0,0 +1 @@ +"""Service layer of glossaries""" diff --git a/backend/dataall/core/glossary/services/registry.py b/backend/dataall/core/glossary/services/registry.py new file mode 100644 index 000000000..7484087c4 --- /dev/null +++ b/backend/dataall/core/glossary/services/registry.py @@ -0,0 +1,38 @@ +from dataclasses import dataclass +from typing import Type, Dict, Optional + +from dataall.db import Resource, models + + +@dataclass +class GlossaryDefinition: + target_type: str + object_type: str + model: Type[Resource] + + +class GlossaryRegistry: + _DEFINITIONS: Dict[str, GlossaryDefinition] = {} + + @classmethod + def register(cls, glossary: GlossaryDefinition) -> None: + cls._DEFINITIONS[glossary.target_type] = glossary + + @classmethod + def find_model(cls, target_type: str) -> Optional[Resource]: + definition = cls._DEFINITIONS[target_type] + return definition.model if definition is not None else None + + @classmethod + def find_object_type(cls, model: Resource) -> Optional[str]: + for _, definition in cls._DEFINITIONS.items(): + if isinstance(model, definition.model): + return definition.object_type + return None + + +GlossaryRegistry.register(GlossaryDefinition("DatasetTable", "DatasetTable", models.DatasetTable)) +GlossaryRegistry.register(GlossaryDefinition("Folder", "DatasetStorageLocation", models.DatasetStorageLocation)) +GlossaryRegistry.register(GlossaryDefinition("Dashboard", "Dashboard", models.Dashboard)) +GlossaryRegistry.register(GlossaryDefinition("DatasetTable", "DatasetTable", models.DatasetTable)) +GlossaryRegistry.register(GlossaryDefinition("Dataset", "Dataset", models.Dataset)) diff --git a/backend/dataall/core/utils/model_registry.py b/backend/dataall/core/utils/model_registry.py deleted file mode 100644 index 9a4c21952..000000000 --- a/backend/dataall/core/utils/model_registry.py +++ /dev/null @@ -1,47 +0,0 @@ -from dataclasses import dataclass -from typing import Type, Dict - -from dataall.db import Resource, models - - -@dataclass -class ModelDefinition: - target_type: str - model: Type[Resource] - - -class ModelRegistry: - """Registers models for different target types""" - - def __init__(self): - self._definitions: Dict[str, ModelDefinition] = {} - - def register(self, model: ModelDefinition): - self._definitions[model.target_type] = model - - def find(self, target_type: str): - return self._definitions[target_type] - - def find_by_model(self, obj: Resource): - for target_type, definition in self._definitions.items(): - if isinstance(obj, definition.model): - return target_type - return None - - -# TODO should migrate to a proper file after the modularization -FeedRegistry = ModelRegistry() -GlossaryRegistry = ModelRegistry() - - -FeedRegistry.register(ModelDefinition("Worksheet", models.Worksheet)) -FeedRegistry.register(ModelDefinition("DataPipeline", models.DataPipeline)) -FeedRegistry.register(ModelDefinition("DatasetTable", models.DatasetTable)) -FeedRegistry.register(ModelDefinition("DatasetStorageLocation", models.DatasetStorageLocation)) -FeedRegistry.register(ModelDefinition("Dashboard", models.Dashboard)) - -GlossaryRegistry.register(ModelDefinition("DatasetTable", models.DatasetTable)) -GlossaryRegistry.register(ModelDefinition("DatasetStorageLocation", models.DatasetStorageLocation)) -GlossaryRegistry.register(ModelDefinition("Dashboard", models.Dashboard)) -GlossaryRegistry.register(ModelDefinition("DatasetTable", models.DatasetTable)) -GlossaryRegistry.register(ModelDefinition("Dataset", models.Dataset)) diff --git a/backend/dataall/db/api/glossary.py b/backend/dataall/db/api/glossary.py index c6313e007..3df8d34f7 100644 --- a/backend/dataall/db/api/glossary.py +++ b/backend/dataall/db/api/glossary.py @@ -4,12 +4,12 @@ from sqlalchemy import asc, or_, and_, literal, case from sqlalchemy.orm import with_expression, aliased -from .. import models, exceptions, permissions, paginate +from .. import models, exceptions, permissions, paginate, Resource from .permission_checker import ( has_tenant_perm, ) from ..models.Glossary import GlossaryNodeStatus -from dataall.modules.datasets.db.table_column_model import DatasetTableColumn +from dataall.core.glossary.services.registry import GlossaryRegistry logger = logging.getLogger(__name__) @@ -124,24 +124,16 @@ def link_term(session, username, groups, uri, data=None, check_perm=None): 'associations are allowed for Glossary terms only', ) - targetUri: str = data['targetUri'] - targetType: str = data['targetType'] - - if targetType == 'Dataset': - target = session.query(models.Dataset).get(targetUri) - elif targetType == 'DatasetTable': - target = session.query(models.DatasetTable).get(targetUri) - elif targetType == 'Folder': - target = session.query(models.DatasetStorageLocation).get(targetUri) - elif targetType == 'Column': - target = session.query(DatasetTableColumn).get(targetUri) - elif targetType == 'Dashboard': - target = session.query(models.Dashboard).get(targetUri) - else: + target_uri: str = data['targetUri'] + target_type: str = data['targetType'] + + target_model: Resource = GlossaryRegistry.find_model(target_type) + if not target_model: raise exceptions.InvalidInput( 'NodeType', 'term.nodeType', 'association target type is invalid' ) + target = session.query(target_model).get(target_uri) if not target: raise exceptions.ObjectNotFound('Association target', uri) @@ -150,8 +142,8 @@ def link_term(session, username, groups, uri, data=None, check_perm=None): approvedByOwner=data.get('approvedByOwner', True), approvedBySteward=data.get('approvedBySteward', True), nodeUri=uri, - targetUri=targetUri, - targetType=targetType, + targetUri=target_uri, + targetType=target_type, ) session.add(link) return link diff --git a/backend/dataall/modules/datasets/__init__.py b/backend/dataall/modules/datasets/__init__.py index 8f30bd897..306778f40 100644 --- a/backend/dataall/modules/datasets/__init__.py +++ b/backend/dataall/modules/datasets/__init__.py @@ -2,7 +2,8 @@ import logging from typing import List -from dataall.core.utils.model_registry import ModelDefinition, FeedRegistry, GlossaryRegistry +from dataall.core.feed.services.registry import FeedRegistry, FeedDefinition +from dataall.core.glossary.services.registry import GlossaryRegistry, GlossaryDefinition from dataall.modules.datasets.db.table_column_model import DatasetTableColumn from dataall.modules.loader import ModuleInterface, ImportMode @@ -18,8 +19,8 @@ def is_supported(cls, modes): def __init__(self): import dataall.modules.datasets.api - FeedRegistry.register(ModelDefinition("DatasetTableColumn", DatasetTableColumn)) - GlossaryRegistry.register(ModelDefinition("DatasetTableColumn", DatasetTableColumn)) + FeedRegistry.register(FeedDefinition("DatasetTableColumn", DatasetTableColumn)) + GlossaryRegistry.register(GlossaryDefinition("DatasetTableColumn", DatasetTableColumn)) log.info("API of datasets has been imported") From 727e3537cfc3b133aa45c0deafc9568b25280b3a Mon Sep 17 00:00:00 2001 From: Nikita Podshivalov Date: Wed, 12 Apr 2023 15:40:08 +0200 Subject: [PATCH 21/32] Abstraction for glossaries Created API for glossaries to use modularization --- .../core/glossary/services/registry.py | 16 +++++- backend/dataall/db/api/environment.py | 2 +- backend/dataall/db/api/glossary.py | 57 +++++++------------ backend/dataall/db/api/organization.py | 1 - backend/dataall/db/models/Dashboard.py | 3 + backend/dataall/db/models/Dataset.py | 4 ++ .../db/models/DatasetStorageLocation.py | 3 + backend/dataall/db/models/DatasetTable.py | 3 + .../modules/datasets/db/table_column_model.py | 3 + 9 files changed, 50 insertions(+), 42 deletions(-) diff --git a/backend/dataall/core/glossary/services/registry.py b/backend/dataall/core/glossary/services/registry.py index 7484087c4..ee3f10d41 100644 --- a/backend/dataall/core/glossary/services/registry.py +++ b/backend/dataall/core/glossary/services/registry.py @@ -1,14 +1,22 @@ from dataclasses import dataclass -from typing import Type, Dict, Optional +from typing import Type, Dict, Optional, Protocol, Union from dataall.db import Resource, models +class Identifiable(Protocol): + def uri(self): + ... + + @dataclass class GlossaryDefinition: target_type: str object_type: str - model: Type[Resource] + model: Union[Type[Resource], Identifiable] # should be an intersection, but python typing doesn't have one yet + + def target_uri(self): + return self.model.uri() class GlossaryRegistry: @@ -30,6 +38,10 @@ def find_object_type(cls, model: Resource) -> Optional[str]: return definition.object_type return None + @classmethod + def definitions(cls): + return cls._DEFINITIONS.values() + GlossaryRegistry.register(GlossaryDefinition("DatasetTable", "DatasetTable", models.DatasetTable)) GlossaryRegistry.register(GlossaryDefinition("Folder", "DatasetStorageLocation", models.DatasetStorageLocation)) diff --git a/backend/dataall/db/api/environment.py b/backend/dataall/db/api/environment.py index 4a436bf9a..19d5de342 100644 --- a/backend/dataall/db/api/environment.py +++ b/backend/dataall/db/api/environment.py @@ -21,7 +21,7 @@ EnvironmentPermission, ) from ..models.Permission import PermissionType -from ..paginator import Page, paginate +from ..paginator import paginate from dataall.core.environment.models import EnvironmentParameter from ...core.environment.db.repositories import EnvironmentParameterRepository from ...utils.naming_convention import ( diff --git a/backend/dataall/db/api/glossary.py b/backend/dataall/db/api/glossary.py index 3df8d34f7..96c340f62 100644 --- a/backend/dataall/db/api/glossary.py +++ b/backend/dataall/db/api/glossary.py @@ -10,6 +10,7 @@ ) from ..models.Glossary import GlossaryNodeStatus from dataall.core.glossary.services.registry import GlossaryRegistry +from ..paginator import Page logger = logging.getLogger(__name__) @@ -339,46 +340,26 @@ def list_term_associations( ): source = data['source'] filter = data['filter'] - datasets = session.query( - models.Dataset.datasetUri.label('targetUri'), - literal('dataset').label('targetType'), - models.Dataset.label.label('label'), - models.Dataset.name.label('name'), - models.Dataset.description.label('description'), - ) - tables = session.query( - models.DatasetTable.tableUri.label('targetUri'), - literal('table').label('targetType'), - models.DatasetTable.label.label('label'), - models.DatasetTable.name.label('name'), - models.DatasetTable.description.label('description'), - ) - columns = session.query( - DatasetTableColumn.columnUri.label('targetUri'), - literal('column').label('targetType'), - DatasetTableColumn.label.label('label'), - DatasetTableColumn.name.label('name'), - DatasetTableColumn.description.label('description'), - ) - folders = session.query( - models.DatasetStorageLocation.locationUri.label('targetUri'), - literal('folder').label('targetType'), - models.DatasetStorageLocation.label.label('label'), - models.DatasetStorageLocation.name.label('name'), - models.DatasetStorageLocation.description.label('description'), - ) - dashboards = session.query( - models.Dashboard.dashboardUri.label('targetUri'), - literal('dashboard').label('targetType'), - models.Dashboard.label.label('label'), - models.Dashboard.name.label('name'), - models.Dashboard.description.label('description'), - ) + query = None + for definition in GlossaryRegistry.definitions(): + model = definition.model + subquery = session.query( + definition.target_uri().label('targetUri'), + literal(definition.target_type.lower()).label('targetType'), + model.label.label('label'), + model.name.label('name'), + model.description.label('description'), + ) + if query: + query.union(subquery) + else: + query = subquery - linked_objects = datasets.union(tables, columns, folders, dashboards).subquery( - 'linked_objects' - ) + if query is None: + return Page([], 1, 1, 0) # empty page. All modules are turned off + + linked_objects = query.subquery('linked_objects') path = models.GlossaryNode.path q = ( diff --git a/backend/dataall/db/api/organization.py b/backend/dataall/db/api/organization.py index 979dd1095..dd570eeae 100644 --- a/backend/dataall/db/api/organization.py +++ b/backend/dataall/db/api/organization.py @@ -8,7 +8,6 @@ from . import has_tenant_perm, ResourcePolicy, has_resource_perm from ..models import OrganizationGroup from ..models.Enums import OrganisationUserRole -from ..paginator import Page logger = logging.getLogger(__name__) diff --git a/backend/dataall/db/models/Dashboard.py b/backend/dataall/db/models/Dashboard.py index 1a24ef1cb..0b12ecd96 100644 --- a/backend/dataall/db/models/Dashboard.py +++ b/backend/dataall/db/models/Dashboard.py @@ -18,3 +18,6 @@ class Dashboard(Resource, Base): SamlGroupName = Column(String, nullable=False) userRoleForDashboard = query_expression() + + def uri(self): + return self.dashboardUri diff --git a/backend/dataall/db/models/Dataset.py b/backend/dataall/db/models/Dataset.py index 71a95fe0e..451c7da7c 100644 --- a/backend/dataall/db/models/Dataset.py +++ b/backend/dataall/db/models/Dataset.py @@ -59,3 +59,7 @@ class Dataset(Resource, Base): importedKmsKey = Column(Boolean, default=False) importedAdminRole = Column(Boolean, default=False) imported = Column(Boolean, default=False) + + def uri(self): + return self.datasetUri + diff --git a/backend/dataall/db/models/DatasetStorageLocation.py b/backend/dataall/db/models/DatasetStorageLocation.py index 33b121438..e21ae6694 100644 --- a/backend/dataall/db/models/DatasetStorageLocation.py +++ b/backend/dataall/db/models/DatasetStorageLocation.py @@ -17,3 +17,6 @@ class DatasetStorageLocation(Resource, Base): userRoleForStorageLocation = query_expression() projectPermission = query_expression() environmentEndPoint = query_expression() + + def uri(self): + return self.locationUri diff --git a/backend/dataall/db/models/DatasetTable.py b/backend/dataall/db/models/DatasetTable.py index a1b06b192..e97174167 100644 --- a/backend/dataall/db/models/DatasetTable.py +++ b/backend/dataall/db/models/DatasetTable.py @@ -27,3 +27,6 @@ class DatasetTable(Resource, Base): stage = Column(String, default='RAW') topics = Column(postgresql.ARRAY(String), nullable=True) confidentiality = Column(String, nullable=False, default='C1') + + def uri(self): + return self.tableUri diff --git a/backend/dataall/modules/datasets/db/table_column_model.py b/backend/dataall/modules/datasets/db/table_column_model.py index 4d3d7e009..05bc26058 100644 --- a/backend/dataall/modules/datasets/db/table_column_model.py +++ b/backend/dataall/modules/datasets/db/table_column_model.py @@ -18,3 +18,6 @@ class DatasetTableColumn(Resource, Base): columnType = Column( String, default='column' ) # can be either "column" or "partition" + + def uri(self): + return self.columnUri From 49fbb415dbdfc27c20d90366a1cf3bcd41ebea0f Mon Sep 17 00:00:00 2001 From: Nikita Podshivalov Date: Wed, 12 Apr 2023 15:47:09 +0200 Subject: [PATCH 22/32] Fixed leftovers --- .../dataall/core/feed/services/registry.py | 19 ++++++++++--------- backend/dataall/modules/datasets/__init__.py | 2 +- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/backend/dataall/core/feed/services/registry.py b/backend/dataall/core/feed/services/registry.py index a69bcdd37..07f2e77a1 100644 --- a/backend/dataall/core/feed/services/registry.py +++ b/backend/dataall/core/feed/services/registry.py @@ -12,18 +12,19 @@ class FeedDefinition: class FeedRegistry: """Registers models for different target types""" + _DEFINITIONS: Dict[str, FeedDefinition] = {} - def __init__(self): - self._definitions: Dict[str, FeedDefinition] = {} + @classmethod + def register(cls, model: FeedDefinition): + cls._DEFINITIONS[model.target_type] = model - def register(self, model: FeedDefinition): - self._definitions[model.target_type] = model + @classmethod + def find(cls, target_type: str): + return cls._DEFINITIONS[target_type] - def find(self, target_type: str): - return self._definitions[target_type] - - def find_by_model(self, obj: Resource): - for target_type, definition in self._definitions.items(): + @classmethod + def find_by_model(cls, obj: Resource): + for target_type, definition in cls._DEFINITIONS.items(): if isinstance(obj, definition.model): return target_type return None diff --git a/backend/dataall/modules/datasets/__init__.py b/backend/dataall/modules/datasets/__init__.py index 306778f40..5b7224a48 100644 --- a/backend/dataall/modules/datasets/__init__.py +++ b/backend/dataall/modules/datasets/__init__.py @@ -20,7 +20,7 @@ def is_supported(cls, modes): def __init__(self): import dataall.modules.datasets.api FeedRegistry.register(FeedDefinition("DatasetTableColumn", DatasetTableColumn)) - GlossaryRegistry.register(GlossaryDefinition("DatasetTableColumn", DatasetTableColumn)) + GlossaryRegistry.register(GlossaryDefinition("Column", "DatasetTableColumn", DatasetTableColumn)) log.info("API of datasets has been imported") From 7d029e73046990bae939f8ff5f6b6cfaa8a83d62 Mon Sep 17 00:00:00 2001 From: Nikita Podshivalov Date: Thu, 13 Apr 2023 10:07:11 +0200 Subject: [PATCH 23/32] Datasets refactoring Added and fixed tests --- backend/dataall/core/feed/services/registry.py | 4 ++-- tests/api/test_feed.py | 3 ++- tests/modules/datasets/__init__.py | 0 tests/modules/datasets/test_dataset_feed.py | 11 +++++++++++ 4 files changed, 15 insertions(+), 3 deletions(-) create mode 100644 tests/modules/datasets/__init__.py create mode 100644 tests/modules/datasets/test_dataset_feed.py diff --git a/backend/dataall/core/feed/services/registry.py b/backend/dataall/core/feed/services/registry.py index 07f2e77a1..893f37f55 100644 --- a/backend/dataall/core/feed/services/registry.py +++ b/backend/dataall/core/feed/services/registry.py @@ -15,8 +15,8 @@ class FeedRegistry: _DEFINITIONS: Dict[str, FeedDefinition] = {} @classmethod - def register(cls, model: FeedDefinition): - cls._DEFINITIONS[model.target_type] = model + def register(cls, definition: FeedDefinition): + cls._DEFINITIONS[definition.target_type] = definition @classmethod def find(cls, target_type: str): diff --git a/tests/api/test_feed.py b/tests/api/test_feed.py index 11f7c4891..f1d8aaf7f 100644 --- a/tests/api/test_feed.py +++ b/tests/api/test_feed.py @@ -103,4 +103,5 @@ def test_get_target(client, worksheet): targetType='Worksheet', username='me', ) - print(response) + assert response.data.getFeed.target.worksheetUri == worksheet.worksheetUri + diff --git a/tests/modules/datasets/__init__.py b/tests/modules/datasets/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/modules/datasets/test_dataset_feed.py b/tests/modules/datasets/test_dataset_feed.py new file mode 100644 index 000000000..52c97e990 --- /dev/null +++ b/tests/modules/datasets/test_dataset_feed.py @@ -0,0 +1,11 @@ + +from dataall.core.feed.services.registry import FeedRegistry +from dataall.modules.datasets.db.table_column_model import DatasetTableColumn + + +def test_dataset_registered(): + model = FeedRegistry.find("DatasetTableColumn") + assert model == DatasetTableColumn + + model = DatasetTableColumn() + assert "DatasetTableColumn" == FeedRegistry.find_by_model(model) From be527ebc26e50f989ca3f9a8e84814d3d316a913 Mon Sep 17 00:00:00 2001 From: Nikita Podshivalov Date: Thu, 13 Apr 2023 11:40:52 +0200 Subject: [PATCH 24/32] Added runtime type registration for Union GraphQL type --- backend/dataall/api/gql/graphql_union_type.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/backend/dataall/api/gql/graphql_union_type.py b/backend/dataall/api/gql/graphql_union_type.py index a8fea7e2f..b67fe3c9a 100644 --- a/backend/dataall/api/gql/graphql_union_type.py +++ b/backend/dataall/api/gql/graphql_union_type.py @@ -1,19 +1,31 @@ +from abc import ABC + from ._cache import cache_instances from .utils import get_named_type +class UnionTypeRegistry(ABC): + """An abstract class that is used to provide union type in runtime""" + + @classmethod + def types(cls): + raise NotImplementedError("Types method is not implemented") + + @cache_instances class Union: _register = {} - def __init__(self, name, types=[], resolver=lambda *_, **__: None): + def __init__(self, name, types=[], type_registry=None, resolver=lambda *_, **__: None): self.name = name self.types = types + self.type_registry = type_registry self.resolver = resolver Union._register[name] = self def gql(self, *args, **kwargs): - return f"union {self.name} = {'|'.join([get_named_type(t).name for t in self.types])}" + types = self.type_registry.types() if self.type_registry else self.types + return f"union {self.name} = {'|'.join([get_named_type(t).name for t in types])}" if __name__ == '__main__': From 3daf2aab7da67320bab4474de2119a93c46840f8 Mon Sep 17 00:00:00 2001 From: Nikita Podshivalov Date: Thu, 13 Apr 2023 11:42:51 +0200 Subject: [PATCH 25/32] Changed Feed type registration mechanism Moved FeedRegistry to gql since it's more appropriate place for this Started using registry to provide types Renaming and small fixes --- .../feed/services => api/Objects/Feed}/registry.py | 14 ++++++++++---- backend/dataall/api/Objects/Feed/resolvers.py | 6 +++--- backend/dataall/api/Objects/Feed/schema.py | 11 ++--------- backend/dataall/core/feed/__init__.py | 1 - backend/dataall/core/feed/services/__init__.py | 1 - backend/dataall/modules/datasets/__init__.py | 2 +- tests/modules/datasets/test_dataset_feed.py | 6 +++--- 7 files changed, 19 insertions(+), 22 deletions(-) rename backend/dataall/{core/feed/services => api/Objects/Feed}/registry.py (72%) delete mode 100644 backend/dataall/core/feed/__init__.py delete mode 100644 backend/dataall/core/feed/services/__init__.py diff --git a/backend/dataall/core/feed/services/registry.py b/backend/dataall/api/Objects/Feed/registry.py similarity index 72% rename from backend/dataall/core/feed/services/registry.py rename to backend/dataall/api/Objects/Feed/registry.py index 893f37f55..a119529ab 100644 --- a/backend/dataall/core/feed/services/registry.py +++ b/backend/dataall/api/Objects/Feed/registry.py @@ -1,6 +1,8 @@ from dataclasses import dataclass from typing import Type, Dict +from dataall.api import gql +from dataall.api.gql.graphql_union_type import UnionTypeRegistry from dataall.db import Resource, models @@ -10,7 +12,7 @@ class FeedDefinition: model: Type[Resource] -class FeedRegistry: +class FeedRegistry(UnionTypeRegistry): """Registers models for different target types""" _DEFINITIONS: Dict[str, FeedDefinition] = {} @@ -19,16 +21,20 @@ def register(cls, definition: FeedDefinition): cls._DEFINITIONS[definition.target_type] = definition @classmethod - def find(cls, target_type: str): - return cls._DEFINITIONS[target_type] + def find_model(cls, target_type: str): + return cls._DEFINITIONS[target_type].model @classmethod - def find_by_model(cls, obj: Resource): + def find_target(cls, obj: Resource): for target_type, definition in cls._DEFINITIONS.items(): if isinstance(obj, definition.model): return target_type return None + @classmethod + def types(cls): + return [gql.Ref(target_type) for target_type in cls._DEFINITIONS.keys()] + FeedRegistry.register(FeedDefinition("Worksheet", models.Worksheet)) FeedRegistry.register(FeedDefinition("DataPipeline", models.DataPipeline)) diff --git a/backend/dataall/api/Objects/Feed/resolvers.py b/backend/dataall/api/Objects/Feed/resolvers.py index 598ec86e1..08de0d6b7 100644 --- a/backend/dataall/api/Objects/Feed/resolvers.py +++ b/backend/dataall/api/Objects/Feed/resolvers.py @@ -2,7 +2,7 @@ from dataall.api.context import Context from dataall.db import paginate, models -from dataall.core.feed.services.registry import FeedRegistry +from dataall.api.Objects.Feed.registry import FeedRegistry class Feed: @@ -20,14 +20,14 @@ def targetType(self): def resolve_feed_target_type(obj, *_): - return FeedRegistry.find_by_model(obj) + return FeedRegistry.find_target(obj) def resolve_target(context: Context, source: Feed, **kwargs): if not source: return None with context.engine.scoped_session() as session: - model = FeedRegistry.find(source.targetType) + model = FeedRegistry.find_model(source.targetType) target = session.query(model).get(source.targetUri) return target diff --git a/backend/dataall/api/Objects/Feed/schema.py b/backend/dataall/api/Objects/Feed/schema.py index d58918716..42fea86ad 100644 --- a/backend/dataall/api/Objects/Feed/schema.py +++ b/backend/dataall/api/Objects/Feed/schema.py @@ -1,18 +1,11 @@ from ... import gql from .resolvers import * +from dataall.api.Objects.Feed.registry import FeedRegistry FeedTarget = gql.Union( name='FeedTarget', - types=[ - gql.Ref('Dataset'), - gql.Ref('DatasetTable'), - gql.Ref('DatasetTableColumn'), - gql.Ref('DatasetStorageLocation'), - gql.Ref('DataPipeline'), - gql.Ref('Worksheet'), - gql.Ref('Dashboard'), - ], + type_registry=FeedRegistry, resolver=resolve_feed_target_type, ) diff --git a/backend/dataall/core/feed/__init__.py b/backend/dataall/core/feed/__init__.py deleted file mode 100644 index 39f751553..000000000 --- a/backend/dataall/core/feed/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Contains logic related to feeds""" diff --git a/backend/dataall/core/feed/services/__init__.py b/backend/dataall/core/feed/services/__init__.py deleted file mode 100644 index 5b130b24b..000000000 --- a/backend/dataall/core/feed/services/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Service layer of feeds""" diff --git a/backend/dataall/modules/datasets/__init__.py b/backend/dataall/modules/datasets/__init__.py index 5b7224a48..6ba4a24e2 100644 --- a/backend/dataall/modules/datasets/__init__.py +++ b/backend/dataall/modules/datasets/__init__.py @@ -2,7 +2,7 @@ import logging from typing import List -from dataall.core.feed.services.registry import FeedRegistry, FeedDefinition +from dataall.api.Objects.Feed.registry import FeedRegistry, FeedDefinition from dataall.core.glossary.services.registry import GlossaryRegistry, GlossaryDefinition from dataall.modules.datasets.db.table_column_model import DatasetTableColumn from dataall.modules.loader import ModuleInterface, ImportMode diff --git a/tests/modules/datasets/test_dataset_feed.py b/tests/modules/datasets/test_dataset_feed.py index 52c97e990..db5ff43e2 100644 --- a/tests/modules/datasets/test_dataset_feed.py +++ b/tests/modules/datasets/test_dataset_feed.py @@ -1,11 +1,11 @@ -from dataall.core.feed.services.registry import FeedRegistry +from dataall.api.Objects.Feed.registry import FeedRegistry from dataall.modules.datasets.db.table_column_model import DatasetTableColumn def test_dataset_registered(): - model = FeedRegistry.find("DatasetTableColumn") + model = FeedRegistry.find_model("DatasetTableColumn") assert model == DatasetTableColumn model = DatasetTableColumn() - assert "DatasetTableColumn" == FeedRegistry.find_by_model(model) + assert "DatasetTableColumn" == FeedRegistry.find_target(model) From db3bfd3f51c0b7ea2ef39ab62f9176b99e5ebeb5 Mon Sep 17 00:00:00 2001 From: Nikita Podshivalov Date: Thu, 13 Apr 2023 11:43:56 +0200 Subject: [PATCH 26/32] Added TODO for future refactoring Solve circular dependecy for redshift. It should go away after the migration of redshift --- backend/dataall/aws/handlers/redshift.py | 1 + backend/dataall/db/api/redshift_cluster.py | 7 +++++-- deploy/stacks/container.py | 2 ++ 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/backend/dataall/aws/handlers/redshift.py b/backend/dataall/aws/handlers/redshift.py index 4d2591520..c186d5df7 100644 --- a/backend/dataall/aws/handlers/redshift.py +++ b/backend/dataall/aws/handlers/redshift.py @@ -9,6 +9,7 @@ from .sts import SessionHelper from ... import db from ...db import models +# TODO should be migrated in the redshift module from dataall.modules.datasets.services.dataset_table import DatasetTableService log = logging.getLogger(__name__) diff --git a/backend/dataall/db/api/redshift_cluster.py b/backend/dataall/db/api/redshift_cluster.py index 4167a555a..8ca3088bf 100644 --- a/backend/dataall/db/api/redshift_cluster.py +++ b/backend/dataall/db/api/redshift_cluster.py @@ -9,7 +9,6 @@ NamingConventionPattern, ) from ...utils.slugify import slugify -from dataall.modules.datasets.services.dataset_table import DatasetTableService log = logging.getLogger(__name__) @@ -495,7 +494,11 @@ def enable_copy_table( session, username, groups, uri, data=None, check_perm=True ) -> models.RedshiftClusterDatasetTable: cluster = RedshiftCluster.get_redshift_cluster_by_uri(session, uri) - table = DatasetTableService.get_dataset_table_by_uri(session, data['tableUri']) + + # TODO should be migrated in the redshift module + table = dataall.modules.datasets.services.dataset_table.DatasetTableService.get_dataset_table_by_uri( + session, data['tableUri'] + ) table = models.RedshiftClusterDatasetTable( clusterUri=uri, datasetUri=data['datasetUri'], diff --git a/deploy/stacks/container.py b/deploy/stacks/container.py index d3c761519..aa7be04df 100644 --- a/deploy/stacks/container.py +++ b/deploy/stacks/container.py @@ -92,6 +92,7 @@ def __init__( envname, resource_prefix, vpc, vpc_endpoints_sg ) + # TODO introduce the ability to change the deployment depending on config.json file sync_tables_task = self.set_scheduled_task( cluster=cluster, command=['python3.8', '-m', 'dataall.modules.datasets.tasks.tables_syncer'], @@ -174,6 +175,7 @@ def __init__( update_bucket_policies_task.task.security_groups ) + # TODO introduce the ability to change the deployment depending on config.json file subscriptions_task = self.set_scheduled_task( cluster=cluster, command=[ From 13b6e92918b453c3fe6ad20be7ec2d8e74194315 Mon Sep 17 00:00:00 2001 From: Nikita Podshivalov Date: Thu, 13 Apr 2023 15:11:50 +0200 Subject: [PATCH 27/32] Added GlossaryRegistry for Union scheme --- .../dataall/api/Objects/Glossary/__init__.py | 3 ++- .../Objects/Glossary}/registry.py | 10 ++++++- .../dataall/api/Objects/Glossary/resolvers.py | 27 ++++++++++--------- .../dataall/api/Objects/Glossary/schema.py | 9 ++----- backend/dataall/core/glossary/__init__.py | 1 - .../core/glossary/services/__init__.py | 1 - backend/dataall/db/api/glossary.py | 25 ++++++----------- backend/dataall/modules/datasets/__init__.py | 2 +- tests/api/test_glossary.py | 20 +++++++------- 9 files changed, 46 insertions(+), 52 deletions(-) rename backend/dataall/{core/glossary/services => api/Objects/Glossary}/registry.py (80%) delete mode 100644 backend/dataall/core/glossary/__init__.py delete mode 100644 backend/dataall/core/glossary/services/__init__.py diff --git a/backend/dataall/api/Objects/Glossary/__init__.py b/backend/dataall/api/Objects/Glossary/__init__.py index 0c4ec6166..30e86e17e 100644 --- a/backend/dataall/api/Objects/Glossary/__init__.py +++ b/backend/dataall/api/Objects/Glossary/__init__.py @@ -4,6 +4,7 @@ mutations, resolvers, schema, + registry, ) -__all__ = ['resolvers', 'schema', 'input_types', 'queries', 'mutations'] +__all__ = ['registry', 'resolvers', 'schema', 'input_types', 'queries', 'mutations'] diff --git a/backend/dataall/core/glossary/services/registry.py b/backend/dataall/api/Objects/Glossary/registry.py similarity index 80% rename from backend/dataall/core/glossary/services/registry.py rename to backend/dataall/api/Objects/Glossary/registry.py index ee3f10d41..375f470e2 100644 --- a/backend/dataall/core/glossary/services/registry.py +++ b/backend/dataall/api/Objects/Glossary/registry.py @@ -1,6 +1,8 @@ from dataclasses import dataclass from typing import Type, Dict, Optional, Protocol, Union +from dataall.api import gql +from dataall.api.gql.graphql_union_type import UnionTypeRegistry from dataall.db import Resource, models @@ -11,6 +13,7 @@ def uri(self): @dataclass class GlossaryDefinition: + """Glossary's definition used for registration references of other modules""" target_type: str object_type: str model: Union[Type[Resource], Identifiable] # should be an intersection, but python typing doesn't have one yet @@ -19,7 +22,8 @@ def target_uri(self): return self.model.uri() -class GlossaryRegistry: +class GlossaryRegistry(UnionTypeRegistry): + """Registry of glossary definition and API to retrieve data""" _DEFINITIONS: Dict[str, GlossaryDefinition] = {} @classmethod @@ -42,6 +46,10 @@ def find_object_type(cls, model: Resource) -> Optional[str]: def definitions(cls): return cls._DEFINITIONS.values() + @classmethod + def types(cls): + return [gql.Ref(definition.object_type) for definition in cls._DEFINITIONS.values()] + GlossaryRegistry.register(GlossaryDefinition("DatasetTable", "DatasetTable", models.DatasetTable)) GlossaryRegistry.register(GlossaryDefinition("Folder", "DatasetStorageLocation", models.DatasetStorageLocation)) diff --git a/backend/dataall/api/Objects/Glossary/resolvers.py b/backend/dataall/api/Objects/Glossary/resolvers.py index ae8501993..15e77327f 100644 --- a/backend/dataall/api/Objects/Glossary/resolvers.py +++ b/backend/dataall/api/Objects/Glossary/resolvers.py @@ -2,6 +2,7 @@ from sqlalchemy import and_, or_, asc +from dataall.api.Objects.Glossary.registry import GlossaryRegistry from .... import db from ....api.context import Context from ....db import paginate, exceptions, models @@ -12,8 +13,6 @@ GlossaryRole ) -from dataall.core.glossary.services.registry import GlossaryRegistry - def resolve_glossary_node(obj: models.GlossaryNode, *_): if obj.nodeType == 'G': @@ -273,8 +272,6 @@ def request_link( with context.engine.scoped_session() as session: return db.api.Glossary.link_term( session=session, - username=context.username, - groups=context.groups, uri=nodeUri, data={ 'targetUri': targetUri, @@ -282,7 +279,7 @@ def request_link( 'approvedByOwner': True, 'approvedBySteward': False, }, - check_perm=True, + target_model=_target_model(targetType), ) @@ -296,8 +293,6 @@ def link_term( with context.engine.scoped_session() as session: return db.api.Glossary.link_term( session=session, - username=context.username, - groups=context.groups, uri=nodeUri, data={ 'targetUri': targetUri, @@ -305,7 +300,7 @@ def link_term( 'approvedByOwner': True, 'approvedBySteward': True, }, - check_perm=True, + target_model=_target_model(targetType), ) @@ -329,7 +324,7 @@ def target_union_resolver(obj, *_): def resolve_link_target(context, source, **kwargs): with context.engine.scoped_session() as session: - model = GlossaryRegistry.find_model(source.targetUri) + model = GlossaryRegistry.find_model(source.targetType) target = session.query(model).get(source.targetUri) return target @@ -342,11 +337,8 @@ def resolve_term_associations( with context.engine.scoped_session() as session: return db.api.Glossary.list_term_associations( session=session, - username=context.username, - groups=context.groups, - uri=None, data={'source': source, 'filter': filter}, - check_perm=True, + target_model_definitions=GlossaryRegistry.definitions() ) @@ -477,3 +469,12 @@ def reindex(context, linkUri): upsert_folder(session=session, es=context.es, locationUri=link.targetUri) elif isinstance(target, models.Dashboard): upsert_dashboard(session=session, es=context.es, dashboardUri=link.targetUri) + + +def _target_model(target_type: str): + target_model = GlossaryRegistry.find_model(target_type) + if not target_model: + raise exceptions.InvalidInput( + 'NodeType', 'term.nodeType', 'association target type is invalid' + ) + return target_model diff --git a/backend/dataall/api/Objects/Glossary/schema.py b/backend/dataall/api/Objects/Glossary/schema.py index 36fd1b758..9b71ae4b1 100644 --- a/backend/dataall/api/Objects/Glossary/schema.py +++ b/backend/dataall/api/Objects/Glossary/schema.py @@ -1,6 +1,7 @@ from ... import gql from .resolvers import * from ...constants import GlossaryRole +from dataall.api.Objects.Glossary.registry import GlossaryRegistry GlossaryNode = gql.Union( name='GlossaryNode', @@ -246,13 +247,7 @@ GlossaryTermLinkTarget = gql.Union( name='GlossaryTermLinkTarget', - types=[ - gql.Ref('Dataset'), - gql.Ref('DatasetTable'), - gql.Ref('DatasetStorageLocation'), - gql.Ref('DatasetTableColumn'), - gql.Ref('Dashboard'), - ], + type_registry=GlossaryRegistry, resolver=target_union_resolver, ) diff --git a/backend/dataall/core/glossary/__init__.py b/backend/dataall/core/glossary/__init__.py deleted file mode 100644 index aa81c1e26..000000000 --- a/backend/dataall/core/glossary/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Contains code related to glossaries""" diff --git a/backend/dataall/core/glossary/services/__init__.py b/backend/dataall/core/glossary/services/__init__.py deleted file mode 100644 index 9ed65d261..000000000 --- a/backend/dataall/core/glossary/services/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Service layer of glossaries""" diff --git a/backend/dataall/db/api/glossary.py b/backend/dataall/db/api/glossary.py index 96c340f62..2dc97c62c 100644 --- a/backend/dataall/db/api/glossary.py +++ b/backend/dataall/db/api/glossary.py @@ -5,12 +5,11 @@ from sqlalchemy.orm import with_expression, aliased from .. import models, exceptions, permissions, paginate, Resource -from .permission_checker import ( - has_tenant_perm, -) +from .permission_checker import has_tenant_perm from ..models.Glossary import GlossaryNodeStatus -from dataall.core.glossary.services.registry import GlossaryRegistry from ..paginator import Page +from dataall.core.permission_checker import has_tenant_permission +from dataall.core.context import get_context logger = logging.getLogger(__name__) @@ -113,8 +112,8 @@ def update_node(session, username, groups, uri, data=None, check_perm=None): return node @staticmethod - @has_tenant_perm(permissions.MANAGE_GLOSSARIES) - def link_term(session, username, groups, uri, data=None, check_perm=None): + @has_tenant_permission(permissions.MANAGE_GLOSSARIES) + def link_term(session, uri, target_model: Resource, data): term: models.GlossaryNode = session.query(models.GlossaryNode).get(uri) if not term: raise exceptions.ObjectNotFound('Node', uri) @@ -128,18 +127,12 @@ def link_term(session, username, groups, uri, data=None, check_perm=None): target_uri: str = data['targetUri'] target_type: str = data['targetType'] - target_model: Resource = GlossaryRegistry.find_model(target_type) - if not target_model: - raise exceptions.InvalidInput( - 'NodeType', 'term.nodeType', 'association target type is invalid' - ) - target = session.query(target_model).get(target_uri) if not target: raise exceptions.ObjectNotFound('Association target', uri) link = models.TermLink( - owner=username, + owner=get_context().username, approvedByOwner=data.get('approvedByOwner', True), approvedBySteward=data.get('approvedBySteward', True), nodeUri=uri, @@ -335,14 +328,12 @@ def list_node_children(session, source, filter): ).to_dict() @staticmethod - def list_term_associations( - session, username, groups, uri, data=None, check_perm=None - ): + def list_term_associations(session, target_model_definitions, data=None): source = data['source'] filter = data['filter'] query = None - for definition in GlossaryRegistry.definitions(): + for definition in target_model_definitions: model = definition.model subquery = session.query( definition.target_uri().label('targetUri'), diff --git a/backend/dataall/modules/datasets/__init__.py b/backend/dataall/modules/datasets/__init__.py index 6ba4a24e2..de1963bd2 100644 --- a/backend/dataall/modules/datasets/__init__.py +++ b/backend/dataall/modules/datasets/__init__.py @@ -3,7 +3,7 @@ from typing import List from dataall.api.Objects.Feed.registry import FeedRegistry, FeedDefinition -from dataall.core.glossary.services.registry import GlossaryRegistry, GlossaryDefinition +from dataall.api.Objects.Glossary.registry import GlossaryRegistry, GlossaryDefinition from dataall.modules.datasets.db.table_column_model import DatasetTableColumn from dataall.modules.loader import ModuleInterface, ImportMode diff --git a/tests/api/test_glossary.py b/tests/api/test_glossary.py index 8276dca8c..987ccc1a8 100644 --- a/tests/api/test_glossary.py +++ b/tests/api/test_glossary.py @@ -1,3 +1,4 @@ +from datetime import datetime from typing import List from dataall.db import models from dataall.modules.datasets.db.table_column_model import DatasetTableColumn @@ -197,7 +198,6 @@ def test_list_glossaries(client): } """ ) - print(response) assert response.data.listGlossaries.count == 1 assert response.data.listGlossaries.nodes[0].stats.categories == 2 @@ -246,7 +246,6 @@ def test_hierarchical_search(client): } """ ) - print(response) assert response.data.searchGlossary.count == 4 @@ -263,7 +262,6 @@ def test_get_glossary(client, g1): """, nodeUri=g1.nodeUri, ) - print(r) assert r.data.getGlossary.nodeUri == g1.nodeUri assert r.data.getGlossary.label == g1.label assert r.data.getGlossary.readme == g1.readme @@ -301,7 +299,6 @@ def test_get_term(client, t1): """, nodeUri=t1.nodeUri, ) - print(r) assert r.data.getTerm.nodeUri == t1.nodeUri assert r.data.getTerm.label == t1.label assert r.data.getTerm.readme == t1.readme @@ -552,7 +549,7 @@ def test_link_term(client, t1, _columns, group): print(r) -def test_get_term_associations(t1, client): +def test_get_term_associations(t1, db, client): r = client.query( """ query GetTerm($nodeUri:String!){ @@ -579,10 +576,13 @@ def test_get_term_associations(t1, client): nodeUri=t1.nodeUri, username='alice', ) - print(r) + assert r.data.getTerm.nodeUri == t1.nodeUri + assert r.data.getTerm.label == t1.label + assert r.data.getTerm.readme == t1.readme -def test_delete_category(client, c1, group): +def test_delete_category(client, db, c1, group): + now = datetime.now() r = client.query( """ mutation DeleteCategory( @@ -597,7 +597,9 @@ def test_delete_category(client, c1, group): username='alice', groups=[group.name], ) - print(r) + with db.scoped_session() as session: + node = session.query(models.GlossaryNode).get(c1.nodeUri) + assert node.deleted >= now def test_list_glossaries_after_delete(client): @@ -634,7 +636,6 @@ def test_list_glossaries_after_delete(client): } """ ) - print(response) assert response.data.listGlossaries.count == 1 assert response.data.listGlossaries.nodes[0].stats.categories == 0 @@ -683,5 +684,4 @@ def test_hierarchical_search_after_delete(client): } """ ) - print(response) assert response.data.searchGlossary.count == 1 From 144dfea5a1026d665eda7e6384adc5781297c5ba Mon Sep 17 00:00:00 2001 From: Nikita Podshivalov Date: Thu, 13 Apr 2023 15:15:08 +0200 Subject: [PATCH 28/32] Changed import in redshift module --- backend/dataall/db/api/redshift_cluster.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/backend/dataall/db/api/redshift_cluster.py b/backend/dataall/db/api/redshift_cluster.py index 8ca3088bf..31b795225 100644 --- a/backend/dataall/db/api/redshift_cluster.py +++ b/backend/dataall/db/api/redshift_cluster.py @@ -495,8 +495,9 @@ def enable_copy_table( ) -> models.RedshiftClusterDatasetTable: cluster = RedshiftCluster.get_redshift_cluster_by_uri(session, uri) - # TODO should be migrated in the redshift module - table = dataall.modules.datasets.services.dataset_table.DatasetTableService.get_dataset_table_by_uri( + # TODO this dirty hack should be removed in the redshift module or after pipeline migration (circular import) + from dataall.modules.datasets.services.dataset_table import DatasetTableService + table = DatasetTableService.get_dataset_table_by_uri( session, data['tableUri'] ) table = models.RedshiftClusterDatasetTable( From d43b9b31b661287e303cfad8fc958c7f511277d2 Mon Sep 17 00:00:00 2001 From: Nikita Podshivalov Date: Thu, 13 Apr 2023 15:18:04 +0200 Subject: [PATCH 29/32] No need for Utils yet --- backend/dataall/core/utils/__init__.py | 1 - 1 file changed, 1 deletion(-) delete mode 100644 backend/dataall/core/utils/__init__.py diff --git a/backend/dataall/core/utils/__init__.py b/backend/dataall/core/utils/__init__.py deleted file mode 100644 index 02ed9cfb4..000000000 --- a/backend/dataall/core/utils/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"""Utility functions and classes""" From 39b244c9b2fc841405e53088a48f8c564850b505 Mon Sep 17 00:00:00 2001 From: Nikita Podshivalov Date: Thu, 13 Apr 2023 15:22:39 +0200 Subject: [PATCH 30/32] Fixed linting --- backend/dataall/db/models/Dataset.py | 1 - .../modules/datasets/handlers/__init__.py | 1 - .../datasets/handlers/glue_column_handler.py | 19 +++++++++---------- 3 files changed, 9 insertions(+), 12 deletions(-) diff --git a/backend/dataall/db/models/Dataset.py b/backend/dataall/db/models/Dataset.py index 451c7da7c..fd65387b7 100644 --- a/backend/dataall/db/models/Dataset.py +++ b/backend/dataall/db/models/Dataset.py @@ -62,4 +62,3 @@ class Dataset(Resource, Base): def uri(self): return self.datasetUri - diff --git a/backend/dataall/modules/datasets/handlers/__init__.py b/backend/dataall/modules/datasets/handlers/__init__.py index 19bd47297..a5d506712 100644 --- a/backend/dataall/modules/datasets/handlers/__init__.py +++ b/backend/dataall/modules/datasets/handlers/__init__.py @@ -8,4 +8,3 @@ ) __all__ = ["glue_column_handler", "glue_table_handler"] - diff --git a/backend/dataall/modules/datasets/handlers/glue_column_handler.py b/backend/dataall/modules/datasets/handlers/glue_column_handler.py index 02003eea2..329b702b7 100644 --- a/backend/dataall/modules/datasets/handlers/glue_column_handler.py +++ b/backend/dataall/modules/datasets/handlers/glue_column_handler.py @@ -67,16 +67,15 @@ def update_table_columns(engine, task: models.Task): updated_table = { k: v for k, v in original_table['Table'].items() - if k - not in [ - 'CatalogId', - 'VersionId', - 'DatabaseName', - 'CreateTime', - 'UpdateTime', - 'CreatedBy', - 'IsRegisteredWithLakeFormation', - ] + if k not in [ + 'CatalogId', + 'VersionId', + 'DatabaseName', + 'CreateTime', + 'UpdateTime', + 'CreatedBy', + 'IsRegisteredWithLakeFormation', + ] } all_columns = updated_table.get('StorageDescriptor', {}).get( 'Columns', [] From ac71f595e576b3bd5a440248f77096d04bec3095 Mon Sep 17 00:00:00 2001 From: Nikita Podshivalov Date: Thu, 20 Apr 2023 13:23:29 +0200 Subject: [PATCH 31/32] Renamed method --- backend/dataall/modules/datasets/handlers/glue_table_handler.py | 2 +- backend/dataall/modules/datasets/services/dataset_table.py | 2 +- backend/dataall/modules/datasets/tasks/tables_syncer.py | 2 +- tests/api/test_dataset_table.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/backend/dataall/modules/datasets/handlers/glue_table_handler.py b/backend/dataall/modules/datasets/handlers/glue_table_handler.py index 9bb50c501..272641e4e 100644 --- a/backend/dataall/modules/datasets/handlers/glue_table_handler.py +++ b/backend/dataall/modules/datasets/handlers/glue_table_handler.py @@ -26,5 +26,5 @@ def list_tables(engine, task: models.Task): tables = Glue.list_glue_database_tables( account_id, dataset.GlueDatabaseName, region ) - DatasetTableService.sync(session, dataset.datasetUri, glue_tables=tables) + DatasetTableService.sync_existing_tables(session, dataset.datasetUri, glue_tables=tables) return tables diff --git a/backend/dataall/modules/datasets/services/dataset_table.py b/backend/dataall/modules/datasets/services/dataset_table.py index 873cbe01e..23b510083 100644 --- a/backend/dataall/modules/datasets/services/dataset_table.py +++ b/backend/dataall/modules/datasets/services/dataset_table.py @@ -224,7 +224,7 @@ def get_dataset_table_by_uri(session, table_uri): return table @staticmethod - def sync(session, datasetUri, glue_tables=None): + def sync_existing_tables(session, datasetUri, glue_tables=None): dataset: Dataset = session.query(Dataset).get(datasetUri) if dataset: diff --git a/backend/dataall/modules/datasets/tasks/tables_syncer.py b/backend/dataall/modules/datasets/tasks/tables_syncer.py index 27a870d60..c7a57aeef 100644 --- a/backend/dataall/modules/datasets/tasks/tables_syncer.py +++ b/backend/dataall/modules/datasets/tasks/tables_syncer.py @@ -62,7 +62,7 @@ def sync_tables(engine, es=None): f'Found {len(tables)} tables on Glue database {dataset.GlueDatabaseName}' ) - DatasetTableService.sync( + DatasetTableService.sync_existing_tables( session, dataset.datasetUri, glue_tables=tables ) diff --git a/tests/api/test_dataset_table.py b/tests/api/test_dataset_table.py index a2fcb2add..c285aa5f7 100644 --- a/tests/api/test_dataset_table.py +++ b/tests/api/test_dataset_table.py @@ -291,7 +291,7 @@ def test_sync_tables_and_columns(client, table, dataset1, db): }, ] - assert DatasetTableService.sync(session, dataset1.datasetUri, glue_tables) + assert DatasetTableService.sync_existing_tables(session, dataset1.datasetUri, glue_tables) new_table: dataall.db.models.DatasetTable = ( session.query(dataall.db.models.DatasetTable) .filter(dataall.db.models.DatasetTable.name == 'new_table') From ec3228f0f0b152544d153a2da256a44076476f09 Mon Sep 17 00:00:00 2001 From: Nikita Podshivalov Date: Thu, 20 Apr 2023 17:58:54 +0200 Subject: [PATCH 32/32] Added AWS for glue and lake formation clients --- .../dataall/modules/datasets/aws/__init__.py | 0 .../modules/datasets/aws/glue_table_client.py | 47 +++++++ .../modules/datasets/aws/lf_table_client.py | 53 +++++++ .../datasets/handlers/glue_column_handler.py | 131 +++++------------- backend/dataall/modules/loader.py | 4 +- 5 files changed, 133 insertions(+), 102 deletions(-) create mode 100644 backend/dataall/modules/datasets/aws/__init__.py create mode 100644 backend/dataall/modules/datasets/aws/glue_table_client.py create mode 100644 backend/dataall/modules/datasets/aws/lf_table_client.py diff --git a/backend/dataall/modules/datasets/aws/__init__.py b/backend/dataall/modules/datasets/aws/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/backend/dataall/modules/datasets/aws/glue_table_client.py b/backend/dataall/modules/datasets/aws/glue_table_client.py new file mode 100644 index 000000000..78f3e4485 --- /dev/null +++ b/backend/dataall/modules/datasets/aws/glue_table_client.py @@ -0,0 +1,47 @@ +import logging + +from botocore.exceptions import ClientError + +from dataall.db.models import DatasetTable + +log = logging.getLogger(__name__) + + +class GlueTableClient: + """Makes requests to AWS Glue API""" + def __init__(self, aws_session, table: DatasetTable): + self._client = aws_session.client('glue', region_name=table.region) + self._table = table + + def get_table(self): + dataset_table = self._table + try: + glue_table = self._client.get_table( + CatalogId=dataset_table.AWSAccountId, + DatabaseName=dataset_table.GlueDatabaseName, + Name=dataset_table.name, + ) + return glue_table + except ClientError as e: + log.error( + f'Failed to get table aws://{dataset_table.AWSAccountId}' + f'//{dataset_table.GlueDatabaseName}' + f'//{dataset_table.name} due to: ' + f'{e}' + ) + return {} + + def update_table_for_column(self, column_name, table_input): + try: + response = self._client.update_table( + DatabaseName=self._table.name, + TableInput=table_input, + ) + log.info( + f'Column {column_name} updated successfully: {response}' + ) + except ClientError as e: + log.error( + f'Failed to update table column {column_name} description: {e}' + ) + raise e diff --git a/backend/dataall/modules/datasets/aws/lf_table_client.py b/backend/dataall/modules/datasets/aws/lf_table_client.py new file mode 100644 index 000000000..f978e0e73 --- /dev/null +++ b/backend/dataall/modules/datasets/aws/lf_table_client.py @@ -0,0 +1,53 @@ +import logging +from botocore.exceptions import ClientError + +from dataall.aws.handlers.sts import SessionHelper +from dataall.db.models import DatasetTable + +log = logging.getLogger(__name__) + + +class LakeFormationTableClient: + """Requests to AWS LakeFormation""" + + def __init__(self, aws_session, table: DatasetTable): + self._client = aws_session.client('lakeformation', region_name=table.reg) + self._table = table + + def grant_pivot_role_all_table_permissions(self): + """ + Pivot role needs to have all permissions + for tables managed inside dataall + :param aws_session: + :param table: + :return: + """ + table = self._table + try: + grant_dict = dict( + Principal={ + 'DataLakePrincipalIdentifier': SessionHelper.get_delegation_role_arn( + table.AWSAccountId + ) + }, + Resource={ + 'Table': { + 'DatabaseName': table.GlueDatabaseName, + 'Name': table.name, + } + }, + Permissions=['SELECT', 'ALTER', 'DROP', 'INSERT'], + ) + response = self._client.grant_permissions(**grant_dict) + log.error( + f'Successfully granted pivot role all table ' + f'aws://{table.AWSAccountId}/{table.GlueDatabaseName}/{table.name} ' + f'access: {response}' + ) + except ClientError as e: + log.error( + f'Failed to grant pivot role all table ' + f'aws://{table.AWSAccountId}/{table.GlueDatabaseName}/{table.name} ' + f'access: {e}' + ) + raise e diff --git a/backend/dataall/modules/datasets/handlers/glue_column_handler.py b/backend/dataall/modules/datasets/handlers/glue_column_handler.py index 329b702b7..9d97470ff 100644 --- a/backend/dataall/modules/datasets/handlers/glue_column_handler.py +++ b/backend/dataall/modules/datasets/handlers/glue_column_handler.py @@ -1,10 +1,10 @@ import logging -from botocore.exceptions import ClientError - from dataall.aws.handlers.sts import SessionHelper from dataall.db import models from dataall.aws.handlers.service_handlers import Worker +from dataall.modules.datasets.aws.glue_table_client import GlueTableClient +from dataall.modules.datasets.aws.lf_table_client import LakeFormationTableClient from dataall.modules.datasets.db.table_column_model import DatasetTableColumn from dataall.modules.datasets.services.dataset_table import DatasetTableService @@ -22,21 +22,8 @@ def get_table_columns(engine, task: models.Task): task.targetUri ) aws = SessionHelper.remote_session(dataset_table.AWSAccountId) - glue_client = aws.client('glue', region_name=dataset_table.region) - glue_table = {} - try: - glue_table = glue_client.get_table( - CatalogId=dataset_table.AWSAccountId, - DatabaseName=dataset_table.GlueDatabaseName, - Name=dataset_table.name, - ) - except glue_client.exceptions.ClientError as e: - log.error( - f'Failed to get table aws://{dataset_table.AWSAccountId}' - f'//{dataset_table.GlueDatabaseName}' - f'//{dataset_table.name} due to: ' - f'{e}' - ) + glue_table = GlueTableClient(aws, dataset_table).get_table() + DatasetTableService.sync_table_columns( session, dataset_table, glue_table['Table'] ) @@ -52,90 +39,34 @@ def update_table_columns(engine, task: models.Task): table: models.DatasetTable = session.query(models.DatasetTable).get( column.tableUri ) - try: - aws_session = SessionHelper.remote_session(table.AWSAccountId) - - DatasetColumnGlueHandler.grant_pivot_role_all_table_permissions(aws_session, table) - glue_client = aws_session.client('glue', region_name=table.region) + aws_session = SessionHelper.remote_session(table.AWSAccountId) - original_table = glue_client.get_table( - CatalogId=table.AWSAccountId, - DatabaseName=table.GlueDatabaseName, - Name=table.name, - ) - updated_table = { - k: v - for k, v in original_table['Table'].items() - if k not in [ - 'CatalogId', - 'VersionId', - 'DatabaseName', - 'CreateTime', - 'UpdateTime', - 'CreatedBy', - 'IsRegisteredWithLakeFormation', - ] - } - all_columns = updated_table.get('StorageDescriptor', {}).get( - 'Columns', [] - ) + updated_table.get('PartitionKeys', []) - for col in all_columns: - if col['Name'] == column.name: - col['Comment'] = column.description - log.info( - f'Found column {column.name} adding description {column.description}' - ) - response = glue_client.update_table( - DatabaseName=table.GlueDatabaseName, - TableInput=updated_table, - ) - log.info( - f'Column {column.name} updated successfully: {response}' - ) - return True + LakeFormationTableClient(aws_session, table).grant_pivot_role_all_table_permissions() + glue_client = GlueTableClient(aws_session, table) + original_table = glue_client.get_table() + updated_table = { + k: v + for k, v in original_table['Table'].items() + if k not in [ + 'CatalogId', + 'VersionId', + 'DatabaseName', + 'CreateTime', + 'UpdateTime', + 'CreatedBy', + 'IsRegisteredWithLakeFormation', + ] + } + all_columns = updated_table.get('StorageDescriptor', {}).get( + 'Columns', [] + ) + updated_table.get('PartitionKeys', []) + for col in all_columns: + if col['Name'] == column.name: + col['Comment'] = column.description + log.info( + f'Found column {column.name} adding description {column.description}' + ) - except ClientError as e: - log.error( - f'Failed to update table column {column.name} description: {e}' - ) - raise e + glue_client.update_table_for_column(column.name, updated_table) - @staticmethod - def grant_pivot_role_all_table_permissions(aws_session, table): - """ - Pivot role needs to have all permissions - for tables managed inside dataall - :param aws_session: - :param table: - :return: - """ - try: - lf_client = aws_session.client('lakeformation', region_name=table.region) - grant_dict = dict( - Principal={ - 'DataLakePrincipalIdentifier': SessionHelper.get_delegation_role_arn( - table.AWSAccountId - ) - }, - Resource={ - 'Table': { - 'DatabaseName': table.GlueDatabaseName, - 'Name': table.name, - } - }, - Permissions=['SELECT', 'ALTER', 'DROP', 'INSERT'], - ) - response = lf_client.grant_permissions(**grant_dict) - log.error( - f'Successfully granted pivot role all table ' - f'aws://{table.AWSAccountId}/{table.GlueDatabaseName}/{table.name} ' - f'access: {response}' - ) - except ClientError as e: - log.error( - f'Failed to grant pivot role all table ' - f'aws://{table.AWSAccountId}/{table.GlueDatabaseName}/{table.name} ' - f'access: {e}' - ) - raise e diff --git a/backend/dataall/modules/loader.py b/backend/dataall/modules/loader.py index 9fa3c69bf..e48583c13 100644 --- a/backend/dataall/modules/loader.py +++ b/backend/dataall/modules/loader.py @@ -48,11 +48,11 @@ def load_modules(modes: List[ImportMode]) -> None: log.info("Found %d modules that have been found in the config", len(modules)) for name, props in modules.items(): - active = props["active"] - if "active" not in props: raise ValueError(f"Status is not defined for {name} module") + active = props["active"] + if not active: log.info(f"Module {name} is not active. Skipping...") continue