Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Generic dataset module and specific s3_datasets module - part 5 (Move DatasetServiceInterface to datasets_base, add property, create first list API for datasets_base) #1281

Merged
merged 36 commits into from
May 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
035a637
Rename datasets to s3_datasets
dlpzx May 6, 2024
aa99257
Rename datasets to s3_datasets
dlpzx May 6, 2024
8ea5ca2
Rename datasets to s3_datasets
dlpzx May 6, 2024
2ad4a2e
Fix references to config in frontend
dlpzx May 6, 2024
59d4c93
Merge branch 'refs/heads/main' into feat/generic-dataset-model-refact…
dlpzx May 6, 2024
17fe992
Fix s3_dataset references frontend
dlpzx May 6, 2024
2c333a2
Added datasets_base module and dependencies
dlpzx May 7, 2024
7d66809
Moved dataset_enums to datasets_base
dlpzx May 7, 2024
38a2275
Use S3Dataset instead of Dataset in s3_dataset module
dlpzx May 7, 2024
fe71766
Use S3Dataset instead of Dataset in dataset_sharing module
dlpzx May 7, 2024
3985167
Use S3Dataset instead of Dataset in tests+some missing in modules
dlpzx May 7, 2024
f05a4f6
Use S3Dataset instead of Dataset in migration scripts and init
dlpzx May 7, 2024
aacd1f0
Fix foreign key between datasetBase and s3dataset
dlpzx May 7, 2024
41db8ed
Fix migration references to Dataset and add new migration script with…
dlpzx May 7, 2024
b3849cd
Added first draft of migration scripts
dlpzx May 7, 2024
313dd89
Merge remote-tracking branch 'refs/remotes/origin/main' into feat/gen…
dlpzx May 8, 2024
7e287d1
Fix details of init files
dlpzx May 8, 2024
98660df
Finis migration scripts
dlpzx May 10, 2024
c4ec66d
Add datasets_base in config.json
dlpzx May 13, 2024
58ea763
Merge remote-tracking branch 'refs/remotes/origin/feat/generic-datase…
dlpzx May 14, 2024
449d689
Merge remote-tracking branch 'refs/remotes/origin/main' into feat/gen…
dlpzx May 15, 2024
5d472e8
Adapt permission resourceType to DatasetBase
dlpzx May 15, 2024
8a573da
Adapt permission resourceType to DatasetBase
dlpzx May 15, 2024
b806100
linting
dlpzx May 15, 2024
b31922f
Fix issues in foreign keys migration scripts
dlpzx May 15, 2024
ffe372e
PR review comments - fix downgrade and add enums to dataset tables
dlpzx May 16, 2024
a98632f
Move DatasetLock and Activity to DatasetBaseRepository
dlpzx May 16, 2024
9e9415b
Move DatasetLock model
dlpzx May 16, 2024
3a6765e
Fixes from PR review: stewards and polymorphic definition with enum
dlpzx May 17, 2024
8939faa
Move DatasetServiceInterface to datasets_base, add property, create f…
dlpzx May 16, 2024
8f1605e
Adapt initialization of datasetBase module and FE response to listDat…
dlpzx May 17, 2024
254b23b
Merge branch 'refs/heads/feat/generic-dataset-model-refactoring-3' in…
dlpzx May 17, 2024
8219f22
Merge branch 'refs/heads/feat/generic-dataset-model-refactoring-4' in…
dlpzx May 17, 2024
132970b
Make changes to frontend to show generic info
dlpzx May 17, 2024
933ce1f
Merge branch 'refs/heads/main' into feat/generic-dataset-model-refact…
dlpzx May 21, 2024
f39296f
Better typing hints
dlpzx May 21, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions backend/dataall/modules/dataset_sharing/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,13 @@ def __init__(self):
from dataall.modules.dataset_sharing import api
from dataall.modules.dataset_sharing.services.managed_share_policy_service import SharePolicyService
from dataall.modules.s3_datasets.services.dataset_service import DatasetService
from dataall.modules.datasets_base.services.dataset_list_service import DatasetListService
from dataall.modules.dataset_sharing.services.dataset_sharing_service import DatasetSharingService
from dataall.modules.dataset_sharing.db.share_object_repositories import ShareEnvironmentResource

EnvironmentResourceManager.register(ShareEnvironmentResource())
DatasetService.register(DatasetSharingService())
DatasetListService.register(DatasetSharingService())
log.info('API of dataset sharing has been imported')


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
from dataall.modules.dataset_sharing.db.share_object_models import ShareObjectItem, ShareObject
from dataall.modules.s3_datasets.db.dataset_repositories import DatasetRepository
from dataall.modules.s3_datasets.db.dataset_models import DatasetStorageLocation, DatasetTable, S3Dataset, DatasetBucket
from dataall.modules.datasets_base.db.dataset_models import DatasetBase

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -1002,10 +1003,10 @@ def delete_shares_with_no_shared_items(session, dataset_uri):
def query_user_shared_datasets(session, username, groups) -> Query:
share_item_shared_states = ShareItemSM.get_share_item_shared_states()
query = (
session.query(S3Dataset)
session.query(DatasetBase)
.outerjoin(
ShareObject,
ShareObject.datasetUri == S3Dataset.datasetUri,
ShareObject.datasetUri == DatasetBase.datasetUri,
)
.outerjoin(ShareObjectItem, ShareObjectItem.shareUri == ShareObject.shareUri)
.filter(
Expand All @@ -1021,7 +1022,7 @@ def query_user_shared_datasets(session, username, groups) -> Query:
)
)
)
return query.distinct(S3Dataset.datasetUri)
return query.distinct(DatasetBase.datasetUri)

@staticmethod
def find_dataset_shares(session, dataset_uri):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,9 @@
DELETE_DATASET_FOLDER,
CREDENTIALS_DATASET,
)

from dataall.modules.s3_datasets.db.dataset_models import S3Dataset
from dataall.modules.datasets_base.services.datasets_enums import DatasetRole
from dataall.modules.s3_datasets.services.dataset_service import DatasetServiceInterface
from dataall.modules.datasets_base.services.datasets_enums import DatasetRole, DatasetType
from dataall.modules.datasets_base.services.dataset_service_interface import DatasetServiceInterface


import logging
Expand All @@ -32,6 +31,10 @@


class DatasetSharingService(DatasetServiceInterface):
@property
def dataset_type(self):
return DatasetType.S3

@staticmethod
def resolve_additional_dataset_user_role(session, uri, username, groups):
"""Implemented as part of the DatasetServiceInterface"""
Expand Down
13 changes: 12 additions & 1 deletion backend/dataall/modules/datasets_base/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@ class DatasetBaseModuleInterface(ModuleInterface):
@staticmethod
def is_supported(modes: Set[ImportMode]) -> bool:
supported_modes = {
ImportMode.API,
ImportMode.CDK,
ImportMode.HANDLERS,
ImportMode.STACK_UPDATER_TASK,
Expand All @@ -16,3 +15,15 @@ def is_supported(modes: Set[ImportMode]) -> bool:

def __init__(self):
import dataall.modules.datasets_base.services.datasets_enums


class DatasetBaseApiModuleInterface(ModuleInterface):
"""Implements ModuleInterface for MLStudio GraphQl lambda"""

@classmethod
def is_supported(cls, modes):
return ImportMode.API in modes

def __init__(self):
import dataall.modules.datasets_base.api
import dataall.modules.datasets_base.services.datasets_enums
5 changes: 5 additions & 0 deletions backend/dataall/modules/datasets_base/api/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
"""The package defines the schema for Dataset_base lists"""

from dataall.modules.datasets_base.api import input_types, queries, types, resolvers

__all__ = ['types', 'input_types', 'queries', 'resolvers']
27 changes: 27 additions & 0 deletions backend/dataall/modules/datasets_base/api/input_types.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
from dataall.base.api import gql
from dataall.base.api.constants import SortDirection
from dataall.modules.datasets_base.services.datasets_enums import DatasetSortField


DatasetSortCriteria = gql.InputType(
name='DatasetSortCriteria',
arguments=[
gql.Argument(name='field', type=gql.NonNullableType(DatasetSortField.toGraphQLEnum())),
gql.Argument(name='direction', type=SortDirection.toGraphQLEnum()),
],
)


DatasetFilter = gql.InputType(
name='DatasetFilter',
arguments=[
gql.Argument('term', gql.String),
gql.Argument('roles', gql.ArrayType(gql.Ref('DatasetRole'))),
gql.Argument('InProject', gql.String),
gql.Argument('notInProject', gql.String),
gql.Argument('displayArchived', gql.Boolean),
gql.Argument('sort', gql.ArrayType(DatasetSortCriteria)),
gql.Argument('page', gql.Integer),
gql.Argument('pageSize', gql.Integer),
],
)
13 changes: 13 additions & 0 deletions backend/dataall/modules/datasets_base/api/queries.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
from dataall.base.api import gql
from dataall.modules.datasets_base.api.input_types import DatasetFilter
from dataall.modules.datasets_base.api.resolvers import (
list_all_user_datasets,
)
from dataall.modules.datasets_base.api.types import DatasetBaseSearchResult

listDatasets = gql.QueryField(
name='listDatasets',
args=[gql.Argument('filter', DatasetFilter)],
type=DatasetBaseSearchResult,
resolver=list_all_user_datasets,
)
71 changes: 71 additions & 0 deletions backend/dataall/modules/datasets_base/api/resolvers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
import logging

from dataall.base.api.context import Context
from dataall.core.environment.services.environment_service import EnvironmentService
from dataall.core.organizations.db.organization_repositories import OrganizationRepository
from dataall.core.stacks.services.stack_service import StackService
from dataall.modules.datasets_base.services.dataset_list_service import DatasetListService
from dataall.modules.datasets_base.services.datasets_enums import DatasetRole
from dataall.modules.datasets_base.db.dataset_models import DatasetBase

log = logging.getLogger(__name__)


def list_all_user_datasets(context: Context, source, filter: dict = None):
if not filter:
filter = {'page': 1, 'pageSize': 5}
return DatasetListService.list_all_user_datasets(filter)


def resolve_user_role(context: Context, source: DatasetBase, **kwargs):
if not source:
return None
if source.owner == context.username:
return DatasetRole.Creator.value
elif source.SamlAdminGroupName in context.groups:
return DatasetRole.Admin.value
elif source.stewards in context.groups:
return DatasetRole.DataSteward.value
else:
with context.engine.scoped_session() as session:
other_modules_user_role = DatasetListService.get_other_modules_dataset_user_role(
session, source.datasetUri, context.username, context.groups
)
if other_modules_user_role is not None:
return other_modules_user_role
return DatasetRole.NoPermission.value


def get_dataset_organization(context, source: DatasetBase, **kwargs):
if not source:
return None
with context.engine.scoped_session() as session:
return OrganizationRepository.get_organization_by_uri(session, source.organizationUri)


def get_dataset_environment(context, source: DatasetBase, **kwargs):
if not source:
return None
with context.engine.scoped_session() as session:
return EnvironmentService.get_environment_by_uri(session, source.environmentUri)


def get_dataset_owners_group(context, source: DatasetBase, **kwargs):
if not source:
return None
return source.SamlAdminGroupName


def get_dataset_stewards_group(context, source: DatasetBase, **kwargs):
if not source:
return None
return source.stewards


def resolve_dataset_stack(context: Context, source: DatasetBase, **kwargs):
if not source:
return None
return StackService.get_stack_with_cfn_resources(
targetUri=source.datasetUri,
environmentUri=source.environmentUri,
)
79 changes: 79 additions & 0 deletions backend/dataall/modules/datasets_base/api/types.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
from dataall.base.api import gql
from dataall.modules.datasets_base.services.datasets_enums import DatasetRole
from dataall.modules.datasets_base.api.resolvers import (
get_dataset_environment,
get_dataset_organization,
get_dataset_owners_group,
get_dataset_stewards_group,
resolve_user_role,
resolve_dataset_stack,
)
from dataall.core.environment.api.enums import EnvironmentPermission

DatasetBase = gql.ObjectType(
name='DatasetBase',
fields=[
gql.Field(name='datasetUri', type=gql.ID),
gql.Field(name='datasetType', type=gql.String),
gql.Field(name='label', type=gql.String),
gql.Field(name='name', type=gql.String),
gql.Field(name='description', type=gql.String),
gql.Field(name='tags', type=gql.ArrayType(gql.String)),
gql.Field(name='owner', type=gql.String),
gql.Field(name='created', type=gql.String),
gql.Field(name='updated', type=gql.String),
gql.Field(name='admins', type=gql.ArrayType(gql.String)),
gql.Field(name='AwsAccountId', type=gql.String),
gql.Field(name='region', type=gql.String),
gql.Field(name='SamlAdminGroupName', type=gql.String),
gql.Field(name='businessOwnerEmail', type=gql.String),
gql.Field(name='businessOwnerDelegationEmails', type=gql.ArrayType(gql.String)),
gql.Field(name='imported', type=gql.Boolean),
gql.Field(
name='environment',
type=gql.Ref('Environment'),
resolver=get_dataset_environment,
),
gql.Field(
name='organization',
type=gql.Ref('Organization'),
resolver=get_dataset_organization,
),
gql.Field(
name='owners',
type=gql.String,
resolver=get_dataset_owners_group,
),
gql.Field(
name='stewards',
type=gql.String,
resolver=get_dataset_stewards_group,
),
gql.Field(
name='userRoleForDataset',
type=DatasetRole.toGraphQLEnum(),
resolver=resolve_user_role,
),
gql.Field(name='userRoleInEnvironment', type=EnvironmentPermission.toGraphQLEnum()),
gql.Field(name='topics', type=gql.ArrayType(gql.Ref('Topic'))),
gql.Field(name='confidentiality', type=gql.String),
gql.Field(name='language', type=gql.Ref('Language')),
gql.Field(name='autoApprovalEnabled', type=gql.Boolean),
gql.Field(name='stack', type=gql.Ref('Stack'), resolver=resolve_dataset_stack),
],
)

DatasetBaseSearchResult = gql.ObjectType(
name='DatasetBaseSearchResult',
fields=[
gql.Field(name='count', type=gql.Integer),
gql.Field(name='nodes', type=gql.ArrayType(DatasetBase)),
gql.Field(name='pageSize', type=gql.Integer),
gql.Field(name='nextPage', type=gql.Integer),
gql.Field(name='pages', type=gql.Integer),
gql.Field(name='page', type=gql.Integer),
gql.Field(name='previousPage', type=gql.Integer),
gql.Field(name='hasNext', type=gql.Boolean),
gql.Field(name='hasPrevious', type=gql.Boolean),
],
)
41 changes: 41 additions & 0 deletions backend/dataall/modules/datasets_base/db/dataset_repositories.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,8 @@
import logging
from typing import List
from sqlalchemy import and_, or_
from sqlalchemy.orm import Query
from dataall.base.db import paginate
from dataall.core.activity.db.activity_models import Activity
from dataall.modules.datasets_base.db.dataset_models import DatasetBase, DatasetLock

Expand Down Expand Up @@ -32,3 +36,40 @@ def update_dataset_activity(session, dataset: DatasetBase, username):
)
session.add(activity)
session.commit()


class DatasetListRepository:
"""DAO layer for Listing Datasets in Environments"""

@staticmethod
def paginated_all_user_datasets(session, username, groups, all_subqueries: List[Query], data=None) -> dict:
return paginate(
query=DatasetListRepository._query_all_user_datasets(session, username, groups, all_subqueries, data),
page=data.get('page', 1),
page_size=data.get('pageSize', 10),
).to_dict()

@staticmethod
def _query_all_user_datasets(session, username, groups, all_subqueries: List[Query], filter: dict = None) -> Query:
query = session.query(DatasetBase).filter(
or_(
DatasetBase.owner == username,
DatasetBase.SamlAdminGroupName.in_(groups),
DatasetBase.stewards.in_(groups),
)
)
if query.first() is not None:
all_subqueries.append(query)
if len(all_subqueries) == 1:
query = all_subqueries[0]
elif len(all_subqueries) > 1:
query = all_subqueries[0].union(*all_subqueries[1:])

if filter and filter.get('term'):
query = query.filter(
or_(
DatasetBase.description.ilike(filter.get('term') + '%%'),
DatasetBase.label.ilike(filter.get('term') + '%%'),
)
)
return query.order_by(DatasetBase.label).distinct(DatasetBase.datasetUri, DatasetBase.label)
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
import logging
from sqlalchemy.orm import Query
from typing import List
from dataall.modules.datasets_base.services.dataset_service_interface import DatasetServiceInterface
from dataall.base.context import get_context
from dataall.modules.datasets_base.db.dataset_repositories import DatasetListRepository

log = logging.getLogger(__name__)


class DatasetListService:
_interfaces: List[DatasetServiceInterface] = []

@classmethod
def register(cls, interface: DatasetServiceInterface):
cls._interfaces.append(interface)

@classmethod
def _list_all_user_interface_datasets(cls, session, username, groups) -> List[Query]:
"""All list_datasets from other modules that need to be appended to the list of datasets"""
return [
query
for interface in cls._interfaces
for query in [interface.append_to_list_user_datasets(session, username, groups)]
if query.first() is not None
]

@classmethod
def get_other_modules_dataset_user_role(cls, session, uri, username, groups) -> str:
"""All other user role types that might come from other modules"""
for interface in cls._interfaces:
role = interface.resolve_additional_dataset_user_role(session, uri, username, groups)
if role is not None:
return role
return None

@staticmethod
def list_all_user_datasets(data: dict):
context = get_context()
with context.db_engine.scoped_session() as session:
all_subqueries = DatasetListService._list_all_user_interface_datasets(
session, context.username, context.groups
)
return DatasetListRepository.paginated_all_user_datasets(
session, context.username, context.groups, all_subqueries, data=data
)
Loading
Loading