Skip to content

Commit

Permalink
Refactor: uncouple datasets and dataset_sharing modules - part 2 (#1185)
Browse files Browse the repository at this point in the history
### Feature or Bugfix
- Refactoring

### Detail

Remove and move logic from dataset to datasets_sharing module. This is
needed as explained in full PR [AFTER 2.4] Refactor: uncouple datasets
and dataset_sharing modules #1179
- [X] Moves the verify dataset shares mutation to the datasets_sharing
module
- [X] Move dataset_subscription task to dataset_sharing
- [X] Move listDatasetShares query to dataset_sharing module
- [X] Remove unused `shares` field from the Dataset graphql type as it
was not used in the frontend: listDatasets, listOwnedDatasets,
listDatasetsOwnedByEnvGroup, listDatasetsCreatedInEnvironment and
getDataset
- [x] Move getSharedDatasetTables to data_sharing module and fix
reference to DatasetService

I am aware that some of the queries and mutations that this PR moves
look a bit odd in the dataset_sharing module, but this will be solved
once data sharing is divided into dataset_sharing_base and
s3_dataset_sharing.


### Relates
#1179

### Security
Please answer the questions below briefly where applicable, or write
`N/A`. Based on
[OWASP 10](https://owasp.org/Top10/en/).

- Does this PR introduce or modify any input fields or queries - this
includes
fetching data from storage outside the application (e.g. a database, an
S3 bucket)?
  - Is the input sanitized?
- What precautions are you taking before deserializing the data you
consume?
  - Is injection prevented by parametrizing queries?
  - Have you ensured no `eval` or similar functions are used?
- Does this PR introduce any functionality or component that requires
authorization?
- How have you ensured it respects the existing AuthN/AuthZ mechanisms?
  - Are you logging failed auth attempts?
- Are you using or adding any cryptographic features?
  - Do you use a standard proven implementations?
  - Are the used keys controlled by the customer? Where are they stored?
- Are you introducing any new policies/roles/users?
  - Have you used the least-privilege principle? How?


By submitting this pull request, I confirm that my contribution is made
under the terms of the Apache 2.0 license.
  • Loading branch information
dlpzx authored Apr 29, 2024
1 parent d7a9518 commit 6386fe1
Show file tree
Hide file tree
Showing 23 changed files with 159 additions and 295 deletions.
8 changes: 8 additions & 0 deletions backend/dataall/modules/dataset_sharing/api/mutations.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
update_share_reject_purpose,
update_share_request_purpose,
verify_items_share_object,
verify_dataset_share_objects,
)

createShareObject = gql.MutationField(
Expand Down Expand Up @@ -117,3 +118,10 @@
type=gql.Boolean,
resolver=update_share_request_purpose,
)

verifyDatasetShareObjects = gql.MutationField(
name='verifyDatasetShareObjects',
args=[gql.Argument(name='input', type=gql.Ref('ShareObjectSelectorInput'))],
type=gql.Boolean,
resolver=verify_dataset_share_objects,
)
23 changes: 23 additions & 0 deletions backend/dataall/modules/dataset_sharing/api/queries.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
list_shared_with_environment_data_items,
list_shares_in_my_inbox,
list_shares_in_my_outbox,
list_dataset_share_objects,
list_shared_tables_by_env_dataset,
)

getShareObject = gql.QueryField(
Expand Down Expand Up @@ -38,3 +40,24 @@
type=gql.Ref('EnvironmentPublishedItemSearchResults'),
test_scope='Dataset',
)

listShareObjects = gql.QueryField(
name='listDatasetShareObjects',
resolver=list_dataset_share_objects,
args=[
gql.Argument(name='datasetUri', type=gql.NonNullableType(gql.String)),
gql.Argument(name='environmentUri', type=gql.String),
gql.Argument(name='page', type=gql.Integer),
],
type=gql.Ref('ShareSearchResult'),
)

getSharedDatasetTables = gql.QueryField(
name='getSharedDatasetTables',
args=[
gql.Argument(name='datasetUri', type=gql.NonNullableType(gql.String)),
gql.Argument(name='envUri', type=gql.NonNullableType(gql.String)),
],
type=gql.ArrayType(gql.Ref('SharedDatasetTableItem')),
resolver=list_shared_tables_by_env_dataset,
)
29 changes: 29 additions & 0 deletions backend/dataall/modules/dataset_sharing/api/resolvers.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from dataall.modules.dataset_sharing.db.share_object_models import ShareObjectItem, ShareObject
from dataall.modules.dataset_sharing.services.share_item_service import ShareItemService
from dataall.modules.dataset_sharing.services.share_object_service import ShareObjectService
from dataall.modules.dataset_sharing.services.dataset_sharing_service import DatasetSharingService
from dataall.modules.dataset_sharing.aws.glue_client import GlueClient
from dataall.modules.datasets_base.db.dataset_repositories import DatasetRepository
from dataall.modules.datasets_base.db.dataset_models import DatasetStorageLocation, DatasetTable, Dataset
Expand Down Expand Up @@ -37,6 +38,15 @@ def validate_item_selector_input(data):
if not data.get('itemUris'):
raise RequiredParameter('itemUris')

@staticmethod
def validate_dataset_share_selector_input(data):
if not data:
raise RequiredParameter(data)
if not data.get('datasetUri'):
raise RequiredParameter('datasetUri')
if not data.get('shareUris'):
raise RequiredParameter('shareUris')


def create_share_object(
context: Context,
Expand Down Expand Up @@ -303,3 +313,22 @@ def update_share_reject_purpose(context: Context, source, shareUri: str = None,
uri=shareUri,
reject_purpose=rejectPurpose,
)


def verify_dataset_share_objects(context: Context, source, input):
RequestValidator.validate_dataset_share_selector_input(input)
dataset_uri = input.get('datasetUri')
verify_share_uris = input.get('shareUris')
return DatasetSharingService.verify_dataset_share_objects(uri=dataset_uri, share_uris=verify_share_uris)


def list_dataset_share_objects(context, source, filter: dict = None):
if not source:
return None
if not filter:
filter = {'page': 1, 'pageSize': 5}
return DatasetSharingService.list_dataset_share_objects(source, filter)


def list_shared_tables_by_env_dataset(context: Context, source, datasetUri: str, envUri: str):
return DatasetSharingService.list_shared_tables_by_env_dataset(datasetUri, envUri)
Original file line number Diff line number Diff line change
Expand Up @@ -1291,3 +1291,40 @@ def count_role_principal_shares(session, principal_id: str, principal_type: Prin
)
.count()
)

@staticmethod
def query_dataset_tables_shared_with_env(
session, environment_uri: str, dataset_uri: str, username: str, groups: [str]
):
"""For a given dataset, returns the list of Tables shared with the environment
This means looking at approved ShareObject items
for the share object associating the dataset and environment
"""
share_item_shared_states = ShareItemSM.get_share_item_shared_states()
env_tables_shared = (
session.query(DatasetTable) # all tables
.join(
ShareObjectItem, # found in ShareObjectItem
ShareObjectItem.itemUri == DatasetTable.tableUri,
)
.join(
ShareObject, # jump to share object
ShareObject.shareUri == ShareObjectItem.shareUri,
)
.filter(
and_(
ShareObject.datasetUri == dataset_uri, # for this dataset
ShareObject.environmentUri == environment_uri, # for this environment
ShareObjectItem.status.in_(share_item_shared_states),
ShareObject.principalType
!= PrincipalType.ConsumptionRole.value, # Exclude Consumption roles shares
or_(
ShareObject.owner == username,
ShareObject.principalId.in_(groups),
),
)
)
.all()
)

return env_tables_shared
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
from dataall.core.permissions.services.resource_policy_service import ResourcePolicyService
from dataall.core.permissions.services.tenant_policy_service import TenantPolicyService
from dataall.base.context import get_context
from dataall.modules.dataset_sharing.db.share_object_repositories import (
ShareObjectRepository,
ShareItemSM,
)
from dataall.modules.dataset_sharing.services.share_item_service import ShareItemService
from dataall.modules.datasets.services.dataset_permissions import (
MANAGE_DATASETS,
UPDATE_DATASET,
)

from dataall.modules.datasets_base.db.dataset_models import Dataset

import logging

log = logging.getLogger(__name__)


class DatasetSharingService:
@staticmethod
@TenantPolicyService.has_tenant_permission(MANAGE_DATASETS)
@ResourcePolicyService.has_resource_permission(UPDATE_DATASET)
def verify_dataset_share_objects(uri: str, share_uris: list):
with get_context().db_engine.scoped_session() as session:
for share_uri in share_uris:
share = ShareObjectRepository.get_share_by_uri(session, share_uri)
states = ShareItemSM.get_share_item_revokable_states()
items = ShareObjectRepository.list_shareable_items(
session, share, states, {'pageSize': 1000, 'isShared': True}
)
item_uris = [item.shareItemUri for item in items.get('nodes', [])]
ShareItemService.verify_items_share_object(uri=share_uri, item_uris=item_uris)
return True

@staticmethod
def list_dataset_share_objects(dataset: Dataset, data: dict = None):
with get_context().db_engine.scoped_session() as session:
return ShareObjectRepository.paginated_dataset_shares(session=session, uri=dataset.datasetUri, data=data)

@staticmethod
def list_shared_tables_by_env_dataset(dataset_uri: str, env_uri: str):
context = get_context()
with context.db_engine.scoped_session() as session:
return [
{'tableUri': t.tableUri, 'GlueTableName': t.GlueTableName}
for t in ShareObjectRepository.query_dataset_tables_shared_with_env(
session, env_uri, dataset_uri, context.username, context.groups
)
]
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from dataall.modules.datasets.aws.sns_dataset_client import SnsDatasetClient
from dataall.modules.datasets.db.dataset_location_repositories import DatasetLocationRepository
from dataall.modules.datasets.db.dataset_table_repositories import DatasetTableRepository
from dataall.modules.datasets.tasks.subscriptions import poll_queues
from dataall.modules.dataset_sharing.tasks.subscriptions import poll_queues
from dataall.modules.datasets_base.db.dataset_repositories import DatasetRepository
from dataall.modules.datasets_base.db.dataset_models import DatasetStorageLocation, DatasetTable, Dataset

Expand Down
8 changes: 0 additions & 8 deletions backend/dataall/modules/datasets/api/dataset/mutations.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
delete_dataset,
import_dataset,
start_crawler,
verify_dataset_share_objects,
)

createDataset = gql.MutationField(
Expand Down Expand Up @@ -69,10 +68,3 @@
resolver=start_crawler,
type=gql.Ref('GlueCrawler'),
)

verifyDatasetShareObjects = gql.MutationField(
name='verifyDatasetShareObjects',
args=[gql.Argument(name='input', type=gql.Ref('ShareObjectSelectorInput'))],
type=gql.Boolean,
resolver=verify_dataset_share_objects,
)
12 changes: 0 additions & 12 deletions backend/dataall/modules/datasets/api/dataset/queries.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
list_owned_datasets,
get_dataset_assume_role_url,
get_file_upload_presigned_url,
list_dataset_share_objects,
list_datasets_owned_by_env_group,
list_datasets_created_in_environment,
)
Expand Down Expand Up @@ -57,17 +56,6 @@
resolver=get_file_upload_presigned_url,
)

listShareObjects = gql.QueryField(
name='listDatasetShareObjects',
resolver=list_dataset_share_objects,
args=[
gql.Argument(name='datasetUri', type=gql.NonNullableType(gql.String)),
gql.Argument(name='environmentUri', type=gql.String),
gql.Argument(name='page', type=gql.Integer),
],
type=gql.Ref('ShareSearchResult'),
)

listDatasetsOwnedByEnvGroup = gql.QueryField(
name='listDatasetsOwnedByEnvGroup',
type=gql.Ref('DatasetSearchResult'),
Expand Down
24 changes: 0 additions & 24 deletions backend/dataall/modules/datasets/api/dataset/resolvers.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,14 +131,6 @@ def start_crawler(context: Context, source, datasetUri: str, input: dict = None)
return DatasetService.start_crawler(uri=datasetUri, data=input)


def list_dataset_share_objects(context, source, filter: dict = None):
if not source:
return None
if not filter:
filter = {'page': 1, 'pageSize': 5}
return DatasetService.list_dataset_share_objects(source, filter)


@is_feature_enabled('modules.datasets.features.aws_actions')
def generate_dataset_access_token(context, source, datasetUri: str = None):
return DatasetService.generate_dataset_access_token(uri=datasetUri)
Expand Down Expand Up @@ -178,13 +170,6 @@ def list_datasets_owned_by_env_group(
return DatasetService.list_datasets_owned_by_env_group(environmentUri, groupUri, filter)


def verify_dataset_share_objects(context: Context, source, input):
RequestValidator.validate_dataset_share_selector_input(input)
dataset_uri = input.get('datasetUri')
verify_share_uris = input.get('shareUris')
return DatasetService.verify_dataset_share_objects(uri=dataset_uri, share_uris=verify_share_uris)


class RequestValidator:
@staticmethod
def validate_creation_request(data):
Expand All @@ -205,12 +190,3 @@ def validate_import_request(data):
RequestValidator.validate_creation_request(data)
if not data.get('bucketName'):
raise RequiredParameter('bucketName')

@staticmethod
def validate_dataset_share_selector_input(data):
if not data:
raise RequiredParameter(data)
if not data.get('datasetUri'):
raise RequiredParameter('datasetUri')
if not data.get('shareUris'):
raise RequiredParameter('shareUris')
15 changes: 0 additions & 15 deletions backend/dataall/modules/datasets/api/dataset/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
list_locations,
resolve_user_role,
get_dataset_statistics,
list_dataset_share_objects,
get_dataset_glossary_terms,
resolve_dataset_stack,
)
Expand Down Expand Up @@ -100,20 +99,6 @@
),
gql.Field(name='userRoleInEnvironment', type=EnvironmentPermission.toGraphQLEnum()),
gql.Field(name='statistics', type=DatasetStatistics, resolver=get_dataset_statistics),
gql.Field(
name='shares',
args=[gql.Argument(name='filter', type=gql.Ref('ShareObjectFilter'))],
type=gql.Ref('ShareSearchResult'),
resolver=list_dataset_share_objects,
test_scope='ShareObject',
test_cases=[
'anonymous',
'businessowner',
'admins',
'stewards',
'unauthorized',
],
),
gql.Field(
name='terms',
resolver=get_dataset_glossary_terms,
Expand Down
12 changes: 1 addition & 11 deletions backend/dataall/modules/datasets/api/table/queries.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from dataall.base.api import gql
from dataall.modules.datasets.api.table.input_types import DatasetTableFilter
from dataall.modules.datasets.api.table.resolvers import get_table, list_shared_tables_by_env_dataset, preview
from dataall.modules.datasets.api.table.resolvers import get_table, preview
from dataall.modules.datasets.api.table.types import (
DatasetTable,
DatasetTableSearchResult,
Expand Down Expand Up @@ -36,13 +36,3 @@
resolver=preview,
type=gql.Ref('QueryPreviewResult'),
)

getSharedDatasetTables = gql.QueryField(
name='getSharedDatasetTables',
args=[
gql.Argument(name='datasetUri', type=gql.NonNullableType(gql.String)),
gql.Argument(name='envUri', type=gql.NonNullableType(gql.String)),
],
type=gql.ArrayType(gql.Ref('SharedDatasetTableItem')),
resolver=list_shared_tables_by_env_dataset,
)
4 changes: 0 additions & 4 deletions backend/dataall/modules/datasets/api/table/resolvers.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,3 @@ def resolve_glossary_terms(context: Context, source: DatasetTable, **kwargs):
return None
with context.engine.scoped_session() as session:
return GlossaryRepository.get_glossary_terms_links(session, source.tableUri, 'DatasetTable')


def list_shared_tables_by_env_dataset(context: Context, source, datasetUri: str, envUri: str):
return DatasetTableService.list_shared_tables_by_env_dataset(datasetUri, envUri)
37 changes: 0 additions & 37 deletions backend/dataall/modules/datasets/db/dataset_table_repositories.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,43 +43,6 @@ def create_synced_table(session, dataset: Dataset, table: dict):
def delete(session, table: DatasetTable):
session.delete(table)

@staticmethod
def query_dataset_tables_shared_with_env(
session, environment_uri: str, dataset_uri: str, username: str, groups: [str]
):
"""For a given dataset, returns the list of Tables shared with the environment
This means looking at approved ShareObject items
for the share object associating the dataset and environment
"""
share_item_shared_states = ShareItemSM.get_share_item_shared_states()
env_tables_shared = (
session.query(DatasetTable) # all tables
.join(
ShareObjectItem, # found in ShareObjectItem
ShareObjectItem.itemUri == DatasetTable.tableUri,
)
.join(
ShareObject, # jump to share object
ShareObject.shareUri == ShareObjectItem.shareUri,
)
.filter(
and_(
ShareObject.datasetUri == dataset_uri, # for this dataset
ShareObject.environmentUri == environment_uri, # for this environment
ShareObjectItem.status.in_(share_item_shared_states),
ShareObject.principalType
!= PrincipalType.ConsumptionRole.value, # Exclude Consumption roles shares
or_(
ShareObject.owner == username,
ShareObject.principalId.in_(groups),
),
)
)
.all()
)

return env_tables_shared

@staticmethod
def get_dataset_table_by_uri(session, table_uri):
table: DatasetTable = session.query(DatasetTable).get(table_uri)
Expand Down
Loading

0 comments on commit 6386fe1

Please sign in to comment.