From 6abb70a7c3cfb46d741837c8cba77a5635949edd Mon Sep 17 00:00:00 2001 From: dlpzx <71252798+dlpzx@users.noreply.github.com> Date: Wed, 22 Feb 2023 09:33:48 +0100 Subject: [PATCH] Check existing Lake Formation data lake location and make dataset resources names unique (#324) ### Feature or Bugfix - Feature ### Detail - When a dataset is imported, the dataset stack checks whether the specified S3 location has been registered with Lake Formation. If it has been already registered data.all does nothing, if it was not registered it registers it as a Lake Formation data location. - For the Glue profiling job, the data quality job and the crawler a suffix with the datasetUri has been added to the name to track them better and to avoid issues in the future if we decide to implement "multiple databases mapping to one single Bucket" features. ### Relates - #321 By submitting this pull request, I confirm that my contribution is made under the terms of the Apache 2.0 license. --- backend/dataall/aws/handlers/lakeformation.py | 5 ++-- backend/dataall/cdkproxy/stacks/dataset.py | 26 ++++++++++++------- backend/dataall/db/api/dataset.py | 10 +++---- tests/cdkproxy/test_dataset_stack.py | 4 +++ 4 files changed, 29 insertions(+), 16 deletions(-) diff --git a/backend/dataall/aws/handlers/lakeformation.py b/backend/dataall/aws/handlers/lakeformation.py index 60bd1b78d..703b117f7 100644 --- a/backend/dataall/aws/handlers/lakeformation.py +++ b/backend/dataall/aws/handlers/lakeformation.py @@ -23,14 +23,15 @@ def describe_resource(resource_arn, accountid, region): response = lf_client.describe_resource(ResourceArn=resource_arn) - log.debug(f'LF data location already registered: {response}') + log.info(f'LF data location already registered: {response}') return response['ResourceInfo'] except ClientError as e: - log.error( + log.info( f'LF data location for resource {resource_arn} not found due to {e}' ) + return False @staticmethod def grant_pivot_role_all_database_permissions(accountid, region, database): diff --git a/backend/dataall/cdkproxy/stacks/dataset.py b/backend/dataall/cdkproxy/stacks/dataset.py index dd9ce5b2e..7e8ccb8ba 100644 --- a/backend/dataall/cdkproxy/stacks/dataset.py +++ b/backend/dataall/cdkproxy/stacks/dataset.py @@ -22,6 +22,7 @@ from .manager import stack from ... import db from ...aws.handlers.quicksight import Quicksight +from ...aws.handlers.lakeformation import LakeFormation from ...aws.handlers.sts import SessionHelper from ...db import models from ...db.api import Environment, ShareObject @@ -432,16 +433,23 @@ def __init__(self, scope, id, target_uri: str = None, **kwargs): on_event_handler=glue_db_handler, ) - storage_location = CfnResource( - self, - 'DatasetStorageLocation', - type='AWS::LakeFormation::Resource', - properties={ - 'ResourceArn': f'arn:aws:s3:::{dataset.S3BucketName}', - 'RoleArn': f'arn:aws:iam::{env.AwsAccountId}:role/{pivot_role_name}', - 'UseServiceLinkedRole': False, - }, + existing_location = LakeFormation.describe_resource( + resource_arn=f'arn:aws:s3:::{dataset.S3BucketName}', + accountid=env.AwsAccountId, + region=env.region ) + + if not existing_location: + storage_location = CfnResource( + self, + 'DatasetStorageLocation', + type='AWS::LakeFormation::Resource', + properties={ + 'ResourceArn': f'arn:aws:s3:::{dataset.S3BucketName}', + 'RoleArn': f'arn:aws:iam::{env.AwsAccountId}:role/{pivot_role_name}', + 'UseServiceLinkedRole': False, + }, + ) dataset_admins = [ dataset_admin_role.role_arn, f'arn:aws:iam::{env.AwsAccountId}:role/{pivot_role_name}', diff --git a/backend/dataall/db/api/dataset.py b/backend/dataall/db/api/dataset.py index 4f3b6c51c..230f5b85f 100644 --- a/backend/dataall/db/api/dataset.py +++ b/backend/dataall/db/api/dataset.py @@ -171,13 +171,13 @@ def _set_dataset_aws_resources(dataset: models.Dataset, data, environment): dataset.IAMDatasetAdminRoleArn = iam_role_arn dataset.IAMDatasetAdminUserArn = iam_role_arn - dataset.GlueCrawlerName = f'{dataset.S3BucketName}-crawler' - dataset.GlueProfilingJobName = f'{dataset.S3BucketName}-profiler' + dataset.GlueCrawlerName = f'{dataset.S3BucketName}-{dataset.datasetUri}-crawler' + dataset.GlueProfilingJobName = f'{dataset.S3BucketName}-{dataset.datasetUri}-profiler' dataset.GlueProfilingTriggerSchedule = None - dataset.GlueProfilingTriggerName = f'{dataset.S3BucketName}-trigger' - dataset.GlueDataQualityJobName = f'{dataset.S3BucketName}-dataquality' + dataset.GlueProfilingTriggerName = f'{dataset.S3BucketName}-{dataset.datasetUri}-trigger' + dataset.GlueDataQualityJobName = f'{dataset.S3BucketName}-{dataset.datasetUri}-dataquality' dataset.GlueDataQualitySchedule = None - dataset.GlueDataQualityTriggerName = f'{dataset.S3BucketName}-dqtrigger' + dataset.GlueDataQualityTriggerName = f'{dataset.S3BucketName}-{dataset.datasetUri}-dqtrigger' return dataset @staticmethod diff --git a/tests/cdkproxy/test_dataset_stack.py b/tests/cdkproxy/test_dataset_stack.py index 1ed73a03e..19a30d513 100644 --- a/tests/cdkproxy/test_dataset_stack.py +++ b/tests/cdkproxy/test_dataset_stack.py @@ -16,6 +16,10 @@ def patch_methods(mocker, db, dataset, env, org): 'dataall.aws.handlers.sts.SessionHelper.get_delegation_role_name', return_value="dataall-pivot-role-name-pytest", ) + mocker.patch( + 'dataall.aws.handlers.lakeformation.LakeFormation.describe_resource', + return_value=False, + ) mocker.patch( 'dataall.utils.runtime_stacks_tagging.TagsUtil.get_target', return_value=dataset,