Skip to content

Commit

Permalink
Check existing Lake Formation data lake location and make dataset res…
Browse files Browse the repository at this point in the history
…ources names unique (#324)

### Feature or Bugfix
- Feature

### Detail
- When a dataset is imported, the dataset stack checks whether the
specified S3 location has been registered with Lake Formation. If it has
been already registered data.all does nothing, if it was not registered
it registers it as a Lake Formation data location.
- For the Glue profiling job, the data quality job and the crawler a
suffix with the datasetUri has been added to the name to track them
better and to avoid issues in the future if we decide to implement
"multiple databases mapping to one single Bucket" features.

### Relates
- #321 

By submitting this pull request, I confirm that my contribution is made
under the terms of the Apache 2.0 license.
  • Loading branch information
dlpzx committed Mar 6, 2023
1 parent a64530f commit 6abb70a
Show file tree
Hide file tree
Showing 4 changed files with 29 additions and 16 deletions.
5 changes: 3 additions & 2 deletions backend/dataall/aws/handlers/lakeformation.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,14 +23,15 @@ def describe_resource(resource_arn, accountid, region):

response = lf_client.describe_resource(ResourceArn=resource_arn)

log.debug(f'LF data location already registered: {response}')
log.info(f'LF data location already registered: {response}')

return response['ResourceInfo']

except ClientError as e:
log.error(
log.info(
f'LF data location for resource {resource_arn} not found due to {e}'
)
return False

@staticmethod
def grant_pivot_role_all_database_permissions(accountid, region, database):
Expand Down
26 changes: 17 additions & 9 deletions backend/dataall/cdkproxy/stacks/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
from .manager import stack
from ... import db
from ...aws.handlers.quicksight import Quicksight
from ...aws.handlers.lakeformation import LakeFormation
from ...aws.handlers.sts import SessionHelper
from ...db import models
from ...db.api import Environment, ShareObject
Expand Down Expand Up @@ -432,16 +433,23 @@ def __init__(self, scope, id, target_uri: str = None, **kwargs):
on_event_handler=glue_db_handler,
)

storage_location = CfnResource(
self,
'DatasetStorageLocation',
type='AWS::LakeFormation::Resource',
properties={
'ResourceArn': f'arn:aws:s3:::{dataset.S3BucketName}',
'RoleArn': f'arn:aws:iam::{env.AwsAccountId}:role/{pivot_role_name}',
'UseServiceLinkedRole': False,
},
existing_location = LakeFormation.describe_resource(
resource_arn=f'arn:aws:s3:::{dataset.S3BucketName}',
accountid=env.AwsAccountId,
region=env.region
)

if not existing_location:
storage_location = CfnResource(
self,
'DatasetStorageLocation',
type='AWS::LakeFormation::Resource',
properties={
'ResourceArn': f'arn:aws:s3:::{dataset.S3BucketName}',
'RoleArn': f'arn:aws:iam::{env.AwsAccountId}:role/{pivot_role_name}',
'UseServiceLinkedRole': False,
},
)
dataset_admins = [
dataset_admin_role.role_arn,
f'arn:aws:iam::{env.AwsAccountId}:role/{pivot_role_name}',
Expand Down
10 changes: 5 additions & 5 deletions backend/dataall/db/api/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,13 +171,13 @@ def _set_dataset_aws_resources(dataset: models.Dataset, data, environment):
dataset.IAMDatasetAdminRoleArn = iam_role_arn
dataset.IAMDatasetAdminUserArn = iam_role_arn

dataset.GlueCrawlerName = f'{dataset.S3BucketName}-crawler'
dataset.GlueProfilingJobName = f'{dataset.S3BucketName}-profiler'
dataset.GlueCrawlerName = f'{dataset.S3BucketName}-{dataset.datasetUri}-crawler'
dataset.GlueProfilingJobName = f'{dataset.S3BucketName}-{dataset.datasetUri}-profiler'
dataset.GlueProfilingTriggerSchedule = None
dataset.GlueProfilingTriggerName = f'{dataset.S3BucketName}-trigger'
dataset.GlueDataQualityJobName = f'{dataset.S3BucketName}-dataquality'
dataset.GlueProfilingTriggerName = f'{dataset.S3BucketName}-{dataset.datasetUri}-trigger'
dataset.GlueDataQualityJobName = f'{dataset.S3BucketName}-{dataset.datasetUri}-dataquality'
dataset.GlueDataQualitySchedule = None
dataset.GlueDataQualityTriggerName = f'{dataset.S3BucketName}-dqtrigger'
dataset.GlueDataQualityTriggerName = f'{dataset.S3BucketName}-{dataset.datasetUri}-dqtrigger'
return dataset

@staticmethod
Expand Down
4 changes: 4 additions & 0 deletions tests/cdkproxy/test_dataset_stack.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,10 @@ def patch_methods(mocker, db, dataset, env, org):
'dataall.aws.handlers.sts.SessionHelper.get_delegation_role_name',
return_value="dataall-pivot-role-name-pytest",
)
mocker.patch(
'dataall.aws.handlers.lakeformation.LakeFormation.describe_resource',
return_value=False,
)
mocker.patch(
'dataall.utils.runtime_stacks_tagging.TagsUtil.get_target',
return_value=dataset,
Expand Down

0 comments on commit 6abb70a

Please sign in to comment.