Skip to content

Commit

Permalink
Resolve Dataset Profiling Glue Job (#649)
Browse files Browse the repository at this point in the history
### Feature or Bugfix
<!-- please choose -->
- Bugfix

### Detail
- Specify `SPARK_VERSION` as an environment variable for `pydeequ`
before import
- Add IAM Permissions to Dataset IAM Role to Allow for Glue Job logging
in CloudWatch
- Add LF Permissions to resolve insufficient permissions error thrown
when looking for `default` database

### Relates


By submitting this pull request, I confirm that my contribution is made
under the terms of the Apache 2.0 license.
  • Loading branch information
noah-paige authored Aug 10, 2023
1 parent 63137ac commit a39fd43
Show file tree
Hide file tree
Showing 4 changed files with 38 additions and 5 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,13 @@ def on_create(event):
except ClientError as e:
pass

default_db_exists = False
try:
glue_client.get_database(Name="default")
default_db_exists = True
except ClientError as e:
pass

if not exists:
try:
db_input = props.get('DatabaseInput').copy()
Expand All @@ -63,7 +70,7 @@ def on_create(event):
raise Exception(f"Could not create Glue Database {props['DatabaseInput']['Name']} in aws://{AWS_ACCOUNT}/{AWS_REGION}, received {str(e)}")

Entries = []
for i, role_arn in enumerate(props.get('DatabaseAdministrators')):
for i, role_arn in enumerate(props.get('DatabaseAdministrators', [])):
Entries.append(
{
'Id': str(uuid.uuid4()),
Expand Down Expand Up @@ -103,6 +110,20 @@ def on_create(event):
'PermissionsWithGrantOption': ['SELECT', 'ALTER', 'DESCRIBE'],
}
)
if default_db_exists:
Entries.append(
{
'Id': str(uuid.uuid4()),
'Principal': {'DataLakePrincipalIdentifier': role_arn},
'Resource': {
'Database': {
'Name': 'default'
}
},
'Permissions': ['Describe'.upper()],
}
)

lf_client.batch_grant_permissions(CatalogId=props['CatalogId'], Entries=Entries)
physical_id = props['DatabaseInput']['Imported'] + props['DatabaseInput']['Name']

Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import json
import os
import logging
import pprint
import sys
Expand All @@ -8,7 +9,6 @@
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from pydeequ.profiles import *

sc = SparkContext.getOrCreate()
sc._jsc.hadoopConfiguration().set('fs.s3.canned.acl', 'BucketOwnerFullControl')
Expand All @@ -32,6 +32,7 @@
'environmentBucket',
'dataallRegion',
'table',
"SPARK_VERSION"
]
try:
args = getResolvedOptions(sys.argv, list_args)
Expand All @@ -43,6 +44,10 @@
list_args.remove('table')
args = getResolvedOptions(sys.argv, list_args)

os.environ["SPARK_VERSION"] = args.get("SPARK_VERSION", "3.1")

from pydeequ.profiles import *

logger.info('Parsed Retrieved parameters')

logger.info('Parsed Args = %s', pprint.pformat(args))
Expand Down
10 changes: 7 additions & 3 deletions backend/dataall/cdkproxy/stacks/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -295,24 +295,26 @@ def __init__(self, scope, id, target_uri: str = None, **kwargs):
]
),
iam.PolicyStatement(
sid="CreateLoggingGlueCrawler",
sid="CreateLoggingGlue",
actions=[
'logs:CreateLogGroup',
'logs:CreateLogStream',
],
effect=iam.Effect.ALLOW,
resources=[
f'arn:aws:logs:{dataset.region}:{dataset.AwsAccountId}:log-group:/aws-glue/crawlers*',
f'arn:aws:logs:{dataset.region}:{dataset.AwsAccountId}:log-group:/aws-glue/jobs/*',
],
),
iam.PolicyStatement(
sid="LoggingGlueCrawler",
sid="LoggingGlue",
actions=[
'logs:PutLogEvents',
],
effect=iam.Effect.ALLOW,
resources=[
f'arn:aws:logs:{dataset.region}:{dataset.AwsAccountId}:log-group:/aws-glue/crawlers:log-stream:{dataset.GlueCrawlerName}',
f'arn:aws:logs:{dataset.region}:{dataset.AwsAccountId}:log-group:/aws-glue/jobs/*',
],
),
iam.PolicyStatement(
Expand Down Expand Up @@ -443,7 +445,8 @@ def __init__(self, scope, id, target_uri: str = None, **kwargs):
'CreateTableDefaultPermissions': [],
'Imported': 'IMPORTED-' if dataset.imported else 'CREATED-'
},
'DatabaseAdministrators': dataset_admins
'DatabaseAdministrators': dataset_admins,
'TriggerUpdate': True
},
)

Expand Down Expand Up @@ -484,6 +487,7 @@ def __init__(self, scope, id, target_uri: str = None, **kwargs):
'--enable-metrics': 'true',
'--enable-continuous-cloudwatch-log': 'true',
'--enable-glue-datacatalog': 'true',
'--SPARK_VERSION': '3.1',
}

job = glue.CfnJob(
Expand Down
3 changes: 3 additions & 0 deletions documentation/userguide/docs/tables.md
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,9 @@ By selecting the **Metrics** tab of your data table you can run a profiling job

![](pictures/tables/table_metrics.png#zoom#shadow)

!!! warning "Profiling Job Prerequisite"
Before running the profiling job you will need to ensure that the **default** Glue Database exists in the AWS Account where the data exists (by default this database exists for new accounts). This is required to enable the Glue profiling job to use the metadata stored in the Glue Catalog.

### :material-trash-can-outline: **Delete a table**
Deleting a table means deleting it from the data.all Catalog, but it will be still available on the AWS Glue Catalog.
Moreover, when data owners
Expand Down

0 comments on commit a39fd43

Please sign in to comment.