diff --git a/backend/dataall/modules/datasets/cdk/assets/gluedatabasecustomresource/index.py b/backend/dataall/modules/datasets/cdk/assets/gluedatabasecustomresource/index.py index e548dcbf6..ce01c0f9a 100644 --- a/backend/dataall/modules/datasets/cdk/assets/gluedatabasecustomresource/index.py +++ b/backend/dataall/modules/datasets/cdk/assets/gluedatabasecustomresource/index.py @@ -49,6 +49,13 @@ def on_create(event): except ClientError as e: pass + default_db_exists = False + try: + glue_client.get_database(Name="default") + default_db_exists = True + except ClientError as e: + pass + if not exists: try: db_input = props.get('DatabaseInput').copy() @@ -63,7 +70,7 @@ def on_create(event): raise Exception(f"Could not create Glue Database {props['DatabaseInput']['Name']} in aws://{AWS_ACCOUNT}/{AWS_REGION}, received {str(e)}") Entries = [] - for i, role_arn in enumerate(props.get('DatabaseAdministrators')): + for i, role_arn in enumerate(props.get('DatabaseAdministrators', [])): Entries.append( { 'Id': str(uuid.uuid4()), @@ -103,6 +110,20 @@ def on_create(event): 'PermissionsWithGrantOption': ['SELECT', 'ALTER', 'DESCRIBE'], } ) + if default_db_exists: + Entries.append( + { + 'Id': str(uuid.uuid4()), + 'Principal': {'DataLakePrincipalIdentifier': role_arn}, + 'Resource': { + 'Database': { + 'Name': 'default' + } + }, + 'Permissions': ['Describe'.upper()], + } + ) + lf_client.batch_grant_permissions(CatalogId=props['CatalogId'], Entries=Entries) physical_id = props['DatabaseInput']['Imported'] + props['DatabaseInput']['Name'] diff --git a/backend/dataall/modules/datasets/cdk/assets/glueprofilingjob/glue_script.py b/backend/dataall/modules/datasets/cdk/assets/glueprofilingjob/glue_script.py index 8279bc11c..e974c6bf9 100644 --- a/backend/dataall/modules/datasets/cdk/assets/glueprofilingjob/glue_script.py +++ b/backend/dataall/modules/datasets/cdk/assets/glueprofilingjob/glue_script.py @@ -1,4 +1,5 @@ import json +import os import logging import pprint import sys @@ -8,7 +9,6 @@ from awsglue.transforms import * from awsglue.utils import getResolvedOptions from pyspark.context import SparkContext -from pydeequ.profiles import * sc = SparkContext.getOrCreate() sc._jsc.hadoopConfiguration().set('fs.s3.canned.acl', 'BucketOwnerFullControl') @@ -32,6 +32,7 @@ 'environmentBucket', 'dataallRegion', 'table', + "SPARK_VERSION" ] try: args = getResolvedOptions(sys.argv, list_args) @@ -43,6 +44,10 @@ list_args.remove('table') args = getResolvedOptions(sys.argv, list_args) +os.environ["SPARK_VERSION"] = args.get("SPARK_VERSION", "3.1") + +from pydeequ.profiles import * + logger.info('Parsed Retrieved parameters') logger.info('Parsed Args = %s', pprint.pformat(args)) diff --git a/backend/dataall/modules/datasets/cdk/dataset_stack.py b/backend/dataall/modules/datasets/cdk/dataset_stack.py index a7ac78a89..cd4b4f69a 100644 --- a/backend/dataall/modules/datasets/cdk/dataset_stack.py +++ b/backend/dataall/modules/datasets/cdk/dataset_stack.py @@ -300,7 +300,7 @@ def __init__(self, scope, id, target_uri: str = None, **kwargs): ] ), iam.PolicyStatement( - sid="CreateLoggingGlueCrawler", + sid="CreateLoggingGlue", actions=[ 'logs:CreateLogGroup', 'logs:CreateLogStream', @@ -308,16 +308,18 @@ def __init__(self, scope, id, target_uri: str = None, **kwargs): effect=iam.Effect.ALLOW, resources=[ f'arn:aws:logs:{dataset.region}:{dataset.AwsAccountId}:log-group:/aws-glue/crawlers*', + f'arn:aws:logs:{dataset.region}:{dataset.AwsAccountId}:log-group:/aws-glue/jobs/*', ], ), iam.PolicyStatement( - sid="LoggingGlueCrawler", + sid="LoggingGlue", actions=[ 'logs:PutLogEvents', ], effect=iam.Effect.ALLOW, resources=[ f'arn:aws:logs:{dataset.region}:{dataset.AwsAccountId}:log-group:/aws-glue/crawlers:log-stream:{dataset.GlueCrawlerName}', + f'arn:aws:logs:{dataset.region}:{dataset.AwsAccountId}:log-group:/aws-glue/jobs/*', ], ), iam.PolicyStatement( @@ -443,7 +445,8 @@ def __init__(self, scope, id, target_uri: str = None, **kwargs): 'CreateTableDefaultPermissions': [], 'Imported': 'IMPORTED-' if dataset.imported else 'CREATED-' }, - 'DatabaseAdministrators': dataset_admins + 'DatabaseAdministrators': dataset_admins, + 'TriggerUpdate': True }, ) @@ -484,6 +487,7 @@ def __init__(self, scope, id, target_uri: str = None, **kwargs): '--enable-metrics': 'true', '--enable-continuous-cloudwatch-log': 'true', '--enable-glue-datacatalog': 'true', + '--SPARK_VERSION': '3.1', } job = glue.CfnJob( diff --git a/backend/migrations/versions/b1cdc0dc987a_fix_template_column_in_table.py b/backend/migrations/versions/b1cdc0dc987a_fix_template_column_in_table.py index 5c92d6ca5..7c134012e 100644 --- a/backend/migrations/versions/b1cdc0dc987a_fix_template_column_in_table.py +++ b/backend/migrations/versions/b1cdc0dc987a_fix_template_column_in_table.py @@ -7,7 +7,9 @@ """ from alembic import op import sqlalchemy as sa +from sqlalchemy import orm, Column, String from sqlalchemy.dialects import postgresql +from sqlalchemy.ext.declarative import declarative_base # revision identifiers, used by Alembic. revision = 'b1cdc0dc987a' @@ -15,13 +17,51 @@ branch_labels = None depends_on = None +Base = declarative_base() + + +class DataPipeline(Base): + __tablename__ = 'datapipeline' + DataPipelineUri = Column( + String, nullable=False, primary_key=True + ) + devStrategy = Column(String, nullable=True) + devStages = Column(postgresql.ARRAY(String), nullable=True) + def upgrade(): # ### commands auto generated by Alembic - please adjust! ### + # Modify column types + print("Upgrade devStages and devStrategy column types. Updating nullable to True...") op.add_column( 'datapipeline', sa.Column('template', sa.String(), nullable=True) ) + op.alter_column( + 'datapipeline', + 'devStages', + existing_type=postgresql.ARRAY(sa.VARCHAR()), + nullable=True + ) + op.alter_column( + 'datapipeline', + 'devStrategy', + existing_type=sa.VARCHAR(), + nullable=True + ) + print("Backfilling values for devStages and devStrategy...") + # Backfill values + bind = op.get_bind() + session = orm.Session(bind=bind) + session.query(DataPipeline).filter(DataPipeline.devStrategy is None).update( + {DataPipeline.devStrategy: 'gitflowBlueprint'}, synchronize_session=False) + + session.query(DataPipeline).filter(DataPipeline.devStages is None).update( + {DataPipeline.devStages: ['dev', 'test', 'prod']}, synchronize_session=False) + session.commit() + + print("Backfilling values for devStages and devStrategy is done. Updating nullable to False...") + # Force nullable = False op.alter_column( 'datapipeline', 'devStages', diff --git a/deploy/cdk_exec_policy/cdkExecPolicy.yaml b/deploy/cdk_exec_policy/cdkExecPolicy.yaml index a285d0d59..cc98cf0ac 100644 --- a/deploy/cdk_exec_policy/cdkExecPolicy.yaml +++ b/deploy/cdk_exec_policy/cdkExecPolicy.yaml @@ -1,9 +1,6 @@ AWSTemplateFormatVersion: 2010-09-09 Description: Custom least privilege IAM policy for linking environments to dataall Parameters: - AwsAccountId: - Description: AWS AccountId of the account that we wish to link. - Type: String PolicyName: Description: IAM policy name (The same name must be used during CDK bootstrapping. Default is DataAllCustomCDKPolicy.) Type: String @@ -48,14 +45,14 @@ Resources: Effect: Allow Action: 'athena:CreateWorkGroup' Resource: - - !Sub 'arn:aws:athena:*:${AWS::AccountId}:workgroup/*' + - !Sub 'arn:${AWS::Partition}:athena:*:${AWS::AccountId}:workgroup/*' - Sid: IAM Action: - 'iam:CreatePolicy' - 'iam:GetPolicy' Effect: Allow Resource: - - !Sub 'arn:aws:iam::${AWS::AccountId}:policy/*' + - !Sub 'arn:${AWS::Partition}:iam::${AWS::AccountId}:policy/*' - Sid: IAMRole Action: - 'iam:AttachRolePolicy' @@ -82,7 +79,7 @@ Resources: - 'iam:CreatePolicyVersion' - 'iam:DeletePolicyVersion' Resource: - - !Sub 'arn:aws:iam::${AWS::AccountId}:policy/service-role/AWSQuickSight*' + - !Sub 'arn:${AWS::Partition}:iam::${AWS::AccountId}:policy/service-role/AWSQuickSight*' - Sid: QuickSight Effect: Allow Action: @@ -114,14 +111,14 @@ Resources: - 'kms:CreateAlias' Effect: Allow Resource: - - !Sub 'arn:aws:kms:*:${AWS::AccountId}:alias/*' + - !Sub 'arn:${AWS::Partition}:kms:*:${AWS::AccountId}:alias/*' - Sid: KMSKey Action: - 's3:PutBucketAcl' - 's3:PutBucketNotification' Effect: Allow Resource: - - !Sub 'arn:aws:s3:::${EnvironmentResourcePrefix}-logging-*' + - !Sub 'arn:${AWS::Partition}:s3:::${EnvironmentResourcePrefix}-logging-*' - Sid: ReadBuckets Action: - 'kms:CreateAlias' @@ -136,7 +133,7 @@ Resources: - 'kms:PutKeyPolicy' - 'kms:TagResource' Effect: Allow - Resource: !Sub 'arn:aws:kms:*:${AWS::AccountId}:key/*' + Resource: !Sub 'arn:${AWS::Partition}:kms:*:${AWS::AccountId}:key/*' - Sid: Lambda Action: - 'lambda:AddPermission' @@ -154,7 +151,7 @@ Resources: Action: - 'lambda:PublishLayerVersion' Resource: - - !Sub 'arn:aws:lambda:*:${AWS::AccountId}:layer:*' + - !Sub 'arn:${AWS::Partition}:lambda:*:${AWS::AccountId}:layer:*' - Sid: S3 Action: - 's3:CreateBucket' @@ -170,13 +167,13 @@ Resources: - 's3:DeleteBucketPolicy' - 's3:DeleteBucket' Effect: Allow - Resource: 'arn:aws:s3:::*' + Resource: !Sub 'arn:${AWS::Partition}:s3:::*' - Sid: SQS Effect: Allow Action: - 'sqs:CreateQueue' - 'sqs:SetQueueAttributes' - Resource: !Sub 'arn:aws:sqs:*:${AWS::AccountId}:*' + Resource: !Sub 'arn:${AWS::Partition}:sqs:*:${AWS::AccountId}:*' - Sid: SSM Effect: Allow Action: @@ -190,18 +187,18 @@ Resources: - 'logs:CreateLogStream' - 'logs:PutLogEvents' - 'logs:DescribeLogStreams' - Resource: 'arn:aws:logs:*:*:*' + Resource: !Sub 'arn:${AWS::Partition}:logs:*:*:*' - Sid: STS Effect: Allow Action: - 'sts:AssumeRole' - 'iam:*Role*' - Resource: !Sub 'arn:aws:iam::${AWS::AccountId}:role/cdk-*' + Resource: !Sub 'arn:${AWS::Partition}:iam::${AWS::AccountId}:role/cdk-*' - Sid: CloudFormation Effect: Allow Action: - 'cloudformation:*' - Resource: !Sub 'arn:aws:cloudformation:*:${AWS::AccountId}:stack/CDKToolkit/*' + Resource: !Sub 'arn:${AWS::Partition}:cloudformation:*:${AWS::AccountId}:stack/CDKToolkit/*' - Sid: ECR Effect: Allow Action: @@ -211,14 +208,14 @@ Resources: - 'ecr:DescribeRepositories' - 'ecr:CreateRepository' - 'ecr:DeleteRepository' - Resource: !Sub 'arn:aws:ecr:*:${AWS::AccountId}:repository/cdk-*' + Resource: !Sub 'arn:${AWS::Partition}:ecr:*:${AWS::AccountId}:repository/cdk-*' - Sid: SSMTwo Effect: Allow Action: - 'ssm:GetParameter' - 'ssm:PutParameter' - 'ssm:DeleteParameter' - Resource: !Sub 'arn:aws:ssm:*:${AWS::AccountId}:parameter/cdk-bootstrap/*' + Resource: !Sub 'arn:${AWS::Partition}:ssm:*:${AWS::AccountId}:parameter/cdk-bootstrap/*' - Sid: CloudformationTwo Effect: Allow Action: @@ -232,7 +229,7 @@ Resources: Action: - 's3:*' Resource: - - !Sub 'arn:aws:s3:::cdktoolkit-stagingbucket-*' + - !Sub 'arn:${AWS::Partition}:s3:::cdk-hnb659fds-assets-${AWS::AccountId}-${AWS::Region}*' - Sid: Pipelines Effect: Allow Action: @@ -261,7 +258,7 @@ Resources: - 's3:ListBucket' - 's3:GetBucketPolicy' Resource: - - 'arn:aws:s3::*:codepipeline-*' + - !Sub 'arn:${AWS::Partition}:s3::*:codepipeline-*' - Sid: CodeStarNotificationsReadOnly Effect: Allow Action: @@ -269,7 +266,7 @@ Resources: Resource: '*' Condition: 'StringLike': - 'codestar-notifications:NotificationsForResource': 'arn:aws:codepipeline:*' + 'codestar-notifications:NotificationsForResource': !Sub 'arn:${AWS::Partition}:codepipeline:*' - Sid: Eventrules Effect: Allow Action: diff --git a/deploy/stacks/backend_stack.py b/deploy/stacks/backend_stack.py index 29e276ba8..502f9fa55 100644 --- a/deploy/stacks/backend_stack.py +++ b/deploy/stacks/backend_stack.py @@ -35,6 +35,7 @@ def __init__( image_tag=None, pipeline_bucket=None, vpc_id=None, + vpc_restricted_nacls=False, vpc_endpoints_sg=None, internet_facing=True, custom_domain=None, @@ -64,6 +65,7 @@ def __init__( resource_prefix=resource_prefix, vpc_endpoints_sg=vpc_endpoints_sg, vpc_id=vpc_id, + restricted_nacl=vpc_restricted_nacls, **kwargs, ) vpc = self.vpc_stack.vpc diff --git a/deploy/stacks/backend_stage.py b/deploy/stacks/backend_stage.py index 58d693385..9a9d12dc1 100644 --- a/deploy/stacks/backend_stage.py +++ b/deploy/stacks/backend_stage.py @@ -17,6 +17,7 @@ def __init__( tooling_account_id=None, pipeline_bucket=None, vpc_id=None, + vpc_restricted_nacls=False, vpc_endpoints_sg=None, internet_facing=True, custom_domain=None, @@ -45,6 +46,7 @@ def __init__( pipeline_bucket=pipeline_bucket, image_tag=commit_id, vpc_id=vpc_id, + vpc_restricted_nacls=vpc_restricted_nacls, vpc_endpoints_sg=vpc_endpoints_sg, internet_facing=internet_facing, custom_domain=custom_domain, diff --git a/deploy/stacks/param_store_stack.py b/deploy/stacks/param_store_stack.py index d06548746..b2991495b 100644 --- a/deploy/stacks/param_store_stack.py +++ b/deploy/stacks/param_store_stack.py @@ -115,9 +115,9 @@ def __init__( ) def _get_external_id_value(envname, account_id, region): - """For first deployments it returns False, - for existing deployments it returns the ssm parameter value generated in the first deployment - for prior to V1.5.1 upgrades it returns the secret from secrets manager + """ + For first deployments and upgrades from <=V1.5.6 to >=v1.6 - returns False and a new ssm parameter created, + For existing >=v1.6 deployments - returns the ssm parameter value generated in the first deployment """ cdk_look_up_role = 'arn:aws:iam::{}:role/cdk-hnb659fds-lookup-role-{}-{}'.format(account_id, account_id, region) base_session = boto3.Session() @@ -130,29 +130,21 @@ def _get_external_id_value(envname, account_id, region): region_name=region, endpoint_url=f"https://sts.{region}.amazonaws.com" ) - response = sts.assume_role(**assume_role_dict) - session = boto3.Session( - aws_access_key_id=response['Credentials']['AccessKeyId'], - aws_secret_access_key=response['Credentials']['SecretAccessKey'], - aws_session_token=response['Credentials']['SessionToken'], - ) - - secret_id = f"dataall-externalId-{envname}" parameter_path = f"/dataall/{envname}/pivotRole/externalId" + try: + response = sts.assume_role(**assume_role_dict) + session = boto3.Session( + aws_access_key_id=response['Credentials']['AccessKeyId'], + aws_secret_access_key=response['Credentials']['SecretAccessKey'], + aws_session_token=response['Credentials']['SessionToken'], + ) ssm_client = session.client('ssm', region_name=region) parameter_value = ssm_client.get_parameter(Name=parameter_path)['Parameter']['Value'] return parameter_value except: - try: - secrets_client = session.client('secretsmanager', region_name=region) - if secrets_client.describe_secret(SecretId=secret_id): - secret_value = SecretValue.secrets_manager(secret_id).unsafe_unwrap() - else: - raise Exception - return secret_value - except: - return False + return False + def _generate_external_id(): allowed_chars = string.ascii_uppercase + string.ascii_lowercase + string.digits diff --git a/deploy/stacks/pipeline.py b/deploy/stacks/pipeline.py index d9e5c05bd..cbdb447e3 100644 --- a/deploy/stacks/pipeline.py +++ b/deploy/stacks/pipeline.py @@ -604,6 +604,7 @@ def set_backend_stage(self, target_env, repository_name): commit_id=self.image_tag, vpc_id=target_env.get('vpc_id'), vpc_endpoints_sg=target_env.get('vpc_endpoints_sg'), + vpc_restricted_nacls=target_env.get('vpc_restricted_nacl', False), internet_facing=target_env.get('internet_facing', True), custom_domain=target_env.get('custom_domain'), ip_ranges=target_env.get('ip_ranges'), diff --git a/documentation/userguide/docs/tables.md b/documentation/userguide/docs/tables.md index 192757b51..cb8396a28 100644 --- a/documentation/userguide/docs/tables.md +++ b/documentation/userguide/docs/tables.md @@ -70,6 +70,9 @@ By selecting the **Metrics** tab of your data table you can run a profiling job ![](pictures/tables/table_metrics.png#zoom#shadow) +!!! warning "Profiling Job Prerequisite" + Before running the profiling job you will need to ensure that the **default** Glue Database exists in the AWS Account where the data exists (by default this database exists for new accounts). This is required to enable the Glue profiling job to use the metadata stored in the Glue Catalog. + ### :material-trash-can-outline: **Delete a table** Deleting a table means deleting it from the data.all Catalog, but it will be still available on the AWS Glue Catalog. Moreover, when data owners diff --git a/frontend/src/views/Environments/EnvironmentCreateForm.js b/frontend/src/views/Environments/EnvironmentCreateForm.js index 8ab11d9fb..1dd171197 100644 --- a/frontend/src/views/Environments/EnvironmentCreateForm.js +++ b/frontend/src/views/Environments/EnvironmentCreateForm.js @@ -99,7 +99,9 @@ const EnvironmentCreateForm = (props) => { }; const getCDKExecPolicyUrl = async () => { - const response = await client.query(getCDKExecPolicyPresignedUrl(params.uri)); + const response = await client.query( + getCDKExecPolicyPresignedUrl(params.uri) + ); if (!response.errors) { window.open(response.data.getCDKExecPolicyPresignedUrl, '_blank'); } else { @@ -282,73 +284,111 @@ const EnvironmentCreateForm = (props) => { - + - Bootstrap your AWS account with AWS CDK - - - copyNotification()} - text={`cdk bootstrap --trust ${trustedAccount} -c @aws-cdk/core:newStyleStackSynthesis=true --cloudformation-execution-policies arn:aws:iam::aws:policy/AdministratorAccess aws://ACCOUNT_ID/REGION`} - > - - - - - {`cdk bootstrap --trust ${trustedAccount} -c @aws-cdk/core:newStyleStackSynthesis=true --cloudformation-execution-policies arn:aws:iam::aws:policy/AdministratorAccess aws://ACCOUNT_ID/REGION`} + 1. (OPTIONAL) As part of setting up your AWS Environment with + CDK you need to specify a IAM Policy that gives permission for + CDK to create AWS Resources via CloudFormation (i.e. CDK + Execution Policy). You optionally can use the below + CloudFormation template to create the custom IAM policy that + is more restrictive than the default{' '} + AdministratorAccess policy. + - Use the below CloudFormation stack to create the custom IAM policy. + 2. Bootstrap your AWS account with AWS CDK - - - copyNotification()} - text={`cdk bootstrap --trust ${trustedAccount} -c @aws-cdk/core:newStyleStackSynthesis=true --cloudformation-execution-policies arn:aws:iam::ACCOUNT_ID:policy/DataAllCustomCDKPolicy aws://ACCOUNT_ID/REGION`} - > - - - - - {`cdk bootstrap --trust ${trustedAccount} -c @aws-cdk/core:newStyleStackSynthesis=true --cloudformation-execution-policies arn:aws:iam::ACCOUNT_ID:policy/DataAllCustomCDKPolicy aws://ACCOUNT_ID/REGION`} - + + + + + {`cdk bootstrap --trust ${trustedAccount} -c @aws-cdk/core:newStyleStackSynthesis=true aws://ACCOUNT_ID/REGION`} + + + + + + + + + If Using Custom CDK Execution Policy (From Step 1): + + + copyNotification()} + text={`cdk bootstrap --trust ${trustedAccount} -c @aws-cdk/core:newStyleStackSynthesis=true --cloudformation-execution-policies arn:aws:iam::ACCOUNT_ID:policy/DataAllCustomCDKPolicy aws://ACCOUNT_ID/REGION`} + > + + + + + {`cdk bootstrap --trust ${trustedAccount} -c @aws-cdk/core:newStyleStackSynthesis=true --cloudformation-execution-policies arn:aws:iam::ACCOUNT_ID:policy/DataAllCustomCDKPolicy aws://ACCOUNT_ID/REGION`} + + + + + {process.env.REACT_APP_ENABLE_PIVOT_ROLE_AUTO_CREATE === 'True' ? ( - As part of the environment CloudFormation stack data.all + 3. As part of the environment CloudFormation stack data.all will create an IAM role (Pivot Role) to manage AWS operations in the environment AWS Account. @@ -357,8 +397,8 @@ const EnvironmentCreateForm = (props) => { - Create an IAM role named {pivotRoleName} using the - AWS CloudFormation stack below + 3. Create an IAM role named {pivotRoleName} using + the AWS CloudFormation stack below diff --git a/template_cdk.json b/template_cdk.json index 75a36cfbd..9cbd28eb2 100644 --- a/template_cdk.json +++ b/template_cdk.json @@ -21,6 +21,7 @@ "with_approval": "boolean_ADD_CODEPIPELINE_APPROVAL_STEP|DEFAULT=false", "vpc_id": "string_DEPLOY_WITHIN_AN_EXISTING_VPC|DEFAULT=None", "vpc_endpoints_sg": "string_DEPLOY_WITHIN_EXISTING_VPC_SG|DEFAULT=None", + "vpc_restricted_nacl": "boolean_CREATE_CUSTOM_NACL|DEFAULT=false", "internet_facing": "boolean_CLOUDFRONT_IF_TRUE_ELSE_ECS_BEHIND_INTERNAL_ALB|DEFAULT=true", "custom_domain": { "hosted_zone_name": "string_ROUTE_53_EXISTING_DOMAIN_NAME|DEFAULT=None, REQUIRED if internet_facing=false",