From c158a0530a6bf7a66ab2121046e8242b694a64ac Mon Sep 17 00:00:00 2001 From: Ahmed Kamel Date: Sun, 10 Jan 2021 17:48:36 +0000 Subject: [PATCH 01/50] feat(aws-glue): add Job construct (#12443) closes #12443 - Add GlueVersion, WorkerType, JobCommandName enum-like classes for future-proofing these values - Add PythonVersion as a normal enum as it's less likely to change - Add Job construct - CloudFormation base resource is a complex resource with almost all properties being optional and allowed in specific combinations based on command, glue version and worker types - It supports multiple job types e.g. ETL, Streaming and Python Shell and languages e.g. Scala and Python. This requires different combinations of props - One possibility is to try to create different constructs for the different types but I've opted to do just one base on resembling CloudFormation which does not prevent us from introducing more specialized types later - Documentation is not accurate, based on integ test deployments which led me to not having any assertions on valid props combinations - https://docs.aws.amazon.com/glue/latest/dg/add-job.html - https://docs.aws.amazon.com/glue/latest/dg/aws-glue-api-jobs-job.html - Not exposing some CloudFormation props - allocatedCapacity as it's deprecated in favour of maxCapacity - logUri which is not used and is reserved for future use --- packages/@aws-cdk/aws-glue/README.md | 14 + packages/@aws-cdk/aws-glue/lib/index.ts | 1 + packages/@aws-cdk/aws-glue/lib/job.ts | 442 +++++++++++++ packages/@aws-cdk/aws-glue/package.json | 1 + .../aws-glue/test/integ.job.expected.json | 596 ++++++++++++++++++ packages/@aws-cdk/aws-glue/test/integ.job.ts | 48 ++ .../aws-glue/test/job-script/hello_world.py | 1 + packages/@aws-cdk/aws-glue/test/job.test.ts | 273 ++++++++ 8 files changed, 1376 insertions(+) create mode 100644 packages/@aws-cdk/aws-glue/lib/job.ts create mode 100644 packages/@aws-cdk/aws-glue/test/integ.job.expected.json create mode 100644 packages/@aws-cdk/aws-glue/test/integ.job.ts create mode 100644 packages/@aws-cdk/aws-glue/test/job-script/hello_world.py create mode 100644 packages/@aws-cdk/aws-glue/test/job.test.ts diff --git a/packages/@aws-cdk/aws-glue/README.md b/packages/@aws-cdk/aws-glue/README.md index 20e08d7c14e31..aa9eaca4994e8 100644 --- a/packages/@aws-cdk/aws-glue/README.md +++ b/packages/@aws-cdk/aws-glue/README.md @@ -41,6 +41,20 @@ If you need to use a connection type that doesn't exist as a static member on `C See [Adding a Connection to Your Data Store](https://docs.aws.amazon.com/glue/latest/dg/populate-add-connection.html) and [Connection Structure](https://docs.aws.amazon.com/glue/latest/dg/aws-glue-api-catalog-connections.html#aws-glue-api-catalog-connections-Connection) documentation for more information on the supported data stores and their configurations. +## Job + +A `Job` encapsulates a script that connects to a data source, processes it, and then writes output to a data target. +Typically, a job runs extract, transform, and load (ETL) scripts. Jobs can also run general-purpose Python scripts (Python shell jobs). + +```ts +new glue.Job(stack, 'Job', { + jobCommand: glue.JobCommand.pythonShell('s3://bucketName/script.py', glue.PythonVersion.TWO), + description: 'an example pythonshell job', +}); +``` + +See [documentation](https://docs.aws.amazon.com/glue/latest/dg/add-job.html) for more information on adding jobs in Glue. + ## Database A `Database` is a logical grouping of `Tables` in the Glue Catalog. diff --git a/packages/@aws-cdk/aws-glue/lib/index.ts b/packages/@aws-cdk/aws-glue/lib/index.ts index a3dfa85b3be71..936d1f003aa6e 100644 --- a/packages/@aws-cdk/aws-glue/lib/index.ts +++ b/packages/@aws-cdk/aws-glue/lib/index.ts @@ -4,6 +4,7 @@ export * from './glue.generated'; export * from './connection'; export * from './data-format'; export * from './database'; +export * from './job'; export * from './schema'; export * from './security-configuration'; export * from './table'; \ No newline at end of file diff --git a/packages/@aws-cdk/aws-glue/lib/job.ts b/packages/@aws-cdk/aws-glue/lib/job.ts new file mode 100644 index 0000000000000..c82a1d24fcd1c --- /dev/null +++ b/packages/@aws-cdk/aws-glue/lib/job.ts @@ -0,0 +1,442 @@ +import * as iam from '@aws-cdk/aws-iam'; +import * as cdk from '@aws-cdk/core'; +import * as constructs from 'constructs'; +import { CfnJob } from './glue.generated'; + +/** + * TODO Consider adding the following + * - metrics/events methods + * - helper constans class with known glue special params for use in default arguments https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-glue-arguments.html + */ + +/** + * AWS Glue version determines the versions of Apache Spark and Python that are available to the job. + * + * @see https://docs.aws.amazon.com/glue/latest/dg/add-job.html. + * + * If you need to use a GlueVersion that doesn't exist as a static member, you + * can instantiate a `GlueVersion` object, e.g: `new GlueVersion('1.5')`. + */ +export class GlueVersion { + /** + * A list of all known `GlueVersion`s. + */ + public static readonly ALL = new Array(); + + /** + * Glue version using Spark 2.2.1 and Python 2.7 + */ + public static readonly ZERO_POINT_NINE = new GlueVersion('0.9'); + + /** + * Glue version using Spark 2.4.3, Python 2.7 and Python 3.6 + */ + public static readonly ONE_POINT_ZERO = new GlueVersion('1.0'); + + /** + * Glue version using Spark 2.4.3 and Python 3.7 + */ + public static readonly TWO_POINT_ZERO = new GlueVersion('2.0'); + + /** + * The name of this GlueVersion, as expected by Job resource. + */ + public readonly name: string; + + constructor(name: string) { + this.name = name; + GlueVersion.ALL.push(this); + } + + /** + * The glue version name as expected by job resource. + */ + public toString(): string { + return this.name; + } +} + +/** + * The type of predefined worker that is allocated when a job runs. + * + * If you need to use a WorkerType that doesn't exist as a static member, you + * can instantiate a `WorkerType` object, e.g: `new WorkerType('other type')`. + */ +export class WorkerType { + /** A list of all known `WorkerType`s. */ + public static readonly ALL = new Array(); + + /** + * Each worker provides 4 vCPU, 16 GB of memory and a 50GB disk, and 2 executors per worker. + */ + public static readonly STANDARD = new WorkerType('Standard'); + + /** + * Each worker maps to 1 DPU (4 vCPU, 16 GB of memory, 64 GB disk), and provides 1 executor per worker. Suitable for memory-intensive jobs. + */ + public static readonly G_1X = new WorkerType('G.1X'); + + /** + * Each worker maps to 2 DPU (8 vCPU, 32 GB of memory, 128 GB disk), and provides 1 executor per worker. Suitable for memory-intensive jobs. + */ + public static readonly G_2X = new WorkerType('G.2X'); + + /** + * The name of this WorkerType, as expected by Job resource. + */ + public readonly name: string; + + constructor(name: string) { + this.name = name; + WorkerType.ALL.push(this); + } + + /** + * The worker type name as expected by Job resource. + */ + public toString(): string { + return this.name; + } +} + +/** + * Python version + */ +export enum PythonVersion { + /** + * Python 2 (the exact version depends on GlueVersion and JobCommand used) + */ + TWO = '2', + + /** + * Python 3 (the exact version depends on GlueVersion and JobCommand used) + */ + THREE = '3', +} + +/** + * The job command name used for job run. + * + * If you need to use a JobCommandName that doesn't exist as a static member, you + * can instantiate a `WorkerType` object, e.g: `new JobCommandName('other name')`. + */ +export class JobCommandName { + /** A list of all known `JobCommandName`s. */ + public static readonly ALL = new Array(); + + /** + * Command for running a Glue ETL job. + */ + public static readonly GLUE_ETL = new JobCommandName('glueetl'); + + /** + * Command for running a Glue streaming job. + */ + public static readonly GLUE_STREAMING = new JobCommandName('gluestreaming'); + + /** + * Command for running a Glue python shell job. + */ + public static readonly PYTHON_SHELL = new JobCommandName('pythonshell'); + + /** + * The name of this JobCommandName, as expected by Job resource. + */ + public readonly name: string; + + constructor(name: string) { + this.name = name; + JobCommandName.ALL.push(this); + } + + /** + * The worker type name as expected by Job resource. + */ + public toString(): string { + return this.name; + } +} + +/** + * JobCommand specifies the execution environment and the code executed when a job is run. + */ +export class JobCommand { + + /** + * Create a glueetl JobCommand with the given scriptLocation + * + * @param scriptLocation specifies the Amazon Simple Storage Service (Amazon S3) path to a script that executes a job. + * @param pythonVersion specifies the Python shell version for the ETL job. Versions supported vary depending on GlueVersion. + */ + public static glueEtl(scriptLocation: string, pythonVersion?: PythonVersion) { + return new JobCommand(JobCommandName.GLUE_ETL, scriptLocation, pythonVersion); + } + + /** + * Create a gluestreaming JobCommand with the given scriptLocation + * + * @param scriptLocation specifies the Amazon Simple Storage Service (Amazon S3) path to a script that executes a job. + * @param pythonVersion specifies the Python shell version for the streaming job. Versions supported vary depending on GlueVersion. + */ + public static glueStreaming(scriptLocation: string, pythonVersion?: PythonVersion) { + return new JobCommand(JobCommandName.GLUE_STREAMING, scriptLocation, pythonVersion); + } + + /** + * Create a pythonshell JobCommand with the given scriptLocation and pythonVersion + * + * @param scriptLocation specifies the Amazon Simple Storage Service (Amazon S3) path to a script that executes a job. + * @param pythonVersion the Python version being used to execute a Python shell job. + */ + public static pythonShell(scriptLocation: string, pythonVersion?: PythonVersion) { + return new JobCommand(JobCommandName.PYTHON_SHELL, scriptLocation, pythonVersion); + } + + /** + * The name of the job command e.g. glueetl for an Apache Spark ETL job or pythonshell for a Python shell job. + */ + readonly name: JobCommandName; + + /** + * Specifies the Amazon Simple Storage Service (Amazon S3) path to a script that executes a job. + */ + readonly scriptLocation: string; + + /** + * The Python version being used to execute a Python shell job. + */ + readonly pythonVersion?: PythonVersion; + + constructor(name: JobCommandName, scriptLocation: string, pythonVersion?: PythonVersion) { + this.name = name; + this.scriptLocation = scriptLocation; + this.pythonVersion = pythonVersion; + } +} + +/** + * Interface representing a created or an imported {@link Job}. + */ +export interface IJob extends cdk.IResource { + /** + * The name of the job. + * @attribute + */ + readonly jobName: string; + + /** + * The ARN of the job. + * @attribute + */ + readonly jobArn: string; +} + +/** + * Attributes for importing {@link Job}. + */ +export interface JobAttributes { + /** + * The name of the job. + */ + readonly jobName: string; +} + +/** + * Construction properties for {@link Job}. + */ +export interface JobProps { + /** + * The name of the job. + * + * @default cloudformation generated name. + */ + readonly jobName?: string; + + /** + * The description of the job. + * + * @default no value. + */ + readonly description?: string; + + /** + * The number of AWS Glue data processing units (DPUs) that can be allocated when this job runs. + * + * @default 10 when you specify an Apache Spark ETL or Sreaming job, 0.0625 DPU when you specify a Python shell job. + */ + readonly maxCapacity?: number; + + /** + * The maximum number of times to retry this job after a JobRun fails. + * + * @default ? + */ + readonly maxRetries?: number; + + /** + * The maximum number of concurrent runs allowed for the job. + * An error is returned when this threshold is reached. The maximum value you can specify is controlled by a service limit. + * + * @default 1. + */ + readonly maxConcurrentRuns?: number; + + /** + * The number of minutes to wait after a job run starts, before sending a job run delay notification. + * + * @default ? + */ + readonly notifyDelayAfter?: cdk.Duration; + + /** + * The maximum time that a job run can consume resources before it is terminated and enters TIMEOUT status. + * + * @default 2,880 minutes (48 hours) + */ + readonly timeout?: cdk.Duration; + + /** + * Glue version determines the versions of Apache Spark and Python that AWS Glue supports. The Python version indicates the version supported for jobs of type Spark. + * + * @default 0.9 + */ + readonly glueVersion?: GlueVersion; + + /** + * The type of predefined worker that is allocated when a job runs. + * + * @default ? + */ + readonly workerType?: WorkerType; + + /** + * The number of workers of a defined {@link WorkerType} that are allocated when a job runs. + * + * @default ? + */ + readonly numberOfWorkers?: number; + + /** + * The {@link Connection}s used for this job. + * + * TODO Enable after https://github.com/aws/aws-cdk/issues/12442 is merged. + */ + // readonly connections?: IConnection []; + + /** + * The {@link SecurityConfiguration} to use for this job. + * + * TODO Enable after https://github.com/aws/aws-cdk/issues/12449 is merged. + */ + // readonly securityConfiguration?: ISecurityConfiguration; + + /** + * The default arguments for this job, specified as name-value pairs. + * + * @see https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-glue-arguments.html for a list of Special Parameters Used by AWS Glue + * @default no arguments + */ + readonly defaultArguments?: { [key: string]: string }; + + /** + * The tags to use with this job. + * + * @default no tags + */ + readonly tags?: { [key: string]: string }; + + /** + * The IAM role associated with this job. + * @default a new IAM role with arn:aws:iam::aws:policy/service-role/AWSGlueServiceRole managed policy + */ + readonly role?: iam.IRole; + + /** + * The job command specifying the type of the job e.g. glueetl or pythonshell and relevant parameters. + */ + readonly jobCommand: JobCommand; +} + +/** + * A Glue Job. + */ +export class Job extends cdk.Resource implements IJob { + /** + * Creates a Glue Job + * + * @param scope The scope creating construct (usually `this`). + * @param id The construct's id. + * @param attrs Import attributes + */ + public static fromJobAttributes(scope: constructs.Construct, id: string, attrs: JobAttributes): IJob { + class Import extends cdk.Resource implements IJob { + public readonly jobName = attrs.jobName; + public readonly jobArn = Job.buildJobArn(scope, attrs.jobName); + } + + return new Import(scope, id); + } + + private static buildJobArn(scope: constructs.Construct, jobName: string) : string { + return cdk.Stack.of(scope).formatArn({ + service: 'glue', + resource: 'job', + resourceName: jobName, + }); + } + + /** + * The ARN of the job. + */ + public readonly jobArn: string; + + /** + * The name of the job. + */ + public readonly jobName: string; + + /** + * The IAM role associated with this job. + */ + public readonly role: iam.IRole; + + constructor(scope: constructs.Construct, id: string, props: JobProps) { + super(scope, id, { + physicalName: props.jobName, + }); + + // Create a basic service role if one is not provided https://docs.aws.amazon.com/glue/latest/dg/create-service-policy.html + this.role = props.role || new iam.Role(this, 'ServiceRole', { + assumedBy: new iam.ServicePrincipal('glue'), + // arn:aws:iam::aws:policy/service-role/AWSGlueServiceRole + managedPolicies: [iam.ManagedPolicy.fromAwsManagedPolicyName('service-role/AWSGlueServiceRole')], + }); + + const jobResource = new CfnJob(this, 'Resource', { + name: props.jobName, + description: props.description, + role: this.role.roleArn, + command: { + name: props.jobCommand.name.name, + scriptLocation: props.jobCommand.scriptLocation, + pythonVersion: props.jobCommand.pythonVersion, + }, + glueVersion: props.glueVersion ? props.glueVersion.name : undefined, + workerType: props.workerType ? props.workerType.name : undefined, + numberOfWorkers: props.numberOfWorkers, + maxCapacity: props.maxCapacity, + maxRetries: props.maxRetries, + executionProperty: props.maxConcurrentRuns ? { maxConcurrentRuns: props.maxConcurrentRuns } : undefined, + notificationProperty: props.notifyDelayAfter ? { notifyDelayAfter: props.notifyDelayAfter.toMinutes() } : undefined, + timeout: props.timeout ? props.timeout.toMinutes() : undefined, + // connections: props.connections ? { connections: props.connections.map((connection) => connection.connectionName) } : undefined, + // securityConfiguration: props.securityConfiguration ? props.securityConfiguration.securityConfigurationName : undefined, + defaultArguments: props.defaultArguments, + tags: props.tags, + }); + + const resourceName = this.getResourceNameAttribute(jobResource.ref); + this.jobArn = Job.buildJobArn(this, resourceName); + this.jobName = resourceName; + } +} diff --git a/packages/@aws-cdk/aws-glue/package.json b/packages/@aws-cdk/aws-glue/package.json index 15fb49b3bc56b..c5435c94f2689 100644 --- a/packages/@aws-cdk/aws-glue/package.json +++ b/packages/@aws-cdk/aws-glue/package.json @@ -75,6 +75,7 @@ "devDependencies": { "@types/jest": "^26.0.23", "@aws-cdk/cx-api": "0.0.0", + "@aws-cdk/aws-s3-assets": "0.0.0", "@types/nodeunit": "^0.0.31", "cdk-build-tools": "0.0.0", "cdk-integ-tools": "0.0.0", diff --git a/packages/@aws-cdk/aws-glue/test/integ.job.expected.json b/packages/@aws-cdk/aws-glue/test/integ.job.expected.json new file mode 100644 index 0000000000000..cc776129ddbd3 --- /dev/null +++ b/packages/@aws-cdk/aws-glue/test/integ.job.expected.json @@ -0,0 +1,596 @@ +{ + "Parameters": { + "AssetParameters894df8f835015940e27548bfbf722885cb247378af70effdc8ecbe342419fc6bS3Bucket252142A8": { + "Type": "String", + "Description": "S3 bucket for asset \"894df8f835015940e27548bfbf722885cb247378af70effdc8ecbe342419fc6b\"" + }, + "AssetParameters894df8f835015940e27548bfbf722885cb247378af70effdc8ecbe342419fc6bS3VersionKey7D45B377": { + "Type": "String", + "Description": "S3 key for asset version \"894df8f835015940e27548bfbf722885cb247378af70effdc8ecbe342419fc6b\"" + }, + "AssetParameters894df8f835015940e27548bfbf722885cb247378af70effdc8ecbe342419fc6bArtifactHashB9AA8E72": { + "Type": "String", + "Description": "Artifact hash for asset \"894df8f835015940e27548bfbf722885cb247378af70effdc8ecbe342419fc6b\"" + } + }, + "Resources": { + "MinimalGlueEtlJobServiceRole60989380": { + "Type": "AWS::IAM::Role", + "Properties": { + "AssumeRolePolicyDocument": { + "Statement": [ + { + "Action": "sts:AssumeRole", + "Effect": "Allow", + "Principal": { + "Service": "glue.amazonaws.com" + } + } + ], + "Version": "2012-10-17" + }, + "ManagedPolicyArns": [ + { + "Fn::Join": [ + "", + [ + "arn:", + { + "Ref": "AWS::Partition" + }, + ":iam::aws:policy/service-role/AWSGlueServiceRole" + ] + ] + } + ] + } + }, + "MinimalGlueEtlJobServiceRoleDefaultPolicyEDA57791": { + "Type": "AWS::IAM::Policy", + "Properties": { + "PolicyDocument": { + "Statement": [ + { + "Action": [ + "s3:GetObject*", + "s3:GetBucket*", + "s3:List*" + ], + "Effect": "Allow", + "Resource": [ + { + "Fn::Join": [ + "", + [ + "arn:", + { + "Ref": "AWS::Partition" + }, + ":s3:::", + { + "Ref": "AssetParameters894df8f835015940e27548bfbf722885cb247378af70effdc8ecbe342419fc6bS3Bucket252142A8" + } + ] + ] + }, + { + "Fn::Join": [ + "", + [ + "arn:", + { + "Ref": "AWS::Partition" + }, + ":s3:::", + { + "Ref": "AssetParameters894df8f835015940e27548bfbf722885cb247378af70effdc8ecbe342419fc6bS3Bucket252142A8" + }, + "/*" + ] + ] + } + ] + } + ], + "Version": "2012-10-17" + }, + "PolicyName": "MinimalGlueEtlJobServiceRoleDefaultPolicyEDA57791", + "Roles": [ + { + "Ref": "MinimalGlueEtlJobServiceRole60989380" + } + ] + } + }, + "MinimalGlueEtlJobF8C90254": { + "Type": "AWS::Glue::Job", + "Properties": { + "Command": { + "Name": "glueetl", + "ScriptLocation": { + "Fn::Join": [ + "", + [ + "s3://", + { + "Ref": "AssetParameters894df8f835015940e27548bfbf722885cb247378af70effdc8ecbe342419fc6bS3Bucket252142A8" + }, + "/", + { + "Fn::Select": [ + 0, + { + "Fn::Split": [ + "||", + { + "Ref": "AssetParameters894df8f835015940e27548bfbf722885cb247378af70effdc8ecbe342419fc6bS3VersionKey7D45B377" + } + ] + } + ] + }, + { + "Fn::Select": [ + 1, + { + "Fn::Split": [ + "||", + { + "Ref": "AssetParameters894df8f835015940e27548bfbf722885cb247378af70effdc8ecbe342419fc6bS3VersionKey7D45B377" + } + ] + } + ] + } + ] + ] + } + }, + "Role": { + "Fn::GetAtt": [ + "MinimalGlueEtlJobServiceRole60989380", + "Arn" + ] + } + } + }, + "MinimalGlueStreamingJobServiceRole77973DB5": { + "Type": "AWS::IAM::Role", + "Properties": { + "AssumeRolePolicyDocument": { + "Statement": [ + { + "Action": "sts:AssumeRole", + "Effect": "Allow", + "Principal": { + "Service": "glue.amazonaws.com" + } + } + ], + "Version": "2012-10-17" + }, + "ManagedPolicyArns": [ + { + "Fn::Join": [ + "", + [ + "arn:", + { + "Ref": "AWS::Partition" + }, + ":iam::aws:policy/service-role/AWSGlueServiceRole" + ] + ] + } + ] + } + }, + "MinimalGlueStreamingJobServiceRoleDefaultPolicyCA892591": { + "Type": "AWS::IAM::Policy", + "Properties": { + "PolicyDocument": { + "Statement": [ + { + "Action": [ + "s3:GetObject*", + "s3:GetBucket*", + "s3:List*" + ], + "Effect": "Allow", + "Resource": [ + { + "Fn::Join": [ + "", + [ + "arn:", + { + "Ref": "AWS::Partition" + }, + ":s3:::", + { + "Ref": "AssetParameters894df8f835015940e27548bfbf722885cb247378af70effdc8ecbe342419fc6bS3Bucket252142A8" + } + ] + ] + }, + { + "Fn::Join": [ + "", + [ + "arn:", + { + "Ref": "AWS::Partition" + }, + ":s3:::", + { + "Ref": "AssetParameters894df8f835015940e27548bfbf722885cb247378af70effdc8ecbe342419fc6bS3Bucket252142A8" + }, + "/*" + ] + ] + } + ] + } + ], + "Version": "2012-10-17" + }, + "PolicyName": "MinimalGlueStreamingJobServiceRoleDefaultPolicyCA892591", + "Roles": [ + { + "Ref": "MinimalGlueStreamingJobServiceRole77973DB5" + } + ] + } + }, + "MinimalGlueStreamingJobC58FD856": { + "Type": "AWS::Glue::Job", + "Properties": { + "Command": { + "Name": "gluestreaming", + "ScriptLocation": { + "Fn::Join": [ + "", + [ + "s3://", + { + "Ref": "AssetParameters894df8f835015940e27548bfbf722885cb247378af70effdc8ecbe342419fc6bS3Bucket252142A8" + }, + "/", + { + "Fn::Select": [ + 0, + { + "Fn::Split": [ + "||", + { + "Ref": "AssetParameters894df8f835015940e27548bfbf722885cb247378af70effdc8ecbe342419fc6bS3VersionKey7D45B377" + } + ] + } + ] + }, + { + "Fn::Select": [ + 1, + { + "Fn::Split": [ + "||", + { + "Ref": "AssetParameters894df8f835015940e27548bfbf722885cb247378af70effdc8ecbe342419fc6bS3VersionKey7D45B377" + } + ] + } + ] + } + ] + ] + } + }, + "Role": { + "Fn::GetAtt": [ + "MinimalGlueStreamingJobServiceRole77973DB5", + "Arn" + ] + } + } + }, + "MinimalPythonShellJobServiceRole4944649D": { + "Type": "AWS::IAM::Role", + "Properties": { + "AssumeRolePolicyDocument": { + "Statement": [ + { + "Action": "sts:AssumeRole", + "Effect": "Allow", + "Principal": { + "Service": "glue.amazonaws.com" + } + } + ], + "Version": "2012-10-17" + }, + "ManagedPolicyArns": [ + { + "Fn::Join": [ + "", + [ + "arn:", + { + "Ref": "AWS::Partition" + }, + ":iam::aws:policy/service-role/AWSGlueServiceRole" + ] + ] + } + ] + } + }, + "MinimalPythonShellJobServiceRoleDefaultPolicy0FFC6CE9": { + "Type": "AWS::IAM::Policy", + "Properties": { + "PolicyDocument": { + "Statement": [ + { + "Action": [ + "s3:GetObject*", + "s3:GetBucket*", + "s3:List*" + ], + "Effect": "Allow", + "Resource": [ + { + "Fn::Join": [ + "", + [ + "arn:", + { + "Ref": "AWS::Partition" + }, + ":s3:::", + { + "Ref": "AssetParameters894df8f835015940e27548bfbf722885cb247378af70effdc8ecbe342419fc6bS3Bucket252142A8" + } + ] + ] + }, + { + "Fn::Join": [ + "", + [ + "arn:", + { + "Ref": "AWS::Partition" + }, + ":s3:::", + { + "Ref": "AssetParameters894df8f835015940e27548bfbf722885cb247378af70effdc8ecbe342419fc6bS3Bucket252142A8" + }, + "/*" + ] + ] + } + ] + } + ], + "Version": "2012-10-17" + }, + "PolicyName": "MinimalPythonShellJobServiceRoleDefaultPolicy0FFC6CE9", + "Roles": [ + { + "Ref": "MinimalPythonShellJobServiceRole4944649D" + } + ] + } + }, + "MinimalPythonShellJob43B4A269": { + "Type": "AWS::Glue::Job", + "Properties": { + "Command": { + "Name": "pythonshell", + "ScriptLocation": { + "Fn::Join": [ + "", + [ + "s3://", + { + "Ref": "AssetParameters894df8f835015940e27548bfbf722885cb247378af70effdc8ecbe342419fc6bS3Bucket252142A8" + }, + "/", + { + "Fn::Select": [ + 0, + { + "Fn::Split": [ + "||", + { + "Ref": "AssetParameters894df8f835015940e27548bfbf722885cb247378af70effdc8ecbe342419fc6bS3VersionKey7D45B377" + } + ] + } + ] + }, + { + "Fn::Select": [ + 1, + { + "Fn::Split": [ + "||", + { + "Ref": "AssetParameters894df8f835015940e27548bfbf722885cb247378af70effdc8ecbe342419fc6bS3VersionKey7D45B377" + } + ] + } + ] + } + ] + ] + } + }, + "Role": { + "Fn::GetAtt": [ + "MinimalPythonShellJobServiceRole4944649D", + "Arn" + ] + } + } + }, + "JobServiceRole4F432993": { + "Type": "AWS::IAM::Role", + "Properties": { + "AssumeRolePolicyDocument": { + "Statement": [ + { + "Action": "sts:AssumeRole", + "Effect": "Allow", + "Principal": { + "Service": "glue.amazonaws.com" + } + } + ], + "Version": "2012-10-17" + }, + "ManagedPolicyArns": [ + { + "Fn::Join": [ + "", + [ + "arn:", + { + "Ref": "AWS::Partition" + }, + ":iam::aws:policy/service-role/AWSGlueServiceRole" + ] + ] + } + ] + } + }, + "JobServiceRoleDefaultPolicy03F68F9D": { + "Type": "AWS::IAM::Policy", + "Properties": { + "PolicyDocument": { + "Statement": [ + { + "Action": [ + "s3:GetObject*", + "s3:GetBucket*", + "s3:List*" + ], + "Effect": "Allow", + "Resource": [ + { + "Fn::Join": [ + "", + [ + "arn:", + { + "Ref": "AWS::Partition" + }, + ":s3:::", + { + "Ref": "AssetParameters894df8f835015940e27548bfbf722885cb247378af70effdc8ecbe342419fc6bS3Bucket252142A8" + } + ] + ] + }, + { + "Fn::Join": [ + "", + [ + "arn:", + { + "Ref": "AWS::Partition" + }, + ":s3:::", + { + "Ref": "AssetParameters894df8f835015940e27548bfbf722885cb247378af70effdc8ecbe342419fc6bS3Bucket252142A8" + }, + "/*" + ] + ] + } + ] + } + ], + "Version": "2012-10-17" + }, + "PolicyName": "JobServiceRoleDefaultPolicy03F68F9D", + "Roles": [ + { + "Ref": "JobServiceRole4F432993" + } + ] + } + }, + "JobB9D00F9F": { + "Type": "AWS::Glue::Job", + "Properties": { + "Command": { + "Name": "glueetl", + "ScriptLocation": { + "Fn::Join": [ + "", + [ + "s3://", + { + "Ref": "AssetParameters894df8f835015940e27548bfbf722885cb247378af70effdc8ecbe342419fc6bS3Bucket252142A8" + }, + "/", + { + "Fn::Select": [ + 0, + { + "Fn::Split": [ + "||", + { + "Ref": "AssetParameters894df8f835015940e27548bfbf722885cb247378af70effdc8ecbe342419fc6bS3VersionKey7D45B377" + } + ] + } + ] + }, + { + "Fn::Select": [ + 1, + { + "Fn::Split": [ + "||", + { + "Ref": "AssetParameters894df8f835015940e27548bfbf722885cb247378af70effdc8ecbe342419fc6bS3VersionKey7D45B377" + } + ] + } + ] + } + ] + ] + } + }, + "Role": { + "Fn::GetAtt": [ + "JobServiceRole4F432993", + "Arn" + ] + }, + "DefaultArguments": { + "arg1": "value1", + "arg2": "value2" + }, + "ExecutionProperty": { + "MaxConcurrentRuns": 2 + }, + "GlueVersion": "2.0", + "MaxRetries": 2, + "NotificationProperty": { + "NotifyDelayAfter": 1 + }, + "NumberOfWorkers": 10, + "Tags": { + "key": "value" + }, + "Timeout": 5, + "WorkerType": "G.2X" + } + } + } +} \ No newline at end of file diff --git a/packages/@aws-cdk/aws-glue/test/integ.job.ts b/packages/@aws-cdk/aws-glue/test/integ.job.ts new file mode 100644 index 0000000000000..0df46b515b286 --- /dev/null +++ b/packages/@aws-cdk/aws-glue/test/integ.job.ts @@ -0,0 +1,48 @@ +import * as path from 'path'; +import * as s3_assets from '@aws-cdk/aws-s3-assets'; +import * as cdk from '@aws-cdk/core'; +import * as glue from '../lib'; + +const app = new cdk.App(); + +const stack = new cdk.Stack(app, 'aws-glue-job'); + +const script = new s3_assets.Asset(stack, 'script', { + path: path.join(__dirname, 'job-script/hello_world.py'), +}); + +const minimalEtlJob = new glue.Job(stack, 'MinimalGlueEtlJob', { + jobCommand: glue.JobCommand.glueEtl(script.s3ObjectUrl), +}); +script.bucket.grantRead(minimalEtlJob.role); + +const minimalStreamingJob = new glue.Job(stack, 'MinimalGlueStreamingJob', { + jobCommand: glue.JobCommand.glueStreaming(script.s3ObjectUrl), +}); +script.bucket.grantRead(minimalStreamingJob.role); + +const minimalPythonShellJob = new glue.Job(stack, 'MinimalPythonShellJob', { + jobCommand: glue.JobCommand.pythonShell(script.s3ObjectUrl), +}); +script.bucket.grantRead(minimalPythonShellJob.role); + +const etlJob = new glue.Job(stack, 'Job', { + jobCommand: glue.JobCommand.glueEtl(script.s3ObjectUrl), + glueVersion: glue.GlueVersion.TWO_POINT_ZERO, + workerType: glue.WorkerType.G_2X, + numberOfWorkers: 10, + maxConcurrentRuns: 2, + maxRetries: 2, + timeout: cdk.Duration.minutes(5), + notifyDelayAfter: cdk.Duration.minutes(1), + defaultArguments: { + arg1: 'value1', + arg2: 'value2', + }, + tags: { + key: 'value', + }, +}); +script.bucket.grantRead(etlJob.role); + +app.synth(); diff --git a/packages/@aws-cdk/aws-glue/test/job-script/hello_world.py b/packages/@aws-cdk/aws-glue/test/job-script/hello_world.py new file mode 100644 index 0000000000000..4c613bd3178b5 --- /dev/null +++ b/packages/@aws-cdk/aws-glue/test/job-script/hello_world.py @@ -0,0 +1 @@ +println("hello world") \ No newline at end of file diff --git a/packages/@aws-cdk/aws-glue/test/job.test.ts b/packages/@aws-cdk/aws-glue/test/job.test.ts new file mode 100644 index 0000000000000..4fc1f6fc99215 --- /dev/null +++ b/packages/@aws-cdk/aws-glue/test/job.test.ts @@ -0,0 +1,273 @@ +import * as cdkassert from '@aws-cdk/assert-internal'; +import * as iam from '@aws-cdk/aws-iam'; +import * as cdk from '@aws-cdk/core'; +import '@aws-cdk/assert-internal/jest'; +import * as glue from '../lib'; + +describe('GlueVersion', () => { + test('.ZERO_POINT_NINE', () => expect(glue.GlueVersion.ZERO_POINT_NINE.name).toEqual('0.9')); + + test('.ONE_POINT_ZERO', () => expect(glue.GlueVersion.ONE_POINT_ZERO.name).toEqual('1.0')); + + test('.TWO_POINT_ZERO', () => expect(glue.GlueVersion.TWO_POINT_ZERO.name).toEqual('2.0')); + + test('new sets name correctly', () => expect(new glue.GlueVersion('CustomVersion').name).toEqual('CustomVersion')); +}); + +describe('WorkerType', () => { + test('.STANDARD', () => expect(glue.WorkerType.STANDARD.name).toEqual('Standard')); + + test('.G_1X', () => expect(glue.WorkerType.G_1X.name).toEqual('G.1X')); + + test('.G_2X', () => expect(glue.WorkerType.G_2X.name).toEqual('G.2X')); + + test('new sets name correctly', () => expect(new glue.WorkerType('CustomType').name).toEqual('CustomType')); +}); + +describe('JobCommandName', () => { + test('.GLUE_ETL', () => expect(glue.JobCommandName.GLUE_ETL.name).toEqual('glueetl')); + + test('.GLUE_STREAMING', () => expect(glue.JobCommandName.GLUE_STREAMING.name).toEqual('gluestreaming')); + + test('.PYTHON_SHELL', () => expect(glue.JobCommandName.PYTHON_SHELL.name).toEqual('pythonshell')); + + test('new sets name correctly', () => expect(new glue.JobCommandName('CustomName').name).toEqual('CustomName')); +}); + +describe('JobCommand', () => { + let scriptLocation: string; + + beforeEach(() => { + scriptLocation = 's3://bucketName/script'; + }); + + describe('new', () => { + let jobCommandName: glue.JobCommandName; + + // known command names + custom one + glue.JobCommandName.ALL.concat(new glue.JobCommandName('CustomName')).forEach((name) => { + describe(`with ${name} JobCommandName`, () => { + + beforeEach(() => { + jobCommandName = name; + }); + + test('without specified python version sets properties correctly', () => { + const jobCommand = new glue.JobCommand(jobCommandName, scriptLocation); + + expect(jobCommand.name).toEqual(jobCommandName); + expect(jobCommand.scriptLocation).toEqual(scriptLocation); + expect(jobCommand.pythonVersion).toBeUndefined(); + }); + + test('with specified python version sets properties correctly', () => { + const pythonVersion = glue.PythonVersion.TWO; + const jobCommand = new glue.JobCommand(jobCommandName, scriptLocation, pythonVersion); + + expect(jobCommand.name).toEqual(jobCommandName); + expect(jobCommand.scriptLocation).toEqual(scriptLocation); + expect(jobCommand.pythonVersion).toEqual(pythonVersion); + }); + }); + }); + }); + + test('.glueEtl uses GLUE_ETL JobCommandName', () => { + const jobCommand = glue.JobCommand.glueEtl(scriptLocation); + + expect(jobCommand.name).toEqual(glue.JobCommandName.GLUE_ETL); + expect(jobCommand.scriptLocation).toEqual(scriptLocation); + expect(jobCommand.pythonVersion).toBeUndefined(); + }); + + test('.glueStreaming uses GLUE_STREAMING JobCommandName', () => { + const jobCommand = glue.JobCommand.glueStreaming(scriptLocation, glue.PythonVersion.THREE); + + expect(jobCommand.name).toEqual(glue.JobCommandName.GLUE_STREAMING); + expect(jobCommand.scriptLocation).toEqual(scriptLocation); + expect(jobCommand.pythonVersion).toEqual(glue.PythonVersion.THREE); + }); + + test('.pythonShell uses PYTHON_SHELL JobCommandName', () => { + const jobCommand = glue.JobCommand.pythonShell(scriptLocation, glue.PythonVersion.TWO); + + expect(jobCommand.name).toEqual(glue.JobCommandName.PYTHON_SHELL); + expect(jobCommand.scriptLocation).toEqual(scriptLocation); + expect(jobCommand.pythonVersion).toEqual(glue.PythonVersion.TWO); + }); +}); + +describe('Job', () => { + let stack: cdk.Stack; + let scriptLocation: string; + let jobName: string; + let job: glue.Job; + + beforeEach(() => { + stack = new cdk.Stack(); + scriptLocation = 's3://bucketName/script'; + jobName = 'test-job'; + }); + + test('.fromJobAttributes should return correct jobName and jobArn', () => { + const iJob = glue.Job.fromJobAttributes(stack, 'ImportedJob', { jobName }); + + expect(iJob.jobName).toEqual(jobName); + expect(iJob.jobArn).toEqual(stack.formatArn({ + service: 'glue', + resource: 'job', + resourceName: jobName, + })); + }); + + describe('new', () => { + describe('with necessary props only', () => { + beforeEach(() => { + job = new glue.Job(stack, 'Job', { + jobCommand: glue.JobCommand.glueEtl(scriptLocation), + }); + }); + + test('should create a role and use it with the job', () => { + // check the role + expect(job.role).toBeDefined(); + cdkassert.expect(stack).to(cdkassert.haveResource('AWS::IAM::Role', { + AssumeRolePolicyDocument: { + Statement: [ + { + Action: 'sts:AssumeRole', + Effect: 'Allow', + Principal: { + Service: 'glue.amazonaws.com', + }, + }, + ], + Version: '2012-10-17', + }, + ManagedPolicyArns: [ + { + 'Fn::Join': [ + '', + [ + 'arn:', + { + Ref: 'AWS::Partition', + }, + ':iam::aws:policy/service-role/AWSGlueServiceRole', + ], + ], + }, + ], + })); + + // check the job using the role + cdkassert.expect(stack).to(cdkassert.haveResource('AWS::Glue::Job', { + Command: { + Name: 'glueetl', + ScriptLocation: scriptLocation, + }, + Role: { + 'Fn::GetAtt': [ + 'JobServiceRole4F432993', + 'Arn', + ], + }, + })); + }); + + test('should return correct jobName and jobArn from CloudFormation', () => { + expect(stack.resolve(job.jobName)).toEqual({ Ref: 'JobB9D00F9F' }); + expect(stack.resolve(job.jobArn)).toEqual({ + 'Fn::Join': ['', [ + 'arn:', { Ref: 'AWS::Partition' }, + ':glue:', { Ref: 'AWS::Region' }, ':', + { Ref: 'AWS::AccountId' }, ':job/', { Ref: 'JobB9D00F9F' }, + ]], + }); + }); + + test('with a custom role should use it and set it in CloudFormation', () => { + const role = iam.Role.fromRoleArn(stack, 'Role', 'arn:aws:iam::123456789012:role/TestRole'); + job = new glue.Job(stack, 'JobWithRole', { + jobCommand: glue.JobCommand.glueEtl(scriptLocation), + role, + }); + + expect(job.role).toEqual(role); + cdkassert.expect(stack).to(cdkassert.haveResourceLike('AWS::Glue::Job', { + Role: role.roleArn, + })); + }); + + test('with a custom jobName should set it in CloudFormation', () => { + job = new glue.Job(stack, 'JobWithName', { + jobCommand: glue.JobCommand.glueEtl(scriptLocation), + jobName, + }); + + cdkassert.expect(stack).to(cdkassert.haveResourceLike('AWS::Glue::Job', { + Name: jobName, + })); + }); + }); + + describe('with props', () => { + beforeEach(() => { + job = new glue.Job(stack, 'Job', { + jobName, + description: 'test job', + jobCommand: glue.JobCommand.glueEtl(scriptLocation), + glueVersion: glue.GlueVersion.TWO_POINT_ZERO, + workerType: glue.WorkerType.G_2X, + numberOfWorkers: 10, + maxConcurrentRuns: 2, + maxRetries: 2, + timeout: cdk.Duration.minutes(5), + notifyDelayAfter: cdk.Duration.minutes(1), + defaultArguments: { + arg1: 'value1', + arg2: 'value2', + }, + tags: { + key: 'value', + }, + }); + }); + + test('should synthesize correctly', () => { + cdkassert.expect(stack).to(cdkassert.haveResource('AWS::Glue::Job', { + Command: { + Name: 'glueetl', + ScriptLocation: 's3://bucketName/script', + }, + Role: { + 'Fn::GetAtt': [ + 'JobServiceRole4F432993', + 'Arn', + ], + }, + DefaultArguments: { + arg1: 'value1', + arg2: 'value2', + }, + Description: 'test job', + ExecutionProperty: { + MaxConcurrentRuns: 2, + }, + GlueVersion: '2.0', + MaxRetries: 2, + Name: 'test-job', + NotificationProperty: { + NotifyDelayAfter: 1, + }, + NumberOfWorkers: 10, + Tags: { + key: 'value', + }, + Timeout: 5, + WorkerType: 'G.2X', + })); + }); + }); + }); +}); From 9ea48b67ac136aea3d4410a1fa1748fa9e86b02e Mon Sep 17 00:00:00 2001 From: Ahmed Kamel Date: Thu, 14 Jan 2021 18:16:32 +0000 Subject: [PATCH 02/50] support job's event rules and rule-based metrics - glue job does not emit success or failure metrics to cloudwatch metrics. Instead, it emits events to cloudwatch events - add JobEventState enum for known job states in https://docs.aws.amazon.com/AmazonCloudWatch/latest/events/EventTypes.html#glue-event-types - add utility methods to create event rules and cloudwatch metrics based on those roles supporting the most common use cases; success, failure and timeout --- packages/@aws-cdk/aws-glue/lib/job.ts | 160 +++++- packages/@aws-cdk/aws-glue/package.json | 4 + .../aws-glue/test/integ.job.expected.json | 37 ++ packages/@aws-cdk/aws-glue/test/integ.job.ts | 2 + packages/@aws-cdk/aws-glue/test/job.test.ts | 476 +++++++++++++++++- 5 files changed, 668 insertions(+), 11 deletions(-) diff --git a/packages/@aws-cdk/aws-glue/lib/job.ts b/packages/@aws-cdk/aws-glue/lib/job.ts index c82a1d24fcd1c..62ba43d2f5b57 100644 --- a/packages/@aws-cdk/aws-glue/lib/job.ts +++ b/packages/@aws-cdk/aws-glue/lib/job.ts @@ -1,3 +1,5 @@ +import * as cloudwatch from '@aws-cdk/aws-cloudwatch'; +import * as events from '@aws-cdk/aws-events'; import * as iam from '@aws-cdk/aws-iam'; import * as cdk from '@aws-cdk/core'; import * as constructs from 'constructs'; @@ -5,8 +7,8 @@ import { CfnJob } from './glue.generated'; /** * TODO Consider adding the following - * - metrics/events methods - * - helper constans class with known glue special params for use in default arguments https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-glue-arguments.html + * - cloudwatch metrics helpers/methods + * - helper constants class with known glue special params for use in default arguments https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-glue-arguments.html */ /** @@ -114,6 +116,48 @@ export enum PythonVersion { THREE = '3', } +/** + * Job states emitted by Glue to CloudWatch Events. + * + * @see https://docs.aws.amazon.com/AmazonCloudWatch/latest/events/EventTypes.html#glue-event-types for more information. + */ +export enum JobEventState { + /** + * State indicating job run succeeded + */ + SUCCEEDED = 'SUCCEEDED', + + /** + * State indicating job run failed + */ + FAILED = 'FAILED', + + /** + * State indicating job run timed out + */ + TIMEOUT = 'TIMEOUT', + + /** + * State indicating job is starting + */ + STARTING = 'STARTING', + + /** + * State indicating job is running + */ + RUNNING = 'RUNNING', + + /** + * State indicating job is stopping + */ + STOPPING = 'STOPPING', + + /** + * State indicating job stopped + */ + STOPPED = 'STOPPED', +} + /** * The job command name used for job run. * @@ -377,6 +421,23 @@ export class Job extends cdk.Resource implements IJob { return new Import(scope, id); } + /** + * Create a CloudWatch Metric with namespace = 'AWS/Events', metricName = 'TriggeredRules' and RuleName = rule.ruleName dimension. + * This is used by ruleXXXMetric methods. + * + * @param rule for use in setting RuleName dimension value + * @param props metric properties + */ + public static ruleMetric(rule: events.IRule, props?: cloudwatch.MetricOptions): cloudwatch.Metric { + return new cloudwatch.Metric({ + namespace: 'AWS/Events', + metricName: 'TriggeredRules', + dimensions: { RuleName: rule.ruleName }, + statistic: cloudwatch.Statistic.SUM, + ...props, + }); + } + private static buildJobArn(scope: constructs.Construct, jobName: string) : string { return cdk.Stack.of(scope).formatArn({ service: 'glue', @@ -400,6 +461,12 @@ export class Job extends cdk.Resource implements IJob { */ public readonly role: iam.IRole; + /** + * Used to cache results of ._rule calls. + * @private + */ + private _rules: Record = {}; + constructor(scope: constructs.Construct, id: string, props: JobProps) { super(scope, id, { physicalName: props.jobName, @@ -439,4 +506,93 @@ export class Job extends cdk.Resource implements IJob { this.jobArn = Job.buildJobArn(this, resourceName); this.jobName = resourceName; } + + + /** + * Create a CloudWatch Event Rule matching transition into the given `JobEventState`s + * + * @param state used in matching the CloudWatch Event + * + * @see https://docs.aws.amazon.com/AmazonCloudWatch/latest/events/EventTypes.html#glue-event-types + */ + public rule(state: JobEventState, props?: events.RuleProps): events.Rule { + const ruleId = `${state}Rule`; + return new events.Rule(this, ruleId, { + description: `Event triggered when Glue job ${this.jobName} is in ${state} state`, + eventPattern: { + source: ['aws.glue'], + detailType: ['Glue Job State Change', 'Glue Job Run Status'], + detail: { + state: [state], + jobName: [this.jobName], + }, + }, + ...props, + }); + } + + /** + * Return a CloudWatch Event Rule matching JobEventState.SUCCEEDED. + * The rule is cached for later usage. + * + * @param props rule props for first invocation. If props are passed again they are ignored. + */ + public successRule(props?: events.RuleProps): events.Rule { + return this._rule(JobEventState.SUCCEEDED, props); + } + + /** + * Return a CloudWatch Event Rule matching JobEventState.FAILED. + * The rule is cached for later usage. + * + * @param props rule props for first invocation. If props are passed again they are ignored. + */ + public failureRule(props?: events.RuleProps): events.Rule { + return this._rule(JobEventState.FAILED, props); + } + + /** + * Return a CloudWatch Event Rule matching JobEventState.TIMEOUT. + * The rule is cached for later usage. + * + * @param props rule props for first invocation. If props are passed again they are ignored. + */ + public timeoutRule(props?: events.RuleProps): events.Rule { + return this._rule(JobEventState.TIMEOUT, props); + } + + /** + * Return a CloudWatch Metric indicating job success that's based on successRule() + */ + public successRuleMetric(props?: cloudwatch.MetricOptions): cloudwatch.Metric { + return Job.ruleMetric(this.successRule(), props); + } + + /** + * Return a CloudWatch Metric indicating job success that's based on failureRule() + */ + public failureRuleMetric(props?: cloudwatch.MetricOptions): cloudwatch.Metric { + return Job.ruleMetric(this.failureRule(), props); + } + + /** + * Return a CloudWatch Metric indicating job success that's based on timeoutRule() + */ + public timeoutRuleMetric(props?: cloudwatch.MetricOptions): cloudwatch.Metric { + return Job.ruleMetric(this.timeoutRule(), props); + } + + /** + * Create a Rule with the given props and caches the results. Subsequent calls ignore props + * + * @param state used in matching the CloudWatch Event + * @param props rule properties + * @private + */ + private _rule(state: JobEventState, props?: events.RuleProps) { + if (!this._rules[state]) { + this._rules[state] = this.rule(state, props); + } + return this._rules[state]; + } } diff --git a/packages/@aws-cdk/aws-glue/package.json b/packages/@aws-cdk/aws-glue/package.json index c5435c94f2689..07e9432402ebe 100644 --- a/packages/@aws-cdk/aws-glue/package.json +++ b/packages/@aws-cdk/aws-glue/package.json @@ -85,6 +85,8 @@ "@aws-cdk/assert-internal": "0.0.0" }, "dependencies": { + "@aws-cdk/aws-cloudwatch": "0.0.0", + "@aws-cdk/aws-events": "0.0.0", "@aws-cdk/aws-ec2": "0.0.0", "@aws-cdk/aws-iam": "0.0.0", "@aws-cdk/aws-kms": "0.0.0", @@ -94,6 +96,8 @@ }, "homepage": "https://github.com/aws/aws-cdk", "peerDependencies": { + "@aws-cdk/aws-cloudwatch": "0.0.0", + "@aws-cdk/aws-events": "0.0.0", "@aws-cdk/aws-ec2": "0.0.0", "@aws-cdk/aws-iam": "0.0.0", "@aws-cdk/aws-kms": "0.0.0", diff --git a/packages/@aws-cdk/aws-glue/test/integ.job.expected.json b/packages/@aws-cdk/aws-glue/test/integ.job.expected.json index cc776129ddbd3..c445bbaf24d57 100644 --- a/packages/@aws-cdk/aws-glue/test/integ.job.expected.json +++ b/packages/@aws-cdk/aws-glue/test/integ.job.expected.json @@ -591,6 +591,43 @@ "Timeout": 5, "WorkerType": "G.2X" } + }, + "JobSUCCEEDEDRule682F039B": { + "Type": "AWS::Events::Rule", + "Properties": { + "Description": { + "Fn::Join": [ + "", + [ + "Event triggered when Glue job ", + { + "Ref": "JobB9D00F9F" + }, + " is in SUCCEEDED state" + ] + ] + }, + "EventPattern": { + "source": [ + "aws.glue" + ], + "detail-type": [ + "Glue Job State Change", + "Glue Job Run Status" + ], + "detail": { + "state": [ + "SUCCEEDED" + ], + "jobName": [ + { + "Ref": "JobB9D00F9F" + } + ] + } + }, + "State": "ENABLED" + } } } } \ No newline at end of file diff --git a/packages/@aws-cdk/aws-glue/test/integ.job.ts b/packages/@aws-cdk/aws-glue/test/integ.job.ts index 0df46b515b286..6da0263a47167 100644 --- a/packages/@aws-cdk/aws-glue/test/integ.job.ts +++ b/packages/@aws-cdk/aws-glue/test/integ.job.ts @@ -45,4 +45,6 @@ const etlJob = new glue.Job(stack, 'Job', { }); script.bucket.grantRead(etlJob.role); +etlJob.successRuleMetric(); + app.synth(); diff --git a/packages/@aws-cdk/aws-glue/test/job.test.ts b/packages/@aws-cdk/aws-glue/test/job.test.ts index 4fc1f6fc99215..65c2df0c4b486 100644 --- a/packages/@aws-cdk/aws-glue/test/job.test.ts +++ b/packages/@aws-cdk/aws-glue/test/job.test.ts @@ -1,4 +1,6 @@ import * as cdkassert from '@aws-cdk/assert-internal'; +import * as cloudwatch from '@aws-cdk/aws-cloudwatch'; +import * as events from '@aws-cdk/aws-events'; import * as iam from '@aws-cdk/aws-iam'; import * as cdk from '@aws-cdk/core'; import '@aws-cdk/assert-internal/jest'; @@ -72,7 +74,7 @@ describe('JobCommand', () => { }); }); - test('.glueEtl uses GLUE_ETL JobCommandName', () => { + test('.glueEtl() uses GLUE_ETL JobCommandName', () => { const jobCommand = glue.JobCommand.glueEtl(scriptLocation); expect(jobCommand.name).toEqual(glue.JobCommandName.GLUE_ETL); @@ -80,7 +82,7 @@ describe('JobCommand', () => { expect(jobCommand.pythonVersion).toBeUndefined(); }); - test('.glueStreaming uses GLUE_STREAMING JobCommandName', () => { + test('.glueStreaming() uses GLUE_STREAMING JobCommandName', () => { const jobCommand = glue.JobCommand.glueStreaming(scriptLocation, glue.PythonVersion.THREE); expect(jobCommand.name).toEqual(glue.JobCommandName.GLUE_STREAMING); @@ -88,7 +90,7 @@ describe('JobCommand', () => { expect(jobCommand.pythonVersion).toEqual(glue.PythonVersion.THREE); }); - test('.pythonShell uses PYTHON_SHELL JobCommandName', () => { + test('.pythonShell() uses PYTHON_SHELL JobCommandName', () => { const jobCommand = glue.JobCommand.pythonShell(scriptLocation, glue.PythonVersion.TWO); expect(jobCommand.name).toEqual(glue.JobCommandName.PYTHON_SHELL); @@ -99,17 +101,14 @@ describe('JobCommand', () => { describe('Job', () => { let stack: cdk.Stack; - let scriptLocation: string; let jobName: string; - let job: glue.Job; beforeEach(() => { stack = new cdk.Stack(); - scriptLocation = 's3://bucketName/script'; jobName = 'test-job'; }); - test('.fromJobAttributes should return correct jobName and jobArn', () => { + test('.fromJobAttributes() should return correct jobName and jobArn', () => { const iJob = glue.Job.fromJobAttributes(stack, 'ImportedJob', { jobName }); expect(iJob.jobName).toEqual(jobName); @@ -120,7 +119,44 @@ describe('Job', () => { })); }); - describe('new', () => { + describe('.ruleMetric()', () => { + let rule: events.IRule; + + beforeEach(() => { + rule = events.Rule.fromEventRuleArn(stack, 'Rule', 'arn:aws:events:us-east-1:123456789012:rule/example'); + }); + + test('with no props returns default metric', () => { + expect(glue.Job.ruleMetric(rule)).toEqual(new cloudwatch.Metric({ + dimensions: { + RuleName: 'example', + }, + metricName: 'TriggeredRules', + namespace: 'AWS/Events', + statistic: 'Sum', + })); + }); + + test('with props overrides', () => { + expect(glue.Job.ruleMetric(rule, { statistic: cloudwatch.Statistic.AVERAGE })).toEqual(new cloudwatch.Metric({ + dimensions: { + RuleName: 'example', + }, + metricName: 'TriggeredRules', + namespace: 'AWS/Events', + statistic: 'Average', + })); + }); + }); + + describe('new instance', () => { + let scriptLocation: string; + let job: glue.Job; + + beforeEach(() => { + scriptLocation = 's3://bucketName/script'; + }); + describe('with necessary props only', () => { beforeEach(() => { job = new glue.Job(stack, 'Job', { @@ -211,7 +247,7 @@ describe('Job', () => { }); }); - describe('with props', () => { + describe('with extended props', () => { beforeEach(() => { job = new glue.Job(stack, 'Job', { jobName, @@ -269,5 +305,427 @@ describe('Job', () => { })); }); }); + + describe('event rules and rule-based metrics', () => { + beforeEach(() => { + job = new glue.Job(stack, 'Job', { + jobCommand: glue.JobCommand.glueEtl(scriptLocation), + }); + }); + + test('.rule() creates the expected event rule', () => { + job.rule(glue.JobEventState.STOPPING); + + cdkassert.expect(stack).to(cdkassert.haveResourceLike('AWS::Events::Rule', { + Description: { + 'Fn::Join': [ + '', + [ + 'Event triggered when Glue job ', + { + Ref: 'JobB9D00F9F', + }, + ' is in STOPPING state', + ], + ], + }, + EventPattern: { + 'source': [ + 'aws.glue', + ], + 'detail-type': [ + 'Glue Job State Change', + 'Glue Job Run Status', + ], + 'detail': { + state: [ + 'STOPPING', + ], + jobName: [ + { + Ref: 'JobB9D00F9F', + }, + ], + }, + }, + State: 'ENABLED', + })); + }); + + describe('.successRule()', () => { + test('without props and multiple calls should create one resource and cache it', () => { + const firstInvocationRule = job.successRule(); + const subsequentInvocationRule = job.successRule(); + + expect(subsequentInvocationRule).toEqual(firstInvocationRule); + cdkassert.countResources('AWS::Events::Rule', 1); + cdkassert.expect(stack).to(cdkassert.haveResource('AWS::Events::Rule', { + Description: { + 'Fn::Join': [ + '', + [ + 'Event triggered when Glue job ', + { + Ref: 'JobB9D00F9F', + }, + ' is in SUCCEEDED state', + ], + ], + }, + EventPattern: { + 'source': [ + 'aws.glue', + ], + 'detail-type': [ + 'Glue Job State Change', + 'Glue Job Run Status', + ], + 'detail': { + state: [ + 'SUCCEEDED', + ], + jobName: [ + { + Ref: 'JobB9D00F9F', + }, + ], + }, + }, + State: 'ENABLED', + })); + }); + + test('with props and multiple calls should create one resource and cache it and ignore later props', () => { + const firstInvocationRule = job.successRule({ description: 'description override' }); + const subsequentInvocationRuleWithProps = job.successRule({ description: 'description to ignore' }); + const subsequentInvocationRuleWithoutProps = job.successRule(); + + expect(subsequentInvocationRuleWithProps).toEqual(firstInvocationRule); + expect(subsequentInvocationRuleWithoutProps).toEqual(firstInvocationRule); + cdkassert.countResources('AWS::Events::Rule', 1); + cdkassert.expect(stack).to(cdkassert.haveResource('AWS::Events::Rule', { + Description: 'description override', + EventPattern: { + 'source': [ + 'aws.glue', + ], + 'detail-type': [ + 'Glue Job State Change', + 'Glue Job Run Status', + ], + 'detail': { + state: [ + 'SUCCEEDED', + ], + jobName: [ + { + Ref: 'JobB9D00F9F', + }, + ], + }, + }, + State: 'ENABLED', + })); + }); + }); + + describe('.failureRule()', () => { + test('without props and multiple calls should create one resource and cache it', () => { + const firstInvocationRule = job.failureRule(); + const subsequentInvocationRule = job.failureRule(); + + expect(subsequentInvocationRule).toEqual(firstInvocationRule); + cdkassert.countResources('AWS::Events::Rule', 1); + cdkassert.expect(stack).to(cdkassert.haveResource('AWS::Events::Rule', { + Description: { + 'Fn::Join': [ + '', + [ + 'Event triggered when Glue job ', + { + Ref: 'JobB9D00F9F', + }, + ' is in FAILED state', + ], + ], + }, + EventPattern: { + 'source': [ + 'aws.glue', + ], + 'detail-type': [ + 'Glue Job State Change', + 'Glue Job Run Status', + ], + 'detail': { + state: [ + 'FAILED', + ], + jobName: [ + { + Ref: 'JobB9D00F9F', + }, + ], + }, + }, + State: 'ENABLED', + })); + }); + + test('with props and multiple calls should create one resource and cache it and ignore later props', () => { + const firstInvocationRule = job.failureRule({ description: 'description override' }); + const subsequentInvocationRuleWithProps = job.failureRule({ description: 'description to ignore' }); + const subsequentInvocationRuleWithoutProps = job.failureRule(); + + expect(subsequentInvocationRuleWithProps).toEqual(firstInvocationRule); + expect(subsequentInvocationRuleWithoutProps).toEqual(firstInvocationRule); + cdkassert.countResources('AWS::Events::Rule', 1); + cdkassert.expect(stack).to(cdkassert.haveResource('AWS::Events::Rule', { + Description: 'description override', + EventPattern: { + 'source': [ + 'aws.glue', + ], + 'detail-type': [ + 'Glue Job State Change', + 'Glue Job Run Status', + ], + 'detail': { + state: [ + 'FAILED', + ], + jobName: [ + { + Ref: 'JobB9D00F9F', + }, + ], + }, + }, + State: 'ENABLED', + })); + }); + }); + + describe('.timeoutRule()', () => { + test('without props and multiple calls should create one resource and cache it', () => { + const firstInvocationRule = job.timeoutRule(); + const subsequentInvocationRule = job.timeoutRule(); + + expect(subsequentInvocationRule).toEqual(firstInvocationRule); + cdkassert.countResources('AWS::Events::Rule', 1); + cdkassert.expect(stack).to(cdkassert.haveResource('AWS::Events::Rule', { + Description: { + 'Fn::Join': [ + '', + [ + 'Event triggered when Glue job ', + { + Ref: 'JobB9D00F9F', + }, + ' is in TIMEOUT state', + ], + ], + }, + EventPattern: { + 'source': [ + 'aws.glue', + ], + 'detail-type': [ + 'Glue Job State Change', + 'Glue Job Run Status', + ], + 'detail': { + state: [ + 'TIMEOUT', + ], + jobName: [ + { + Ref: 'JobB9D00F9F', + }, + ], + }, + }, + State: 'ENABLED', + })); + }); + + test('with props and multiple calls should create one resource and cache it and ignore later props', () => { + const firstInvocationRule = job.timeoutRule({ description: 'description override' }); + const subsequentInvocationRuleWithProps = job.timeoutRule({ description: 'description to ignore' }); + const subsequentInvocationRuleWithoutProps = job.timeoutRule(); + + expect(subsequentInvocationRuleWithProps).toEqual(firstInvocationRule); + expect(subsequentInvocationRuleWithoutProps).toEqual(firstInvocationRule); + cdkassert.countResources('AWS::Events::Rule', 1); + cdkassert.expect(stack).to(cdkassert.haveResource('AWS::Events::Rule', { + Description: 'description override', + EventPattern: { + 'source': [ + 'aws.glue', + ], + 'detail-type': [ + 'Glue Job State Change', + 'Glue Job Run Status', + ], + 'detail': { + state: [ + 'TIMEOUT', + ], + jobName: [ + { + Ref: 'JobB9D00F9F', + }, + ], + }, + }, + State: 'ENABLED', + })); + }); + }); + + test('.successRuleMetric() creates the expected event rule and corresponding metric', () => { + const metric = job.successRuleMetric(); + + expect(metric).toEqual(new cloudwatch.Metric({ + dimensions: { + RuleName: job.successRule().ruleName, + }, + metricName: 'TriggeredRules', + namespace: 'AWS/Events', + statistic: 'Sum', + })); + + cdkassert.expect(stack).to(cdkassert.haveResourceLike('AWS::Events::Rule', { + Description: { + 'Fn::Join': [ + '', + [ + 'Event triggered when Glue job ', + { + Ref: 'JobB9D00F9F', + }, + ' is in SUCCEEDED state', + ], + ], + }, + EventPattern: { + 'source': [ + 'aws.glue', + ], + 'detail-type': [ + 'Glue Job State Change', + 'Glue Job Run Status', + ], + 'detail': { + state: [ + 'SUCCEEDED', + ], + jobName: [ + { + Ref: 'JobB9D00F9F', + }, + ], + }, + }, + State: 'ENABLED', + })); + }); + + test('.failureRuleMetric() creates the expected event rule and corresponding metric', () => { + const metric = job.failureRuleMetric(); + + expect(metric).toEqual(new cloudwatch.Metric({ + dimensions: { + RuleName: job.failureRule().ruleName, + }, + metricName: 'TriggeredRules', + namespace: 'AWS/Events', + statistic: 'Sum', + })); + + cdkassert.expect(stack).to(cdkassert.haveResourceLike('AWS::Events::Rule', { + Description: { + 'Fn::Join': [ + '', + [ + 'Event triggered when Glue job ', + { + Ref: 'JobB9D00F9F', + }, + ' is in FAILED state', + ], + ], + }, + EventPattern: { + 'source': [ + 'aws.glue', + ], + 'detail-type': [ + 'Glue Job State Change', + 'Glue Job Run Status', + ], + 'detail': { + state: [ + 'FAILED', + ], + jobName: [ + { + Ref: 'JobB9D00F9F', + }, + ], + }, + }, + State: 'ENABLED', + })); + }); + + test('.timeoutRuleMetric() creates the expected event rule and corresponding metric', () => { + const metric = job.timeoutRuleMetric(); + + expect(metric).toEqual(new cloudwatch.Metric({ + dimensions: { + RuleName: job.timeoutRule().ruleName, + }, + metricName: 'TriggeredRules', + namespace: 'AWS/Events', + statistic: 'Sum', + })); + + cdkassert.expect(stack).to(cdkassert.haveResourceLike('AWS::Events::Rule', { + Description: { + 'Fn::Join': [ + '', + [ + 'Event triggered when Glue job ', + { + Ref: 'JobB9D00F9F', + }, + ' is in TIMEOUT state', + ], + ], + }, + EventPattern: { + 'source': [ + 'aws.glue', + ], + 'detail-type': [ + 'Glue Job State Change', + 'Glue Job Run Status', + ], + 'detail': { + state: [ + 'TIMEOUT', + ], + jobName: [ + { + Ref: 'JobB9D00F9F', + }, + ], + }, + }, + State: 'ENABLED', + })); + }); + }); }); }); From 638129c52cbe96abc5edeaac82aef7b1dcf2726b Mon Sep 17 00:00:00 2001 From: Ahmed Kamel Date: Thu, 14 Jan 2021 22:42:09 +0000 Subject: [PATCH 03/50] add metric helper method https://docs.aws.amazon.com/glue/latest/dg/monitoring-awsglue-with-cloudwatch-metrics.html --- packages/@aws-cdk/aws-glue/lib/job.ts | 44 ++++++++++++++++++-- packages/@aws-cdk/aws-glue/test/job.test.ts | 45 ++++++++++++++++++--- 2 files changed, 81 insertions(+), 8 deletions(-) diff --git a/packages/@aws-cdk/aws-glue/lib/job.ts b/packages/@aws-cdk/aws-glue/lib/job.ts index 62ba43d2f5b57..f1d25f03e2cd1 100644 --- a/packages/@aws-cdk/aws-glue/lib/job.ts +++ b/packages/@aws-cdk/aws-glue/lib/job.ts @@ -7,7 +7,6 @@ import { CfnJob } from './glue.generated'; /** * TODO Consider adding the following - * - cloudwatch metrics helpers/methods * - helper constants class with known glue special params for use in default arguments https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-glue-arguments.html */ @@ -158,6 +157,23 @@ export enum JobEventState { STOPPED = 'STOPPED', } +/** + * The Glue CloudWatch metric type. + * + * @see https://docs.aws.amazon.com/glue/latest/dg/monitoring-awsglue-with-cloudwatch-metrics.html + */ +export enum MetricType { + /** + * A value at a point in time. + */ + GAUGE = 'gauge', + + /** + * An aggregate number. + */ + COUNT = 'count', +} + /** * The job command name used for job run. * @@ -435,7 +451,7 @@ export class Job extends cdk.Resource implements IJob { dimensions: { RuleName: rule.ruleName }, statistic: cloudwatch.Statistic.SUM, ...props, - }); + }).attachTo(rule); } private static buildJobArn(scope: constructs.Construct, jobName: string) : string { @@ -583,7 +599,29 @@ export class Job extends cdk.Resource implements IJob { } /** - * Create a Rule with the given props and caches the results. Subsequent calls ignore props + * Create a CloudWatch metric. + * + * @param metricName name of the metric typically prefixed with `glue.driver.`, `glue..` or `glue.ALL.`. + * @param jobRunId a dimension that filters for metrics of a specific JobRun ID, or `ALL`. + * @param type the metric type. + * + * @see https://docs.aws.amazon.com/glue/latest/dg/monitoring-awsglue-with-cloudwatch-metrics.html + */ + public metric(metricName: string, jobRunId: string, type: MetricType, props?: cloudwatch.MetricOptions): cloudwatch.Metric { + return new cloudwatch.Metric({ + metricName, + namespace: 'Glue', + dimensions: { + JobName: this.jobName, + JobRunId: jobRunId, + Type: type, + }, + ...props, + }).attachTo(this); + } + + /** + * Create a Rule with the given props and caches the resulting rule. Subsequent returns cached value and ignores props * * @param state used in matching the CloudWatch Event * @param props rule properties diff --git a/packages/@aws-cdk/aws-glue/test/job.test.ts b/packages/@aws-cdk/aws-glue/test/job.test.ts index 65c2df0c4b486..9280f97bd1b60 100644 --- a/packages/@aws-cdk/aws-glue/test/job.test.ts +++ b/packages/@aws-cdk/aws-glue/test/job.test.ts @@ -149,7 +149,7 @@ describe('Job', () => { }); }); - describe('new instance', () => { + describe('new', () => { let scriptLocation: string; let job: glue.Job; @@ -316,7 +316,7 @@ describe('Job', () => { test('.rule() creates the expected event rule', () => { job.rule(glue.JobEventState.STOPPING); - cdkassert.expect(stack).to(cdkassert.haveResourceLike('AWS::Events::Rule', { + cdkassert.expect(stack).to(cdkassert.haveResource('AWS::Events::Rule', { Description: { 'Fn::Join': [ '', @@ -595,7 +595,7 @@ describe('Job', () => { statistic: 'Sum', })); - cdkassert.expect(stack).to(cdkassert.haveResourceLike('AWS::Events::Rule', { + cdkassert.expect(stack).to(cdkassert.haveResource('AWS::Events::Rule', { Description: { 'Fn::Join': [ '', @@ -643,7 +643,7 @@ describe('Job', () => { statistic: 'Sum', })); - cdkassert.expect(stack).to(cdkassert.haveResourceLike('AWS::Events::Rule', { + cdkassert.expect(stack).to(cdkassert.haveResource('AWS::Events::Rule', { Description: { 'Fn::Join': [ '', @@ -691,7 +691,7 @@ describe('Job', () => { statistic: 'Sum', })); - cdkassert.expect(stack).to(cdkassert.haveResourceLike('AWS::Events::Rule', { + cdkassert.expect(stack).to(cdkassert.haveResource('AWS::Events::Rule', { Description: { 'Fn::Join': [ '', @@ -727,5 +727,40 @@ describe('Job', () => { })); }); }); + + describe('.metric()', () => { + + test('to create a count sum metric', () => { + const metricName = 'glue.driver.aggregate.bytesRead'; + const props = { statistic: cloudwatch.Statistic.SUM }; + + expect(job.metric(metricName, 'ALL', glue.MetricType.COUNT, props)).toEqual(new cloudwatch.Metric({ + metricName, + statistic: 'Sum', + namespace: 'Glue', + dimensions: { + JobName: job.jobName, + JobRunId: 'ALL', + Type: 'count', + }, + })); + }); + + test('to create a gauge average metric', () => { + const metricName = 'glue.driver.BlockManager.disk.diskSpaceUsed_MB'; + const props = { statistic: cloudwatch.Statistic.AVERAGE }; + + expect(job.metric(metricName, 'ALL', glue.MetricType.GAUGE, props)).toEqual(new cloudwatch.Metric({ + metricName, + statistic: 'Average', + namespace: 'Glue', + dimensions: { + JobName: job.jobName, + JobRunId: 'ALL', + Type: 'gauge', + }, + })); + }); + }); }); }); From b99dbcec15ef77e53b2bf5bd100ab14c743afcd5 Mon Sep 17 00:00:00 2001 From: Ahmed Kamel Date: Fri, 15 Jan 2021 15:01:38 +0000 Subject: [PATCH 04/50] add JobSpecialArgumentNames for glue special parameters https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-glue-arguments.html https://docs.aws.amazon.com/glue/latest/dg/monitor-spark-ui-jobs.html --- packages/@aws-cdk/aws-glue/lib/job.ts | 130 ++++++++++++++++++++++-- packages/@aws-cdk/aws-glue/package.json | 3 +- 2 files changed, 126 insertions(+), 7 deletions(-) diff --git a/packages/@aws-cdk/aws-glue/lib/job.ts b/packages/@aws-cdk/aws-glue/lib/job.ts index f1d25f03e2cd1..1c114780f8f29 100644 --- a/packages/@aws-cdk/aws-glue/lib/job.ts +++ b/packages/@aws-cdk/aws-glue/lib/job.ts @@ -5,11 +5,6 @@ import * as cdk from '@aws-cdk/core'; import * as constructs from 'constructs'; import { CfnJob } from './glue.generated'; -/** - * TODO Consider adding the following - * - helper constants class with known glue special params for use in default arguments https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-glue-arguments.html - */ - /** * AWS Glue version determines the versions of Apache Spark and Python that are available to the job. * @@ -274,6 +269,128 @@ export class JobCommand { } } +/** + * Constants for some of the special parameters used by {@link Job}. + * + * These constants can be used as argument names in {@link JobProps.defaultArguments}. + * + * @see https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-glue-arguments.html + * @see https://docs.aws.amazon.com/glue/latest/dg/monitor-spark-ui-jobs.html + */ +export enum JobSpecialArgumentNames { + /** + * The script programming language. This value must be either `scala` or `python`. If this parameter is not present, the default is `python`. + */ + JOB_LANGUAGE = '--job-language', + + /** + * The Scala class that serves as the entry point for your Scala script. This applies only if your `--job-language` is set to `scala`. + */ + CLASS = '--class', + + /** + * The Amazon Simple Storage Service (Amazon S3) location where your ETL script is located (in the form `s3://path/to/my/script.py`). + * This parameter overrides a script location set in the JobCommand object. + */ + SCRIPT_LOCATION = '--scriptLocation', + + /** + * The Amazon S3 paths to additional Python modules that AWS Glue adds to the Python path before executing your script. + * Multiple values must be complete paths separated by a comma (`,`). + */ + EXTRA_PY_FILES = '--extra-py-files', + + /** + * The Amazon S3 paths to additional Java `.jar` files that AWS Glue adds to the Java classpath before executing your script. + * Multiple values must be complete paths separated by a comma (`,`). + */ + EXTRA_JARS = '--extra-jars', + + /** + * Setting this value to true prioritizes the customer's extra JAR files in the classpath. + * This option is only available in AWS Glue version 2.0. + */ + USER_JARS_FIRST = '--user-jars-first', + + /** + * The Amazon S3 paths to additional files, such as configuration files that AWS Glue copies to the working directory of your script before executing it. + * Multiple values must be complete paths separated by a comma (`,`). + */ + EXTRA_FILES = '--extra-files', + + /** + * Controls the behavior of a job bookmark. The following option values can be set. + */ + JOB_BOOKMARK_OPTION = '--job-bookmark-option', + + /** + * Specifies an Amazon S3 path to a bucket that can be used as a temporary directory for the job. + */ + TEMP_DIR = '--TempDir', + + /** + * Enables the EMRFS S3-optimized committer for writing Parquet data into Amazon S3. + * Setting the value to `true` enables the committer. By default the flag is turned off. + */ + ENABLE_S3_PARQUET_OPTIMIZED_COMMITTER = '--enable-s3-parquet-optimized-committer', + + /** + * Sets the EMRFS rename algorithm version to version 2. + * This option is only available on AWS Glue version 1.0. + */ + ENABLE_RENAME_ALGORITHM_V2 = '--enable-rename-algorithm-v2', + + /** + * Enables using the AWS Glue Data Catalog as an Apache Spark Hive metastore. + */ + ENABLE_GLUE_DATA_CATALOG = '--enable-glue-datacatalog', + + /** + * Enables the collection of metrics for job profiling for this job run. + * To enable metrics, only specify the key; no value is needed. + */ + ENABLE_METRICS = '--enable-metrics', + + /** + * Enables real-time continuous logging for AWS Glue jobs to view real-time Apache Spark job logs in CloudWatch. + */ + ENABLE_CONTINUOUS_LOGGING = '--enable-continuous-cloudwatch-log', + + /** + * Specifies a standard filter (true) or no filter (false) for continuous logging. + * Choosing the standard filter prunes out non-useful Apache Spark driver/executor and Apache Hadoop YARN heartbeat log messages. + * Choosing no filter gives all the log messages. + */ + ENABLE_CONTINUOUS_LOG_FILTER = '--enable-continuous-log-filter', + + /** + * Specifies a custom Amazon CloudWatch log group name for a job enabled for continuous logging. + */ + LOG_GROUP = '--continuous-log-logGroup', + + /** + * Specifies a custom CloudWatch log stream prefix for a job enabled for continuous logging. + */ + LOG_STREAM_PREFIX = '--continuous-log-logStreamPrefix', + + /** + * Specifies a custom conversion log pattern for a job enabled for continuous logging. + */ + LOG_CONVERSION_PATTERN = '--continuous-log-conversionPattern', + + /** + * Enables Apache Spark web UI. + * @see https://docs.aws.amazon.com/glue/latest/dg/monitor-spark-ui-jobs.html + */ + ENABLE_SPARK_UI = '--enable-spark-ui', + + /** + * Specifies the Amazon S3 path for storing the Spark event logs for the job. + * @see https://docs.aws.amazon.com/glue/latest/dg/monitor-spark-ui-jobs.html + */ + SPARK_UI_LOGS_PATH = '--spark-event-logs-path', +} + /** * Interface representing a created or an imported {@link Job}. */ @@ -393,7 +510,8 @@ export interface JobProps { /** * The default arguments for this job, specified as name-value pairs. * - * @see https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-glue-arguments.html for a list of Special Parameters Used by AWS Glue + * {@link JobSpecialArgumentNames} defines some of the Special Parameters used by AWS Glue. + * @see https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-glue-arguments.html A list of Special Parameters Used by AWS Glue * @default no arguments */ readonly defaultArguments?: { [key: string]: string }; diff --git a/packages/@aws-cdk/aws-glue/package.json b/packages/@aws-cdk/aws-glue/package.json index 07e9432402ebe..9cae577424184 100644 --- a/packages/@aws-cdk/aws-glue/package.json +++ b/packages/@aws-cdk/aws-glue/package.json @@ -148,7 +148,8 @@ "docs-public-apis:@aws-cdk/aws-glue.ClassificationString.XML", "docs-public-apis:@aws-cdk/aws-glue.ClassificationString.PARQUET", "docs-public-apis:@aws-cdk/aws-glue.ClassificationString.ORC", - "docs-public-apis:@aws-cdk/aws-glue.ClassificationString.value" + "docs-public-apis:@aws-cdk/aws-glue.ClassificationString.value", + "no-unused-type:@aws-cdk/aws-glue.JobSpecialArgumentNames" ] }, "awscdkio": { From 06b4d55dcfc474749bf2afdab86aa61de62d35e7 Mon Sep 17 00:00:00 2001 From: Ahmed Kamel Date: Wed, 17 Feb 2021 16:01:23 +0000 Subject: [PATCH 05/50] rebase to use Connection and SecurityConfiguration --- packages/@aws-cdk/aws-glue/lib/job.ts | 28 ++++++--------------- packages/@aws-cdk/aws-glue/test/job.test.ts | 11 +++++++- 2 files changed, 18 insertions(+), 21 deletions(-) diff --git a/packages/@aws-cdk/aws-glue/lib/job.ts b/packages/@aws-cdk/aws-glue/lib/job.ts index 1c114780f8f29..1d4d3a3b3946c 100644 --- a/packages/@aws-cdk/aws-glue/lib/job.ts +++ b/packages/@aws-cdk/aws-glue/lib/job.ts @@ -3,7 +3,9 @@ import * as events from '@aws-cdk/aws-events'; import * as iam from '@aws-cdk/aws-iam'; import * as cdk from '@aws-cdk/core'; import * as constructs from 'constructs'; +import { IConnection } from './connection'; import { CfnJob } from './glue.generated'; +import { ISecurityConfiguration } from './security-configuration'; /** * AWS Glue version determines the versions of Apache Spark and Python that are available to the job. @@ -14,11 +16,6 @@ import { CfnJob } from './glue.generated'; * can instantiate a `GlueVersion` object, e.g: `new GlueVersion('1.5')`. */ export class GlueVersion { - /** - * A list of all known `GlueVersion`s. - */ - public static readonly ALL = new Array(); - /** * Glue version using Spark 2.2.1 and Python 2.7 */ @@ -41,7 +38,6 @@ export class GlueVersion { constructor(name: string) { this.name = name; - GlueVersion.ALL.push(this); } /** @@ -59,9 +55,6 @@ export class GlueVersion { * can instantiate a `WorkerType` object, e.g: `new WorkerType('other type')`. */ export class WorkerType { - /** A list of all known `WorkerType`s. */ - public static readonly ALL = new Array(); - /** * Each worker provides 4 vCPU, 16 GB of memory and a 50GB disk, and 2 executors per worker. */ @@ -84,7 +77,6 @@ export class WorkerType { constructor(name: string) { this.name = name; - WorkerType.ALL.push(this); } /** @@ -176,9 +168,6 @@ export enum MetricType { * can instantiate a `WorkerType` object, e.g: `new JobCommandName('other name')`. */ export class JobCommandName { - /** A list of all known `JobCommandName`s. */ - public static readonly ALL = new Array(); - /** * Command for running a Glue ETL job. */ @@ -201,7 +190,6 @@ export class JobCommandName { constructor(name: string) { this.name = name; - JobCommandName.ALL.push(this); } /** @@ -496,16 +484,16 @@ export interface JobProps { /** * The {@link Connection}s used for this job. * - * TODO Enable after https://github.com/aws/aws-cdk/issues/12442 is merged. + * @default no connection. */ - // readonly connections?: IConnection []; + readonly connections?: IConnection []; /** * The {@link SecurityConfiguration} to use for this job. * - * TODO Enable after https://github.com/aws/aws-cdk/issues/12449 is merged. + * @default no security configuration. */ - // readonly securityConfiguration?: ISecurityConfiguration; + readonly securityConfiguration?: ISecurityConfiguration; /** * The default arguments for this job, specified as name-value pairs. @@ -630,8 +618,8 @@ export class Job extends cdk.Resource implements IJob { executionProperty: props.maxConcurrentRuns ? { maxConcurrentRuns: props.maxConcurrentRuns } : undefined, notificationProperty: props.notifyDelayAfter ? { notifyDelayAfter: props.notifyDelayAfter.toMinutes() } : undefined, timeout: props.timeout ? props.timeout.toMinutes() : undefined, - // connections: props.connections ? { connections: props.connections.map((connection) => connection.connectionName) } : undefined, - // securityConfiguration: props.securityConfiguration ? props.securityConfiguration.securityConfigurationName : undefined, + connections: props.connections ? { connections: props.connections.map((connection) => connection.connectionName) } : undefined, + securityConfiguration: props.securityConfiguration ? props.securityConfiguration.securityConfigurationName : undefined, defaultArguments: props.defaultArguments, tags: props.tags, }); diff --git a/packages/@aws-cdk/aws-glue/test/job.test.ts b/packages/@aws-cdk/aws-glue/test/job.test.ts index 9280f97bd1b60..8fed6f5892f01 100644 --- a/packages/@aws-cdk/aws-glue/test/job.test.ts +++ b/packages/@aws-cdk/aws-glue/test/job.test.ts @@ -47,7 +47,8 @@ describe('JobCommand', () => { let jobCommandName: glue.JobCommandName; // known command names + custom one - glue.JobCommandName.ALL.concat(new glue.JobCommandName('CustomName')).forEach((name) => { + [glue.JobCommandName.GLUE_STREAMING, glue.JobCommandName.PYTHON_SHELL, glue.JobCommandName.GLUE_ETL, + new glue.JobCommandName('CustomName')].forEach((name) => { describe(`with ${name} JobCommandName`, () => { beforeEach(() => { @@ -264,6 +265,8 @@ describe('Job', () => { arg1: 'value1', arg2: 'value2', }, + connections: [glue.Connection.fromConnectionName(stack, 'ImportedConnection', 'ConnectionName')], + securityConfiguration: glue.SecurityConfiguration.fromSecurityConfigurationName(stack, 'ImportedSecurityConfiguration', 'SecurityConfigurationName'), tags: { key: 'value', }, @@ -302,6 +305,12 @@ describe('Job', () => { }, Timeout: 5, WorkerType: 'G.2X', + Connections: { + Connections: [ + 'ConnectionName', + ], + }, + SecurityConfiguration: 'SecurityConfigurationName', })); }); }); From 6bcdad877b3187289d4fd4bb8c48dfba49ae78f5 Mon Sep 17 00:00:00 2001 From: Ahmed Kamel Date: Tue, 27 Jul 2021 12:39:07 +0100 Subject: [PATCH 06/50] address some comments - introduce JobBase to contain common logic between Job and Import - JobBase now handles CloudWatch Event Rules and Rule-based metrics methods - address the rename comments - use the private constructor + static of() pattern where relevant - Make JobProps.glueVersion required --- packages/@aws-cdk/aws-glue/lib/job.ts | 447 +++++++++++------- packages/@aws-cdk/aws-glue/package.json | 3 +- .../aws-glue/test/integ.job.expected.json | 17 +- packages/@aws-cdk/aws-glue/test/integ.job.ts | 15 +- packages/@aws-cdk/aws-glue/test/job.test.ts | 260 ++++++---- 5 files changed, 482 insertions(+), 260 deletions(-) diff --git a/packages/@aws-cdk/aws-glue/lib/job.ts b/packages/@aws-cdk/aws-glue/lib/job.ts index 1d4d3a3b3946c..b2828d390c1d4 100644 --- a/packages/@aws-cdk/aws-glue/lib/job.ts +++ b/packages/@aws-cdk/aws-glue/lib/job.ts @@ -19,24 +19,32 @@ export class GlueVersion { /** * Glue version using Spark 2.2.1 and Python 2.7 */ - public static readonly ZERO_POINT_NINE = new GlueVersion('0.9'); + public static readonly V0_9 = new GlueVersion('0.9'); /** * Glue version using Spark 2.4.3, Python 2.7 and Python 3.6 */ - public static readonly ONE_POINT_ZERO = new GlueVersion('1.0'); + public static readonly V1_0 = new GlueVersion('1.0'); /** * Glue version using Spark 2.4.3 and Python 3.7 */ - public static readonly TWO_POINT_ZERO = new GlueVersion('2.0'); + public static readonly V2_0 = new GlueVersion('2.0'); + + /** + * Custom Glue version + * @param version custom version + */ + public static of(version: string): GlueVersion { + return new GlueVersion(version); + } /** * The name of this GlueVersion, as expected by Job resource. */ public readonly name: string; - constructor(name: string) { + private constructor(name: string) { this.name = name; } @@ -70,12 +78,20 @@ export class WorkerType { */ public static readonly G_2X = new WorkerType('G.2X'); + /** + * Custom worker type + * @param workerType custom worker type + */ + public static of(workerType: string): WorkerType { + return new WorkerType(workerType); + } + /** * The name of this WorkerType, as expected by Job resource. */ public readonly name: string; - constructor(name: string) { + private constructor(name: string) { this.name = name; } @@ -107,7 +123,7 @@ export enum PythonVersion { * * @see https://docs.aws.amazon.com/AmazonCloudWatch/latest/events/EventTypes.html#glue-event-types for more information. */ -export enum JobEventState { +export enum JobState { /** * State indicating job run succeeded */ @@ -183,12 +199,20 @@ export class JobCommandName { */ public static readonly PYTHON_SHELL = new JobCommandName('pythonshell'); + /** + * Custom command name + * @param name command name + */ + public static of(name: string): WorkerType { + return new JobCommandName(name); + } + /** * The name of this JobCommandName, as expected by Job resource. */ public readonly name: string; - constructor(name: string) { + private constructor(name: string) { this.name = name; } @@ -211,7 +235,7 @@ export class JobCommand { * @param scriptLocation specifies the Amazon Simple Storage Service (Amazon S3) path to a script that executes a job. * @param pythonVersion specifies the Python shell version for the ETL job. Versions supported vary depending on GlueVersion. */ - public static glueEtl(scriptLocation: string, pythonVersion?: PythonVersion) { + public static etl(scriptLocation: string, pythonVersion?: PythonVersion) { return new JobCommand(JobCommandName.GLUE_ETL, scriptLocation, pythonVersion); } @@ -221,7 +245,7 @@ export class JobCommand { * @param scriptLocation specifies the Amazon Simple Storage Service (Amazon S3) path to a script that executes a job. * @param pythonVersion specifies the Python shell version for the streaming job. Versions supported vary depending on GlueVersion. */ - public static glueStreaming(scriptLocation: string, pythonVersion?: PythonVersion) { + public static streaming(scriptLocation: string, pythonVersion?: PythonVersion) { return new JobCommand(JobCommandName.GLUE_STREAMING, scriptLocation, pythonVersion); } @@ -231,7 +255,7 @@ export class JobCommand { * @param scriptLocation specifies the Amazon Simple Storage Service (Amazon S3) path to a script that executes a job. * @param pythonVersion the Python version being used to execute a Python shell job. */ - public static pythonShell(scriptLocation: string, pythonVersion?: PythonVersion) { + public static python(scriptLocation: string, pythonVersion?: PythonVersion) { return new JobCommand(JobCommandName.PYTHON_SHELL, scriptLocation, pythonVersion); } @@ -394,6 +418,252 @@ export interface IJob extends cdk.IResource { * @attribute */ readonly jobArn: string; + + /** + * Defines a CloudWatch event rule triggered when something happens with this job. + * + * @see https://docs.aws.amazon.com/AmazonCloudWatch/latest/events/EventTypes.html#glue-event-types + */ + onEvent(id: string, options?: events.OnEventOptions): events.Rule; + + /** + * Defines a CloudWatch event rule triggered when this job moves to the SUCCEEDED state. + * + * @see https://docs.aws.amazon.com/AmazonCloudWatch/latest/events/EventTypes.html#glue-event-types + */ + onSuccess(id?: string, options?: events.OnEventOptions): events.Rule; + + /** + * Defines a CloudWatch event rule triggered when this job moves to the FAILED state. + * + * @see https://docs.aws.amazon.com/AmazonCloudWatch/latest/events/EventTypes.html#glue-event-types + */ + onFailure(id?: string, options?: events.OnEventOptions): events.Rule; + + /** + * Defines a CloudWatch event rule triggered when this job moves to the TIMEOUT state. + * + * @see https://docs.aws.amazon.com/AmazonCloudWatch/latest/events/EventTypes.html#glue-event-types + */ + onTimeout(id?: string, options?: events.OnEventOptions): events.Rule; + + /** + * Create a CloudWatch metric. + * + * @param metricName name of the metric typically prefixed with `glue.driver.`, `glue..` or `glue.ALL.`. + * @param jobRunId a dimension that filters for metrics of a specific JobRun ID, or `ALL`. + * @param type the metric type. + * @param props metric options. + * + * @see https://docs.aws.amazon.com/glue/latest/dg/monitoring-awsglue-with-cloudwatch-metrics.html + */ + metric(metricName: string, jobRunId: string, type: MetricType, props?: cloudwatch.MetricOptions): cloudwatch.Metric; + + /** + * Create a CloudWatch Metric indicating job success. + */ + metricSuccess(props?: cloudwatch.MetricOptions): cloudwatch.Metric; + + /** + * Create a CloudWatch Metric indicating job failure. + */ + metricFailure(props?: cloudwatch.MetricOptions): cloudwatch.Metric; + + /** + * Create a CloudWatch Metric indicating job timeout. + */ + metricTimeout(props?: cloudwatch.MetricOptions): cloudwatch.Metric; +} + +abstract class JobBase extends cdk.Resource implements IJob { + + /** + * Create a CloudWatch Metric that's based on Glue Job events + * {@see https://docs.aws.amazon.com/AmazonCloudWatch/latest/events/EventTypes.html#glue-event-types} + * The metric has namespace = 'AWS/Events', metricName = 'TriggeredRules' and RuleName = rule.ruleName dimension. + * + * @param rule for use in setting RuleName dimension value + * @param props metric properties + */ + public static metricRule(rule: events.IRule, props?: cloudwatch.MetricOptions): cloudwatch.Metric { + return new cloudwatch.Metric({ + namespace: 'AWS/Events', + metricName: 'TriggeredRules', + dimensions: { RuleName: rule.ruleName }, + statistic: cloudwatch.Statistic.SUM, + ...props, + }).attachTo(rule); + } + + + /** + * Returns the job arn + * @param scope + * @param jobName + */ + public static buildJobArn(scope: constructs.Construct, jobName: string) : string { + return cdk.Stack.of(scope).formatArn({ + service: 'glue', + resource: 'job', + resourceName: jobName, + }); + } + + public abstract readonly jobArn: string; + public abstract readonly jobName: string; + private cachedRules: Record = {}; + + /** + * Create a CloudWatch Event Rule for this Glue Job when it's in a given state + * + * @param id construct id + * @param options event options. Note that some values are overridden if provided, these are + * - eventPattern.source = ['aws.glue'] + * - eventPattern.detailType = ['Glue Job State Change', 'Glue Job Run Status'] + * - eventPattern.detail.jobName = [this.jobName] + * + * @see https://docs.aws.amazon.com/AmazonCloudWatch/latest/events/EventTypes.html#glue-event-types + */ + public onEvent(id: string, options: events.OnEventOptions = {}): events.Rule { + const rule = new events.Rule(this, id, options); + rule.addTarget(options.target); + rule.addEventPattern({ + source: ['aws.glue'], + detailType: ['Glue Job State Change', 'Glue Job Run Status'], + detail: { + jobName: [this.jobName], + }, + }); + return rule; + } + + /** + * Return a CloudWatch Event Rule matching JobState.SUCCEEDED. + * + * If no id or options are provided, the created rule is cached. Later no-args calls with retrieves from cache but ones with args. + * Later calls with args lead to the creation of a new Rule + * + * @param id optional construct id. default is SUCCEEDEDRule + * @param options optional event options. default is {} + */ + public onSuccess(id?: string, options: events.OnEventOptions = {}): events.Rule { + return this.rule(JobState.SUCCEEDED, id, options); + } + + /** + * Return a CloudWatch Event Rule matching FAILED state. + * + * If no id or options are provided, the created rule is cached. Later no-args calls with retrieves from cache but ones with args. + * Later calls with args lead to the creation of a new Rule + * + * @param id optional construct id. default is FAILEDRule + * @param options optional event options. default is {} + */ + public onFailure(id?: string, options: events.OnEventOptions = {}): events.Rule { + return this.rule(JobState.FAILED, id, options); + } + + /** + * Return a CloudWatch Event Rule matching TIMEOUT state. + * + * If no id or options are provided, the created rule is cached. Later no-args calls with retrieves from cache but ones with args. + * Later calls with args lead to the creation of a new Rule + * + * @param id optional construct id. default is TIMEOUTRule + * @param options optional event options. default is {} + */ + public onTimeout(id?: string, options: events.OnEventOptions = {}): events.Rule { + return this.rule(JobState.TIMEOUT, id, options); + } + + /** + * Create a CloudWatch metric. + * + * @param metricName name of the metric typically prefixed with `glue.driver.`, `glue..` or `glue.ALL.`. + * @param jobRunId a dimension that filters for metrics of a specific JobRun ID, or `ALL`. + * @param type the metric type. + * @param props metric options. + * + * @see https://docs.aws.amazon.com/glue/latest/dg/monitoring-awsglue-with-cloudwatch-metrics.html + */ + public metric(metricName: string, jobRunId: string, type: MetricType, props?: cloudwatch.MetricOptions): cloudwatch.Metric { + return new cloudwatch.Metric({ + metricName, + namespace: 'Glue', + dimensions: { + JobName: this.jobName, + JobRunId: jobRunId, + Type: type, + }, + ...props, + }).attachTo(this); + } + + /** + * Return a CloudWatch Metric indicating job success. + * + * This metric is based on the Rule returned by no-args onSuccess() call. + */ + public metricSuccess(props?: cloudwatch.MetricOptions): cloudwatch.Metric { + return JobBase.metricRule(this.onSuccess(), props); + } + + /** + * Return a CloudWatch Metric indicating job failure. + * + * This metric is based on the Rule returned by no-args onFailure() call. + */ + public metricFailure(props?: cloudwatch.MetricOptions): cloudwatch.Metric { + return JobBase.metricRule(this.onFailure(), props); + } + + /** + * Return a CloudWatch Metric indicating job timeout. + * + * This metric is based on the Rule returned by no-args onTimeout() call. + */ + public metricTimeout(props?: cloudwatch.MetricOptions): cloudwatch.Metric { + return JobBase.metricRule(this.onTimeout(), props); + } + + /** + * Creates a new rule for a transition into the input jobState or attempt to create-if-necessary and retrieve the default rule + * - A new rule is created but not cached if the id parameter is specified + * - A create/retrieve from cache scenario happens when no explicit id (and options) are not provided + * The reason is that the default rule is used by onSuccess, onFailure and onTimeout methods which are in turn used by metrics methods. + * + * @param jobState the job state + * @param id optional construct id + * @param options optional event options + * @private + */ + private rule(jobState: JobState, id?: string, options: events.OnEventOptions = {}): events.Rule { + // No caching + if (id) { + const rule = this.onEvent(id, options); + rule.addEventPattern({ + detail: { + state: [jobState], + }, + }); + return rule; + } + // Caching + const ruleId = `${jobState}Rule`; + if (!this.cachedRules[ruleId]) { + const rule = this.onEvent(ruleId, { + description: `Rule triggered when Glue job ${this.jobName} is in ${jobState} state`, + }); + rule.addEventPattern({ + detail: { + state: [jobState], + }, + }); + this.cachedRules[ruleId] = rule; + } + return this.cachedRules[ruleId]; + } + } /** @@ -434,7 +704,7 @@ export interface JobProps { /** * The maximum number of times to retry this job after a JobRun fails. * - * @default ? + * @default 0 */ readonly maxRetries?: number; @@ -449,7 +719,7 @@ export interface JobProps { /** * The number of minutes to wait after a job run starts, before sending a job run delay notification. * - * @default ? + * @default N/A (no delay notifications) */ readonly notifyDelayAfter?: cdk.Duration; @@ -463,9 +733,8 @@ export interface JobProps { /** * Glue version determines the versions of Apache Spark and Python that AWS Glue supports. The Python version indicates the version supported for jobs of type Spark. * - * @default 0.9 */ - readonly glueVersion?: GlueVersion; + readonly glueVersion: GlueVersion; /** * The type of predefined worker that is allocated when a job runs. @@ -526,7 +795,7 @@ export interface JobProps { /** * A Glue Job. */ -export class Job extends cdk.Resource implements IJob { +export class Job extends JobBase { /** * Creates a Glue Job * @@ -535,39 +804,14 @@ export class Job extends cdk.Resource implements IJob { * @param attrs Import attributes */ public static fromJobAttributes(scope: constructs.Construct, id: string, attrs: JobAttributes): IJob { - class Import extends cdk.Resource implements IJob { + class Import extends JobBase { public readonly jobName = attrs.jobName; - public readonly jobArn = Job.buildJobArn(scope, attrs.jobName); + public readonly jobArn = JobBase.buildJobArn(scope, attrs.jobName); } return new Import(scope, id); } - /** - * Create a CloudWatch Metric with namespace = 'AWS/Events', metricName = 'TriggeredRules' and RuleName = rule.ruleName dimension. - * This is used by ruleXXXMetric methods. - * - * @param rule for use in setting RuleName dimension value - * @param props metric properties - */ - public static ruleMetric(rule: events.IRule, props?: cloudwatch.MetricOptions): cloudwatch.Metric { - return new cloudwatch.Metric({ - namespace: 'AWS/Events', - metricName: 'TriggeredRules', - dimensions: { RuleName: rule.ruleName }, - statistic: cloudwatch.Statistic.SUM, - ...props, - }).attachTo(rule); - } - - private static buildJobArn(scope: constructs.Construct, jobName: string) : string { - return cdk.Stack.of(scope).formatArn({ - service: 'glue', - resource: 'job', - resourceName: jobName, - }); - } - /** * The ARN of the job. */ @@ -583,12 +827,6 @@ export class Job extends cdk.Resource implements IJob { */ public readonly role: iam.IRole; - /** - * Used to cache results of ._rule calls. - * @private - */ - private _rules: Record = {}; - constructor(scope: constructs.Construct, id: string, props: JobProps) { super(scope, id, { physicalName: props.jobName, @@ -625,118 +863,7 @@ export class Job extends cdk.Resource implements IJob { }); const resourceName = this.getResourceNameAttribute(jobResource.ref); - this.jobArn = Job.buildJobArn(this, resourceName); + this.jobArn = JobBase.buildJobArn(this, resourceName); this.jobName = resourceName; } - - - /** - * Create a CloudWatch Event Rule matching transition into the given `JobEventState`s - * - * @param state used in matching the CloudWatch Event - * - * @see https://docs.aws.amazon.com/AmazonCloudWatch/latest/events/EventTypes.html#glue-event-types - */ - public rule(state: JobEventState, props?: events.RuleProps): events.Rule { - const ruleId = `${state}Rule`; - return new events.Rule(this, ruleId, { - description: `Event triggered when Glue job ${this.jobName} is in ${state} state`, - eventPattern: { - source: ['aws.glue'], - detailType: ['Glue Job State Change', 'Glue Job Run Status'], - detail: { - state: [state], - jobName: [this.jobName], - }, - }, - ...props, - }); - } - - /** - * Return a CloudWatch Event Rule matching JobEventState.SUCCEEDED. - * The rule is cached for later usage. - * - * @param props rule props for first invocation. If props are passed again they are ignored. - */ - public successRule(props?: events.RuleProps): events.Rule { - return this._rule(JobEventState.SUCCEEDED, props); - } - - /** - * Return a CloudWatch Event Rule matching JobEventState.FAILED. - * The rule is cached for later usage. - * - * @param props rule props for first invocation. If props are passed again they are ignored. - */ - public failureRule(props?: events.RuleProps): events.Rule { - return this._rule(JobEventState.FAILED, props); - } - - /** - * Return a CloudWatch Event Rule matching JobEventState.TIMEOUT. - * The rule is cached for later usage. - * - * @param props rule props for first invocation. If props are passed again they are ignored. - */ - public timeoutRule(props?: events.RuleProps): events.Rule { - return this._rule(JobEventState.TIMEOUT, props); - } - - /** - * Return a CloudWatch Metric indicating job success that's based on successRule() - */ - public successRuleMetric(props?: cloudwatch.MetricOptions): cloudwatch.Metric { - return Job.ruleMetric(this.successRule(), props); - } - - /** - * Return a CloudWatch Metric indicating job success that's based on failureRule() - */ - public failureRuleMetric(props?: cloudwatch.MetricOptions): cloudwatch.Metric { - return Job.ruleMetric(this.failureRule(), props); - } - - /** - * Return a CloudWatch Metric indicating job success that's based on timeoutRule() - */ - public timeoutRuleMetric(props?: cloudwatch.MetricOptions): cloudwatch.Metric { - return Job.ruleMetric(this.timeoutRule(), props); - } - - /** - * Create a CloudWatch metric. - * - * @param metricName name of the metric typically prefixed with `glue.driver.`, `glue..` or `glue.ALL.`. - * @param jobRunId a dimension that filters for metrics of a specific JobRun ID, or `ALL`. - * @param type the metric type. - * - * @see https://docs.aws.amazon.com/glue/latest/dg/monitoring-awsglue-with-cloudwatch-metrics.html - */ - public metric(metricName: string, jobRunId: string, type: MetricType, props?: cloudwatch.MetricOptions): cloudwatch.Metric { - return new cloudwatch.Metric({ - metricName, - namespace: 'Glue', - dimensions: { - JobName: this.jobName, - JobRunId: jobRunId, - Type: type, - }, - ...props, - }).attachTo(this); - } - - /** - * Create a Rule with the given props and caches the resulting rule. Subsequent returns cached value and ignores props - * - * @param state used in matching the CloudWatch Event - * @param props rule properties - * @private - */ - private _rule(state: JobEventState, props?: events.RuleProps) { - if (!this._rules[state]) { - this._rules[state] = this.rule(state, props); - } - return this._rules[state]; - } } diff --git a/packages/@aws-cdk/aws-glue/package.json b/packages/@aws-cdk/aws-glue/package.json index 9cae577424184..2faa6107e3004 100644 --- a/packages/@aws-cdk/aws-glue/package.json +++ b/packages/@aws-cdk/aws-glue/package.json @@ -149,7 +149,8 @@ "docs-public-apis:@aws-cdk/aws-glue.ClassificationString.PARQUET", "docs-public-apis:@aws-cdk/aws-glue.ClassificationString.ORC", "docs-public-apis:@aws-cdk/aws-glue.ClassificationString.value", - "no-unused-type:@aws-cdk/aws-glue.JobSpecialArgumentNames" + "no-unused-type:@aws-cdk/aws-glue.JobSpecialArgumentNames", + "no-unused-type:@aws-cdk/aws-glue.JobState" ] }, "awscdkio": { diff --git a/packages/@aws-cdk/aws-glue/test/integ.job.expected.json b/packages/@aws-cdk/aws-glue/test/integ.job.expected.json index c445bbaf24d57..23ebc8ba9361a 100644 --- a/packages/@aws-cdk/aws-glue/test/integ.job.expected.json +++ b/packages/@aws-cdk/aws-glue/test/integ.job.expected.json @@ -151,7 +151,8 @@ "MinimalGlueEtlJobServiceRole60989380", "Arn" ] - } + }, + "GlueVersion": "2.0" } }, "MinimalGlueStreamingJobServiceRole77973DB5": { @@ -291,7 +292,8 @@ "MinimalGlueStreamingJobServiceRole77973DB5", "Arn" ] - } + }, + "GlueVersion": "2.0" } }, "MinimalPythonShellJobServiceRole4944649D": { @@ -431,7 +433,8 @@ "MinimalPythonShellJobServiceRole4944649D", "Arn" ] - } + }, + "GlueVersion": "2.0" } }, "JobServiceRole4F432993": { @@ -599,7 +602,7 @@ "Fn::Join": [ "", [ - "Event triggered when Glue job ", + "Rule triggered when Glue job ", { "Ref": "JobB9D00F9F" }, @@ -616,13 +619,13 @@ "Glue Job Run Status" ], "detail": { - "state": [ - "SUCCEEDED" - ], "jobName": [ { "Ref": "JobB9D00F9F" } + ], + "state": [ + "SUCCEEDED" ] } }, diff --git a/packages/@aws-cdk/aws-glue/test/integ.job.ts b/packages/@aws-cdk/aws-glue/test/integ.job.ts index 6da0263a47167..63d00f5b23fa1 100644 --- a/packages/@aws-cdk/aws-glue/test/integ.job.ts +++ b/packages/@aws-cdk/aws-glue/test/integ.job.ts @@ -12,23 +12,26 @@ const script = new s3_assets.Asset(stack, 'script', { }); const minimalEtlJob = new glue.Job(stack, 'MinimalGlueEtlJob', { - jobCommand: glue.JobCommand.glueEtl(script.s3ObjectUrl), + glueVersion: glue.GlueVersion.V2_0, + jobCommand: glue.JobCommand.etl(script.s3ObjectUrl), }); script.bucket.grantRead(minimalEtlJob.role); const minimalStreamingJob = new glue.Job(stack, 'MinimalGlueStreamingJob', { - jobCommand: glue.JobCommand.glueStreaming(script.s3ObjectUrl), + glueVersion: glue.GlueVersion.V2_0, + jobCommand: glue.JobCommand.streaming(script.s3ObjectUrl), }); script.bucket.grantRead(minimalStreamingJob.role); const minimalPythonShellJob = new glue.Job(stack, 'MinimalPythonShellJob', { - jobCommand: glue.JobCommand.pythonShell(script.s3ObjectUrl), + glueVersion: glue.GlueVersion.V2_0, + jobCommand: glue.JobCommand.python(script.s3ObjectUrl), }); script.bucket.grantRead(minimalPythonShellJob.role); const etlJob = new glue.Job(stack, 'Job', { - jobCommand: glue.JobCommand.glueEtl(script.s3ObjectUrl), - glueVersion: glue.GlueVersion.TWO_POINT_ZERO, + glueVersion: glue.GlueVersion.V2_0, + jobCommand: glue.JobCommand.etl(script.s3ObjectUrl), workerType: glue.WorkerType.G_2X, numberOfWorkers: 10, maxConcurrentRuns: 2, @@ -45,6 +48,6 @@ const etlJob = new glue.Job(stack, 'Job', { }); script.bucket.grantRead(etlJob.role); -etlJob.successRuleMetric(); +etlJob.metricSuccess(); app.synth(); diff --git a/packages/@aws-cdk/aws-glue/test/job.test.ts b/packages/@aws-cdk/aws-glue/test/job.test.ts index 8fed6f5892f01..7ecdc681c786e 100644 --- a/packages/@aws-cdk/aws-glue/test/job.test.ts +++ b/packages/@aws-cdk/aws-glue/test/job.test.ts @@ -7,13 +7,13 @@ import '@aws-cdk/assert-internal/jest'; import * as glue from '../lib'; describe('GlueVersion', () => { - test('.ZERO_POINT_NINE', () => expect(glue.GlueVersion.ZERO_POINT_NINE.name).toEqual('0.9')); + test('.V0_9', () => expect(glue.GlueVersion.V0_9.name).toEqual('0.9')); - test('.ONE_POINT_ZERO', () => expect(glue.GlueVersion.ONE_POINT_ZERO.name).toEqual('1.0')); + test('.V1_0', () => expect(glue.GlueVersion.V1_0.name).toEqual('1.0')); - test('.TWO_POINT_ZERO', () => expect(glue.GlueVersion.TWO_POINT_ZERO.name).toEqual('2.0')); + test('.V2_0', () => expect(glue.GlueVersion.V2_0.name).toEqual('2.0')); - test('new sets name correctly', () => expect(new glue.GlueVersion('CustomVersion').name).toEqual('CustomVersion')); + test('of(customVersion) sets name correctly', () => expect(glue.GlueVersion.of('CustomVersion').name).toEqual('CustomVersion')); }); describe('WorkerType', () => { @@ -23,7 +23,7 @@ describe('WorkerType', () => { test('.G_2X', () => expect(glue.WorkerType.G_2X.name).toEqual('G.2X')); - test('new sets name correctly', () => expect(new glue.WorkerType('CustomType').name).toEqual('CustomType')); + test('of(customType) sets name correctly', () => expect(glue.WorkerType.of('CustomType').name).toEqual('CustomType')); }); describe('JobCommandName', () => { @@ -33,7 +33,7 @@ describe('JobCommandName', () => { test('.PYTHON_SHELL', () => expect(glue.JobCommandName.PYTHON_SHELL.name).toEqual('pythonshell')); - test('new sets name correctly', () => expect(new glue.JobCommandName('CustomName').name).toEqual('CustomName')); + test('of(customName) sets name correctly', () => expect(glue.JobCommandName.of('CustomName').name).toEqual('CustomName')); }); describe('JobCommand', () => { @@ -48,7 +48,7 @@ describe('JobCommand', () => { // known command names + custom one [glue.JobCommandName.GLUE_STREAMING, glue.JobCommandName.PYTHON_SHELL, glue.JobCommandName.GLUE_ETL, - new glue.JobCommandName('CustomName')].forEach((name) => { + glue.JobCommandName.of('CustomName')].forEach((name) => { describe(`with ${name} JobCommandName`, () => { beforeEach(() => { @@ -75,24 +75,24 @@ describe('JobCommand', () => { }); }); - test('.glueEtl() uses GLUE_ETL JobCommandName', () => { - const jobCommand = glue.JobCommand.glueEtl(scriptLocation); + test('.etl() uses GLUE_ETL JobCommandName', () => { + const jobCommand = glue.JobCommand.etl(scriptLocation); expect(jobCommand.name).toEqual(glue.JobCommandName.GLUE_ETL); expect(jobCommand.scriptLocation).toEqual(scriptLocation); expect(jobCommand.pythonVersion).toBeUndefined(); }); - test('.glueStreaming() uses GLUE_STREAMING JobCommandName', () => { - const jobCommand = glue.JobCommand.glueStreaming(scriptLocation, glue.PythonVersion.THREE); + test('.streaming() uses GLUE_STREAMING JobCommandName', () => { + const jobCommand = glue.JobCommand.streaming(scriptLocation, glue.PythonVersion.THREE); expect(jobCommand.name).toEqual(glue.JobCommandName.GLUE_STREAMING); expect(jobCommand.scriptLocation).toEqual(scriptLocation); expect(jobCommand.pythonVersion).toEqual(glue.PythonVersion.THREE); }); - test('.pythonShell() uses PYTHON_SHELL JobCommandName', () => { - const jobCommand = glue.JobCommand.pythonShell(scriptLocation, glue.PythonVersion.TWO); + test('.python() uses PYTHON_SHELL JobCommandName', () => { + const jobCommand = glue.JobCommand.python(scriptLocation, glue.PythonVersion.TWO); expect(jobCommand.name).toEqual(glue.JobCommandName.PYTHON_SHELL); expect(jobCommand.scriptLocation).toEqual(scriptLocation); @@ -128,7 +128,7 @@ describe('Job', () => { }); test('with no props returns default metric', () => { - expect(glue.Job.ruleMetric(rule)).toEqual(new cloudwatch.Metric({ + expect(glue.Job.metricRule(rule)).toEqual(new cloudwatch.Metric({ dimensions: { RuleName: 'example', }, @@ -139,7 +139,7 @@ describe('Job', () => { }); test('with props overrides', () => { - expect(glue.Job.ruleMetric(rule, { statistic: cloudwatch.Statistic.AVERAGE })).toEqual(new cloudwatch.Metric({ + expect(glue.Job.metricRule(rule, { statistic: cloudwatch.Statistic.AVERAGE })).toEqual(new cloudwatch.Metric({ dimensions: { RuleName: 'example', }, @@ -161,7 +161,8 @@ describe('Job', () => { describe('with necessary props only', () => { beforeEach(() => { job = new glue.Job(stack, 'Job', { - jobCommand: glue.JobCommand.glueEtl(scriptLocation), + glueVersion: glue.GlueVersion.V2_0, + jobCommand: glue.JobCommand.etl(scriptLocation), }); }); @@ -226,7 +227,8 @@ describe('Job', () => { test('with a custom role should use it and set it in CloudFormation', () => { const role = iam.Role.fromRoleArn(stack, 'Role', 'arn:aws:iam::123456789012:role/TestRole'); job = new glue.Job(stack, 'JobWithRole', { - jobCommand: glue.JobCommand.glueEtl(scriptLocation), + glueVersion: glue.GlueVersion.V2_0, + jobCommand: glue.JobCommand.etl(scriptLocation), role, }); @@ -238,7 +240,8 @@ describe('Job', () => { test('with a custom jobName should set it in CloudFormation', () => { job = new glue.Job(stack, 'JobWithName', { - jobCommand: glue.JobCommand.glueEtl(scriptLocation), + glueVersion: glue.GlueVersion.V2_0, + jobCommand: glue.JobCommand.etl(scriptLocation), jobName, }); @@ -253,8 +256,8 @@ describe('Job', () => { job = new glue.Job(stack, 'Job', { jobName, description: 'test job', - jobCommand: glue.JobCommand.glueEtl(scriptLocation), - glueVersion: glue.GlueVersion.TWO_POINT_ZERO, + jobCommand: glue.JobCommand.etl(scriptLocation), + glueVersion: glue.GlueVersion.V2_0, workerType: glue.WorkerType.G_2X, numberOfWorkers: 10, maxConcurrentRuns: 2, @@ -318,26 +321,15 @@ describe('Job', () => { describe('event rules and rule-based metrics', () => { beforeEach(() => { job = new glue.Job(stack, 'Job', { - jobCommand: glue.JobCommand.glueEtl(scriptLocation), + glueVersion: glue.GlueVersion.V2_0, + jobCommand: glue.JobCommand.etl(scriptLocation), }); }); - test('.rule() creates the expected event rule', () => { - job.rule(glue.JobEventState.STOPPING); + test('.onEvent() creates the expected event rule', () => { + job.onEvent('eventId', {}); cdkassert.expect(stack).to(cdkassert.haveResource('AWS::Events::Rule', { - Description: { - 'Fn::Join': [ - '', - [ - 'Event triggered when Glue job ', - { - Ref: 'JobB9D00F9F', - }, - ' is in STOPPING state', - ], - ], - }, EventPattern: { 'source': [ 'aws.glue', @@ -347,9 +339,6 @@ describe('Job', () => { 'Glue Job Run Status', ], 'detail': { - state: [ - 'STOPPING', - ], jobName: [ { Ref: 'JobB9D00F9F', @@ -361,10 +350,10 @@ describe('Job', () => { })); }); - describe('.successRule()', () => { - test('without props and multiple calls should create one resource and cache it', () => { - const firstInvocationRule = job.successRule(); - const subsequentInvocationRule = job.successRule(); + describe('.onSuccess()', () => { + test('with no-args and multiple calls should create one resource and cache it', () => { + const firstInvocationRule = job.onSuccess(); + const subsequentInvocationRule = job.onSuccess(); expect(subsequentInvocationRule).toEqual(firstInvocationRule); cdkassert.countResources('AWS::Events::Rule', 1); @@ -373,7 +362,7 @@ describe('Job', () => { 'Fn::Join': [ '', [ - 'Event triggered when Glue job ', + 'Rule triggered when Glue job ', { Ref: 'JobB9D00F9F', }, @@ -404,14 +393,47 @@ describe('Job', () => { })); }); - test('with props and multiple calls should create one resource and cache it and ignore later props', () => { - const firstInvocationRule = job.successRule({ description: 'description override' }); - const subsequentInvocationRuleWithProps = job.successRule({ description: 'description to ignore' }); - const subsequentInvocationRuleWithoutProps = job.successRule(); + test('with args should ignore the cached rule and return a new one', () => { + const firstInvocationRule = job.onSuccess(); + const subsequentInvocationRuleWithNoArgs = job.onSuccess(); + job.onSuccess('noCache', { description: 'description override' }); - expect(subsequentInvocationRuleWithProps).toEqual(firstInvocationRule); - expect(subsequentInvocationRuleWithoutProps).toEqual(firstInvocationRule); - cdkassert.countResources('AWS::Events::Rule', 1); + expect(subsequentInvocationRuleWithNoArgs).toEqual(firstInvocationRule); + cdkassert.countResources('AWS::Events::Rule', 2); + cdkassert.expect(stack).to(cdkassert.haveResource('AWS::Events::Rule', { + Description: { + 'Fn::Join': [ + '', + [ + 'Rule triggered when Glue job ', + { + Ref: 'JobB9D00F9F', + }, + ' is in SUCCEEDED state', + ], + ], + }, + EventPattern: { + 'source': [ + 'aws.glue', + ], + 'detail-type': [ + 'Glue Job State Change', + 'Glue Job Run Status', + ], + 'detail': { + state: [ + 'SUCCEEDED', + ], + jobName: [ + { + Ref: 'JobB9D00F9F', + }, + ], + }, + }, + State: 'ENABLED', + })); cdkassert.expect(stack).to(cdkassert.haveResource('AWS::Events::Rule', { Description: 'description override', EventPattern: { @@ -438,10 +460,10 @@ describe('Job', () => { }); }); - describe('.failureRule()', () => { - test('without props and multiple calls should create one resource and cache it', () => { - const firstInvocationRule = job.failureRule(); - const subsequentInvocationRule = job.failureRule(); + describe('.onFailure()', () => { + test('with no-args and multiple calls should create one resource and cache it', () => { + const firstInvocationRule = job.onFailure(); + const subsequentInvocationRule = job.onFailure(); expect(subsequentInvocationRule).toEqual(firstInvocationRule); cdkassert.countResources('AWS::Events::Rule', 1); @@ -450,7 +472,7 @@ describe('Job', () => { 'Fn::Join': [ '', [ - 'Event triggered when Glue job ', + 'Rule triggered when Glue job ', { Ref: 'JobB9D00F9F', }, @@ -481,14 +503,47 @@ describe('Job', () => { })); }); - test('with props and multiple calls should create one resource and cache it and ignore later props', () => { - const firstInvocationRule = job.failureRule({ description: 'description override' }); - const subsequentInvocationRuleWithProps = job.failureRule({ description: 'description to ignore' }); - const subsequentInvocationRuleWithoutProps = job.failureRule(); + test('with args should ignore the cached rule and return a new one', () => { + const firstInvocationRule = job.onFailure(); + const subsequentInvocationRuleWithNoArgs = job.onFailure(); + job.onFailure('noCache', { description: 'description override' }); - expect(subsequentInvocationRuleWithProps).toEqual(firstInvocationRule); - expect(subsequentInvocationRuleWithoutProps).toEqual(firstInvocationRule); - cdkassert.countResources('AWS::Events::Rule', 1); + expect(subsequentInvocationRuleWithNoArgs).toEqual(firstInvocationRule); + cdkassert.countResources('AWS::Events::Rule', 2); + cdkassert.expect(stack).to(cdkassert.haveResource('AWS::Events::Rule', { + Description: { + 'Fn::Join': [ + '', + [ + 'Rule triggered when Glue job ', + { + Ref: 'JobB9D00F9F', + }, + ' is in FAILED state', + ], + ], + }, + EventPattern: { + 'source': [ + 'aws.glue', + ], + 'detail-type': [ + 'Glue Job State Change', + 'Glue Job Run Status', + ], + 'detail': { + state: [ + 'FAILED', + ], + jobName: [ + { + Ref: 'JobB9D00F9F', + }, + ], + }, + }, + State: 'ENABLED', + })); cdkassert.expect(stack).to(cdkassert.haveResource('AWS::Events::Rule', { Description: 'description override', EventPattern: { @@ -515,10 +570,10 @@ describe('Job', () => { }); }); - describe('.timeoutRule()', () => { - test('without props and multiple calls should create one resource and cache it', () => { - const firstInvocationRule = job.timeoutRule(); - const subsequentInvocationRule = job.timeoutRule(); + describe('.onTimeout()', () => { + test('with no-args and multiple calls should create one resource and cache it', () => { + const firstInvocationRule = job.onTimeout(); + const subsequentInvocationRule = job.onTimeout(); expect(subsequentInvocationRule).toEqual(firstInvocationRule); cdkassert.countResources('AWS::Events::Rule', 1); @@ -527,7 +582,7 @@ describe('Job', () => { 'Fn::Join': [ '', [ - 'Event triggered when Glue job ', + 'Rule triggered when Glue job ', { Ref: 'JobB9D00F9F', }, @@ -558,14 +613,47 @@ describe('Job', () => { })); }); - test('with props and multiple calls should create one resource and cache it and ignore later props', () => { - const firstInvocationRule = job.timeoutRule({ description: 'description override' }); - const subsequentInvocationRuleWithProps = job.timeoutRule({ description: 'description to ignore' }); - const subsequentInvocationRuleWithoutProps = job.timeoutRule(); + test('with args should ignore the cached rule and return a new one', () => { + const firstInvocationRule = job.onTimeout(); + job.onTimeout('noCache', { description: 'description override' }); + const subsequentInvocationRuleWithNoArgs = job.onTimeout(); - expect(subsequentInvocationRuleWithProps).toEqual(firstInvocationRule); - expect(subsequentInvocationRuleWithoutProps).toEqual(firstInvocationRule); - cdkassert.countResources('AWS::Events::Rule', 1); + expect(subsequentInvocationRuleWithNoArgs).toEqual(firstInvocationRule); + cdkassert.countResources('AWS::Events::Rule', 2); + cdkassert.expect(stack).to(cdkassert.haveResource('AWS::Events::Rule', { + Description: { + 'Fn::Join': [ + '', + [ + 'Rule triggered when Glue job ', + { + Ref: 'JobB9D00F9F', + }, + ' is in TIMEOUT state', + ], + ], + }, + EventPattern: { + 'source': [ + 'aws.glue', + ], + 'detail-type': [ + 'Glue Job State Change', + 'Glue Job Run Status', + ], + 'detail': { + state: [ + 'TIMEOUT', + ], + jobName: [ + { + Ref: 'JobB9D00F9F', + }, + ], + }, + }, + State: 'ENABLED', + })); cdkassert.expect(stack).to(cdkassert.haveResource('AWS::Events::Rule', { Description: 'description override', EventPattern: { @@ -592,12 +680,12 @@ describe('Job', () => { }); }); - test('.successRuleMetric() creates the expected event rule and corresponding metric', () => { - const metric = job.successRuleMetric(); + test('.metricSuccess() creates the expected event rule and corresponding metric', () => { + const metric = job.metricSuccess(); expect(metric).toEqual(new cloudwatch.Metric({ dimensions: { - RuleName: job.successRule().ruleName, + RuleName: job.onSuccess().ruleName, }, metricName: 'TriggeredRules', namespace: 'AWS/Events', @@ -609,7 +697,7 @@ describe('Job', () => { 'Fn::Join': [ '', [ - 'Event triggered when Glue job ', + 'Rule triggered when Glue job ', { Ref: 'JobB9D00F9F', }, @@ -640,12 +728,12 @@ describe('Job', () => { })); }); - test('.failureRuleMetric() creates the expected event rule and corresponding metric', () => { - const metric = job.failureRuleMetric(); + test('.metricFailure() creates the expected event rule and corresponding metric', () => { + const metric = job.metricFailure(); expect(metric).toEqual(new cloudwatch.Metric({ dimensions: { - RuleName: job.failureRule().ruleName, + RuleName: job.onFailure().ruleName, }, metricName: 'TriggeredRules', namespace: 'AWS/Events', @@ -657,7 +745,7 @@ describe('Job', () => { 'Fn::Join': [ '', [ - 'Event triggered when Glue job ', + 'Rule triggered when Glue job ', { Ref: 'JobB9D00F9F', }, @@ -688,12 +776,12 @@ describe('Job', () => { })); }); - test('.timeoutRuleMetric() creates the expected event rule and corresponding metric', () => { - const metric = job.timeoutRuleMetric(); + test('.metricTimeout() creates the expected event rule and corresponding metric', () => { + const metric = job.metricTimeout(); expect(metric).toEqual(new cloudwatch.Metric({ dimensions: { - RuleName: job.timeoutRule().ruleName, + RuleName: job.onTimeout().ruleName, }, metricName: 'TriggeredRules', namespace: 'AWS/Events', @@ -705,7 +793,7 @@ describe('Job', () => { 'Fn::Join': [ '', [ - 'Event triggered when Glue job ', + 'Rule triggered when Glue job ', { Ref: 'JobB9D00F9F', }, From b2d866f3a0d97c30fac7ea91c9b7a62885833afb Mon Sep 17 00:00:00 2001 From: Ahmed Kamel Date: Tue, 27 Jul 2021 21:52:40 +0100 Subject: [PATCH 07/50] improve docs --- packages/@aws-cdk/aws-glue/lib/job.ts | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/packages/@aws-cdk/aws-glue/lib/job.ts b/packages/@aws-cdk/aws-glue/lib/job.ts index b2828d390c1d4..1dd4e3aa960f1 100644 --- a/packages/@aws-cdk/aws-glue/lib/job.ts +++ b/packages/@aws-cdk/aws-glue/lib/job.ts @@ -13,7 +13,7 @@ import { ISecurityConfiguration } from './security-configuration'; * @see https://docs.aws.amazon.com/glue/latest/dg/add-job.html. * * If you need to use a GlueVersion that doesn't exist as a static member, you - * can instantiate a `GlueVersion` object, e.g: `new GlueVersion('1.5')`. + * can instantiate a `GlueVersion` object, e.g: `GlueVersion.of('1.5')`. */ export class GlueVersion { /** @@ -60,7 +60,7 @@ export class GlueVersion { * The type of predefined worker that is allocated when a job runs. * * If you need to use a WorkerType that doesn't exist as a static member, you - * can instantiate a `WorkerType` object, e.g: `new WorkerType('other type')`. + * can instantiate a `WorkerType` object, e.g: `WorkerType.of('other type')`. */ export class WorkerType { /** @@ -181,7 +181,7 @@ export enum MetricType { * The job command name used for job run. * * If you need to use a JobCommandName that doesn't exist as a static member, you - * can instantiate a `WorkerType` object, e.g: `new JobCommandName('other name')`. + * can instantiate a `JobCommandName` object, e.g: `JobCommandName.of('other name')`. */ export class JobCommandName { /** @@ -696,6 +696,7 @@ export interface JobProps { /** * The number of AWS Glue data processing units (DPUs) that can be allocated when this job runs. + * Should not be used for Glue version 2.0 and later - workerType and numberOfWorkers should be used instead. * * @default 10 when you specify an Apache Spark ETL or Sreaming job, 0.0625 DPU when you specify a Python shell job. */ @@ -738,15 +739,17 @@ export interface JobProps { /** * The type of predefined worker that is allocated when a job runs. + * Should not be used for Glue version 1.0 and earlier - maxCapacity should be used instead. * - * @default ? + * @default differs based on specific glueVersion */ readonly workerType?: WorkerType; /** * The number of workers of a defined {@link WorkerType} that are allocated when a job runs. + * Should not be used for Glue version 1.0 and earlier - maxCapacity should be used instead. * - * @default ? + * @default differs based on specific glueVersion/workerType */ readonly numberOfWorkers?: number; From 29ff157ff3b76507d17c5dfee28a5a341533d86f Mon Sep 17 00:00:00 2001 From: Ahmed Kamel Date: Wed, 28 Jul 2021 10:28:45 +0100 Subject: [PATCH 08/50] rename JobCommandName constants and JobCommand methods --- packages/@aws-cdk/aws-glue/lib/job.ts | 10 +++++----- packages/@aws-cdk/aws-glue/test/integ.job.ts | 2 +- packages/@aws-cdk/aws-glue/test/job.test.ts | 18 +++++++++--------- 3 files changed, 15 insertions(+), 15 deletions(-) diff --git a/packages/@aws-cdk/aws-glue/lib/job.ts b/packages/@aws-cdk/aws-glue/lib/job.ts index 1dd4e3aa960f1..c920dee59495f 100644 --- a/packages/@aws-cdk/aws-glue/lib/job.ts +++ b/packages/@aws-cdk/aws-glue/lib/job.ts @@ -187,12 +187,12 @@ export class JobCommandName { /** * Command for running a Glue ETL job. */ - public static readonly GLUE_ETL = new JobCommandName('glueetl'); + public static readonly ETL = new JobCommandName('glueetl'); /** * Command for running a Glue streaming job. */ - public static readonly GLUE_STREAMING = new JobCommandName('gluestreaming'); + public static readonly STREAMING = new JobCommandName('gluestreaming'); /** * Command for running a Glue python shell job. @@ -236,7 +236,7 @@ export class JobCommand { * @param pythonVersion specifies the Python shell version for the ETL job. Versions supported vary depending on GlueVersion. */ public static etl(scriptLocation: string, pythonVersion?: PythonVersion) { - return new JobCommand(JobCommandName.GLUE_ETL, scriptLocation, pythonVersion); + return new JobCommand(JobCommandName.ETL, scriptLocation, pythonVersion); } /** @@ -246,7 +246,7 @@ export class JobCommand { * @param pythonVersion specifies the Python shell version for the streaming job. Versions supported vary depending on GlueVersion. */ public static streaming(scriptLocation: string, pythonVersion?: PythonVersion) { - return new JobCommand(JobCommandName.GLUE_STREAMING, scriptLocation, pythonVersion); + return new JobCommand(JobCommandName.STREAMING, scriptLocation, pythonVersion); } /** @@ -255,7 +255,7 @@ export class JobCommand { * @param scriptLocation specifies the Amazon Simple Storage Service (Amazon S3) path to a script that executes a job. * @param pythonVersion the Python version being used to execute a Python shell job. */ - public static python(scriptLocation: string, pythonVersion?: PythonVersion) { + public static pythonShell(scriptLocation: string, pythonVersion?: PythonVersion) { return new JobCommand(JobCommandName.PYTHON_SHELL, scriptLocation, pythonVersion); } diff --git a/packages/@aws-cdk/aws-glue/test/integ.job.ts b/packages/@aws-cdk/aws-glue/test/integ.job.ts index 63d00f5b23fa1..f70ca31b56ca6 100644 --- a/packages/@aws-cdk/aws-glue/test/integ.job.ts +++ b/packages/@aws-cdk/aws-glue/test/integ.job.ts @@ -25,7 +25,7 @@ script.bucket.grantRead(minimalStreamingJob.role); const minimalPythonShellJob = new glue.Job(stack, 'MinimalPythonShellJob', { glueVersion: glue.GlueVersion.V2_0, - jobCommand: glue.JobCommand.python(script.s3ObjectUrl), + jobCommand: glue.JobCommand.pythonShell(script.s3ObjectUrl), }); script.bucket.grantRead(minimalPythonShellJob.role); diff --git a/packages/@aws-cdk/aws-glue/test/job.test.ts b/packages/@aws-cdk/aws-glue/test/job.test.ts index 7ecdc681c786e..50b29b339da3c 100644 --- a/packages/@aws-cdk/aws-glue/test/job.test.ts +++ b/packages/@aws-cdk/aws-glue/test/job.test.ts @@ -27,9 +27,9 @@ describe('WorkerType', () => { }); describe('JobCommandName', () => { - test('.GLUE_ETL', () => expect(glue.JobCommandName.GLUE_ETL.name).toEqual('glueetl')); + test('.ETL', () => expect(glue.JobCommandName.ETL.name).toEqual('glueetl')); - test('.GLUE_STREAMING', () => expect(glue.JobCommandName.GLUE_STREAMING.name).toEqual('gluestreaming')); + test('.STREAMING', () => expect(glue.JobCommandName.STREAMING.name).toEqual('gluestreaming')); test('.PYTHON_SHELL', () => expect(glue.JobCommandName.PYTHON_SHELL.name).toEqual('pythonshell')); @@ -47,7 +47,7 @@ describe('JobCommand', () => { let jobCommandName: glue.JobCommandName; // known command names + custom one - [glue.JobCommandName.GLUE_STREAMING, glue.JobCommandName.PYTHON_SHELL, glue.JobCommandName.GLUE_ETL, + [glue.JobCommandName.STREAMING, glue.JobCommandName.PYTHON_SHELL, glue.JobCommandName.ETL, glue.JobCommandName.of('CustomName')].forEach((name) => { describe(`with ${name} JobCommandName`, () => { @@ -75,24 +75,24 @@ describe('JobCommand', () => { }); }); - test('.etl() uses GLUE_ETL JobCommandName', () => { + test('.etl() uses ETL JobCommandName', () => { const jobCommand = glue.JobCommand.etl(scriptLocation); - expect(jobCommand.name).toEqual(glue.JobCommandName.GLUE_ETL); + expect(jobCommand.name).toEqual(glue.JobCommandName.ETL); expect(jobCommand.scriptLocation).toEqual(scriptLocation); expect(jobCommand.pythonVersion).toBeUndefined(); }); - test('.streaming() uses GLUE_STREAMING JobCommandName', () => { + test('.streaming() uses STREAMING JobCommandName', () => { const jobCommand = glue.JobCommand.streaming(scriptLocation, glue.PythonVersion.THREE); - expect(jobCommand.name).toEqual(glue.JobCommandName.GLUE_STREAMING); + expect(jobCommand.name).toEqual(glue.JobCommandName.STREAMING); expect(jobCommand.scriptLocation).toEqual(scriptLocation); expect(jobCommand.pythonVersion).toEqual(glue.PythonVersion.THREE); }); - test('.python() uses PYTHON_SHELL JobCommandName', () => { - const jobCommand = glue.JobCommand.python(scriptLocation, glue.PythonVersion.TWO); + test('.pythonShell() uses PYTHON_SHELL JobCommandName', () => { + const jobCommand = glue.JobCommand.pythonShell(scriptLocation, glue.PythonVersion.TWO); expect(jobCommand.name).toEqual(glue.JobCommandName.PYTHON_SHELL); expect(jobCommand.scriptLocation).toEqual(scriptLocation); From c31d7e4449fac1ae0c58ba83a897cab6055fb9e3 Mon Sep 17 00:00:00 2001 From: Ahmed Kamel Date: Fri, 30 Jul 2021 10:48:49 +0100 Subject: [PATCH 09/50] drop unnecessry toString() methods --- packages/@aws-cdk/aws-glue/lib/job.ts | 21 --------------------- 1 file changed, 21 deletions(-) diff --git a/packages/@aws-cdk/aws-glue/lib/job.ts b/packages/@aws-cdk/aws-glue/lib/job.ts index c920dee59495f..299e4c51d2bfe 100644 --- a/packages/@aws-cdk/aws-glue/lib/job.ts +++ b/packages/@aws-cdk/aws-glue/lib/job.ts @@ -47,13 +47,6 @@ export class GlueVersion { private constructor(name: string) { this.name = name; } - - /** - * The glue version name as expected by job resource. - */ - public toString(): string { - return this.name; - } } /** @@ -94,13 +87,6 @@ export class WorkerType { private constructor(name: string) { this.name = name; } - - /** - * The worker type name as expected by Job resource. - */ - public toString(): string { - return this.name; - } } /** @@ -215,13 +201,6 @@ export class JobCommandName { private constructor(name: string) { this.name = name; } - - /** - * The worker type name as expected by Job resource. - */ - public toString(): string { - return this.name; - } } /** From f05afa79d2d28f9a4009de8cb1c04c3b27370378 Mon Sep 17 00:00:00 2001 From: Ahmed Kamel Date: Fri, 30 Jul 2021 11:15:20 +0100 Subject: [PATCH 10/50] indicate PythonVersion.TWO is the default for JobCommand --- packages/@aws-cdk/aws-glue/lib/job.ts | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/packages/@aws-cdk/aws-glue/lib/job.ts b/packages/@aws-cdk/aws-glue/lib/job.ts index 299e4c51d2bfe..b3c4e6f150119 100644 --- a/packages/@aws-cdk/aws-glue/lib/job.ts +++ b/packages/@aws-cdk/aws-glue/lib/job.ts @@ -212,7 +212,7 @@ export class JobCommand { * Create a glueetl JobCommand with the given scriptLocation * * @param scriptLocation specifies the Amazon Simple Storage Service (Amazon S3) path to a script that executes a job. - * @param pythonVersion specifies the Python shell version for the ETL job. Versions supported vary depending on GlueVersion. + * @param pythonVersion specifies the Python shell version for the ETL job. default is PythonVersion.TWO. */ public static etl(scriptLocation: string, pythonVersion?: PythonVersion) { return new JobCommand(JobCommandName.ETL, scriptLocation, pythonVersion); @@ -222,7 +222,7 @@ export class JobCommand { * Create a gluestreaming JobCommand with the given scriptLocation * * @param scriptLocation specifies the Amazon Simple Storage Service (Amazon S3) path to a script that executes a job. - * @param pythonVersion specifies the Python shell version for the streaming job. Versions supported vary depending on GlueVersion. + * @param pythonVersion specifies the Python shell version for the streaming job. default is PythonVersion.TWO. */ public static streaming(scriptLocation: string, pythonVersion?: PythonVersion) { return new JobCommand(JobCommandName.STREAMING, scriptLocation, pythonVersion); @@ -232,7 +232,7 @@ export class JobCommand { * Create a pythonshell JobCommand with the given scriptLocation and pythonVersion * * @param scriptLocation specifies the Amazon Simple Storage Service (Amazon S3) path to a script that executes a job. - * @param pythonVersion the Python version being used to execute a Python shell job. + * @param pythonVersion the Python version being used to execute a Python shell job. default is PythonVersion.TWO. */ public static pythonShell(scriptLocation: string, pythonVersion?: PythonVersion) { return new JobCommand(JobCommandName.PYTHON_SHELL, scriptLocation, pythonVersion); From 5a46dc5aa2c6c0a36178a77ae4a41a1a53367ec6 Mon Sep 17 00:00:00 2001 From: Ahmed Kamel Date: Fri, 30 Jul 2021 11:15:57 +0100 Subject: [PATCH 11/50] make metricRule and buildJobArn protected --- packages/@aws-cdk/aws-glue/lib/job.ts | 4 +-- packages/@aws-cdk/aws-glue/test/job.test.ts | 31 --------------------- 2 files changed, 2 insertions(+), 33 deletions(-) diff --git a/packages/@aws-cdk/aws-glue/lib/job.ts b/packages/@aws-cdk/aws-glue/lib/job.ts index b3c4e6f150119..f3a86257dbe3e 100644 --- a/packages/@aws-cdk/aws-glue/lib/job.ts +++ b/packages/@aws-cdk/aws-glue/lib/job.ts @@ -464,7 +464,7 @@ abstract class JobBase extends cdk.Resource implements IJob { * @param rule for use in setting RuleName dimension value * @param props metric properties */ - public static metricRule(rule: events.IRule, props?: cloudwatch.MetricOptions): cloudwatch.Metric { + protected static metricRule(rule: events.IRule, props?: cloudwatch.MetricOptions): cloudwatch.Metric { return new cloudwatch.Metric({ namespace: 'AWS/Events', metricName: 'TriggeredRules', @@ -480,7 +480,7 @@ abstract class JobBase extends cdk.Resource implements IJob { * @param scope * @param jobName */ - public static buildJobArn(scope: constructs.Construct, jobName: string) : string { + protected static buildJobArn(scope: constructs.Construct, jobName: string) : string { return cdk.Stack.of(scope).formatArn({ service: 'glue', resource: 'job', diff --git a/packages/@aws-cdk/aws-glue/test/job.test.ts b/packages/@aws-cdk/aws-glue/test/job.test.ts index 50b29b339da3c..44e8b02b7ee0c 100644 --- a/packages/@aws-cdk/aws-glue/test/job.test.ts +++ b/packages/@aws-cdk/aws-glue/test/job.test.ts @@ -1,6 +1,5 @@ import * as cdkassert from '@aws-cdk/assert-internal'; import * as cloudwatch from '@aws-cdk/aws-cloudwatch'; -import * as events from '@aws-cdk/aws-events'; import * as iam from '@aws-cdk/aws-iam'; import * as cdk from '@aws-cdk/core'; import '@aws-cdk/assert-internal/jest'; @@ -120,36 +119,6 @@ describe('Job', () => { })); }); - describe('.ruleMetric()', () => { - let rule: events.IRule; - - beforeEach(() => { - rule = events.Rule.fromEventRuleArn(stack, 'Rule', 'arn:aws:events:us-east-1:123456789012:rule/example'); - }); - - test('with no props returns default metric', () => { - expect(glue.Job.metricRule(rule)).toEqual(new cloudwatch.Metric({ - dimensions: { - RuleName: 'example', - }, - metricName: 'TriggeredRules', - namespace: 'AWS/Events', - statistic: 'Sum', - })); - }); - - test('with props overrides', () => { - expect(glue.Job.metricRule(rule, { statistic: cloudwatch.Statistic.AVERAGE })).toEqual(new cloudwatch.Metric({ - dimensions: { - RuleName: 'example', - }, - metricName: 'TriggeredRules', - namespace: 'AWS/Events', - statistic: 'Average', - })); - }); - }); - describe('new', () => { let scriptLocation: string; let job: glue.Job; From f59d688d3501d5fd5380ae3422f6399ed5c6bfa9 Mon Sep 17 00:00:00 2001 From: Ahmed Kamel Date: Fri, 30 Jul 2021 11:31:55 +0100 Subject: [PATCH 12/50] address more comments --- packages/@aws-cdk/aws-glue/lib/job.ts | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/packages/@aws-cdk/aws-glue/lib/job.ts b/packages/@aws-cdk/aws-glue/lib/job.ts index f3a86257dbe3e..9f5de61392bb9 100644 --- a/packages/@aws-cdk/aws-glue/lib/job.ts +++ b/packages/@aws-cdk/aws-glue/lib/job.ts @@ -675,7 +675,7 @@ export interface JobProps { /** * The number of AWS Glue data processing units (DPUs) that can be allocated when this job runs. - * Should not be used for Glue version 2.0 and later - workerType and numberOfWorkers should be used instead. + * Cannot be used for Glue version 2.0 and later - workerType and numberOfWorkers should be used instead. * * @default 10 when you specify an Apache Spark ETL or Sreaming job, 0.0625 DPU when you specify a Python shell job. */ @@ -692,21 +692,21 @@ export interface JobProps { * The maximum number of concurrent runs allowed for the job. * An error is returned when this threshold is reached. The maximum value you can specify is controlled by a service limit. * - * @default 1. + * @default 1 */ readonly maxConcurrentRuns?: number; /** * The number of minutes to wait after a job run starts, before sending a job run delay notification. * - * @default N/A (no delay notifications) + * @default - no delay notifications */ readonly notifyDelayAfter?: cdk.Duration; /** * The maximum time that a job run can consume resources before it is terminated and enters TIMEOUT status. * - * @default 2,880 minutes (48 hours) + * @default cdk.Duration.hours(48) */ readonly timeout?: cdk.Duration; @@ -718,7 +718,6 @@ export interface JobProps { /** * The type of predefined worker that is allocated when a job runs. - * Should not be used for Glue version 1.0 and earlier - maxCapacity should be used instead. * * @default differs based on specific glueVersion */ @@ -726,7 +725,6 @@ export interface JobProps { /** * The number of workers of a defined {@link WorkerType} that are allocated when a job runs. - * Should not be used for Glue version 1.0 and earlier - maxCapacity should be used instead. * * @default differs based on specific glueVersion/workerType */ @@ -764,7 +762,8 @@ export interface JobProps { /** * The IAM role associated with this job. - * @default a new IAM role with arn:aws:iam::aws:policy/service-role/AWSGlueServiceRole managed policy + * + * @default an IAM role is generated */ readonly role?: iam.IRole; @@ -815,9 +814,8 @@ export class Job extends JobBase { }); // Create a basic service role if one is not provided https://docs.aws.amazon.com/glue/latest/dg/create-service-policy.html - this.role = props.role || new iam.Role(this, 'ServiceRole', { - assumedBy: new iam.ServicePrincipal('glue'), - // arn:aws:iam::aws:policy/service-role/AWSGlueServiceRole + this.role = props.role ?? new iam.Role(this, 'ServiceRole', { + assumedBy: new iam.ServicePrincipal('glue.amazonaws.com'), managedPolicies: [iam.ManagedPolicy.fromAwsManagedPolicyName('service-role/AWSGlueServiceRole')], }); @@ -830,8 +828,8 @@ export class Job extends JobBase { scriptLocation: props.jobCommand.scriptLocation, pythonVersion: props.jobCommand.pythonVersion, }, - glueVersion: props.glueVersion ? props.glueVersion.name : undefined, - workerType: props.workerType ? props.workerType.name : undefined, + glueVersion: props.glueVersion?.name, + workerType: props.workerType?.name, numberOfWorkers: props.numberOfWorkers, maxCapacity: props.maxCapacity, maxRetries: props.maxRetries, From fba208da1ca1fb456e496dd7705a1a470a1ff919 Mon Sep 17 00:00:00 2001 From: Ahmed Kamel Date: Mon, 2 Aug 2021 11:42:09 +0100 Subject: [PATCH 13/50] drop jobRunId from metric()'s arguments --- packages/@aws-cdk/aws-glue/lib/job.ts | 8 +++----- packages/@aws-cdk/aws-glue/test/job.test.ts | 4 ++-- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/packages/@aws-cdk/aws-glue/lib/job.ts b/packages/@aws-cdk/aws-glue/lib/job.ts index 9f5de61392bb9..f9e326c36c49e 100644 --- a/packages/@aws-cdk/aws-glue/lib/job.ts +++ b/packages/@aws-cdk/aws-glue/lib/job.ts @@ -430,13 +430,12 @@ export interface IJob extends cdk.IResource { * Create a CloudWatch metric. * * @param metricName name of the metric typically prefixed with `glue.driver.`, `glue..` or `glue.ALL.`. - * @param jobRunId a dimension that filters for metrics of a specific JobRun ID, or `ALL`. * @param type the metric type. * @param props metric options. * * @see https://docs.aws.amazon.com/glue/latest/dg/monitoring-awsglue-with-cloudwatch-metrics.html */ - metric(metricName: string, jobRunId: string, type: MetricType, props?: cloudwatch.MetricOptions): cloudwatch.Metric; + metric(metricName: string, type: MetricType, props?: cloudwatch.MetricOptions): cloudwatch.Metric; /** * Create a CloudWatch Metric indicating job success. @@ -559,19 +558,18 @@ abstract class JobBase extends cdk.Resource implements IJob { * Create a CloudWatch metric. * * @param metricName name of the metric typically prefixed with `glue.driver.`, `glue..` or `glue.ALL.`. - * @param jobRunId a dimension that filters for metrics of a specific JobRun ID, or `ALL`. * @param type the metric type. * @param props metric options. * * @see https://docs.aws.amazon.com/glue/latest/dg/monitoring-awsglue-with-cloudwatch-metrics.html */ - public metric(metricName: string, jobRunId: string, type: MetricType, props?: cloudwatch.MetricOptions): cloudwatch.Metric { + public metric(metricName: string, type: MetricType, props?: cloudwatch.MetricOptions): cloudwatch.Metric { return new cloudwatch.Metric({ metricName, namespace: 'Glue', dimensions: { JobName: this.jobName, - JobRunId: jobRunId, + JobRunId: 'ALL', Type: type, }, ...props, diff --git a/packages/@aws-cdk/aws-glue/test/job.test.ts b/packages/@aws-cdk/aws-glue/test/job.test.ts index 44e8b02b7ee0c..4588dceba1e40 100644 --- a/packages/@aws-cdk/aws-glue/test/job.test.ts +++ b/packages/@aws-cdk/aws-glue/test/job.test.ts @@ -800,7 +800,7 @@ describe('Job', () => { const metricName = 'glue.driver.aggregate.bytesRead'; const props = { statistic: cloudwatch.Statistic.SUM }; - expect(job.metric(metricName, 'ALL', glue.MetricType.COUNT, props)).toEqual(new cloudwatch.Metric({ + expect(job.metric(metricName, glue.MetricType.COUNT, props)).toEqual(new cloudwatch.Metric({ metricName, statistic: 'Sum', namespace: 'Glue', @@ -816,7 +816,7 @@ describe('Job', () => { const metricName = 'glue.driver.BlockManager.disk.diskSpaceUsed_MB'; const props = { statistic: cloudwatch.Statistic.AVERAGE }; - expect(job.metric(metricName, 'ALL', glue.MetricType.GAUGE, props)).toEqual(new cloudwatch.Metric({ + expect(job.metric(metricName, glue.MetricType.GAUGE, props)).toEqual(new cloudwatch.Metric({ metricName, statistic: 'Average', namespace: 'Glue', From 6032807b9d46573c2cdf5bf198944c55617b1476 Mon Sep 17 00:00:00 2001 From: Ahmed Kamel Date: Mon, 2 Aug 2021 11:58:37 +0100 Subject: [PATCH 14/50] change how event.Rules caching is done --- packages/@aws-cdk/aws-glue/lib/job.ts | 40 +++++++++++---------------- 1 file changed, 16 insertions(+), 24 deletions(-) diff --git a/packages/@aws-cdk/aws-glue/lib/job.ts b/packages/@aws-cdk/aws-glue/lib/job.ts index f9e326c36c49e..500cf48dc9dee 100644 --- a/packages/@aws-cdk/aws-glue/lib/job.ts +++ b/packages/@aws-cdk/aws-glue/lib/job.ts @@ -489,7 +489,6 @@ abstract class JobBase extends cdk.Resource implements IJob { public abstract readonly jobArn: string; public abstract readonly jobName: string; - private cachedRules: Record = {}; /** * Create a CloudWatch Event Rule for this Glue Job when it's in a given state @@ -615,30 +614,23 @@ abstract class JobBase extends cdk.Resource implements IJob { * @private */ private rule(jobState: JobState, id?: string, options: events.OnEventOptions = {}): events.Rule { - // No caching - if (id) { - const rule = this.onEvent(id, options); - rule.addEventPattern({ - detail: { - state: [jobState], - }, - }); - return rule; + // Caching (for metric methods and default arg-less event methods) + const cachedRuleId = `${jobState}Rule`; + const cachedRule = this.node.tryFindChild(cachedRuleId); + // Use the already created rule if no id is provided (arg-less event methods or events supporting metrics) + if (!id && cachedRule) { + return cachedRule as events.Rule; } - // Caching - const ruleId = `${jobState}Rule`; - if (!this.cachedRules[ruleId]) { - const rule = this.onEvent(ruleId, { - description: `Rule triggered when Glue job ${this.jobName} is in ${jobState} state`, - }); - rule.addEventPattern({ - detail: { - state: [jobState], - }, - }); - this.cachedRules[ruleId] = rule; - } - return this.cachedRules[ruleId]; + const rule = this.onEvent(id || cachedRuleId, { + description: `Rule triggered when Glue job ${this.jobName} is in ${jobState} state`, + ...options, + }); + rule.addEventPattern({ + detail: { + state: [jobState], + }, + }); + return rule; } } From 98c19dfcfe17269af8eb81471ce5d7d5078e8fb0 Mon Sep 17 00:00:00 2001 From: Ahmed Kamel Date: Mon, 2 Aug 2021 12:09:44 +0100 Subject: [PATCH 15/50] drop JobSpecialArgumentNames --- packages/@aws-cdk/aws-glue/lib/job.ts | 125 +----------------------- packages/@aws-cdk/aws-glue/package.json | 1 - 2 files changed, 1 insertion(+), 125 deletions(-) diff --git a/packages/@aws-cdk/aws-glue/lib/job.ts b/packages/@aws-cdk/aws-glue/lib/job.ts index 500cf48dc9dee..f01dbe2f23a54 100644 --- a/packages/@aws-cdk/aws-glue/lib/job.ts +++ b/packages/@aws-cdk/aws-glue/lib/job.ts @@ -260,128 +260,6 @@ export class JobCommand { } } -/** - * Constants for some of the special parameters used by {@link Job}. - * - * These constants can be used as argument names in {@link JobProps.defaultArguments}. - * - * @see https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-glue-arguments.html - * @see https://docs.aws.amazon.com/glue/latest/dg/monitor-spark-ui-jobs.html - */ -export enum JobSpecialArgumentNames { - /** - * The script programming language. This value must be either `scala` or `python`. If this parameter is not present, the default is `python`. - */ - JOB_LANGUAGE = '--job-language', - - /** - * The Scala class that serves as the entry point for your Scala script. This applies only if your `--job-language` is set to `scala`. - */ - CLASS = '--class', - - /** - * The Amazon Simple Storage Service (Amazon S3) location where your ETL script is located (in the form `s3://path/to/my/script.py`). - * This parameter overrides a script location set in the JobCommand object. - */ - SCRIPT_LOCATION = '--scriptLocation', - - /** - * The Amazon S3 paths to additional Python modules that AWS Glue adds to the Python path before executing your script. - * Multiple values must be complete paths separated by a comma (`,`). - */ - EXTRA_PY_FILES = '--extra-py-files', - - /** - * The Amazon S3 paths to additional Java `.jar` files that AWS Glue adds to the Java classpath before executing your script. - * Multiple values must be complete paths separated by a comma (`,`). - */ - EXTRA_JARS = '--extra-jars', - - /** - * Setting this value to true prioritizes the customer's extra JAR files in the classpath. - * This option is only available in AWS Glue version 2.0. - */ - USER_JARS_FIRST = '--user-jars-first', - - /** - * The Amazon S3 paths to additional files, such as configuration files that AWS Glue copies to the working directory of your script before executing it. - * Multiple values must be complete paths separated by a comma (`,`). - */ - EXTRA_FILES = '--extra-files', - - /** - * Controls the behavior of a job bookmark. The following option values can be set. - */ - JOB_BOOKMARK_OPTION = '--job-bookmark-option', - - /** - * Specifies an Amazon S3 path to a bucket that can be used as a temporary directory for the job. - */ - TEMP_DIR = '--TempDir', - - /** - * Enables the EMRFS S3-optimized committer for writing Parquet data into Amazon S3. - * Setting the value to `true` enables the committer. By default the flag is turned off. - */ - ENABLE_S3_PARQUET_OPTIMIZED_COMMITTER = '--enable-s3-parquet-optimized-committer', - - /** - * Sets the EMRFS rename algorithm version to version 2. - * This option is only available on AWS Glue version 1.0. - */ - ENABLE_RENAME_ALGORITHM_V2 = '--enable-rename-algorithm-v2', - - /** - * Enables using the AWS Glue Data Catalog as an Apache Spark Hive metastore. - */ - ENABLE_GLUE_DATA_CATALOG = '--enable-glue-datacatalog', - - /** - * Enables the collection of metrics for job profiling for this job run. - * To enable metrics, only specify the key; no value is needed. - */ - ENABLE_METRICS = '--enable-metrics', - - /** - * Enables real-time continuous logging for AWS Glue jobs to view real-time Apache Spark job logs in CloudWatch. - */ - ENABLE_CONTINUOUS_LOGGING = '--enable-continuous-cloudwatch-log', - - /** - * Specifies a standard filter (true) or no filter (false) for continuous logging. - * Choosing the standard filter prunes out non-useful Apache Spark driver/executor and Apache Hadoop YARN heartbeat log messages. - * Choosing no filter gives all the log messages. - */ - ENABLE_CONTINUOUS_LOG_FILTER = '--enable-continuous-log-filter', - - /** - * Specifies a custom Amazon CloudWatch log group name for a job enabled for continuous logging. - */ - LOG_GROUP = '--continuous-log-logGroup', - - /** - * Specifies a custom CloudWatch log stream prefix for a job enabled for continuous logging. - */ - LOG_STREAM_PREFIX = '--continuous-log-logStreamPrefix', - - /** - * Specifies a custom conversion log pattern for a job enabled for continuous logging. - */ - LOG_CONVERSION_PATTERN = '--continuous-log-conversionPattern', - - /** - * Enables Apache Spark web UI. - * @see https://docs.aws.amazon.com/glue/latest/dg/monitor-spark-ui-jobs.html - */ - ENABLE_SPARK_UI = '--enable-spark-ui', - - /** - * Specifies the Amazon S3 path for storing the Spark event logs for the job. - * @see https://docs.aws.amazon.com/glue/latest/dg/monitor-spark-ui-jobs.html - */ - SPARK_UI_LOGS_PATH = '--spark-event-logs-path', -} - /** * Interface representing a created or an imported {@link Job}. */ @@ -737,8 +615,7 @@ export interface JobProps { /** * The default arguments for this job, specified as name-value pairs. * - * {@link JobSpecialArgumentNames} defines some of the Special Parameters used by AWS Glue. - * @see https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-glue-arguments.html A list of Special Parameters Used by AWS Glue + * @see https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-glue-arguments.html for a list of special parameters Used by AWS Glue * @default no arguments */ readonly defaultArguments?: { [key: string]: string }; diff --git a/packages/@aws-cdk/aws-glue/package.json b/packages/@aws-cdk/aws-glue/package.json index 68a40f69ea472..5cb2bfc955db3 100644 --- a/packages/@aws-cdk/aws-glue/package.json +++ b/packages/@aws-cdk/aws-glue/package.json @@ -149,7 +149,6 @@ "docs-public-apis:@aws-cdk/aws-glue.ClassificationString.PARQUET", "docs-public-apis:@aws-cdk/aws-glue.ClassificationString.ORC", "docs-public-apis:@aws-cdk/aws-glue.ClassificationString.value", - "no-unused-type:@aws-cdk/aws-glue.JobSpecialArgumentNames", "no-unused-type:@aws-cdk/aws-glue.JobState" ] }, From 933ebcd86abbca830bafec4015dd8d81e04d43e5 Mon Sep 17 00:00:00 2001 From: Ahmed Kamel Date: Wed, 11 Aug 2021 16:10:27 +0100 Subject: [PATCH 16/50] introduce JobExecutable and refactor accordingly - introduce JobExecutable - drop JobCommand - rename JobCommandName to JobType --- packages/@aws-cdk/aws-glue/lib/job.ts | 425 +++++++++++++++--- .../aws-glue/test/integ.job.expected.json | 14 + packages/@aws-cdk/aws-glue/test/integ.job.ts | 31 +- packages/@aws-cdk/aws-glue/test/job.test.ts | 127 +++--- 4 files changed, 458 insertions(+), 139 deletions(-) diff --git a/packages/@aws-cdk/aws-glue/lib/job.ts b/packages/@aws-cdk/aws-glue/lib/job.ts index f01dbe2f23a54..71e8db0bd53ee 100644 --- a/packages/@aws-cdk/aws-glue/lib/job.ts +++ b/packages/@aws-cdk/aws-glue/lib/job.ts @@ -164,99 +164,391 @@ export enum MetricType { } /** - * The job command name used for job run. + * Runtime language of the Glue job + */ +export enum JobLanguage { + /** + * Scala + */ + SCALA = 'scala', + + /** + * Python + */ + PYTHON = 'python', +} + +/** + * The job type. * - * If you need to use a JobCommandName that doesn't exist as a static member, you - * can instantiate a `JobCommandName` object, e.g: `JobCommandName.of('other name')`. + * If you need to use a JobType that doesn't exist as a static member, you + * can instantiate a `JobType` object, e.g: `JobType.of('other name', ['supported languages'])`. */ -export class JobCommandName { +export class JobType { /** * Command for running a Glue ETL job. */ - public static readonly ETL = new JobCommandName('glueetl'); + public static readonly ETL = new JobType('glueetl', [JobLanguage.SCALA, JobLanguage.PYTHON]); /** * Command for running a Glue streaming job. */ - public static readonly STREAMING = new JobCommandName('gluestreaming'); + public static readonly STREAMING = new JobType('gluestreaming', [JobLanguage.SCALA, JobLanguage.PYTHON]); /** * Command for running a Glue python shell job. */ - public static readonly PYTHON_SHELL = new JobCommandName('pythonshell'); + public static readonly PYTHON_SHELL = new JobType('pythonshell', [JobLanguage.PYTHON]); /** - * Custom command name - * @param name command name + * Custom type name + * @param name type name + * @param languages languages supported by this job type */ - public static of(name: string): WorkerType { - return new JobCommandName(name); + public static of(name: string, languages: JobLanguage[]): JobType { + return new JobType(name, languages); } /** - * The name of this JobCommandName, as expected by Job resource. + * The name of this JobType, as expected by Job resource. */ public readonly name: string; - private constructor(name: string) { + /** + * Languages supported by this job type + */ + public languages: JobLanguage[]; + + private constructor(name: string, languages: JobLanguage[]) { + if (languages.length === 0) { + throw new Error('languages cannot be empty'); + } this.name = name; + this.languages = languages; } } /** - * JobCommand specifies the execution environment and the code executed when a job is run. + * TODO: Q for reviewer - any way to better model these props, Omit seems to cause problems with JSII? + */ +interface JobExecutableProps { + readonly glueVersion: GlueVersion; + + readonly language: JobLanguage; + + readonly type: JobType; + + readonly pythonVersion?: PythonVersion; + + readonly scriptLocation: string; + + readonly className?: string; + + readonly extraJars?: string[]; + + readonly extraPythonFiles?: string[]; + + readonly extraFiles?: string[]; + + readonly extraJarsFirst?: boolean; +} + +/** + * Props for creating a Scala Spark (ETL or Streaming) job executable + */ +export interface ScalaJobExecutableProps { + + /** + * Glue version. + * + * @see https://docs.aws.amazon.com/glue/latest/dg/release-notes.html + */ + readonly glueVersion: GlueVersion; + + /** + * Specify the type of the job whether it's an Apache Spark ETL or streaming one. + */ + readonly type: JobType; + + /** + * Specifies the Amazon Simple Storage Service (Amazon S3) path to a script that executes a job. + */ + readonly scriptLocation: string; + + /** + * The fully qualified Scala class name that serves as the entry point for the job. + * + * @see `--class` in https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-glue-arguments.html + */ + readonly className: string; + + /** + * The Amazon S3 paths to additional Java .jar files that AWS Glue adds to the Java classpath before executing your script. + * Only individual files are supported, not a directory path. + * + * @default - no extra jars and argument is not set + * + * @see `--extra-jars` in https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-glue-arguments.html + */ + readonly extraJars?: string[]; + + /** + * The Amazon S3 paths to additional files, such as configuration files that AWS Glue copies to the working directory of your script before executing it. + * Only individual files are supported, not a directory path. + * + * @default - no extra files and argument is not set + * + * @see `--extra-files` in https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-glue-arguments.html + */ + readonly extraFiles?: string[]; + + /** + * Setting this value to true prioritizes the customer's extra JAR files in the classpath. + * + * @default - priortiy is not given to extra jars and argument is not set + * + * @see `--user-jars-first` in https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-glue-arguments.html + */ + readonly extraJarsFirst?: boolean; +} + +/** + * Props for creating a Python Spark (ETL or Streaming) job executable */ -export class JobCommand { +export interface PythonJobExecutableProps { /** - * Create a glueetl JobCommand with the given scriptLocation + * Glue version. * - * @param scriptLocation specifies the Amazon Simple Storage Service (Amazon S3) path to a script that executes a job. - * @param pythonVersion specifies the Python shell version for the ETL job. default is PythonVersion.TWO. + * @see https://docs.aws.amazon.com/glue/latest/dg/release-notes.html + */ + readonly glueVersion: GlueVersion; + + /** + * Specify the type of the job whether it's an Apache Spark ETL or streaming one. */ - public static etl(scriptLocation: string, pythonVersion?: PythonVersion) { - return new JobCommand(JobCommandName.ETL, scriptLocation, pythonVersion); + readonly type: JobType; + + /** + * The Python version to use. + */ + readonly pythonVersion: PythonVersion; + + /** + * Specifies the Amazon Simple Storage Service (Amazon S3) path to a script that executes a job. + */ + readonly scriptLocation: string; + + /** + * The Amazon S3 paths to additional Java .jar files that AWS Glue adds to the Java classpath before executing your script. + * Only individual files are supported, not a directory path. + * + * @default - no extra jars and argument is not set + * + * @see `--extra-jars` in https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-glue-arguments.html + */ + readonly extraJars?: string[]; + + /** + * The Amazon S3 paths to additional Python modules that AWS Glue adds to the Python path before executing your script. + * Only individual files are supported, not a directory path. + * + * @default - no extra python files and argument is not set + * + * @see `--extra-py-files` in https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-glue-arguments.html + */ + readonly extraPythonFiles?: string[]; + + /** + * The Amazon S3 paths to additional files, such as configuration files that AWS Glue copies to the working directory of your script before executing it. + * Only individual files are supported, not a directory path. + * + * @default - no extra files and argument is not set + * + * @see `--extra-files` in https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-glue-arguments.html + */ + readonly extraFiles?: string[]; + + /** + * Setting this value to true prioritizes the customer's extra JAR files in the classpath. + * + * @default - priortiy is not given to extra jars and argument is not set + * @see `--user-jars-first` in https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-glue-arguments.html + */ + readonly extraJarsFirst?: boolean; +} + +/** + * Props for creating a Python shell job executable + */ +export interface PythonShellExecutableProps { + + /** + * Glue version. + * + * @see https://docs.aws.amazon.com/glue/latest/dg/release-notes.html + */ + readonly glueVersion: GlueVersion; + + /** + * The Python version to use. + */ + readonly pythonVersion: PythonVersion; + + /** + * Specifies the Amazon Simple Storage Service (Amazon S3) path to a script that executes a job. + */ + readonly scriptLocation: string; + + /** + * The Amazon S3 paths to additional Python modules that AWS Glue adds to the Python path before executing your script. + * Only individual files are supported, not a directory path. + * + * @default - no extra python files and argument is not set + * + * @see `--extra-py-files` in https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-glue-arguments.html + */ + readonly extraPythonFiles?: string[]; + + /** + * The Amazon S3 paths to additional files, such as configuration files that AWS Glue copies to the working directory of your script before executing it. + * Only individual files are supported, not a directory path. + * + * @default - no extra files and argument is not set + * + * @see `--extra-files` in https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-glue-arguments.html + */ + readonly extraFiles?: string[]; +} + +/** + * The executable properties related to the Glue job's GlueVersion, JobType and code + * + * TODO test for exceptions + */ +export class JobExecutable { + + /** + * Create Scala executable props for Apache Spark jobs (ETL or streaming) + * + * @param props Scala Apache Spark Job props + */ + public static scala(props: ScalaJobExecutableProps): JobExecutable { + return new JobExecutable({ + ...props, + language: JobLanguage.SCALA, + }); } /** - * Create a gluestreaming JobCommand with the given scriptLocation + * Create Python executable props for Apache Spark jobs (ETL or streaming) * - * @param scriptLocation specifies the Amazon Simple Storage Service (Amazon S3) path to a script that executes a job. - * @param pythonVersion specifies the Python shell version for the streaming job. default is PythonVersion.TWO. + * @param props Python Apache Spark Job props */ - public static streaming(scriptLocation: string, pythonVersion?: PythonVersion) { - return new JobCommand(JobCommandName.STREAMING, scriptLocation, pythonVersion); + public static python(props: PythonJobExecutableProps): JobExecutable { + return new JobExecutable({ + ...props, + language: JobLanguage.PYTHON, + }); } /** - * Create a pythonshell JobCommand with the given scriptLocation and pythonVersion + * Create Python executable props for python shell jobs * - * @param scriptLocation specifies the Amazon Simple Storage Service (Amazon S3) path to a script that executes a job. - * @param pythonVersion the Python version being used to execute a Python shell job. default is PythonVersion.TWO. + * @param props Python Shell Job props */ - public static pythonShell(scriptLocation: string, pythonVersion?: PythonVersion) { - return new JobCommand(JobCommandName.PYTHON_SHELL, scriptLocation, pythonVersion); + public static pythonShell(props: PythonShellExecutableProps): JobExecutable { + return new JobExecutable({ + ...props, + type: JobType.PYTHON_SHELL, + language: JobLanguage.PYTHON, + }); } /** - * The name of the job command e.g. glueetl for an Apache Spark ETL job or pythonshell for a Python shell job. + * Glue version. + * + * @see https://docs.aws.amazon.com/glue/latest/dg/release-notes.html */ - readonly name: JobCommandName; + public readonly glueVersion: GlueVersion; + + /** + * The language of the job (Scala or Python). + * + * @see `--job-languae` in https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-glue-arguments.html + */ + public readonly language: JobLanguage; + + /** + * Specify the type of the job whether it's an Apache Spark ETL or streaming one or if it's a Python shell job. + */ + public readonly type: JobType; + + /** + * The Python version to use. + */ + public readonly pythonVersion?: PythonVersion; /** * Specifies the Amazon Simple Storage Service (Amazon S3) path to a script that executes a job. */ - readonly scriptLocation: string; + public readonly scriptLocation: string; /** - * The Python version being used to execute a Python shell job. + * The Scala class that serves as the entry point for the job. This applies only if your the job langauage is Scala. + * + * @see `--class` in https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-glue-arguments.html */ - readonly pythonVersion?: PythonVersion; + public readonly className?: string; - constructor(name: JobCommandName, scriptLocation: string, pythonVersion?: PythonVersion) { - this.name = name; - this.scriptLocation = scriptLocation; - this.pythonVersion = pythonVersion; + /** + * The Amazon S3 paths to additional Java .jar files that AWS Glue adds to the Java classpath before executing your script. + * Only individual files are supported, not a directory path. + * + * @see `--extra-jars` in https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-glue-arguments.html + */ + public readonly extraJars?: string[]; + + /** + * The Amazon S3 paths to additional Python modules that AWS Glue adds to the Python path before executing your script. + * Only individual files are supported, not a directory path. + * + * @see `--extra-py-files` in https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-glue-arguments.html + */ + public readonly extraPythonFiles?: string[]; + + /** + * The Amazon S3 paths to additional files, such as configuration files that AWS Glue copies to the working directory of your script before executing it. + * Only individual files are supported, not a directory path. + * + * @see `--extra-files` in https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-glue-arguments.html + */ + public readonly extraFiles?: string[]; + + /** + * Setting this value to true prioritizes the customer's extra JAR files in the classpath. + * + * @see `--user-jars-first` in https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-glue-arguments.html + */ + public readonly extraJarsFirst?: boolean; + + private constructor(props: JobExecutableProps) { + if (!props.type.languages.includes(props.language)) { + throw new Error(`${props.type.name} JobType does not support ${props.language} language`); + } + if (props.extraJarsFirst && [GlueVersion.V0_9, GlueVersion.V1_0].includes(props.glueVersion)) { + throw new Error(`${props.type.name} JobType does not support ${props.language} language`); + } + this.glueVersion = props.glueVersion; + this.language = props.language; + this.type = props.type; + this.pythonVersion = props.pythonVersion; + this.scriptLocation = props.scriptLocation; + this.className = props.className; + this.extraJars = props.extraJars; + this.extraPythonFiles = props.extraPythonFiles; + this.extraFiles = props.extraFiles; + this.extraJarsFirst = props.extraJarsFirst; } } @@ -578,23 +870,17 @@ export interface JobProps { */ readonly timeout?: cdk.Duration; - /** - * Glue version determines the versions of Apache Spark and Python that AWS Glue supports. The Python version indicates the version supported for jobs of type Spark. - * - */ - readonly glueVersion: GlueVersion; - /** * The type of predefined worker that is allocated when a job runs. * - * @default differs based on specific glueVersion + * @default differs based on specific glue version */ readonly workerType?: WorkerType; /** * The number of workers of a defined {@link WorkerType} that are allocated when a job runs. * - * @default differs based on specific glueVersion/workerType + * @default differs based on specific glue version/worker type */ readonly numberOfWorkers?: number; @@ -635,9 +921,14 @@ export interface JobProps { readonly role?: iam.IRole; /** - * The job command specifying the type of the job e.g. glueetl or pythonshell and relevant parameters. + * The job's executable properties. */ - readonly jobCommand: JobCommand; + readonly executable: JobExecutable; + + // TODO '--TempDir': 's3-path-to-directory' we should have a prop to enable setting this, and enable a bucket to be created + // or one specified, the role should also be updated to have access to that bucket + + // TODO --enable-metrics confirm if it's needed for count/guage metrics or not and add a prop } /** @@ -686,16 +977,21 @@ export class Job extends JobBase { managedPolicies: [iam.ManagedPolicy.fromAwsManagedPolicyName('service-role/AWSGlueServiceRole')], }); + const defaultArguments = { + ...this.executableArguments(props.executable), + ...props.defaultArguments, + }; + const jobResource = new CfnJob(this, 'Resource', { name: props.jobName, description: props.description, role: this.role.roleArn, command: { - name: props.jobCommand.name.name, - scriptLocation: props.jobCommand.scriptLocation, - pythonVersion: props.jobCommand.pythonVersion, + name: props.executable.type.name, + scriptLocation: props.executable.scriptLocation, + pythonVersion: props.executable.pythonVersion, }, - glueVersion: props.glueVersion?.name, + glueVersion: props.executable.glueVersion.name, workerType: props.workerType?.name, numberOfWorkers: props.numberOfWorkers, maxCapacity: props.maxCapacity, @@ -705,12 +1001,33 @@ export class Job extends JobBase { timeout: props.timeout ? props.timeout.toMinutes() : undefined, connections: props.connections ? { connections: props.connections.map((connection) => connection.connectionName) } : undefined, securityConfiguration: props.securityConfiguration ? props.securityConfiguration.securityConfigurationName : undefined, - defaultArguments: props.defaultArguments, tags: props.tags, + defaultArguments, }); const resourceName = this.getResourceNameAttribute(jobResource.ref); this.jobArn = JobBase.buildJobArn(this, resourceName); this.jobName = resourceName; } + + private executableArguments(executable: JobExecutable) { + const args: { [key: string]: string } = {}; + args['--job-language'] = executable.language; + if (executable.className) { + args['--class'] = executable.className; + } + if (executable.extraJars && executable.extraJars.length > 0) { + args['--extra-jars'] = executable.extraJars.join(','); + } + if (executable.extraPythonFiles && executable.extraPythonFiles.length > 0) { + args['--extra-py-files'] = executable.extraPythonFiles.join(','); + } + if (executable.extraFiles && executable.extraFiles.length > 0) { + args['--extra-files'] = executable.extraFiles.join(','); + } + if (executable.extraJarsFirst) { + args['--user-jars-first'] = 'true'; + } + return args; + } } diff --git a/packages/@aws-cdk/aws-glue/test/integ.job.expected.json b/packages/@aws-cdk/aws-glue/test/integ.job.expected.json index 23ebc8ba9361a..1610c0c3d8bb4 100644 --- a/packages/@aws-cdk/aws-glue/test/integ.job.expected.json +++ b/packages/@aws-cdk/aws-glue/test/integ.job.expected.json @@ -107,6 +107,7 @@ "Properties": { "Command": { "Name": "glueetl", + "PythonVersion": "2", "ScriptLocation": { "Fn::Join": [ "", @@ -152,6 +153,9 @@ "Arn" ] }, + "DefaultArguments": { + "--job-language": "python" + }, "GlueVersion": "2.0" } }, @@ -248,6 +252,7 @@ "Properties": { "Command": { "Name": "gluestreaming", + "PythonVersion": "2", "ScriptLocation": { "Fn::Join": [ "", @@ -293,6 +298,9 @@ "Arn" ] }, + "DefaultArguments": { + "--job-language": "python" + }, "GlueVersion": "2.0" } }, @@ -389,6 +397,7 @@ "Properties": { "Command": { "Name": "pythonshell", + "PythonVersion": "2", "ScriptLocation": { "Fn::Join": [ "", @@ -434,6 +443,9 @@ "Arn" ] }, + "DefaultArguments": { + "--job-language": "python" + }, "GlueVersion": "2.0" } }, @@ -530,6 +542,7 @@ "Properties": { "Command": { "Name": "glueetl", + "PythonVersion": "2", "ScriptLocation": { "Fn::Join": [ "", @@ -576,6 +589,7 @@ ] }, "DefaultArguments": { + "--job-language": "python", "arg1": "value1", "arg2": "value2" }, diff --git a/packages/@aws-cdk/aws-glue/test/integ.job.ts b/packages/@aws-cdk/aws-glue/test/integ.job.ts index f70ca31b56ca6..cdf8b909e817b 100644 --- a/packages/@aws-cdk/aws-glue/test/integ.job.ts +++ b/packages/@aws-cdk/aws-glue/test/integ.job.ts @@ -12,26 +12,41 @@ const script = new s3_assets.Asset(stack, 'script', { }); const minimalEtlJob = new glue.Job(stack, 'MinimalGlueEtlJob', { - glueVersion: glue.GlueVersion.V2_0, - jobCommand: glue.JobCommand.etl(script.s3ObjectUrl), + executable: glue.JobExecutable.python({ + glueVersion: glue.GlueVersion.V2_0, + type: glue.JobType.ETL, + pythonVersion: glue.PythonVersion.TWO, + scriptLocation: script.s3ObjectUrl, + }), }); script.bucket.grantRead(minimalEtlJob.role); const minimalStreamingJob = new glue.Job(stack, 'MinimalGlueStreamingJob', { - glueVersion: glue.GlueVersion.V2_0, - jobCommand: glue.JobCommand.streaming(script.s3ObjectUrl), + executable: glue.JobExecutable.python({ + glueVersion: glue.GlueVersion.V2_0, + type: glue.JobType.STREAMING, + pythonVersion: glue.PythonVersion.TWO, + scriptLocation: script.s3ObjectUrl, + }), }); script.bucket.grantRead(minimalStreamingJob.role); const minimalPythonShellJob = new glue.Job(stack, 'MinimalPythonShellJob', { - glueVersion: glue.GlueVersion.V2_0, - jobCommand: glue.JobCommand.pythonShell(script.s3ObjectUrl), + executable: glue.JobExecutable.pythonShell({ + glueVersion: glue.GlueVersion.V2_0, + pythonVersion: glue.PythonVersion.TWO, + scriptLocation: script.s3ObjectUrl, + }), }); script.bucket.grantRead(minimalPythonShellJob.role); const etlJob = new glue.Job(stack, 'Job', { - glueVersion: glue.GlueVersion.V2_0, - jobCommand: glue.JobCommand.etl(script.s3ObjectUrl), + executable: glue.JobExecutable.python({ + glueVersion: glue.GlueVersion.V2_0, + type: glue.JobType.ETL, + pythonVersion: glue.PythonVersion.TWO, + scriptLocation: script.s3ObjectUrl, + }), workerType: glue.WorkerType.G_2X, numberOfWorkers: 10, maxConcurrentRuns: 2, diff --git a/packages/@aws-cdk/aws-glue/test/job.test.ts b/packages/@aws-cdk/aws-glue/test/job.test.ts index 4588dceba1e40..38f3ae7da2c8e 100644 --- a/packages/@aws-cdk/aws-glue/test/job.test.ts +++ b/packages/@aws-cdk/aws-glue/test/job.test.ts @@ -25,77 +25,26 @@ describe('WorkerType', () => { test('of(customType) sets name correctly', () => expect(glue.WorkerType.of('CustomType').name).toEqual('CustomType')); }); -describe('JobCommandName', () => { - test('.ETL', () => expect(glue.JobCommandName.ETL.name).toEqual('glueetl')); - - test('.STREAMING', () => expect(glue.JobCommandName.STREAMING.name).toEqual('gluestreaming')); - - test('.PYTHON_SHELL', () => expect(glue.JobCommandName.PYTHON_SHELL.name).toEqual('pythonshell')); - - test('of(customName) sets name correctly', () => expect(glue.JobCommandName.of('CustomName').name).toEqual('CustomName')); -}); - -describe('JobCommand', () => { - let scriptLocation: string; - - beforeEach(() => { - scriptLocation = 's3://bucketName/script'; - }); - - describe('new', () => { - let jobCommandName: glue.JobCommandName; - - // known command names + custom one - [glue.JobCommandName.STREAMING, glue.JobCommandName.PYTHON_SHELL, glue.JobCommandName.ETL, - glue.JobCommandName.of('CustomName')].forEach((name) => { - describe(`with ${name} JobCommandName`, () => { - - beforeEach(() => { - jobCommandName = name; - }); - - test('without specified python version sets properties correctly', () => { - const jobCommand = new glue.JobCommand(jobCommandName, scriptLocation); - - expect(jobCommand.name).toEqual(jobCommandName); - expect(jobCommand.scriptLocation).toEqual(scriptLocation); - expect(jobCommand.pythonVersion).toBeUndefined(); - }); - - test('with specified python version sets properties correctly', () => { - const pythonVersion = glue.PythonVersion.TWO; - const jobCommand = new glue.JobCommand(jobCommandName, scriptLocation, pythonVersion); - - expect(jobCommand.name).toEqual(jobCommandName); - expect(jobCommand.scriptLocation).toEqual(scriptLocation); - expect(jobCommand.pythonVersion).toEqual(pythonVersion); - }); - }); - }); +describe('JobType', () => { + test('.ETL', () => { + expect(glue.JobType.ETL.name).toEqual('glueetl'); + expect(glue.JobType.ETL.languages).toEqual([glue.JobLanguage.SCALA, glue.JobLanguage.PYTHON]); }); - test('.etl() uses ETL JobCommandName', () => { - const jobCommand = glue.JobCommand.etl(scriptLocation); - - expect(jobCommand.name).toEqual(glue.JobCommandName.ETL); - expect(jobCommand.scriptLocation).toEqual(scriptLocation); - expect(jobCommand.pythonVersion).toBeUndefined(); + test('.STREAMING', () => { + expect(glue.JobType.STREAMING.name).toEqual('gluestreaming'); + expect(glue.JobType.STREAMING.languages).toEqual([glue.JobLanguage.SCALA, glue.JobLanguage.PYTHON]); }); - test('.streaming() uses STREAMING JobCommandName', () => { - const jobCommand = glue.JobCommand.streaming(scriptLocation, glue.PythonVersion.THREE); - - expect(jobCommand.name).toEqual(glue.JobCommandName.STREAMING); - expect(jobCommand.scriptLocation).toEqual(scriptLocation); - expect(jobCommand.pythonVersion).toEqual(glue.PythonVersion.THREE); + test('.PYTHON_SHELL', () => { + expect(glue.JobType.PYTHON_SHELL.name).toEqual('pythonshell'); + expect(glue.JobType.PYTHON_SHELL.languages).toEqual([glue.JobLanguage.PYTHON]); }); - test('.pythonShell() uses PYTHON_SHELL JobCommandName', () => { - const jobCommand = glue.JobCommand.pythonShell(scriptLocation, glue.PythonVersion.TWO); - - expect(jobCommand.name).toEqual(glue.JobCommandName.PYTHON_SHELL); - expect(jobCommand.scriptLocation).toEqual(scriptLocation); - expect(jobCommand.pythonVersion).toEqual(glue.PythonVersion.TWO); + test('of(customName, supportedLanguages) sets name correctly', () => { + const jobType = glue.JobType.of('CustomName', [glue.JobLanguage.SCALA]); + expect(jobType.name).toEqual('CustomName'); + expect(jobType.languages).toEqual([glue.JobLanguage.SCALA]); }); }); @@ -121,17 +70,23 @@ describe('Job', () => { describe('new', () => { let scriptLocation: string; + let className: string; let job: glue.Job; beforeEach(() => { scriptLocation = 's3://bucketName/script'; + className = 'com.amazon.test.ClassName'; }); describe('with necessary props only', () => { beforeEach(() => { job = new glue.Job(stack, 'Job', { - glueVersion: glue.GlueVersion.V2_0, - jobCommand: glue.JobCommand.etl(scriptLocation), + executable: glue.JobExecutable.scala({ + type: glue.JobType.ETL, + glueVersion: glue.GlueVersion.V2_0, + className, + scriptLocation, + }), }); }); @@ -196,8 +151,12 @@ describe('Job', () => { test('with a custom role should use it and set it in CloudFormation', () => { const role = iam.Role.fromRoleArn(stack, 'Role', 'arn:aws:iam::123456789012:role/TestRole'); job = new glue.Job(stack, 'JobWithRole', { - glueVersion: glue.GlueVersion.V2_0, - jobCommand: glue.JobCommand.etl(scriptLocation), + executable: glue.JobExecutable.scala({ + type: glue.JobType.ETL, + glueVersion: glue.GlueVersion.V2_0, + className, + scriptLocation, + }), role, }); @@ -209,8 +168,12 @@ describe('Job', () => { test('with a custom jobName should set it in CloudFormation', () => { job = new glue.Job(stack, 'JobWithName', { - glueVersion: glue.GlueVersion.V2_0, - jobCommand: glue.JobCommand.etl(scriptLocation), + executable: glue.JobExecutable.scala({ + type: glue.JobType.ETL, + glueVersion: glue.GlueVersion.V2_0, + className, + scriptLocation, + }), jobName, }); @@ -225,8 +188,12 @@ describe('Job', () => { job = new glue.Job(stack, 'Job', { jobName, description: 'test job', - jobCommand: glue.JobCommand.etl(scriptLocation), - glueVersion: glue.GlueVersion.V2_0, + executable: glue.JobExecutable.scala({ + type: glue.JobType.ETL, + glueVersion: glue.GlueVersion.V2_0, + className, + scriptLocation, + }), workerType: glue.WorkerType.G_2X, numberOfWorkers: 10, maxConcurrentRuns: 2, @@ -258,8 +225,10 @@ describe('Job', () => { ], }, DefaultArguments: { - arg1: 'value1', - arg2: 'value2', + '--job-language': 'scala', + '--class': 'com.amazon.test.ClassName', + 'arg1': 'value1', + 'arg2': 'value2', }, Description: 'test job', ExecutionProperty: { @@ -290,8 +259,12 @@ describe('Job', () => { describe('event rules and rule-based metrics', () => { beforeEach(() => { job = new glue.Job(stack, 'Job', { - glueVersion: glue.GlueVersion.V2_0, - jobCommand: glue.JobCommand.etl(scriptLocation), + executable: glue.JobExecutable.scala({ + type: glue.JobType.ETL, + glueVersion: glue.GlueVersion.V2_0, + className, + scriptLocation, + }), }); }); From 6b3d31ecba5fbcb0984ee0a4e39a40be0fcfb354 Mon Sep 17 00:00:00 2001 From: Ahmed Kamel Date: Thu, 12 Aug 2021 10:54:17 +0100 Subject: [PATCH 17/50] refactor JobExecutable and add more tests --- packages/@aws-cdk/aws-glue/README.md | 6 +- packages/@aws-cdk/aws-glue/lib/index.ts | 1 + .../@aws-cdk/aws-glue/lib/job-executable.ts | 407 +++++++++++++++ packages/@aws-cdk/aws-glue/lib/job.ts | 480 +----------------- packages/@aws-cdk/aws-glue/package.json | 1 + packages/@aws-cdk/aws-glue/test/integ.job.ts | 11 +- packages/@aws-cdk/aws-glue/test/job.test.ts | 245 +++++++-- 7 files changed, 649 insertions(+), 502 deletions(-) create mode 100644 packages/@aws-cdk/aws-glue/lib/job-executable.ts diff --git a/packages/@aws-cdk/aws-glue/README.md b/packages/@aws-cdk/aws-glue/README.md index aa9eaca4994e8..e96f279383574 100644 --- a/packages/@aws-cdk/aws-glue/README.md +++ b/packages/@aws-cdk/aws-glue/README.md @@ -48,7 +48,11 @@ Typically, a job runs extract, transform, and load (ETL) scripts. Jobs can also ```ts new glue.Job(stack, 'Job', { - jobCommand: glue.JobCommand.pythonShell('s3://bucketName/script.py', glue.PythonVersion.TWO), + executable: glue.JobExecutable.shellPython({ + glueVersion: glue.GlueVersion.V2_0, + pythonVersion: PythonVersion.TWO, + scriptLocation: 's3://bucketName/script.py', + }), description: 'an example pythonshell job', }); ``` diff --git a/packages/@aws-cdk/aws-glue/lib/index.ts b/packages/@aws-cdk/aws-glue/lib/index.ts index 936d1f003aa6e..d0e13085804ae 100644 --- a/packages/@aws-cdk/aws-glue/lib/index.ts +++ b/packages/@aws-cdk/aws-glue/lib/index.ts @@ -5,6 +5,7 @@ export * from './connection'; export * from './data-format'; export * from './database'; export * from './job'; +export * from './job-executable'; export * from './schema'; export * from './security-configuration'; export * from './table'; \ No newline at end of file diff --git a/packages/@aws-cdk/aws-glue/lib/job-executable.ts b/packages/@aws-cdk/aws-glue/lib/job-executable.ts new file mode 100644 index 0000000000000..1e97dba2716a8 --- /dev/null +++ b/packages/@aws-cdk/aws-glue/lib/job-executable.ts @@ -0,0 +1,407 @@ +import * as constructs from 'constructs'; + +/** + * AWS Glue version determines the versions of Apache Spark and Python that are available to the job. + * + * @see https://docs.aws.amazon.com/glue/latest/dg/add-job.html. + * + * If you need to use a GlueVersion that doesn't exist as a static member, you + * can instantiate a `GlueVersion` object, e.g: `GlueVersion.of('1.5')`. + */ +export class GlueVersion { + /** + * Glue version using Spark 2.2.1 and Python 2.7 + */ + public static readonly V0_9 = new GlueVersion('0.9'); + + /** + * Glue version using Spark 2.4.3, Python 2.7 and Python 3.6 + */ + public static readonly V1_0 = new GlueVersion('1.0'); + + /** + * Glue version using Spark 2.4.3 and Python 3.7 + */ + public static readonly V2_0 = new GlueVersion('2.0'); + + /** + * Custom Glue version + * @param version custom version + */ + public static of(version: string): GlueVersion { + return new GlueVersion(version); + } + + /** + * The name of this GlueVersion, as expected by Job resource. + */ + public readonly name: string; + + private constructor(name: string) { + this.name = name; + } +} + +/** + * Runtime language of the Glue job + */ +export enum JobLanguage { + /** + * Scala + */ + SCALA = 'scala', + + /** + * Python + */ + PYTHON = 'python', +} + +/** + * Python version + */ +export enum PythonVersion { + /** + * Python 2 (the exact version depends on GlueVersion and JobCommand used) + */ + TWO = '2', + + /** + * Python 3 (the exact version depends on GlueVersion and JobCommand used) + */ + THREE = '3', +} + +/** + * The job type. + * + * If you need to use a JobType that doesn't exist as a static member, you + * can instantiate a `JobType` object, e.g: `JobType.of('other name')`. + */ +export class JobType { + /** + * Command for running a Glue ETL job. + */ + public static readonly ETL = new JobType('glueetl'); + + /** + * Command for running a Glue streaming job. + */ + public static readonly STREAMING = new JobType('gluestreaming'); + + /** + * Command for running a Glue python shell job. + */ + public static readonly PYTHON_SHELL = new JobType('pythonshell'); + + /** + * Custom type name + * @param name type name + */ + public static of(name: string): JobType { + return new JobType(name); + } + + /** + * The name of this JobType, as expected by Job resource. + */ + public readonly name: string; + + private constructor(name: string) { + this.name = name; + } +} + +interface SharedJobExecutableProps { + /** + * Glue version. + * + * @see https://docs.aws.amazon.com/glue/latest/dg/release-notes.html + */ + readonly glueVersion: GlueVersion; + + /** + * Specifies the Amazon Simple Storage Service (Amazon S3) path to a script that executes a job. + */ + readonly scriptLocation: string; + + /** + * The Amazon S3 paths to additional files, such as configuration files that AWS Glue copies to the working directory of your script before executing it. + * Only individual files are supported, not a directory path. + * + * @default - no extra files and argument is not set + * + * @see `--extra-files` in https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-glue-arguments.html + */ + readonly extraFiles?: string[]; +} + +interface SharedSparkJobExecutableProps extends SharedJobExecutableProps { + /** + * The Amazon S3 paths to additional Java .jar files that AWS Glue adds to the Java classpath before executing your script. + * Only individual files are supported, not a directory path. + * + * @default - no extra jars and argument is not set + * + * @see `--extra-jars` in https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-glue-arguments.html + */ + readonly extraJars?: string[]; + + /** + * Setting this value to true prioritizes the customer's extra JAR files in the classpath. + * + * @default - priortiy is not given to extra jars and argument is not set + * + * @see `--user-jars-first` in https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-glue-arguments.html + */ + readonly extraJarsFirst?: boolean; +} + +/** + * Result of binding a `JobExecutable` into a `Job`. + */ +export interface JobExecutableConfig { + /** + * Glue version. + * + * @see https://docs.aws.amazon.com/glue/latest/dg/release-notes.html + */ + readonly glueVersion: GlueVersion; + + /** + * The language of the job (Scala or Python). + * + * @see `--job-languae` in https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-glue-arguments.html + */ + readonly language: JobLanguage; + + /** + * Specify the type of the job whether it's an Apache Spark ETL or streaming one or if it's a Python shell job. + */ + readonly type: JobType; + + /** + * The Python version to use. + * + * @default - no python version specified + */ + readonly pythonVersion?: PythonVersion; + + /** + * Specifies the Amazon Simple Storage Service (Amazon S3) path to a script that executes a job. + */ + readonly scriptLocation: string; + + /** + * The Scala class that serves as the entry point for the job. This applies only if your the job langauage is Scala. + * + * @default - no scala className specified + * + * @see `--class` in https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-glue-arguments.html + */ + readonly className?: string; + + /** + * The Amazon S3 paths to additional Java .jar files that AWS Glue adds to the Java classpath before executing your script. + * Only individual files are supported, not a directory path. + * + * @default - no extra jars specified. + * + * @see `--extra-jars` in https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-glue-arguments.html + */ + readonly extraJars?: string[]; + + /** + * The Amazon S3 paths to additional Python modules that AWS Glue adds to the Python path before executing your script. + * Only individual files are supported, not a directory path. + * + * @default - no extra python files specified. + * + * @see `--extra-py-files` in https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-glue-arguments.html + */ + readonly extraPythonFiles?: string[]; + + /** + * The Amazon S3 paths to additional files, such as configuration files that AWS Glue copies to the working directory of your script before executing it. + * Only individual files are supported, not a directory path. + * + * @default - no extra files specified. + * + * @see `--extra-files` in https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-glue-arguments.html + */ + readonly extraFiles?: string[]; + + /** + * Setting this value to true prioritizes the customer's extra JAR files in the classpath. + * + * @default - extra jars are not prioritized. + * + * @see `--user-jars-first` in https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-glue-arguments.html + */ + readonly extraJarsFirst?: boolean; +} + +/** + * Props for creating a Scala Spark (ETL or Streaming) job executable + */ +export interface ScalaJobExecutableProps extends SharedSparkJobExecutableProps { + /** + * The fully qualified Scala class name that serves as the entry point for the job. + * + * @see `--class` in https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-glue-arguments.html + */ + readonly className: string; +} + +/** + * Props for creating a Python Spark (ETL or Streaming) job executable + */ +export interface PythonJobExecutableProps extends SharedSparkJobExecutableProps { + + /** + * The Python version to use. + */ + readonly pythonVersion: PythonVersion; + + /** + * The Amazon S3 paths to additional Python modules that AWS Glue adds to the Python path before executing your script. + * Only individual files are supported, not a directory path. + * + * @default - no extra python files and argument is not set + * + * @see `--extra-py-files` in https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-glue-arguments.html + */ + readonly extraPythonFiles?: string[]; + + /** + * Setting this value to true prioritizes the customer's extra JAR files in the classpath. + * + * @default - priortiy is not given to extra jars and argument is not set + * @see `--user-jars-first` in https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-glue-arguments.html + */ + readonly extraJarsFirst?: boolean; +} + +/** + * Props for creating a Python shell job executable + */ +export interface PythonShellExecutableProps extends SharedJobExecutableProps { + + /** + * The Python version to use. + */ + readonly pythonVersion: PythonVersion; + + /** + * The Amazon S3 paths to additional Python modules that AWS Glue adds to the Python path before executing your script. + * Only individual files are supported, not a directory path. + * + * @default - no extra python files and argument is not set + * + * @see `--extra-py-files` in https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-glue-arguments.html + */ + readonly extraPythonFiles?: string[]; +} + +/** + * The executable properties related to the Glue job's GlueVersion, JobType and code + * + * TODO test for exceptions + */ +export class JobExecutable { + + /** + * Create Scala executable props for Apache Spark ETL job. + * + * @param props Scala Apache Spark Job props + */ + public static etlScala(props: ScalaJobExecutableProps): JobExecutable { + return new JobExecutable({ + ...props, + type: JobType.ETL, + language: JobLanguage.SCALA, + }); + } + + /** + * Create Scala executable props for Apache Spark Streaming job. + * + * @param props Scala Apache Spark Job props + */ + public static streamingScala(props: ScalaJobExecutableProps): JobExecutable { + return new JobExecutable({ + ...props, + type: JobType.STREAMING, + language: JobLanguage.SCALA, + }); + } + + /** + * Create Python executable props for Apache Spark ETL job. + * + * @param props Python Apache Spark Job props + */ + public static etlPython(props: PythonJobExecutableProps): JobExecutable { + return new JobExecutable({ + ...props, + type: JobType.ETL, + language: JobLanguage.PYTHON, + }); + } + + /** + * Create Python executable props for Apache Spark Streaming job. + * + * @param props Python Apache Spark Job props + */ + public static streamingPython(props: PythonJobExecutableProps): JobExecutable { + return new JobExecutable({ + ...props, + type: JobType.STREAMING, + language: JobLanguage.PYTHON, + }); + } + + /** + * Create Python executable props for python shell jobs. + * + * @param props Python Shell Job props. + */ + public static shellPython(props: PythonShellExecutableProps): JobExecutable { + return new JobExecutable({ + ...props, + type: JobType.PYTHON_SHELL, + language: JobLanguage.PYTHON, + }); + } + + /** + * Create a custom JobExecutable. + * + * @param config custom job executable configuration. + */ + public static of(config: JobExecutableConfig): JobExecutable { + return new JobExecutable(config); + } + + private config: JobExecutableConfig; + + private constructor(config: JobExecutableConfig) { + if (JobType.PYTHON_SHELL === config.type) { + if (config.language !== JobLanguage.PYTHON) { + throw new Error('Python shell requires the language to be set to Python'); + } + if ([GlueVersion.V0_9, GlueVersion.V1_0].includes(config.glueVersion)) { + throw new Error(`Specified GlueVersion ${config.glueVersion.name} does not support Python Shell`); + } + } + if (config.extraJarsFirst && [GlueVersion.V0_9, GlueVersion.V1_0].includes(config.glueVersion)) { + throw new Error(`Specified GlueVersion ${config.glueVersion.name} does not support extraJarsFirst`); + } + this.config = config; + } + + public bind(_scope: constructs.Construct): JobExecutableConfig { + return this.config; + } +} \ No newline at end of file diff --git a/packages/@aws-cdk/aws-glue/lib/job.ts b/packages/@aws-cdk/aws-glue/lib/job.ts index 71e8db0bd53ee..1563b4ffda177 100644 --- a/packages/@aws-cdk/aws-glue/lib/job.ts +++ b/packages/@aws-cdk/aws-glue/lib/job.ts @@ -3,52 +3,11 @@ import * as events from '@aws-cdk/aws-events'; import * as iam from '@aws-cdk/aws-iam'; import * as cdk from '@aws-cdk/core'; import * as constructs from 'constructs'; +import { JobExecutable, JobExecutableConfig } from '.'; import { IConnection } from './connection'; import { CfnJob } from './glue.generated'; import { ISecurityConfiguration } from './security-configuration'; -/** - * AWS Glue version determines the versions of Apache Spark and Python that are available to the job. - * - * @see https://docs.aws.amazon.com/glue/latest/dg/add-job.html. - * - * If you need to use a GlueVersion that doesn't exist as a static member, you - * can instantiate a `GlueVersion` object, e.g: `GlueVersion.of('1.5')`. - */ -export class GlueVersion { - /** - * Glue version using Spark 2.2.1 and Python 2.7 - */ - public static readonly V0_9 = new GlueVersion('0.9'); - - /** - * Glue version using Spark 2.4.3, Python 2.7 and Python 3.6 - */ - public static readonly V1_0 = new GlueVersion('1.0'); - - /** - * Glue version using Spark 2.4.3 and Python 3.7 - */ - public static readonly V2_0 = new GlueVersion('2.0'); - - /** - * Custom Glue version - * @param version custom version - */ - public static of(version: string): GlueVersion { - return new GlueVersion(version); - } - - /** - * The name of this GlueVersion, as expected by Job resource. - */ - public readonly name: string; - - private constructor(name: string) { - this.name = name; - } -} - /** * The type of predefined worker that is allocated when a job runs. * @@ -89,21 +48,6 @@ export class WorkerType { } } -/** - * Python version - */ -export enum PythonVersion { - /** - * Python 2 (the exact version depends on GlueVersion and JobCommand used) - */ - TWO = '2', - - /** - * Python 3 (the exact version depends on GlueVersion and JobCommand used) - */ - THREE = '3', -} - /** * Job states emitted by Glue to CloudWatch Events. * @@ -163,395 +107,6 @@ export enum MetricType { COUNT = 'count', } -/** - * Runtime language of the Glue job - */ -export enum JobLanguage { - /** - * Scala - */ - SCALA = 'scala', - - /** - * Python - */ - PYTHON = 'python', -} - -/** - * The job type. - * - * If you need to use a JobType that doesn't exist as a static member, you - * can instantiate a `JobType` object, e.g: `JobType.of('other name', ['supported languages'])`. - */ -export class JobType { - /** - * Command for running a Glue ETL job. - */ - public static readonly ETL = new JobType('glueetl', [JobLanguage.SCALA, JobLanguage.PYTHON]); - - /** - * Command for running a Glue streaming job. - */ - public static readonly STREAMING = new JobType('gluestreaming', [JobLanguage.SCALA, JobLanguage.PYTHON]); - - /** - * Command for running a Glue python shell job. - */ - public static readonly PYTHON_SHELL = new JobType('pythonshell', [JobLanguage.PYTHON]); - - /** - * Custom type name - * @param name type name - * @param languages languages supported by this job type - */ - public static of(name: string, languages: JobLanguage[]): JobType { - return new JobType(name, languages); - } - - /** - * The name of this JobType, as expected by Job resource. - */ - public readonly name: string; - - /** - * Languages supported by this job type - */ - public languages: JobLanguage[]; - - private constructor(name: string, languages: JobLanguage[]) { - if (languages.length === 0) { - throw new Error('languages cannot be empty'); - } - this.name = name; - this.languages = languages; - } -} - -/** - * TODO: Q for reviewer - any way to better model these props, Omit seems to cause problems with JSII? - */ -interface JobExecutableProps { - readonly glueVersion: GlueVersion; - - readonly language: JobLanguage; - - readonly type: JobType; - - readonly pythonVersion?: PythonVersion; - - readonly scriptLocation: string; - - readonly className?: string; - - readonly extraJars?: string[]; - - readonly extraPythonFiles?: string[]; - - readonly extraFiles?: string[]; - - readonly extraJarsFirst?: boolean; -} - -/** - * Props for creating a Scala Spark (ETL or Streaming) job executable - */ -export interface ScalaJobExecutableProps { - - /** - * Glue version. - * - * @see https://docs.aws.amazon.com/glue/latest/dg/release-notes.html - */ - readonly glueVersion: GlueVersion; - - /** - * Specify the type of the job whether it's an Apache Spark ETL or streaming one. - */ - readonly type: JobType; - - /** - * Specifies the Amazon Simple Storage Service (Amazon S3) path to a script that executes a job. - */ - readonly scriptLocation: string; - - /** - * The fully qualified Scala class name that serves as the entry point for the job. - * - * @see `--class` in https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-glue-arguments.html - */ - readonly className: string; - - /** - * The Amazon S3 paths to additional Java .jar files that AWS Glue adds to the Java classpath before executing your script. - * Only individual files are supported, not a directory path. - * - * @default - no extra jars and argument is not set - * - * @see `--extra-jars` in https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-glue-arguments.html - */ - readonly extraJars?: string[]; - - /** - * The Amazon S3 paths to additional files, such as configuration files that AWS Glue copies to the working directory of your script before executing it. - * Only individual files are supported, not a directory path. - * - * @default - no extra files and argument is not set - * - * @see `--extra-files` in https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-glue-arguments.html - */ - readonly extraFiles?: string[]; - - /** - * Setting this value to true prioritizes the customer's extra JAR files in the classpath. - * - * @default - priortiy is not given to extra jars and argument is not set - * - * @see `--user-jars-first` in https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-glue-arguments.html - */ - readonly extraJarsFirst?: boolean; -} - -/** - * Props for creating a Python Spark (ETL or Streaming) job executable - */ -export interface PythonJobExecutableProps { - - /** - * Glue version. - * - * @see https://docs.aws.amazon.com/glue/latest/dg/release-notes.html - */ - readonly glueVersion: GlueVersion; - - /** - * Specify the type of the job whether it's an Apache Spark ETL or streaming one. - */ - readonly type: JobType; - - /** - * The Python version to use. - */ - readonly pythonVersion: PythonVersion; - - /** - * Specifies the Amazon Simple Storage Service (Amazon S3) path to a script that executes a job. - */ - readonly scriptLocation: string; - - /** - * The Amazon S3 paths to additional Java .jar files that AWS Glue adds to the Java classpath before executing your script. - * Only individual files are supported, not a directory path. - * - * @default - no extra jars and argument is not set - * - * @see `--extra-jars` in https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-glue-arguments.html - */ - readonly extraJars?: string[]; - - /** - * The Amazon S3 paths to additional Python modules that AWS Glue adds to the Python path before executing your script. - * Only individual files are supported, not a directory path. - * - * @default - no extra python files and argument is not set - * - * @see `--extra-py-files` in https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-glue-arguments.html - */ - readonly extraPythonFiles?: string[]; - - /** - * The Amazon S3 paths to additional files, such as configuration files that AWS Glue copies to the working directory of your script before executing it. - * Only individual files are supported, not a directory path. - * - * @default - no extra files and argument is not set - * - * @see `--extra-files` in https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-glue-arguments.html - */ - readonly extraFiles?: string[]; - - /** - * Setting this value to true prioritizes the customer's extra JAR files in the classpath. - * - * @default - priortiy is not given to extra jars and argument is not set - * @see `--user-jars-first` in https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-glue-arguments.html - */ - readonly extraJarsFirst?: boolean; -} - -/** - * Props for creating a Python shell job executable - */ -export interface PythonShellExecutableProps { - - /** - * Glue version. - * - * @see https://docs.aws.amazon.com/glue/latest/dg/release-notes.html - */ - readonly glueVersion: GlueVersion; - - /** - * The Python version to use. - */ - readonly pythonVersion: PythonVersion; - - /** - * Specifies the Amazon Simple Storage Service (Amazon S3) path to a script that executes a job. - */ - readonly scriptLocation: string; - - /** - * The Amazon S3 paths to additional Python modules that AWS Glue adds to the Python path before executing your script. - * Only individual files are supported, not a directory path. - * - * @default - no extra python files and argument is not set - * - * @see `--extra-py-files` in https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-glue-arguments.html - */ - readonly extraPythonFiles?: string[]; - - /** - * The Amazon S3 paths to additional files, such as configuration files that AWS Glue copies to the working directory of your script before executing it. - * Only individual files are supported, not a directory path. - * - * @default - no extra files and argument is not set - * - * @see `--extra-files` in https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-glue-arguments.html - */ - readonly extraFiles?: string[]; -} - -/** - * The executable properties related to the Glue job's GlueVersion, JobType and code - * - * TODO test for exceptions - */ -export class JobExecutable { - - /** - * Create Scala executable props for Apache Spark jobs (ETL or streaming) - * - * @param props Scala Apache Spark Job props - */ - public static scala(props: ScalaJobExecutableProps): JobExecutable { - return new JobExecutable({ - ...props, - language: JobLanguage.SCALA, - }); - } - - /** - * Create Python executable props for Apache Spark jobs (ETL or streaming) - * - * @param props Python Apache Spark Job props - */ - public static python(props: PythonJobExecutableProps): JobExecutable { - return new JobExecutable({ - ...props, - language: JobLanguage.PYTHON, - }); - } - - /** - * Create Python executable props for python shell jobs - * - * @param props Python Shell Job props - */ - public static pythonShell(props: PythonShellExecutableProps): JobExecutable { - return new JobExecutable({ - ...props, - type: JobType.PYTHON_SHELL, - language: JobLanguage.PYTHON, - }); - } - - /** - * Glue version. - * - * @see https://docs.aws.amazon.com/glue/latest/dg/release-notes.html - */ - public readonly glueVersion: GlueVersion; - - /** - * The language of the job (Scala or Python). - * - * @see `--job-languae` in https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-glue-arguments.html - */ - public readonly language: JobLanguage; - - /** - * Specify the type of the job whether it's an Apache Spark ETL or streaming one or if it's a Python shell job. - */ - public readonly type: JobType; - - /** - * The Python version to use. - */ - public readonly pythonVersion?: PythonVersion; - - /** - * Specifies the Amazon Simple Storage Service (Amazon S3) path to a script that executes a job. - */ - public readonly scriptLocation: string; - - /** - * The Scala class that serves as the entry point for the job. This applies only if your the job langauage is Scala. - * - * @see `--class` in https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-glue-arguments.html - */ - public readonly className?: string; - - /** - * The Amazon S3 paths to additional Java .jar files that AWS Glue adds to the Java classpath before executing your script. - * Only individual files are supported, not a directory path. - * - * @see `--extra-jars` in https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-glue-arguments.html - */ - public readonly extraJars?: string[]; - - /** - * The Amazon S3 paths to additional Python modules that AWS Glue adds to the Python path before executing your script. - * Only individual files are supported, not a directory path. - * - * @see `--extra-py-files` in https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-glue-arguments.html - */ - public readonly extraPythonFiles?: string[]; - - /** - * The Amazon S3 paths to additional files, such as configuration files that AWS Glue copies to the working directory of your script before executing it. - * Only individual files are supported, not a directory path. - * - * @see `--extra-files` in https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-glue-arguments.html - */ - public readonly extraFiles?: string[]; - - /** - * Setting this value to true prioritizes the customer's extra JAR files in the classpath. - * - * @see `--user-jars-first` in https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-glue-arguments.html - */ - public readonly extraJarsFirst?: boolean; - - private constructor(props: JobExecutableProps) { - if (!props.type.languages.includes(props.language)) { - throw new Error(`${props.type.name} JobType does not support ${props.language} language`); - } - if (props.extraJarsFirst && [GlueVersion.V0_9, GlueVersion.V1_0].includes(props.glueVersion)) { - throw new Error(`${props.type.name} JobType does not support ${props.language} language`); - } - this.glueVersion = props.glueVersion; - this.language = props.language; - this.type = props.type; - this.pythonVersion = props.pythonVersion; - this.scriptLocation = props.scriptLocation; - this.className = props.className; - this.extraJars = props.extraJars; - this.extraPythonFiles = props.extraPythonFiles; - this.extraFiles = props.extraFiles; - this.extraJarsFirst = props.extraJarsFirst; - } -} - /** * Interface representing a created or an imported {@link Job}. */ @@ -977,8 +532,9 @@ export class Job extends JobBase { managedPolicies: [iam.ManagedPolicy.fromAwsManagedPolicyName('service-role/AWSGlueServiceRole')], }); + const executable = props.executable.bind(this); const defaultArguments = { - ...this.executableArguments(props.executable), + ...this.executableArguments(executable), ...props.defaultArguments, }; @@ -987,11 +543,11 @@ export class Job extends JobBase { description: props.description, role: this.role.roleArn, command: { - name: props.executable.type.name, - scriptLocation: props.executable.scriptLocation, - pythonVersion: props.executable.pythonVersion, + name: executable.type.name, + scriptLocation: executable.scriptLocation, + pythonVersion: executable.pythonVersion, }, - glueVersion: props.executable.glueVersion.name, + glueVersion: executable.glueVersion.name, workerType: props.workerType?.name, numberOfWorkers: props.numberOfWorkers, maxCapacity: props.maxCapacity, @@ -1010,22 +566,22 @@ export class Job extends JobBase { this.jobName = resourceName; } - private executableArguments(executable: JobExecutable) { + private executableArguments(config: JobExecutableConfig) { const args: { [key: string]: string } = {}; - args['--job-language'] = executable.language; - if (executable.className) { - args['--class'] = executable.className; + args['--job-language'] = config.language; + if (config.className) { + args['--class'] = config.className; } - if (executable.extraJars && executable.extraJars.length > 0) { - args['--extra-jars'] = executable.extraJars.join(','); + if (config.extraJars && config.extraJars.length > 0) { + args['--extra-jars'] = config.extraJars.join(','); } - if (executable.extraPythonFiles && executable.extraPythonFiles.length > 0) { - args['--extra-py-files'] = executable.extraPythonFiles.join(','); + if (config.extraPythonFiles && config.extraPythonFiles.length > 0) { + args['--extra-py-files'] = config.extraPythonFiles.join(','); } - if (executable.extraFiles && executable.extraFiles.length > 0) { - args['--extra-files'] = executable.extraFiles.join(','); + if (config.extraFiles && config.extraFiles.length > 0) { + args['--extra-files'] = config.extraFiles.join(','); } - if (executable.extraJarsFirst) { + if (config.extraJarsFirst) { args['--user-jars-first'] = 'true'; } return args; diff --git a/packages/@aws-cdk/aws-glue/package.json b/packages/@aws-cdk/aws-glue/package.json index 5cb2bfc955db3..190cb2b0c9967 100644 --- a/packages/@aws-cdk/aws-glue/package.json +++ b/packages/@aws-cdk/aws-glue/package.json @@ -149,6 +149,7 @@ "docs-public-apis:@aws-cdk/aws-glue.ClassificationString.PARQUET", "docs-public-apis:@aws-cdk/aws-glue.ClassificationString.ORC", "docs-public-apis:@aws-cdk/aws-glue.ClassificationString.value", + "docs-public-apis:@aws-cdk/aws-glue.JobExecutable.bind", "no-unused-type:@aws-cdk/aws-glue.JobState" ] }, diff --git a/packages/@aws-cdk/aws-glue/test/integ.job.ts b/packages/@aws-cdk/aws-glue/test/integ.job.ts index cdf8b909e817b..9cadac4c9c564 100644 --- a/packages/@aws-cdk/aws-glue/test/integ.job.ts +++ b/packages/@aws-cdk/aws-glue/test/integ.job.ts @@ -12,9 +12,8 @@ const script = new s3_assets.Asset(stack, 'script', { }); const minimalEtlJob = new glue.Job(stack, 'MinimalGlueEtlJob', { - executable: glue.JobExecutable.python({ + executable: glue.JobExecutable.etlPython({ glueVersion: glue.GlueVersion.V2_0, - type: glue.JobType.ETL, pythonVersion: glue.PythonVersion.TWO, scriptLocation: script.s3ObjectUrl, }), @@ -22,9 +21,8 @@ const minimalEtlJob = new glue.Job(stack, 'MinimalGlueEtlJob', { script.bucket.grantRead(minimalEtlJob.role); const minimalStreamingJob = new glue.Job(stack, 'MinimalGlueStreamingJob', { - executable: glue.JobExecutable.python({ + executable: glue.JobExecutable.streamingPython({ glueVersion: glue.GlueVersion.V2_0, - type: glue.JobType.STREAMING, pythonVersion: glue.PythonVersion.TWO, scriptLocation: script.s3ObjectUrl, }), @@ -32,7 +30,7 @@ const minimalStreamingJob = new glue.Job(stack, 'MinimalGlueStreamingJob', { script.bucket.grantRead(minimalStreamingJob.role); const minimalPythonShellJob = new glue.Job(stack, 'MinimalPythonShellJob', { - executable: glue.JobExecutable.pythonShell({ + executable: glue.JobExecutable.shellPython({ glueVersion: glue.GlueVersion.V2_0, pythonVersion: glue.PythonVersion.TWO, scriptLocation: script.s3ObjectUrl, @@ -41,9 +39,8 @@ const minimalPythonShellJob = new glue.Job(stack, 'MinimalPythonShellJob', { script.bucket.grantRead(minimalPythonShellJob.role); const etlJob = new glue.Job(stack, 'Job', { - executable: glue.JobExecutable.python({ + executable: glue.JobExecutable.etlPython({ glueVersion: glue.GlueVersion.V2_0, - type: glue.JobType.ETL, pythonVersion: glue.PythonVersion.TWO, scriptLocation: script.s3ObjectUrl, }), diff --git a/packages/@aws-cdk/aws-glue/test/job.test.ts b/packages/@aws-cdk/aws-glue/test/job.test.ts index 38f3ae7da2c8e..966ee73509bf1 100644 --- a/packages/@aws-cdk/aws-glue/test/job.test.ts +++ b/packages/@aws-cdk/aws-glue/test/job.test.ts @@ -4,6 +4,7 @@ import * as iam from '@aws-cdk/aws-iam'; import * as cdk from '@aws-cdk/core'; import '@aws-cdk/assert-internal/jest'; import * as glue from '../lib'; +import { PythonVersion } from '../lib'; describe('GlueVersion', () => { test('.V0_9', () => expect(glue.GlueVersion.V0_9.name).toEqual('0.9')); @@ -26,26 +27,13 @@ describe('WorkerType', () => { }); describe('JobType', () => { - test('.ETL', () => { - expect(glue.JobType.ETL.name).toEqual('glueetl'); - expect(glue.JobType.ETL.languages).toEqual([glue.JobLanguage.SCALA, glue.JobLanguage.PYTHON]); - }); + test('.ETL', () => expect(glue.JobType.ETL.name).toEqual('glueetl')); - test('.STREAMING', () => { - expect(glue.JobType.STREAMING.name).toEqual('gluestreaming'); - expect(glue.JobType.STREAMING.languages).toEqual([glue.JobLanguage.SCALA, glue.JobLanguage.PYTHON]); - }); + test('.STREAMING', () => expect(glue.JobType.STREAMING.name).toEqual('gluestreaming')); - test('.PYTHON_SHELL', () => { - expect(glue.JobType.PYTHON_SHELL.name).toEqual('pythonshell'); - expect(glue.JobType.PYTHON_SHELL.languages).toEqual([glue.JobLanguage.PYTHON]); - }); + test('.PYTHON_SHELL', () => expect(glue.JobType.PYTHON_SHELL.name).toEqual('pythonshell')); - test('of(customName, supportedLanguages) sets name correctly', () => { - const jobType = glue.JobType.of('CustomName', [glue.JobLanguage.SCALA]); - expect(jobType.name).toEqual('CustomName'); - expect(jobType.languages).toEqual([glue.JobLanguage.SCALA]); - }); + test('of(customName) sets name correctly', () => expect(glue.JobType.of('CustomName').name).toEqual('CustomName')); }); describe('Job', () => { @@ -70,19 +58,24 @@ describe('Job', () => { describe('new', () => { let scriptLocation: string; + let extraJars: string[]; + let extraFiles: string[]; + let extraPythonFiles: string[]; let className: string; let job: glue.Job; beforeEach(() => { scriptLocation = 's3://bucketName/script'; className = 'com.amazon.test.ClassName'; + extraJars = ['s3://bucketName/file1.jar', 's3://bucketName/file2.jar']; + extraPythonFiles = ['s3://bucketName/file1.py', 's3://bucketName/file2.py']; + extraFiles = ['s3://bucketName/file1.txt', 's3://bucketName/file2.txt']; }); describe('with necessary props only', () => { beforeEach(() => { job = new glue.Job(stack, 'Job', { - executable: glue.JobExecutable.scala({ - type: glue.JobType.ETL, + executable: glue.JobExecutable.etlScala({ glueVersion: glue.GlueVersion.V2_0, className, scriptLocation, @@ -151,10 +144,9 @@ describe('Job', () => { test('with a custom role should use it and set it in CloudFormation', () => { const role = iam.Role.fromRoleArn(stack, 'Role', 'arn:aws:iam::123456789012:role/TestRole'); job = new glue.Job(stack, 'JobWithRole', { - executable: glue.JobExecutable.scala({ - type: glue.JobType.ETL, + executable: glue.JobExecutable.etlPython({ glueVersion: glue.GlueVersion.V2_0, - className, + pythonVersion: PythonVersion.TWO, scriptLocation, }), role, @@ -168,8 +160,7 @@ describe('Job', () => { test('with a custom jobName should set it in CloudFormation', () => { job = new glue.Job(stack, 'JobWithName', { - executable: glue.JobExecutable.scala({ - type: glue.JobType.ETL, + executable: glue.JobExecutable.streamingScala({ glueVersion: glue.GlueVersion.V2_0, className, scriptLocation, @@ -188,10 +179,9 @@ describe('Job', () => { job = new glue.Job(stack, 'Job', { jobName, description: 'test job', - executable: glue.JobExecutable.scala({ - type: glue.JobType.ETL, + executable: glue.JobExecutable.streamingPython({ glueVersion: glue.GlueVersion.V2_0, - className, + pythonVersion: PythonVersion.TWO, scriptLocation, }), workerType: glue.WorkerType.G_2X, @@ -215,8 +205,9 @@ describe('Job', () => { test('should synthesize correctly', () => { cdkassert.expect(stack).to(cdkassert.haveResource('AWS::Glue::Job', { Command: { - Name: 'glueetl', + Name: 'gluestreaming', ScriptLocation: 's3://bucketName/script', + PythonVersion: '2', }, Role: { 'Fn::GetAtt': [ @@ -225,8 +216,7 @@ describe('Job', () => { ], }, DefaultArguments: { - '--job-language': 'scala', - '--class': 'com.amazon.test.ClassName', + '--job-language': 'python', 'arg1': 'value1', 'arg2': 'value2', }, @@ -256,11 +246,202 @@ describe('Job', () => { }); }); + describe('python shell job', () => { + + test('with minimal props should synthesize correctly', () => { + new glue.Job(stack, 'Job', { + executable: glue.JobExecutable.shellPython({ + glueVersion: glue.GlueVersion.V2_0, + pythonVersion: PythonVersion.THREE, + scriptLocation, + }), + }); + + // check the job using the role + cdkassert.expect(stack).to(cdkassert.haveResource('AWS::Glue::Job', { + Command: { + Name: 'pythonshell', + ScriptLocation: scriptLocation, + PythonVersion: '3', + }, + GlueVersion: '2.0', + })); + }); + + test('with unsupported glue version', () => { + expect(() => new glue.Job(stack, 'Job', { + executable: glue.JobExecutable.shellPython({ + glueVersion: glue.GlueVersion.V0_9, + pythonVersion: PythonVersion.TWO, + scriptLocation, + }), + })).toThrow('Specified GlueVersion 0.9 does not support Python Shell'); + }); + + test('with all props should synthesize correctly', () => { + new glue.Job(stack, 'Job', { + executable: glue.JobExecutable.shellPython({ + glueVersion: glue.GlueVersion.V2_0, + pythonVersion: PythonVersion.THREE, + scriptLocation, + extraPythonFiles, + extraFiles, + }), + }); + + cdkassert.expect(stack).to(cdkassert.haveResource('AWS::Glue::Job', { + Command: { + Name: 'pythonshell', + ScriptLocation: scriptLocation, + PythonVersion: '3', + }, + GlueVersion: '2.0', + DefaultArguments: { + '--job-language': 'python', + '--extra-py-files': 's3://bucketName/file1.py,s3://bucketName/file2.py', + '--extra-files': 's3://bucketName/file1.txt,s3://bucketName/file2.txt', + }, + })); + }); + }); + + describe('python etl job', () => { + + test('with minimal props should synthesize correctly', () => { + new glue.Job(stack, 'Job', { + executable: glue.JobExecutable.etlPython({ + glueVersion: glue.GlueVersion.V2_0, + pythonVersion: PythonVersion.TWO, + scriptLocation, + }), + }); + + // check the job using the role + cdkassert.expect(stack).to(cdkassert.haveResource('AWS::Glue::Job', { + GlueVersion: '2.0', + Command: { + Name: 'glueetl', + ScriptLocation: scriptLocation, + PythonVersion: '2', + }, + Role: { + 'Fn::GetAtt': [ + 'JobServiceRole4F432993', + 'Arn', + ], + }, + DefaultArguments: { + '--job-language': 'python', + }, + })); + }); + + test('with all props should synthesize correctly', () => { + new glue.Job(stack, 'Job', { + executable: glue.JobExecutable.etlPython({ + glueVersion: glue.GlueVersion.V2_0, + pythonVersion: PythonVersion.TWO, + extraJarsFirst: true, + scriptLocation, + extraPythonFiles, + extraJars, + extraFiles, + }), + }); + + cdkassert.expect(stack).to(cdkassert.haveResource('AWS::Glue::Job', { + GlueVersion: '2.0', + Command: { + Name: 'glueetl', + ScriptLocation: scriptLocation, + PythonVersion: '2', + }, + Role: { + 'Fn::GetAtt': [ + 'JobServiceRole4F432993', + 'Arn', + ], + }, + DefaultArguments: { + '--job-language': 'python', + '--extra-jars': 's3://bucketName/file1.jar,s3://bucketName/file2.jar', + '--extra-py-files': 's3://bucketName/file1.py,s3://bucketName/file2.py', + '--extra-files': 's3://bucketName/file1.txt,s3://bucketName/file2.txt', + '--user-jars-first': 'true', + }, + })); + }); + }); + + describe('scala streaming job', () => { + + test('with minimal props should synthesize correctly', () => { + new glue.Job(stack, 'Job', { + executable: glue.JobExecutable.streamingScala({ + glueVersion: glue.GlueVersion.V2_0, + scriptLocation, + className, + }), + }); + + cdkassert.expect(stack).to(cdkassert.haveResource('AWS::Glue::Job', { + GlueVersion: '2.0', + Command: { + Name: 'gluestreaming', + ScriptLocation: scriptLocation, + }, + Role: { + 'Fn::GetAtt': [ + 'JobServiceRole4F432993', + 'Arn', + ], + }, + DefaultArguments: { + '--job-language': 'scala', + '--class': 'com.amazon.test.ClassName', + }, + })); + }); + + test('with all props should synthesize correctly', () => { + new glue.Job(stack, 'Job', { + executable: glue.JobExecutable.streamingScala({ + glueVersion: glue.GlueVersion.V2_0, + extraJarsFirst: true, + className, + scriptLocation, + extraJars, + extraFiles, + }), + }); + + cdkassert.expect(stack).to(cdkassert.haveResource('AWS::Glue::Job', { + GlueVersion: '2.0', + Command: { + Name: 'gluestreaming', + ScriptLocation: scriptLocation, + }, + Role: { + 'Fn::GetAtt': [ + 'JobServiceRole4F432993', + 'Arn', + ], + }, + DefaultArguments: { + '--job-language': 'scala', + '--class': 'com.amazon.test.ClassName', + '--extra-jars': 's3://bucketName/file1.jar,s3://bucketName/file2.jar', + '--extra-files': 's3://bucketName/file1.txt,s3://bucketName/file2.txt', + '--user-jars-first': 'true', + }, + })); + }); + }); + describe('event rules and rule-based metrics', () => { beforeEach(() => { job = new glue.Job(stack, 'Job', { - executable: glue.JobExecutable.scala({ - type: glue.JobType.ETL, + executable: glue.JobExecutable.etlScala({ glueVersion: glue.GlueVersion.V2_0, className, scriptLocation, From 2f0279864420c9daa4fc70cdac98c917c2c1ecd5 Mon Sep 17 00:00:00 2001 From: Ahmed Kamel Date: Thu, 12 Aug 2021 11:08:31 +0100 Subject: [PATCH 18/50] add enableProfilingMetrics to JobProps --- packages/@aws-cdk/aws-glue/lib/job.ts | 21 ++++++++++++++------- packages/@aws-cdk/aws-glue/test/job.test.ts | 2 ++ 2 files changed, 16 insertions(+), 7 deletions(-) diff --git a/packages/@aws-cdk/aws-glue/lib/job.ts b/packages/@aws-cdk/aws-glue/lib/job.ts index 1563b4ffda177..bda70cfd2bb63 100644 --- a/packages/@aws-cdk/aws-glue/lib/job.ts +++ b/packages/@aws-cdk/aws-glue/lib/job.ts @@ -374,6 +374,11 @@ export interface JobAttributes { * Construction properties for {@link Job}. */ export interface JobProps { + /** + * The job's executable properties. + */ + readonly executable: JobExecutable; + /** * The name of the job. * @@ -476,14 +481,13 @@ export interface JobProps { readonly role?: iam.IRole; /** - * The job's executable properties. + * Enables the collection of metrics for job profiling. + * + * @default - no profiling metrics emitted. + * + * @see `--enable-metrics` at https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-glue-arguments.html */ - readonly executable: JobExecutable; - - // TODO '--TempDir': 's3-path-to-directory' we should have a prop to enable setting this, and enable a bucket to be created - // or one specified, the role should also be updated to have access to that bucket - - // TODO --enable-metrics confirm if it's needed for count/guage metrics or not and add a prop + readonly enableProfilingMetrics? :boolean; } /** @@ -537,6 +541,9 @@ export class Job extends JobBase { ...this.executableArguments(executable), ...props.defaultArguments, }; + if (props.enableProfilingMetrics) { + defaultArguments['--enable-metrics'] = ''; + } const jobResource = new CfnJob(this, 'Resource', { name: props.jobName, diff --git a/packages/@aws-cdk/aws-glue/test/job.test.ts b/packages/@aws-cdk/aws-glue/test/job.test.ts index 966ee73509bf1..b489e02e78412 100644 --- a/packages/@aws-cdk/aws-glue/test/job.test.ts +++ b/packages/@aws-cdk/aws-glue/test/job.test.ts @@ -196,6 +196,7 @@ describe('Job', () => { }, connections: [glue.Connection.fromConnectionName(stack, 'ImportedConnection', 'ConnectionName')], securityConfiguration: glue.SecurityConfiguration.fromSecurityConfigurationName(stack, 'ImportedSecurityConfiguration', 'SecurityConfigurationName'), + enableProfilingMetrics: true, tags: { key: 'value', }, @@ -217,6 +218,7 @@ describe('Job', () => { }, DefaultArguments: { '--job-language': 'python', + '--enable-metrics': '', 'arg1': 'value1', 'arg2': 'value2', }, From fe0808957a0b43dc3907dc9b2ca18fd3955e3df3 Mon Sep 17 00:00:00 2001 From: Ahmed Kamel Date: Thu, 12 Aug 2021 11:37:43 +0100 Subject: [PATCH 19/50] add @aws-cdk/assert-internal to package.json after merge --- packages/@aws-cdk/aws-glue/package.json | 1 + 1 file changed, 1 insertion(+) diff --git a/packages/@aws-cdk/aws-glue/package.json b/packages/@aws-cdk/aws-glue/package.json index cc766381cd6b7..7911c793165c8 100644 --- a/packages/@aws-cdk/aws-glue/package.json +++ b/packages/@aws-cdk/aws-glue/package.json @@ -75,6 +75,7 @@ "devDependencies": { "@types/jest": "^26.0.24", "@aws-cdk/assertions": "0.0.0", + "@aws-cdk/assert-internal": "0.0.0", "@aws-cdk/cx-api": "0.0.0", "@types/nodeunit": "^0.0.32", "@aws-cdk/aws-s3-assets": "0.0.0", From 55b5ee420da29ddfc0a3a10fe069eb37e71204a3 Mon Sep 17 00:00:00 2001 From: Ahmed Kamel Date: Thu, 12 Aug 2021 15:38:15 +0100 Subject: [PATCH 20/50] add sparkUI optional prop to JobProps --- packages/@aws-cdk/aws-glue/lib/job.ts | 85 +++++++- packages/@aws-cdk/aws-glue/test/job.test.ts | 212 +++++++++++++++++++- 2 files changed, 294 insertions(+), 3 deletions(-) diff --git a/packages/@aws-cdk/aws-glue/lib/job.ts b/packages/@aws-cdk/aws-glue/lib/job.ts index bda70cfd2bb63..419f83ea4076c 100644 --- a/packages/@aws-cdk/aws-glue/lib/job.ts +++ b/packages/@aws-cdk/aws-glue/lib/job.ts @@ -1,9 +1,10 @@ import * as cloudwatch from '@aws-cdk/aws-cloudwatch'; import * as events from '@aws-cdk/aws-events'; import * as iam from '@aws-cdk/aws-iam'; +import * as s3 from '@aws-cdk/aws-s3'; import * as cdk from '@aws-cdk/core'; import * as constructs from 'constructs'; -import { JobExecutable, JobExecutableConfig } from '.'; +import { JobExecutable, JobExecutableConfig, JobType } from '.'; import { IConnection } from './connection'; import { CfnJob } from './glue.generated'; import { ISecurityConfiguration } from './security-configuration'; @@ -360,6 +361,48 @@ abstract class JobBase extends cdk.Resource implements IJob { } +/** + * Properties for enabling Spark UI monitoring feature for Spark-based Glue jobs. + * + * @see https://docs.aws.amazon.com/glue/latest/dg/monitor-spark-ui-jobs.html + * @see https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-glue-arguments.html + */ +export interface SparkUIProps { + /** + * The bucket where the Glue job stores the logs. + * + * @default a new bucket will be created. + */ + readonly bucket?: s3.IBucket; + + /** + * The path inside the bucket (objects prefix) where the Glue job stores the logs. + * + * @default - no path will be used, the logs will be written at the root of the bucket. + */ + readonly path?: string; +} + +/** + * The Spark UI monitoring configurations for Spark-based Glue jobs. + * + * @see https://docs.aws.amazon.com/glue/latest/dg/monitor-spark-ui-jobs.html + * @see https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-glue-arguments.html + */ +export interface SparkUIConfig { + /** + * The bucket where the Glue job stores the logs. + */ + readonly bucket: s3.IBucket; + + /** + * The path inside the bucket (objects prefix) where the Glue job stores the logs. + * + * @default - no path will be used, the logs will be written at the root of the bucket. + */ + readonly path?: string; +} + /** * Attributes for importing {@link Job}. */ @@ -488,6 +531,16 @@ export interface JobProps { * @see `--enable-metrics` at https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-glue-arguments.html */ readonly enableProfilingMetrics? :boolean; + + /** + * Enables the Spark UI debugging and monitoring with the specified props. + * + * @default - Spark UI debugging and monitoring is disabled. + * + * @see https://docs.aws.amazon.com/glue/latest/dg/monitor-spark-ui-jobs.html + * @see https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-glue-arguments.html + */ + readonly sparkUI?: SparkUIProps, } /** @@ -525,18 +578,27 @@ export class Job extends JobBase { */ public readonly role: iam.IRole; + /** + * The Spark monitoring configuration (Bucket, path) if Spark UI monitoring and debugging is enabled. + * + * @see https://docs.aws.amazon.com/glue/latest/dg/monitor-spark-ui-jobs.html + * @see https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-glue-arguments.html + */ + public readonly sparkUIConfig?: SparkUIConfig; + constructor(scope: constructs.Construct, id: string, props: JobProps) { super(scope, id, { physicalName: props.jobName, }); + const executable = props.executable.bind(this); + // Create a basic service role if one is not provided https://docs.aws.amazon.com/glue/latest/dg/create-service-policy.html this.role = props.role ?? new iam.Role(this, 'ServiceRole', { assumedBy: new iam.ServicePrincipal('glue.amazonaws.com'), managedPolicies: [iam.ManagedPolicy.fromAwsManagedPolicyName('service-role/AWSGlueServiceRole')], }); - const executable = props.executable.bind(this); const defaultArguments = { ...this.executableArguments(executable), ...props.defaultArguments, @@ -544,6 +606,9 @@ export class Job extends JobBase { if (props.enableProfilingMetrics) { defaultArguments['--enable-metrics'] = ''; } + if (props.sparkUI) { + this.sparkUIConfig = this.setupSparkUI(executable, this.role, defaultArguments, props.sparkUI); + } const jobResource = new CfnJob(this, 'Resource', { name: props.jobName, @@ -593,4 +658,20 @@ export class Job extends JobBase { } return args; } + + private setupSparkUI(executable: JobExecutableConfig, role: iam.IRole, args: {[key: string]: string}, props: SparkUIProps): SparkUIConfig { + if (![JobType.ETL, JobType.STREAMING].includes(executable.type)) { + throw new Error('Spark UI can only be configured for JobType.ETL or JobType.STREAMING jobs'); + } + + const bucket = props.bucket || new s3.Bucket(this, 'SparkUIBucket'); + bucket.grantReadWrite(role); + args['--enable-spark-ui'] = 'true'; + args['--spark-event-logs-path'] = `s3://${bucket.bucketName}/${props.path || ''}`; + + return { + ...props, + bucket, + }; + } } diff --git a/packages/@aws-cdk/aws-glue/test/job.test.ts b/packages/@aws-cdk/aws-glue/test/job.test.ts index b489e02e78412..91a2f258a0032 100644 --- a/packages/@aws-cdk/aws-glue/test/job.test.ts +++ b/packages/@aws-cdk/aws-glue/test/job.test.ts @@ -1,6 +1,7 @@ import * as cdkassert from '@aws-cdk/assert-internal'; import * as cloudwatch from '@aws-cdk/aws-cloudwatch'; import * as iam from '@aws-cdk/aws-iam'; +import * as s3 from '@aws-cdk/aws-s3'; import * as cdk from '@aws-cdk/core'; import '@aws-cdk/assert-internal/jest'; import * as glue from '../lib'; @@ -174,6 +175,204 @@ describe('Job', () => { }); }); + describe('enabling spark ui but no bucket or path provided', () => { + beforeEach(() => { + job = new glue.Job(stack, 'Job', { + executable: glue.JobExecutable.etlScala({ + glueVersion: glue.GlueVersion.V2_0, + className, + scriptLocation, + }), + sparkUI: {}, + }); + }); + + test('should create spark ui bucket', () => { + cdkassert.expect(stack).to(cdkassert.countResources('AWS::S3::Bucket', 1)); + }); + + test('should grant the role read/write permissions to the spark ui bucket', () => { + cdkassert.expect(stack).to(cdkassert.haveResource('AWS::IAM::Policy', { + PolicyDocument: { + Statement: [ + { + Action: [ + 's3:GetObject*', + 's3:GetBucket*', + 's3:List*', + 's3:DeleteObject*', + 's3:PutObject*', + 's3:Abort*', + ], + Effect: 'Allow', + Resource: [ + { + 'Fn::GetAtt': [ + 'JobSparkUIBucket8E6A0139', + 'Arn', + ], + }, + { + 'Fn::Join': [ + '', + [ + { + 'Fn::GetAtt': [ + 'JobSparkUIBucket8E6A0139', + 'Arn', + ], + }, + '/*', + ], + ], + }, + ], + }, + ], + Version: '2012-10-17', + }, + PolicyName: 'JobServiceRoleDefaultPolicy03F68F9D', + Roles: [ + { + Ref: 'JobServiceRole4F432993', + }, + ], + })); + }); + + test('should set spark arguments on the job', () => { + cdkassert.expect(stack).to(cdkassert.haveResourceLike('AWS::Glue::Job', { + DefaultArguments: { + '--enable-spark-ui': 'true', + '--spark-event-logs-path': { + 'Fn::Join': [ + '', + [ + 's3://', + { + Ref: 'JobSparkUIBucket8E6A0139', + }, + '/', + ], + ], + }, + }, + })); + }); + }); + + describe('enabling spark ui with bucket provided', () => { + let bucketName: string; + let bucket: s3.IBucket; + + beforeEach(() => { + bucketName = 'BucketName'; + bucket = s3.Bucket.fromBucketName(stack, 'BucketId', bucketName); + job = new glue.Job(stack, 'Job', { + executable: glue.JobExecutable.etlScala({ + glueVersion: glue.GlueVersion.V2_0, + className, + scriptLocation, + }), + sparkUI: { + bucket, + }, + }); + }); + + test('should grant the role read/write permissions to the provided spark ui bucket', () => { + cdkassert.expect(stack).to(cdkassert.haveResourceLike('AWS::IAM::Policy', { + PolicyDocument: { + Statement: [ + { + Action: [ + 's3:GetObject*', + 's3:GetBucket*', + 's3:List*', + 's3:DeleteObject*', + 's3:PutObject*', + 's3:Abort*', + ], + Effect: 'Allow', + Resource: [ + { + 'Fn::Join': [ + '', + [ + 'arn:', + { + Ref: 'AWS::Partition', + }, + ':s3:::BucketName', + ], + ], + }, + { + 'Fn::Join': [ + '', + [ + 'arn:', + { + Ref: 'AWS::Partition', + }, + ':s3:::BucketName/*', + ], + ], + }, + ], + }, + ], + }, + Roles: [ + { + Ref: 'JobServiceRole4F432993', + }, + ], + })); + }); + + test('should set spark arguments on the job', () => { + cdkassert.expect(stack).to(cdkassert.haveResourceLike('AWS::Glue::Job', { + DefaultArguments: { + '--enable-spark-ui': 'true', + '--spark-event-logs-path': `s3://${bucketName}/`, + }, + })); + }); + }); + + describe('enabling spark ui with bucket and path provided', () => { + let bucketName: string; + let bucket: s3.IBucket; + let path: string; + + beforeEach(() => { + bucketName = 'BucketName'; + bucket = s3.Bucket.fromBucketName(stack, 'BucketId', bucketName); + path = 'some/path/'; + job = new glue.Job(stack, 'Job', { + executable: glue.JobExecutable.etlScala({ + glueVersion: glue.GlueVersion.V2_0, + className, + scriptLocation, + }), + sparkUI: { + bucket, + path, + }, + }); + }); + + test('should set spark arguments on the job', () => { + cdkassert.expect(stack).to(cdkassert.haveResourceLike('AWS::Glue::Job', { + DefaultArguments: { + '--enable-spark-ui': 'true', + '--spark-event-logs-path': `s3://${bucketName}/${path}`, + }, + })); + }); + }); + describe('with extended props', () => { beforeEach(() => { job = new glue.Job(stack, 'Job', { @@ -270,7 +469,7 @@ describe('Job', () => { })); }); - test('with unsupported glue version', () => { + test('with unsupported glue version throws', () => { expect(() => new glue.Job(stack, 'Job', { executable: glue.JobExecutable.shellPython({ glueVersion: glue.GlueVersion.V0_9, @@ -280,6 +479,17 @@ describe('Job', () => { })).toThrow('Specified GlueVersion 0.9 does not support Python Shell'); }); + test('with unsupported Spark UI prop throws', () => { + expect(() => new glue.Job(stack, 'Job', { + executable: glue.JobExecutable.shellPython({ + glueVersion: glue.GlueVersion.V2_0, + pythonVersion: PythonVersion.TWO, + scriptLocation, + }), + sparkUI: {}, + })).toThrow('Spark UI can only be configured for JobType.ETL or JobType.STREAMING jobs'); + }); + test('with all props should synthesize correctly', () => { new glue.Job(stack, 'Job', { executable: glue.JobExecutable.shellPython({ From 3235ae796bd457150e21b41f2e2882fb0f04953c Mon Sep 17 00:00:00 2001 From: Ahmed Kamel Date: Fri, 13 Aug 2021 11:39:06 +0100 Subject: [PATCH 21/50] add continuousLogging optional prop to JobProps --- packages/@aws-cdk/aws-glue/lib/job.ts | 67 ++++++++++++++ packages/@aws-cdk/aws-glue/package.json | 2 + packages/@aws-cdk/aws-glue/test/job.test.ts | 97 +++++++++++++++++++++ 3 files changed, 166 insertions(+) diff --git a/packages/@aws-cdk/aws-glue/lib/job.ts b/packages/@aws-cdk/aws-glue/lib/job.ts index 419f83ea4076c..3a242fa574f04 100644 --- a/packages/@aws-cdk/aws-glue/lib/job.ts +++ b/packages/@aws-cdk/aws-glue/lib/job.ts @@ -1,6 +1,7 @@ import * as cloudwatch from '@aws-cdk/aws-cloudwatch'; import * as events from '@aws-cdk/aws-events'; import * as iam from '@aws-cdk/aws-iam'; +import * as logs from '@aws-cdk/aws-logs'; import * as s3 from '@aws-cdk/aws-s3'; import * as cdk from '@aws-cdk/core'; import * as constructs from 'constructs'; @@ -403,6 +404,42 @@ export interface SparkUIConfig { readonly path?: string; } +/** + * Properties for enabling Continuous Logging for Glue Jobs. + * + * @see https://docs.aws.amazon.com/glue/latest/dg/monitor-continuous-logging-enable.html + * @see https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-glue-arguments.html + */ +export interface ContinuousLoggingProps { + /** + * Specify a custom CloudWatch log group name. + * + * @default LogGroup named `/aws-glue/jobs/logs-v2/`. + */ + readonly logGroup?: logs.ILogGroup; + + /** + * Specify a custom CloudWatch log stream prefix. + * + * @default the job run ID. + */ + readonly logStreamPrefix?: string; + + /** + * Enable pruning out non-useful Apache Spark driver/executor and Apache Hadoop YARN heartbeat log messages. + * + * @default true + */ + readonly filter?: boolean; + + /** + * Apply the provided conversion pattern. + * + * @default `%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n` + */ + readonly conversionPattern?: string; +} + /** * Attributes for importing {@link Job}. */ @@ -541,6 +578,16 @@ export interface JobProps { * @see https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-glue-arguments.html */ readonly sparkUI?: SparkUIProps, + + /** + * Enables Continuous Logging with the specified props. + * + * @default - Continuous Logging is disabled. + * + * @see https://docs.aws.amazon.com/glue/latest/dg/monitor-continuous-logging-enable.html + * @see https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-glue-arguments.html + */ + readonly continuousLogging?: ContinuousLoggingProps, } /** @@ -609,6 +656,9 @@ export class Job extends JobBase { if (props.sparkUI) { this.sparkUIConfig = this.setupSparkUI(executable, this.role, defaultArguments, props.sparkUI); } + if (props.continuousLogging) { + this.setupContinuousLogging(this.role, defaultArguments, props.continuousLogging); + } const jobResource = new CfnJob(this, 'Resource', { name: props.jobName, @@ -674,4 +724,21 @@ export class Job extends JobBase { bucket, }; } + + private setupContinuousLogging(role: iam.IRole, args: {[key: string]: string}, props: ContinuousLoggingProps) { + args['--enable-continuous-cloudwatch-log'] = 'true'; + args['--enable-continuous-log-filter'] = (props.filter?? true).toString(); + + if (props.logGroup) { + args['--continuous-log-logGroup'] = props.logGroup.logGroupName; + props.logGroup.grantWrite(role); + } + + if (props.logStreamPrefix) { + args['--continuous-log-logStreamPrefix'] = props.logStreamPrefix; + } + if (props.conversionPattern) { + args['--continuous-log-conversionPattern'] = props.conversionPattern; + } + } } diff --git a/packages/@aws-cdk/aws-glue/package.json b/packages/@aws-cdk/aws-glue/package.json index 7911c793165c8..67aedf47b6100 100644 --- a/packages/@aws-cdk/aws-glue/package.json +++ b/packages/@aws-cdk/aws-glue/package.json @@ -90,6 +90,7 @@ "@aws-cdk/aws-events": "0.0.0", "@aws-cdk/aws-ec2": "0.0.0", "@aws-cdk/aws-iam": "0.0.0", + "@aws-cdk/aws-logs": "0.0.0", "@aws-cdk/aws-kms": "0.0.0", "@aws-cdk/aws-s3": "0.0.0", "@aws-cdk/core": "0.0.0", @@ -101,6 +102,7 @@ "@aws-cdk/aws-events": "0.0.0", "@aws-cdk/aws-ec2": "0.0.0", "@aws-cdk/aws-iam": "0.0.0", + "@aws-cdk/aws-logs": "0.0.0", "@aws-cdk/aws-kms": "0.0.0", "@aws-cdk/aws-s3": "0.0.0", "@aws-cdk/core": "0.0.0", diff --git a/packages/@aws-cdk/aws-glue/test/job.test.ts b/packages/@aws-cdk/aws-glue/test/job.test.ts index 91a2f258a0032..4d2b4b9df9ccc 100644 --- a/packages/@aws-cdk/aws-glue/test/job.test.ts +++ b/packages/@aws-cdk/aws-glue/test/job.test.ts @@ -1,6 +1,7 @@ import * as cdkassert from '@aws-cdk/assert-internal'; import * as cloudwatch from '@aws-cdk/aws-cloudwatch'; import * as iam from '@aws-cdk/aws-iam'; +import * as logs from '@aws-cdk/aws-logs'; import * as s3 from '@aws-cdk/aws-s3'; import * as cdk from '@aws-cdk/core'; import '@aws-cdk/assert-internal/jest'; @@ -175,6 +176,102 @@ describe('Job', () => { }); }); + describe('enabling continuous logging with defaults', () => { + beforeEach(() => { + job = new glue.Job(stack, 'Job', { + executable: glue.JobExecutable.etlScala({ + glueVersion: glue.GlueVersion.V2_0, + className, + scriptLocation, + }), + continuousLogging: {}, + }); + }); + + test('should set minimal default arguments', () => { + cdkassert.expect(stack).to(cdkassert.haveResourceLike('AWS::Glue::Job', { + DefaultArguments: { + '--enable-continuous-cloudwatch-log': 'true', + '--enable-continuous-log-filter': 'true', + }, + })); + }); + }); + + describe('enabling continuous logging with all props set', () => { + let logGroup; + + beforeEach(() => { + logGroup = logs.LogGroup.fromLogGroupName(stack, 'LogGroup', 'LogGroupName'); + job = new glue.Job(stack, 'Job', { + executable: glue.JobExecutable.etlScala({ + glueVersion: glue.GlueVersion.V2_0, + className, + scriptLocation, + }), + continuousLogging: { + filter: false, + logStreamPrefix: 'LogStreamPrefix', + conversionPattern: '%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n', + logGroup, + }, + }); + }); + + test('should set all arguments', () => { + cdkassert.expect(stack).to(cdkassert.haveResourceLike('AWS::Glue::Job', { + DefaultArguments: { + '--enable-continuous-cloudwatch-log': 'true', + '--enable-continuous-log-filter': 'false', + '--continuous-log-logGroup': 'LogGroupName', + '--continuous-log-logStreamPrefix': 'LogStreamPrefix', + '--continuous-log-conversionPattern': '%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n', + }, + })); + }); + + test('should grant cloudwatch log write permissions', () => { + cdkassert.expect(stack).to(cdkassert.haveResourceLike('AWS::IAM::Policy', { + PolicyDocument: { + Statement: [ + { + Action: [ + 'logs:CreateLogStream', + 'logs:PutLogEvents', + ], + Effect: 'Allow', + Resource: { + 'Fn::Join': [ + '', + [ + 'arn:', + { + Ref: 'AWS::Partition', + }, + ':logs:', + { + Ref: 'AWS::Region', + }, + ':', + { + Ref: 'AWS::AccountId', + }, + ':log-group:LogGroupName:*', + ], + ], + }, + }, + ], + }, + Roles: [ + { + Ref: 'JobServiceRole4F432993', + }, + ], + })); + }); + }); + describe('enabling spark ui but no bucket or path provided', () => { beforeEach(() => { job = new glue.Job(stack, 'Job', { From 0ff9d011cf4549278a243c8f981ee033241a1e38 Mon Sep 17 00:00:00 2001 From: Ahmed Kamel Date: Tue, 24 Aug 2021 11:32:20 +0100 Subject: [PATCH 22/50] add GlueVersion.V3_0 https://aws.amazon.com/blogs/big-data/introducing-aws-glue-3-0-with-optimized-apache-spark-3-1-runtime-for-faster-data-integration/ --- .../@aws-cdk/aws-glue/lib/job-executable.ts | 8 ++++++++ .../aws-glue/test/integ.job.expected.json | 8 ++++---- packages/@aws-cdk/aws-glue/test/integ.job.ts | 8 ++++---- packages/@aws-cdk/aws-glue/test/job.test.ts | 19 ++++++++++--------- 4 files changed, 26 insertions(+), 17 deletions(-) diff --git a/packages/@aws-cdk/aws-glue/lib/job-executable.ts b/packages/@aws-cdk/aws-glue/lib/job-executable.ts index 1e97dba2716a8..e7f29ca0745df 100644 --- a/packages/@aws-cdk/aws-glue/lib/job-executable.ts +++ b/packages/@aws-cdk/aws-glue/lib/job-executable.ts @@ -24,6 +24,11 @@ export class GlueVersion { */ public static readonly V2_0 = new GlueVersion('2.0'); + /** + * Glue version using Spark 3.1.1 and Python 3.7 + */ + public static readonly V3_0 = new GlueVersion('3.0'); + /** * Custom Glue version * @param version custom version @@ -398,6 +403,9 @@ export class JobExecutable { if (config.extraJarsFirst && [GlueVersion.V0_9, GlueVersion.V1_0].includes(config.glueVersion)) { throw new Error(`Specified GlueVersion ${config.glueVersion.name} does not support extraJarsFirst`); } + if (config.pythonVersion === PythonVersion.TWO && ![GlueVersion.V0_9, GlueVersion.V1_0].includes(config.glueVersion)) { + throw new Error(`Specified GlueVersion ${config.glueVersion.name} does not support PythonVersion ${config.pythonVersion}`); + } this.config = config; } diff --git a/packages/@aws-cdk/aws-glue/test/integ.job.expected.json b/packages/@aws-cdk/aws-glue/test/integ.job.expected.json index 1610c0c3d8bb4..a4f289ebabaf1 100644 --- a/packages/@aws-cdk/aws-glue/test/integ.job.expected.json +++ b/packages/@aws-cdk/aws-glue/test/integ.job.expected.json @@ -107,7 +107,7 @@ "Properties": { "Command": { "Name": "glueetl", - "PythonVersion": "2", + "PythonVersion": "3", "ScriptLocation": { "Fn::Join": [ "", @@ -252,7 +252,7 @@ "Properties": { "Command": { "Name": "gluestreaming", - "PythonVersion": "2", + "PythonVersion": "3", "ScriptLocation": { "Fn::Join": [ "", @@ -397,7 +397,7 @@ "Properties": { "Command": { "Name": "pythonshell", - "PythonVersion": "2", + "PythonVersion": "3", "ScriptLocation": { "Fn::Join": [ "", @@ -542,7 +542,7 @@ "Properties": { "Command": { "Name": "glueetl", - "PythonVersion": "2", + "PythonVersion": "3", "ScriptLocation": { "Fn::Join": [ "", diff --git a/packages/@aws-cdk/aws-glue/test/integ.job.ts b/packages/@aws-cdk/aws-glue/test/integ.job.ts index 9cadac4c9c564..3a3cccf3568cb 100644 --- a/packages/@aws-cdk/aws-glue/test/integ.job.ts +++ b/packages/@aws-cdk/aws-glue/test/integ.job.ts @@ -14,7 +14,7 @@ const script = new s3_assets.Asset(stack, 'script', { const minimalEtlJob = new glue.Job(stack, 'MinimalGlueEtlJob', { executable: glue.JobExecutable.etlPython({ glueVersion: glue.GlueVersion.V2_0, - pythonVersion: glue.PythonVersion.TWO, + pythonVersion: glue.PythonVersion.THREE, scriptLocation: script.s3ObjectUrl, }), }); @@ -23,7 +23,7 @@ script.bucket.grantRead(minimalEtlJob.role); const minimalStreamingJob = new glue.Job(stack, 'MinimalGlueStreamingJob', { executable: glue.JobExecutable.streamingPython({ glueVersion: glue.GlueVersion.V2_0, - pythonVersion: glue.PythonVersion.TWO, + pythonVersion: glue.PythonVersion.THREE, scriptLocation: script.s3ObjectUrl, }), }); @@ -32,7 +32,7 @@ script.bucket.grantRead(minimalStreamingJob.role); const minimalPythonShellJob = new glue.Job(stack, 'MinimalPythonShellJob', { executable: glue.JobExecutable.shellPython({ glueVersion: glue.GlueVersion.V2_0, - pythonVersion: glue.PythonVersion.TWO, + pythonVersion: glue.PythonVersion.THREE, scriptLocation: script.s3ObjectUrl, }), }); @@ -41,7 +41,7 @@ script.bucket.grantRead(minimalPythonShellJob.role); const etlJob = new glue.Job(stack, 'Job', { executable: glue.JobExecutable.etlPython({ glueVersion: glue.GlueVersion.V2_0, - pythonVersion: glue.PythonVersion.TWO, + pythonVersion: glue.PythonVersion.THREE, scriptLocation: script.s3ObjectUrl, }), workerType: glue.WorkerType.G_2X, diff --git a/packages/@aws-cdk/aws-glue/test/job.test.ts b/packages/@aws-cdk/aws-glue/test/job.test.ts index 4d2b4b9df9ccc..507f5971b6eae 100644 --- a/packages/@aws-cdk/aws-glue/test/job.test.ts +++ b/packages/@aws-cdk/aws-glue/test/job.test.ts @@ -15,6 +15,8 @@ describe('GlueVersion', () => { test('.V2_0', () => expect(glue.GlueVersion.V2_0.name).toEqual('2.0')); + test('.V3_0', () => expect(glue.GlueVersion.V3_0.name).toEqual('3.0')); + test('of(customVersion) sets name correctly', () => expect(glue.GlueVersion.of('CustomVersion').name).toEqual('CustomVersion')); }); @@ -148,7 +150,7 @@ describe('Job', () => { job = new glue.Job(stack, 'JobWithRole', { executable: glue.JobExecutable.etlPython({ glueVersion: glue.GlueVersion.V2_0, - pythonVersion: PythonVersion.TWO, + pythonVersion: PythonVersion.THREE, scriptLocation, }), role, @@ -477,7 +479,7 @@ describe('Job', () => { description: 'test job', executable: glue.JobExecutable.streamingPython({ glueVersion: glue.GlueVersion.V2_0, - pythonVersion: PythonVersion.TWO, + pythonVersion: PythonVersion.THREE, scriptLocation, }), workerType: glue.WorkerType.G_2X, @@ -504,7 +506,7 @@ describe('Job', () => { Command: { Name: 'gluestreaming', ScriptLocation: 's3://bucketName/script', - PythonVersion: '2', + PythonVersion: '3', }, Role: { 'Fn::GetAtt': [ @@ -555,7 +557,6 @@ describe('Job', () => { }), }); - // check the job using the role cdkassert.expect(stack).to(cdkassert.haveResource('AWS::Glue::Job', { Command: { Name: 'pythonshell', @@ -580,7 +581,7 @@ describe('Job', () => { expect(() => new glue.Job(stack, 'Job', { executable: glue.JobExecutable.shellPython({ glueVersion: glue.GlueVersion.V2_0, - pythonVersion: PythonVersion.TWO, + pythonVersion: PythonVersion.THREE, scriptLocation, }), sparkUI: {}, @@ -620,7 +621,7 @@ describe('Job', () => { new glue.Job(stack, 'Job', { executable: glue.JobExecutable.etlPython({ glueVersion: glue.GlueVersion.V2_0, - pythonVersion: PythonVersion.TWO, + pythonVersion: PythonVersion.THREE, scriptLocation, }), }); @@ -631,7 +632,7 @@ describe('Job', () => { Command: { Name: 'glueetl', ScriptLocation: scriptLocation, - PythonVersion: '2', + PythonVersion: '3', }, Role: { 'Fn::GetAtt': [ @@ -649,7 +650,7 @@ describe('Job', () => { new glue.Job(stack, 'Job', { executable: glue.JobExecutable.etlPython({ glueVersion: glue.GlueVersion.V2_0, - pythonVersion: PythonVersion.TWO, + pythonVersion: PythonVersion.THREE, extraJarsFirst: true, scriptLocation, extraPythonFiles, @@ -663,7 +664,7 @@ describe('Job', () => { Command: { Name: 'glueetl', ScriptLocation: scriptLocation, - PythonVersion: '2', + PythonVersion: '3', }, Role: { 'Fn::GetAtt': [ From fb7e376d7eaacf4048171bcaad463adc40c7341c Mon Sep 17 00:00:00 2001 From: Ahmed Kamel Date: Mon, 30 Aug 2021 22:47:13 +0100 Subject: [PATCH 23/50] address smaller comments --- .../@aws-cdk/aws-glue/lib/job-executable.ts | 16 +++-- packages/@aws-cdk/aws-glue/lib/job.ts | 63 ++++++++++++------- packages/@aws-cdk/aws-glue/test/integ.job.ts | 8 +-- packages/@aws-cdk/aws-glue/test/job.test.ts | 47 +++++++------- 4 files changed, 76 insertions(+), 58 deletions(-) diff --git a/packages/@aws-cdk/aws-glue/lib/job-executable.ts b/packages/@aws-cdk/aws-glue/lib/job-executable.ts index e7f29ca0745df..3032e50d23e86 100644 --- a/packages/@aws-cdk/aws-glue/lib/job-executable.ts +++ b/packages/@aws-cdk/aws-glue/lib/job-executable.ts @@ -1,5 +1,3 @@ -import * as constructs from 'constructs'; - /** * AWS Glue version determines the versions of Apache Spark and Python that are available to the job. * @@ -320,7 +318,7 @@ export class JobExecutable { * * @param props Scala Apache Spark Job props */ - public static etlScala(props: ScalaJobExecutableProps): JobExecutable { + public static scalaEtl(props: ScalaJobExecutableProps): JobExecutable { return new JobExecutable({ ...props, type: JobType.ETL, @@ -333,7 +331,7 @@ export class JobExecutable { * * @param props Scala Apache Spark Job props */ - public static streamingScala(props: ScalaJobExecutableProps): JobExecutable { + public static scalaStreaming(props: ScalaJobExecutableProps): JobExecutable { return new JobExecutable({ ...props, type: JobType.STREAMING, @@ -346,7 +344,7 @@ export class JobExecutable { * * @param props Python Apache Spark Job props */ - public static etlPython(props: PythonJobExecutableProps): JobExecutable { + public static pythonEtl(props: PythonJobExecutableProps): JobExecutable { return new JobExecutable({ ...props, type: JobType.ETL, @@ -359,7 +357,7 @@ export class JobExecutable { * * @param props Python Apache Spark Job props */ - public static streamingPython(props: PythonJobExecutableProps): JobExecutable { + public static pythonStreaming(props: PythonJobExecutableProps): JobExecutable { return new JobExecutable({ ...props, type: JobType.STREAMING, @@ -372,7 +370,7 @@ export class JobExecutable { * * @param props Python Shell Job props. */ - public static shellPython(props: PythonShellExecutableProps): JobExecutable { + public static pythonShell(props: PythonShellExecutableProps): JobExecutable { return new JobExecutable({ ...props, type: JobType.PYTHON_SHELL, @@ -409,7 +407,7 @@ export class JobExecutable { this.config = config; } - public bind(_scope: constructs.Construct): JobExecutableConfig { + public bind(): JobExecutableConfig { return this.config; } -} \ No newline at end of file +} diff --git a/packages/@aws-cdk/aws-glue/lib/job.ts b/packages/@aws-cdk/aws-glue/lib/job.ts index 3a242fa574f04..135ec60793059 100644 --- a/packages/@aws-cdk/aws-glue/lib/job.ts +++ b/packages/@aws-cdk/aws-glue/lib/job.ts @@ -369,6 +369,11 @@ abstract class JobBase extends cdk.Resource implements IJob { * @see https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-glue-arguments.html */ export interface SparkUIProps { + /** + * Enable Spark UI. + */ + readonly enabled: boolean + /** * The bucket where the Glue job stores the logs. * @@ -411,6 +416,11 @@ export interface SparkUIConfig { * @see https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-glue-arguments.html */ export interface ContinuousLoggingProps { + /** + * Enable continouous logging. + */ + readonly enabled: boolean; + /** * Specify a custom CloudWatch log group name. * @@ -638,7 +648,7 @@ export class Job extends JobBase { physicalName: props.jobName, }); - const executable = props.executable.bind(this); + const executable = props.executable.bind(); // Create a basic service role if one is not provided https://docs.aws.amazon.com/glue/latest/dg/create-service-policy.html this.role = props.role ?? new iam.Role(this, 'ServiceRole', { @@ -646,19 +656,18 @@ export class Job extends JobBase { managedPolicies: [iam.ManagedPolicy.fromAwsManagedPolicyName('service-role/AWSGlueServiceRole')], }); + const sparkUI = props.sparkUI?.enabled ? this.setupSparkUI(executable, this.role, props.sparkUI) : undefined; + this.sparkUIConfig = sparkUI?.config; + const continuousLoggingArgs = props.continuousLogging?.enabled ? this.setupContinuousLogging(this.role, props.continuousLogging) : {}; + const profilingMetricsArgs = props.enableProfilingMetrics ? { '--enable-metrics': '' } : {}; + const defaultArguments = { ...this.executableArguments(executable), + ...continuousLoggingArgs, + ...profilingMetricsArgs, + ...sparkUI?.args, ...props.defaultArguments, }; - if (props.enableProfilingMetrics) { - defaultArguments['--enable-metrics'] = ''; - } - if (props.sparkUI) { - this.sparkUIConfig = this.setupSparkUI(executable, this.role, defaultArguments, props.sparkUI); - } - if (props.continuousLogging) { - this.setupContinuousLogging(this.role, defaultArguments, props.continuousLogging); - } const jobResource = new CfnJob(this, 'Resource', { name: props.jobName, @@ -676,9 +685,9 @@ export class Job extends JobBase { maxRetries: props.maxRetries, executionProperty: props.maxConcurrentRuns ? { maxConcurrentRuns: props.maxConcurrentRuns } : undefined, notificationProperty: props.notifyDelayAfter ? { notifyDelayAfter: props.notifyDelayAfter.toMinutes() } : undefined, - timeout: props.timeout ? props.timeout.toMinutes() : undefined, + timeout: props.timeout?.toMinutes(), connections: props.connections ? { connections: props.connections.map((connection) => connection.connectionName) } : undefined, - securityConfiguration: props.securityConfiguration ? props.securityConfiguration.securityConfigurationName : undefined, + securityConfiguration: props.securityConfiguration?.securityConfigurationName, tags: props.tags, defaultArguments, }); @@ -709,25 +718,32 @@ export class Job extends JobBase { return args; } - private setupSparkUI(executable: JobExecutableConfig, role: iam.IRole, args: {[key: string]: string}, props: SparkUIProps): SparkUIConfig { - if (![JobType.ETL, JobType.STREAMING].includes(executable.type)) { - throw new Error('Spark UI can only be configured for JobType.ETL or JobType.STREAMING jobs'); + private setupSparkUI(executable: JobExecutableConfig, role: iam.IRole, props: SparkUIProps) { + if (JobType.PYTHON_SHELL === executable.type) { + throw new Error('Spark UI is not available for JobType.PYTHON_SHELL jobs'); } - const bucket = props.bucket || new s3.Bucket(this, 'SparkUIBucket'); + const bucket = props.bucket ?? new s3.Bucket(this, 'SparkUIBucket'); bucket.grantReadWrite(role); - args['--enable-spark-ui'] = 'true'; - args['--spark-event-logs-path'] = `s3://${bucket.bucketName}/${props.path || ''}`; + const args = { + '--enable-spark-ui': 'true', + '--spark-event-logs-path': `s3://${bucket.bucketName}/${props.path || ''}`, + }; return { - ...props, - bucket, + config: { + ...props, + bucket, + }, + args, }; } - private setupContinuousLogging(role: iam.IRole, args: {[key: string]: string}, props: ContinuousLoggingProps) { - args['--enable-continuous-cloudwatch-log'] = 'true'; - args['--enable-continuous-log-filter'] = (props.filter?? true).toString(); + private setupContinuousLogging(role: iam.IRole, props: ContinuousLoggingProps) { + const args: {[key: string]: string} = { + '--enable-continuous-cloudwatch-log': 'true', + '--enable-continuous-log-filter': (props.filter ?? true).toString(), + }; if (props.logGroup) { args['--continuous-log-logGroup'] = props.logGroup.logGroupName; @@ -740,5 +756,6 @@ export class Job extends JobBase { if (props.conversionPattern) { args['--continuous-log-conversionPattern'] = props.conversionPattern; } + return args; } } diff --git a/packages/@aws-cdk/aws-glue/test/integ.job.ts b/packages/@aws-cdk/aws-glue/test/integ.job.ts index 3a3cccf3568cb..c620455d5194a 100644 --- a/packages/@aws-cdk/aws-glue/test/integ.job.ts +++ b/packages/@aws-cdk/aws-glue/test/integ.job.ts @@ -12,7 +12,7 @@ const script = new s3_assets.Asset(stack, 'script', { }); const minimalEtlJob = new glue.Job(stack, 'MinimalGlueEtlJob', { - executable: glue.JobExecutable.etlPython({ + executable: glue.JobExecutable.pythonEtl({ glueVersion: glue.GlueVersion.V2_0, pythonVersion: glue.PythonVersion.THREE, scriptLocation: script.s3ObjectUrl, @@ -21,7 +21,7 @@ const minimalEtlJob = new glue.Job(stack, 'MinimalGlueEtlJob', { script.bucket.grantRead(minimalEtlJob.role); const minimalStreamingJob = new glue.Job(stack, 'MinimalGlueStreamingJob', { - executable: glue.JobExecutable.streamingPython({ + executable: glue.JobExecutable.pythonStreaming({ glueVersion: glue.GlueVersion.V2_0, pythonVersion: glue.PythonVersion.THREE, scriptLocation: script.s3ObjectUrl, @@ -30,7 +30,7 @@ const minimalStreamingJob = new glue.Job(stack, 'MinimalGlueStreamingJob', { script.bucket.grantRead(minimalStreamingJob.role); const minimalPythonShellJob = new glue.Job(stack, 'MinimalPythonShellJob', { - executable: glue.JobExecutable.shellPython({ + executable: glue.JobExecutable.pythonShell({ glueVersion: glue.GlueVersion.V2_0, pythonVersion: glue.PythonVersion.THREE, scriptLocation: script.s3ObjectUrl, @@ -39,7 +39,7 @@ const minimalPythonShellJob = new glue.Job(stack, 'MinimalPythonShellJob', { script.bucket.grantRead(minimalPythonShellJob.role); const etlJob = new glue.Job(stack, 'Job', { - executable: glue.JobExecutable.etlPython({ + executable: glue.JobExecutable.pythonEtl({ glueVersion: glue.GlueVersion.V2_0, pythonVersion: glue.PythonVersion.THREE, scriptLocation: script.s3ObjectUrl, diff --git a/packages/@aws-cdk/aws-glue/test/job.test.ts b/packages/@aws-cdk/aws-glue/test/job.test.ts index 507f5971b6eae..6ea7588ffe69c 100644 --- a/packages/@aws-cdk/aws-glue/test/job.test.ts +++ b/packages/@aws-cdk/aws-glue/test/job.test.ts @@ -79,7 +79,7 @@ describe('Job', () => { describe('with necessary props only', () => { beforeEach(() => { job = new glue.Job(stack, 'Job', { - executable: glue.JobExecutable.etlScala({ + executable: glue.JobExecutable.scalaEtl({ glueVersion: glue.GlueVersion.V2_0, className, scriptLocation, @@ -148,7 +148,7 @@ describe('Job', () => { test('with a custom role should use it and set it in CloudFormation', () => { const role = iam.Role.fromRoleArn(stack, 'Role', 'arn:aws:iam::123456789012:role/TestRole'); job = new glue.Job(stack, 'JobWithRole', { - executable: glue.JobExecutable.etlPython({ + executable: glue.JobExecutable.pythonEtl({ glueVersion: glue.GlueVersion.V2_0, pythonVersion: PythonVersion.THREE, scriptLocation, @@ -164,7 +164,7 @@ describe('Job', () => { test('with a custom jobName should set it in CloudFormation', () => { job = new glue.Job(stack, 'JobWithName', { - executable: glue.JobExecutable.streamingScala({ + executable: glue.JobExecutable.scalaStreaming({ glueVersion: glue.GlueVersion.V2_0, className, scriptLocation, @@ -181,12 +181,12 @@ describe('Job', () => { describe('enabling continuous logging with defaults', () => { beforeEach(() => { job = new glue.Job(stack, 'Job', { - executable: glue.JobExecutable.etlScala({ + executable: glue.JobExecutable.scalaEtl({ glueVersion: glue.GlueVersion.V2_0, className, scriptLocation, }), - continuousLogging: {}, + continuousLogging: { enabled: true }, }); }); @@ -206,12 +206,13 @@ describe('Job', () => { beforeEach(() => { logGroup = logs.LogGroup.fromLogGroupName(stack, 'LogGroup', 'LogGroupName'); job = new glue.Job(stack, 'Job', { - executable: glue.JobExecutable.etlScala({ + executable: glue.JobExecutable.scalaEtl({ glueVersion: glue.GlueVersion.V2_0, className, scriptLocation, }), continuousLogging: { + enabled: true, filter: false, logStreamPrefix: 'LogStreamPrefix', conversionPattern: '%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n', @@ -277,12 +278,12 @@ describe('Job', () => { describe('enabling spark ui but no bucket or path provided', () => { beforeEach(() => { job = new glue.Job(stack, 'Job', { - executable: glue.JobExecutable.etlScala({ + executable: glue.JobExecutable.scalaEtl({ glueVersion: glue.GlueVersion.V2_0, className, scriptLocation, }), - sparkUI: {}, + sparkUI: { enabled: true }, }); }); @@ -368,12 +369,13 @@ describe('Job', () => { bucketName = 'BucketName'; bucket = s3.Bucket.fromBucketName(stack, 'BucketId', bucketName); job = new glue.Job(stack, 'Job', { - executable: glue.JobExecutable.etlScala({ + executable: glue.JobExecutable.scalaEtl({ glueVersion: glue.GlueVersion.V2_0, className, scriptLocation, }), sparkUI: { + enabled: true, bucket, }, }); @@ -450,12 +452,13 @@ describe('Job', () => { bucket = s3.Bucket.fromBucketName(stack, 'BucketId', bucketName); path = 'some/path/'; job = new glue.Job(stack, 'Job', { - executable: glue.JobExecutable.etlScala({ + executable: glue.JobExecutable.scalaEtl({ glueVersion: glue.GlueVersion.V2_0, className, scriptLocation, }), sparkUI: { + enabled: true, bucket, path, }, @@ -477,7 +480,7 @@ describe('Job', () => { job = new glue.Job(stack, 'Job', { jobName, description: 'test job', - executable: glue.JobExecutable.streamingPython({ + executable: glue.JobExecutable.pythonStreaming({ glueVersion: glue.GlueVersion.V2_0, pythonVersion: PythonVersion.THREE, scriptLocation, @@ -550,7 +553,7 @@ describe('Job', () => { test('with minimal props should synthesize correctly', () => { new glue.Job(stack, 'Job', { - executable: glue.JobExecutable.shellPython({ + executable: glue.JobExecutable.pythonShell({ glueVersion: glue.GlueVersion.V2_0, pythonVersion: PythonVersion.THREE, scriptLocation, @@ -569,7 +572,7 @@ describe('Job', () => { test('with unsupported glue version throws', () => { expect(() => new glue.Job(stack, 'Job', { - executable: glue.JobExecutable.shellPython({ + executable: glue.JobExecutable.pythonShell({ glueVersion: glue.GlueVersion.V0_9, pythonVersion: PythonVersion.TWO, scriptLocation, @@ -579,18 +582,18 @@ describe('Job', () => { test('with unsupported Spark UI prop throws', () => { expect(() => new glue.Job(stack, 'Job', { - executable: glue.JobExecutable.shellPython({ + executable: glue.JobExecutable.pythonShell({ glueVersion: glue.GlueVersion.V2_0, pythonVersion: PythonVersion.THREE, scriptLocation, }), - sparkUI: {}, - })).toThrow('Spark UI can only be configured for JobType.ETL or JobType.STREAMING jobs'); + sparkUI: { enabled: true }, + })).toThrow('Spark UI is not available for JobType.PYTHON_SHELL'); }); test('with all props should synthesize correctly', () => { new glue.Job(stack, 'Job', { - executable: glue.JobExecutable.shellPython({ + executable: glue.JobExecutable.pythonShell({ glueVersion: glue.GlueVersion.V2_0, pythonVersion: PythonVersion.THREE, scriptLocation, @@ -619,7 +622,7 @@ describe('Job', () => { test('with minimal props should synthesize correctly', () => { new glue.Job(stack, 'Job', { - executable: glue.JobExecutable.etlPython({ + executable: glue.JobExecutable.pythonEtl({ glueVersion: glue.GlueVersion.V2_0, pythonVersion: PythonVersion.THREE, scriptLocation, @@ -648,7 +651,7 @@ describe('Job', () => { test('with all props should synthesize correctly', () => { new glue.Job(stack, 'Job', { - executable: glue.JobExecutable.etlPython({ + executable: glue.JobExecutable.pythonEtl({ glueVersion: glue.GlueVersion.V2_0, pythonVersion: PythonVersion.THREE, extraJarsFirst: true, @@ -687,7 +690,7 @@ describe('Job', () => { test('with minimal props should synthesize correctly', () => { new glue.Job(stack, 'Job', { - executable: glue.JobExecutable.streamingScala({ + executable: glue.JobExecutable.scalaStreaming({ glueVersion: glue.GlueVersion.V2_0, scriptLocation, className, @@ -715,7 +718,7 @@ describe('Job', () => { test('with all props should synthesize correctly', () => { new glue.Job(stack, 'Job', { - executable: glue.JobExecutable.streamingScala({ + executable: glue.JobExecutable.scalaStreaming({ glueVersion: glue.GlueVersion.V2_0, extraJarsFirst: true, className, @@ -751,7 +754,7 @@ describe('Job', () => { describe('event rules and rule-based metrics', () => { beforeEach(() => { job = new glue.Job(stack, 'Job', { - executable: glue.JobExecutable.etlScala({ + executable: glue.JobExecutable.scalaEtl({ glueVersion: glue.GlueVersion.V2_0, className, scriptLocation, From 7072a6f1dd5c5ab86fbe40a78eb6331bd710c15b Mon Sep 17 00:00:00 2001 From: Ahmed Kamel Date: Tue, 31 Aug 2021 10:57:56 +0100 Subject: [PATCH 24/50] address metric comments --- packages/@aws-cdk/aws-glue/lib/job.ts | 79 +++--- .../aws-glue/test/integ.job.expected.json | 2 +- packages/@aws-cdk/aws-glue/test/job.test.ts | 238 ++---------------- 3 files changed, 55 insertions(+), 264 deletions(-) diff --git a/packages/@aws-cdk/aws-glue/lib/job.ts b/packages/@aws-cdk/aws-glue/lib/job.ts index 135ec60793059..dce6b359e0fdc 100644 --- a/packages/@aws-cdk/aws-glue/lib/job.ts +++ b/packages/@aws-cdk/aws-glue/lib/job.ts @@ -137,21 +137,21 @@ export interface IJob extends cdk.IResource { * * @see https://docs.aws.amazon.com/AmazonCloudWatch/latest/events/EventTypes.html#glue-event-types */ - onSuccess(id?: string, options?: events.OnEventOptions): events.Rule; + onSuccess(id: string, options?: events.OnEventOptions): events.Rule; /** * Defines a CloudWatch event rule triggered when this job moves to the FAILED state. * * @see https://docs.aws.amazon.com/AmazonCloudWatch/latest/events/EventTypes.html#glue-event-types */ - onFailure(id?: string, options?: events.OnEventOptions): events.Rule; + onFailure(id: string, options?: events.OnEventOptions): events.Rule; /** * Defines a CloudWatch event rule triggered when this job moves to the TIMEOUT state. * * @see https://docs.aws.amazon.com/AmazonCloudWatch/latest/events/EventTypes.html#glue-event-types */ - onTimeout(id?: string, options?: events.OnEventOptions): events.Rule; + onTimeout(id: string, options?: events.OnEventOptions): events.Rule; /** * Create a CloudWatch metric. @@ -244,40 +244,31 @@ abstract class JobBase extends cdk.Resource implements IJob { /** * Return a CloudWatch Event Rule matching JobState.SUCCEEDED. * - * If no id or options are provided, the created rule is cached. Later no-args calls with retrieves from cache but ones with args. - * Later calls with args lead to the creation of a new Rule - * - * @param id optional construct id. default is SUCCEEDEDRule - * @param options optional event options. default is {} + * @param id construct id. + * @param options optional event options. default is {}. */ - public onSuccess(id?: string, options: events.OnEventOptions = {}): events.Rule { - return this.rule(JobState.SUCCEEDED, id, options); + public onSuccess(id: string, options: events.OnEventOptions = {}): events.Rule { + return this.jobStateRule(id, JobState.SUCCEEDED, options); } /** * Return a CloudWatch Event Rule matching FAILED state. * - * If no id or options are provided, the created rule is cached. Later no-args calls with retrieves from cache but ones with args. - * Later calls with args lead to the creation of a new Rule - * - * @param id optional construct id. default is FAILEDRule - * @param options optional event options. default is {} + * @param id construct id. + * @param options optional event options. default is {}. */ - public onFailure(id?: string, options: events.OnEventOptions = {}): events.Rule { - return this.rule(JobState.FAILED, id, options); + public onFailure(id: string, options: events.OnEventOptions = {}): events.Rule { + return this.jobStateRule(id, JobState.FAILED, options); } /** * Return a CloudWatch Event Rule matching TIMEOUT state. * - * If no id or options are provided, the created rule is cached. Later no-args calls with retrieves from cache but ones with args. - * Later calls with args lead to the creation of a new Rule - * - * @param id optional construct id. default is TIMEOUTRule - * @param options optional event options. default is {} + * @param id construct id. + * @param options optional event options. default is {}. */ - public onTimeout(id?: string, options: events.OnEventOptions = {}): events.Rule { - return this.rule(JobState.TIMEOUT, id, options); + public onTimeout(id: string, options: events.OnEventOptions = {}): events.Rule { + return this.jobStateRule(id, JobState.TIMEOUT, options); } /** @@ -308,7 +299,7 @@ abstract class JobBase extends cdk.Resource implements IJob { * This metric is based on the Rule returned by no-args onSuccess() call. */ public metricSuccess(props?: cloudwatch.MetricOptions): cloudwatch.Metric { - return JobBase.metricRule(this.onSuccess(), props); + return JobBase.metricRule(this.metricJobStateRule('SuccessMetricRule', JobState.SUCCEEDED), props); } /** @@ -317,7 +308,7 @@ abstract class JobBase extends cdk.Resource implements IJob { * This metric is based on the Rule returned by no-args onFailure() call. */ public metricFailure(props?: cloudwatch.MetricOptions): cloudwatch.Metric { - return JobBase.metricRule(this.onFailure(), props); + return JobBase.metricRule(this.metricJobStateRule('FailureMetricRule', JobState.FAILED), props); } /** @@ -326,29 +317,30 @@ abstract class JobBase extends cdk.Resource implements IJob { * This metric is based on the Rule returned by no-args onTimeout() call. */ public metricTimeout(props?: cloudwatch.MetricOptions): cloudwatch.Metric { - return JobBase.metricRule(this.onTimeout(), props); + return JobBase.metricRule(this.metricJobStateRule('TimeoutMetricRule', JobState.TIMEOUT), props); } /** - * Creates a new rule for a transition into the input jobState or attempt to create-if-necessary and retrieve the default rule - * - A new rule is created but not cached if the id parameter is specified - * - A create/retrieve from cache scenario happens when no explicit id (and options) are not provided - * The reason is that the default rule is used by onSuccess, onFailure and onTimeout methods which are in turn used by metrics methods. + * Creates or retrieves a singleton event rule for the input job state for use with the metric JobState methods. * - * @param jobState the job state - * @param id optional construct id - * @param options optional event options + * @param id construct id. + * @param jobState the job state. * @private */ - private rule(jobState: JobState, id?: string, options: events.OnEventOptions = {}): events.Rule { - // Caching (for metric methods and default arg-less event methods) - const cachedRuleId = `${jobState}Rule`; - const cachedRule = this.node.tryFindChild(cachedRuleId); - // Use the already created rule if no id is provided (arg-less event methods or events supporting metrics) - if (!id && cachedRule) { - return cachedRule as events.Rule; - } - const rule = this.onEvent(id || cachedRuleId, { + private metricJobStateRule(id: string, jobState: JobState): events.Rule { + return this.node.tryFindChild(id) as events.Rule ?? this.jobStateRule(id, jobState); + } + + /** + * Creates a new rule for a transition into the input jobState. + * + * @param id construct id. + * @param jobState the job state. + * @param options optional event options. + * @private + */ + private jobStateRule(id: string, jobState: JobState, options: events.OnEventOptions = {}): events.Rule { + const rule = this.onEvent(id, { description: `Rule triggered when Glue job ${this.jobName} is in ${jobState} state`, ...options, }); @@ -359,7 +351,6 @@ abstract class JobBase extends cdk.Resource implements IJob { }); return rule; } - } /** diff --git a/packages/@aws-cdk/aws-glue/test/integ.job.expected.json b/packages/@aws-cdk/aws-glue/test/integ.job.expected.json index a4f289ebabaf1..985721eac07c3 100644 --- a/packages/@aws-cdk/aws-glue/test/integ.job.expected.json +++ b/packages/@aws-cdk/aws-glue/test/integ.job.expected.json @@ -609,7 +609,7 @@ "WorkerType": "G.2X" } }, - "JobSUCCEEDEDRule682F039B": { + "JobSuccessMetricRule80747C33": { "Type": "AWS::Events::Rule", "Properties": { "Description": { diff --git a/packages/@aws-cdk/aws-glue/test/job.test.ts b/packages/@aws-cdk/aws-glue/test/job.test.ts index 6ea7588ffe69c..58cc9fe1e3643 100644 --- a/packages/@aws-cdk/aws-glue/test/job.test.ts +++ b/packages/@aws-cdk/aws-glue/test/job.test.ts @@ -1,5 +1,6 @@ import * as cdkassert from '@aws-cdk/assert-internal'; import * as cloudwatch from '@aws-cdk/aws-cloudwatch'; +import * as events from '@aws-cdk/aws-events'; import * as iam from '@aws-cdk/aws-iam'; import * as logs from '@aws-cdk/aws-logs'; import * as s3 from '@aws-cdk/aws-s3'; @@ -787,12 +788,9 @@ describe('Job', () => { }); describe('.onSuccess()', () => { - test('with no-args and multiple calls should create one resource and cache it', () => { - const firstInvocationRule = job.onSuccess(); - const subsequentInvocationRule = job.onSuccess(); + test('should create a rule with correct properties', () => { + job.onSuccess('SuccessRule'); - expect(subsequentInvocationRule).toEqual(firstInvocationRule); - cdkassert.countResources('AWS::Events::Rule', 1); cdkassert.expect(stack).to(cdkassert.haveResource('AWS::Events::Rule', { Description: { 'Fn::Join': [ @@ -828,81 +826,12 @@ describe('Job', () => { State: 'ENABLED', })); }); - - test('with args should ignore the cached rule and return a new one', () => { - const firstInvocationRule = job.onSuccess(); - const subsequentInvocationRuleWithNoArgs = job.onSuccess(); - job.onSuccess('noCache', { description: 'description override' }); - - expect(subsequentInvocationRuleWithNoArgs).toEqual(firstInvocationRule); - cdkassert.countResources('AWS::Events::Rule', 2); - cdkassert.expect(stack).to(cdkassert.haveResource('AWS::Events::Rule', { - Description: { - 'Fn::Join': [ - '', - [ - 'Rule triggered when Glue job ', - { - Ref: 'JobB9D00F9F', - }, - ' is in SUCCEEDED state', - ], - ], - }, - EventPattern: { - 'source': [ - 'aws.glue', - ], - 'detail-type': [ - 'Glue Job State Change', - 'Glue Job Run Status', - ], - 'detail': { - state: [ - 'SUCCEEDED', - ], - jobName: [ - { - Ref: 'JobB9D00F9F', - }, - ], - }, - }, - State: 'ENABLED', - })); - cdkassert.expect(stack).to(cdkassert.haveResource('AWS::Events::Rule', { - Description: 'description override', - EventPattern: { - 'source': [ - 'aws.glue', - ], - 'detail-type': [ - 'Glue Job State Change', - 'Glue Job Run Status', - ], - 'detail': { - state: [ - 'SUCCEEDED', - ], - jobName: [ - { - Ref: 'JobB9D00F9F', - }, - ], - }, - }, - State: 'ENABLED', - })); - }); }); describe('.onFailure()', () => { - test('with no-args and multiple calls should create one resource and cache it', () => { - const firstInvocationRule = job.onFailure(); - const subsequentInvocationRule = job.onFailure(); + test('should create a rule with correct properties', () => { + job.onFailure('FailureRule'); - expect(subsequentInvocationRule).toEqual(firstInvocationRule); - cdkassert.countResources('AWS::Events::Rule', 1); cdkassert.expect(stack).to(cdkassert.haveResource('AWS::Events::Rule', { Description: { 'Fn::Join': [ @@ -938,124 +867,12 @@ describe('Job', () => { State: 'ENABLED', })); }); - - test('with args should ignore the cached rule and return a new one', () => { - const firstInvocationRule = job.onFailure(); - const subsequentInvocationRuleWithNoArgs = job.onFailure(); - job.onFailure('noCache', { description: 'description override' }); - - expect(subsequentInvocationRuleWithNoArgs).toEqual(firstInvocationRule); - cdkassert.countResources('AWS::Events::Rule', 2); - cdkassert.expect(stack).to(cdkassert.haveResource('AWS::Events::Rule', { - Description: { - 'Fn::Join': [ - '', - [ - 'Rule triggered when Glue job ', - { - Ref: 'JobB9D00F9F', - }, - ' is in FAILED state', - ], - ], - }, - EventPattern: { - 'source': [ - 'aws.glue', - ], - 'detail-type': [ - 'Glue Job State Change', - 'Glue Job Run Status', - ], - 'detail': { - state: [ - 'FAILED', - ], - jobName: [ - { - Ref: 'JobB9D00F9F', - }, - ], - }, - }, - State: 'ENABLED', - })); - cdkassert.expect(stack).to(cdkassert.haveResource('AWS::Events::Rule', { - Description: 'description override', - EventPattern: { - 'source': [ - 'aws.glue', - ], - 'detail-type': [ - 'Glue Job State Change', - 'Glue Job Run Status', - ], - 'detail': { - state: [ - 'FAILED', - ], - jobName: [ - { - Ref: 'JobB9D00F9F', - }, - ], - }, - }, - State: 'ENABLED', - })); - }); }); describe('.onTimeout()', () => { - test('with no-args and multiple calls should create one resource and cache it', () => { - const firstInvocationRule = job.onTimeout(); - const subsequentInvocationRule = job.onTimeout(); - - expect(subsequentInvocationRule).toEqual(firstInvocationRule); - cdkassert.countResources('AWS::Events::Rule', 1); - cdkassert.expect(stack).to(cdkassert.haveResource('AWS::Events::Rule', { - Description: { - 'Fn::Join': [ - '', - [ - 'Rule triggered when Glue job ', - { - Ref: 'JobB9D00F9F', - }, - ' is in TIMEOUT state', - ], - ], - }, - EventPattern: { - 'source': [ - 'aws.glue', - ], - 'detail-type': [ - 'Glue Job State Change', - 'Glue Job Run Status', - ], - 'detail': { - state: [ - 'TIMEOUT', - ], - jobName: [ - { - Ref: 'JobB9D00F9F', - }, - ], - }, - }, - State: 'ENABLED', - })); - }); - - test('with args should ignore the cached rule and return a new one', () => { - const firstInvocationRule = job.onTimeout(); - job.onTimeout('noCache', { description: 'description override' }); - const subsequentInvocationRuleWithNoArgs = job.onTimeout(); + test('should create a rule with correct properties', () => { + job.onTimeout('TimeoutRule'); - expect(subsequentInvocationRuleWithNoArgs).toEqual(firstInvocationRule); - cdkassert.countResources('AWS::Events::Rule', 2); cdkassert.expect(stack).to(cdkassert.haveResource('AWS::Events::Rule', { Description: { 'Fn::Join': [ @@ -1090,44 +907,23 @@ describe('Job', () => { }, State: 'ENABLED', })); - cdkassert.expect(stack).to(cdkassert.haveResource('AWS::Events::Rule', { - Description: 'description override', - EventPattern: { - 'source': [ - 'aws.glue', - ], - 'detail-type': [ - 'Glue Job State Change', - 'Glue Job Run Status', - ], - 'detail': { - state: [ - 'TIMEOUT', - ], - jobName: [ - { - Ref: 'JobB9D00F9F', - }, - ], - }, - }, - State: 'ENABLED', - })); }); }); - test('.metricSuccess() creates the expected event rule and corresponding metric', () => { + test('.metricSuccess() creates the expected singleton event rule and corresponding metric', () => { const metric = job.metricSuccess(); + job.metricSuccess(); expect(metric).toEqual(new cloudwatch.Metric({ dimensions: { - RuleName: job.onSuccess().ruleName, + RuleName: (job.node.findChild('SuccessMetricRule') as events.Rule).ruleName, }, metricName: 'TriggeredRules', namespace: 'AWS/Events', statistic: 'Sum', })); + cdkassert.countResources('AWS::Events::Rule', 1); cdkassert.expect(stack).to(cdkassert.haveResource('AWS::Events::Rule', { Description: { 'Fn::Join': [ @@ -1164,18 +960,20 @@ describe('Job', () => { })); }); - test('.metricFailure() creates the expected event rule and corresponding metric', () => { + test('.metricFailure() creates the expected singleton event rule and corresponding metric', () => { const metric = job.metricFailure(); + job.metricFailure(); expect(metric).toEqual(new cloudwatch.Metric({ dimensions: { - RuleName: job.onFailure().ruleName, + RuleName: (job.node.findChild('FailureMetricRule') as events.Rule).ruleName, }, metricName: 'TriggeredRules', namespace: 'AWS/Events', statistic: 'Sum', })); + cdkassert.countResources('AWS::Events::Rule', 1); cdkassert.expect(stack).to(cdkassert.haveResource('AWS::Events::Rule', { Description: { 'Fn::Join': [ @@ -1212,18 +1010,20 @@ describe('Job', () => { })); }); - test('.metricTimeout() creates the expected event rule and corresponding metric', () => { + test('.metricTimeout() creates the expected singleton event rule and corresponding metric', () => { const metric = job.metricTimeout(); + job.metricTimeout(); expect(metric).toEqual(new cloudwatch.Metric({ dimensions: { - RuleName: job.onTimeout().ruleName, + RuleName: (job.node.findChild('TimeoutMetricRule') as events.Rule).ruleName, }, metricName: 'TriggeredRules', namespace: 'AWS/Events', statistic: 'Sum', })); + cdkassert.countResources('AWS::Events::Rule', 1); cdkassert.expect(stack).to(cdkassert.haveResource('AWS::Events::Rule', { Description: { 'Fn::Join': [ From e860f4d31c7abd624c5e88663ddaaa798f9d9595 Mon Sep 17 00:00:00 2001 From: Ahmed Kamel Date: Tue, 31 Aug 2021 12:32:34 +0100 Subject: [PATCH 25/50] take 1 at glue.Code (not fulyl tested) --- packages/@aws-cdk/aws-glue/lib/code.ts | 116 ++++++++ packages/@aws-cdk/aws-glue/lib/index.ts | 1 + .../@aws-cdk/aws-glue/lib/job-executable.ts | 47 ++-- packages/@aws-cdk/aws-glue/lib/job.ts | 17 +- packages/@aws-cdk/aws-glue/package.json | 4 + .../aws-glue/test/integ.job.expected.json | 256 +----------------- packages/@aws-cdk/aws-glue/test/integ.job.ts | 23 +- packages/@aws-cdk/aws-glue/test/job.test.ts | 68 ++--- 8 files changed, 209 insertions(+), 323 deletions(-) create mode 100644 packages/@aws-cdk/aws-glue/lib/code.ts diff --git a/packages/@aws-cdk/aws-glue/lib/code.ts b/packages/@aws-cdk/aws-glue/lib/code.ts new file mode 100644 index 0000000000000..0d589b729d8a9 --- /dev/null +++ b/packages/@aws-cdk/aws-glue/lib/code.ts @@ -0,0 +1,116 @@ +import * as crypto from 'crypto'; +import * as fs from 'fs'; +import * as s3 from '@aws-cdk/aws-s3'; +import * as s3_assets from '@aws-cdk/aws-s3-assets'; +import * as cdk from '@aws-cdk/core'; +import * as constructs from 'constructs'; + +/** + * Represents a Glue Job's Code assets (an asset can be a scripts, a jar, a python file or any other file). + */ +export abstract class Code { + + /** + * Job code as an S3 object. + * @param bucket The S3 bucket + * @param key The object key + */ + public static fromBucket(bucket: s3.IBucket, key: string): S3Code { + return new S3Code(bucket, key); + } + + /** + * Job code from a local disk path. + * + * @param path code file (not a directory). + */ + public static fromAsset(path: string, options?: s3_assets.AssetOptions): AssetCode { + return new AssetCode(path, options); + } + + /** + * Called when the Job is initialized to allow this object to bind. + */ + public abstract bind(_scope: constructs.Construct): CodeConfig; +} + +/** + * Glue job Code from an S3 bucket. + */ +export class S3Code extends Code { + private bucketName: string; + + constructor(bucket: s3.IBucket, private key: string) { + super(); + + if (!bucket.bucketName) { + throw new Error('bucketName is undefined for the provided bucket'); + } + this.bucketName = bucket.bucketName; + } + + public bind(_scope: constructs.Construct): CodeConfig { + return { + s3Location: { + bucketName: this.bucketName, + objectKey: this.key, + }, + }; + } +} + +/** + * Job Code from a local file. + */ +export class AssetCode extends Code { + private asset?: s3_assets.Asset; + + /** + * @param path The path to the Code file. + */ + constructor(public readonly path: string, private readonly options: s3_assets.AssetOptions = { }) { + super(); + } + + public bind(scope: constructs.Construct): CodeConfig { + if (fs.lstatSync(this.path).isDirectory()) { + throw new Error(`Code path ${this.path} is a directory. Only files are supported.`); + } + // If the same AssetCode is used multiple times, retain only the first instantiation. + if (!this.asset) { + this.asset = new s3_assets.Asset(scope, `Code${this.hashcode(this.path)}`, { + path: this.path, + ...this.options, + }); + } else if (cdk.Stack.of(this.asset) !== cdk.Stack.of(scope)) { + throw new Error(`Asset is already associated with another stack '${cdk.Stack.of(this.asset).stackName}'. ` + + 'Create a new Code instance for every stack.'); + } + + return { + s3Location: { + bucketName: this.asset.s3BucketName, + objectKey: this.asset.s3ObjectKey, + }, + }; + } + + /** + * Hash a string + */ + private hashcode(s: string): string { + const hash = crypto.createHash('md5'); + hash.update(s); + return hash.digest('hex'); + }; +} + +/** + * Result of binding `Code` into a `Job`. + */ +export interface CodeConfig { + /** + * The location of the code in S3. + */ + readonly s3Location: s3.Location; +} \ No newline at end of file diff --git a/packages/@aws-cdk/aws-glue/lib/index.ts b/packages/@aws-cdk/aws-glue/lib/index.ts index d0e13085804ae..d1da5e9385349 100644 --- a/packages/@aws-cdk/aws-glue/lib/index.ts +++ b/packages/@aws-cdk/aws-glue/lib/index.ts @@ -6,6 +6,7 @@ export * from './data-format'; export * from './database'; export * from './job'; export * from './job-executable'; +export * from './code'; export * from './schema'; export * from './security-configuration'; export * from './table'; \ No newline at end of file diff --git a/packages/@aws-cdk/aws-glue/lib/job-executable.ts b/packages/@aws-cdk/aws-glue/lib/job-executable.ts index 3032e50d23e86..84cedc91ad9f5 100644 --- a/packages/@aws-cdk/aws-glue/lib/job-executable.ts +++ b/packages/@aws-cdk/aws-glue/lib/job-executable.ts @@ -1,3 +1,5 @@ +import { Code } from './code'; + /** * AWS Glue version determines the versions of Apache Spark and Python that are available to the job. * @@ -124,31 +126,29 @@ interface SharedJobExecutableProps { readonly glueVersion: GlueVersion; /** - * Specifies the Amazon Simple Storage Service (Amazon S3) path to a script that executes a job. + * The script that executes a job. */ - readonly scriptLocation: string; + readonly script: Code; /** - * The Amazon S3 paths to additional files, such as configuration files that AWS Glue copies to the working directory of your script before executing it. - * Only individual files are supported, not a directory path. + * Additional files, such as configuration files that AWS Glue copies to the working directory of your script before executing it. * * @default - no extra files and argument is not set * * @see `--extra-files` in https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-glue-arguments.html */ - readonly extraFiles?: string[]; + readonly extraFiles?: Code[]; } interface SharedSparkJobExecutableProps extends SharedJobExecutableProps { /** - * The Amazon S3 paths to additional Java .jar files that AWS Glue adds to the Java classpath before executing your script. - * Only individual files are supported, not a directory path. + * Additional Java .jar files that AWS Glue adds to the Java classpath before executing your script. * * @default - no extra jars and argument is not set * * @see `--extra-jars` in https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-glue-arguments.html */ - readonly extraJars?: string[]; + readonly extraJars?: Code[]; /** * Setting this value to true prioritizes the customer's extra JAR files in the classpath. @@ -191,9 +191,9 @@ export interface JobExecutableConfig { readonly pythonVersion?: PythonVersion; /** - * Specifies the Amazon Simple Storage Service (Amazon S3) path to a script that executes a job. + * The script that executes a job. */ - readonly scriptLocation: string; + readonly script: Code; /** * The Scala class that serves as the entry point for the job. This applies only if your the job langauage is Scala. @@ -205,34 +205,31 @@ export interface JobExecutableConfig { readonly className?: string; /** - * The Amazon S3 paths to additional Java .jar files that AWS Glue adds to the Java classpath before executing your script. - * Only individual files are supported, not a directory path. + * Additional Java .jar files that AWS Glue adds to the Java classpath before executing your script. * * @default - no extra jars specified. * * @see `--extra-jars` in https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-glue-arguments.html */ - readonly extraJars?: string[]; + readonly extraJars?: Code[]; /** - * The Amazon S3 paths to additional Python modules that AWS Glue adds to the Python path before executing your script. - * Only individual files are supported, not a directory path. + * Additional Python files that AWS Glue adds to the Python path before executing your script. * * @default - no extra python files specified. * * @see `--extra-py-files` in https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-glue-arguments.html */ - readonly extraPythonFiles?: string[]; + readonly extraPythonFiles?: Code[]; /** - * The Amazon S3 paths to additional files, such as configuration files that AWS Glue copies to the working directory of your script before executing it. - * Only individual files are supported, not a directory path. + * Additional files, such as configuration files that AWS Glue copies to the working directory of your script before executing it. * * @default - no extra files specified. * * @see `--extra-files` in https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-glue-arguments.html */ - readonly extraFiles?: string[]; + readonly extraFiles?: Code[]; /** * Setting this value to true prioritizes the customer's extra JAR files in the classpath. @@ -267,14 +264,13 @@ export interface PythonJobExecutableProps extends SharedSparkJobExecutableProps readonly pythonVersion: PythonVersion; /** - * The Amazon S3 paths to additional Python modules that AWS Glue adds to the Python path before executing your script. - * Only individual files are supported, not a directory path. + * Additional Python files that AWS Glue adds to the Python path before executing your script. * * @default - no extra python files and argument is not set * * @see `--extra-py-files` in https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-glue-arguments.html */ - readonly extraPythonFiles?: string[]; + readonly extraPythonFiles?: Code[]; /** * Setting this value to true prioritizes the customer's extra JAR files in the classpath. @@ -296,20 +292,17 @@ export interface PythonShellExecutableProps extends SharedJobExecutableProps { readonly pythonVersion: PythonVersion; /** - * The Amazon S3 paths to additional Python modules that AWS Glue adds to the Python path before executing your script. - * Only individual files are supported, not a directory path. + * Additional Python files that AWS Glue adds to the Python path before executing your script. * * @default - no extra python files and argument is not set * * @see `--extra-py-files` in https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-glue-arguments.html */ - readonly extraPythonFiles?: string[]; + readonly extraPythonFiles?: Code[]; } /** * The executable properties related to the Glue job's GlueVersion, JobType and code - * - * TODO test for exceptions */ export class JobExecutable { diff --git a/packages/@aws-cdk/aws-glue/lib/job.ts b/packages/@aws-cdk/aws-glue/lib/job.ts index dce6b359e0fdc..f033448b9d500 100644 --- a/packages/@aws-cdk/aws-glue/lib/job.ts +++ b/packages/@aws-cdk/aws-glue/lib/job.ts @@ -5,7 +5,7 @@ import * as logs from '@aws-cdk/aws-logs'; import * as s3 from '@aws-cdk/aws-s3'; import * as cdk from '@aws-cdk/core'; import * as constructs from 'constructs'; -import { JobExecutable, JobExecutableConfig, JobType } from '.'; +import { Code, JobExecutable, JobExecutableConfig, JobType } from '.'; import { IConnection } from './connection'; import { CfnJob } from './glue.generated'; import { ISecurityConfiguration } from './security-configuration'; @@ -666,7 +666,7 @@ export class Job extends JobBase { role: this.role.roleArn, command: { name: executable.type.name, - scriptLocation: executable.scriptLocation, + scriptLocation: this.codeS3ObjectUrl(executable.script), pythonVersion: executable.pythonVersion, }, glueVersion: executable.glueVersion.name, @@ -694,14 +694,14 @@ export class Job extends JobBase { if (config.className) { args['--class'] = config.className; } - if (config.extraJars && config.extraJars.length > 0) { - args['--extra-jars'] = config.extraJars.join(','); + if (config.extraJars && config.extraJars?.length > 0) { + args['--extra-jars'] = config.extraJars.map(code => this.codeS3ObjectUrl(code)).join(','); } if (config.extraPythonFiles && config.extraPythonFiles.length > 0) { - args['--extra-py-files'] = config.extraPythonFiles.join(','); + args['--extra-py-files'] = config.extraPythonFiles.map(code => this.codeS3ObjectUrl(code)).join(','); } if (config.extraFiles && config.extraFiles.length > 0) { - args['--extra-files'] = config.extraFiles.join(','); + args['--extra-files'] = config.extraFiles.map(code => this.codeS3ObjectUrl(code)).join(','); } if (config.extraJarsFirst) { args['--user-jars-first'] = 'true'; @@ -749,4 +749,9 @@ export class Job extends JobBase { } return args; } + + private codeS3ObjectUrl(code: Code): string { + const s3Location = code.bind(this).s3Location; + return `s3://${s3Location.bucketName}/${s3Location.objectKey}`; + } } diff --git a/packages/@aws-cdk/aws-glue/package.json b/packages/@aws-cdk/aws-glue/package.json index 67aedf47b6100..38223678bd6ca 100644 --- a/packages/@aws-cdk/aws-glue/package.json +++ b/packages/@aws-cdk/aws-glue/package.json @@ -86,6 +86,7 @@ "pkglint": "0.0.0" }, "dependencies": { + "@aws-cdk/assets": "0.0.0", "@aws-cdk/aws-cloudwatch": "0.0.0", "@aws-cdk/aws-events": "0.0.0", "@aws-cdk/aws-ec2": "0.0.0", @@ -93,11 +94,13 @@ "@aws-cdk/aws-logs": "0.0.0", "@aws-cdk/aws-kms": "0.0.0", "@aws-cdk/aws-s3": "0.0.0", + "@aws-cdk/aws-s3-assets": "0.0.0", "@aws-cdk/core": "0.0.0", "constructs": "^3.3.69" }, "homepage": "https://github.com/aws/aws-cdk", "peerDependencies": { + "@aws-cdk/assets": "0.0.0", "@aws-cdk/aws-cloudwatch": "0.0.0", "@aws-cdk/aws-events": "0.0.0", "@aws-cdk/aws-ec2": "0.0.0", @@ -105,6 +108,7 @@ "@aws-cdk/aws-logs": "0.0.0", "@aws-cdk/aws-kms": "0.0.0", "@aws-cdk/aws-s3": "0.0.0", + "@aws-cdk/aws-s3-assets": "0.0.0", "@aws-cdk/core": "0.0.0", "constructs": "^3.3.69" }, diff --git a/packages/@aws-cdk/aws-glue/test/integ.job.expected.json b/packages/@aws-cdk/aws-glue/test/integ.job.expected.json index 985721eac07c3..92d724caabae5 100644 --- a/packages/@aws-cdk/aws-glue/test/integ.job.expected.json +++ b/packages/@aws-cdk/aws-glue/test/integ.job.expected.json @@ -1,18 +1,4 @@ { - "Parameters": { - "AssetParameters894df8f835015940e27548bfbf722885cb247378af70effdc8ecbe342419fc6bS3Bucket252142A8": { - "Type": "String", - "Description": "S3 bucket for asset \"894df8f835015940e27548bfbf722885cb247378af70effdc8ecbe342419fc6b\"" - }, - "AssetParameters894df8f835015940e27548bfbf722885cb247378af70effdc8ecbe342419fc6bS3VersionKey7D45B377": { - "Type": "String", - "Description": "S3 key for asset version \"894df8f835015940e27548bfbf722885cb247378af70effdc8ecbe342419fc6b\"" - }, - "AssetParameters894df8f835015940e27548bfbf722885cb247378af70effdc8ecbe342419fc6bArtifactHashB9AA8E72": { - "Type": "String", - "Description": "Artifact hash for asset \"894df8f835015940e27548bfbf722885cb247378af70effdc8ecbe342419fc6b\"" - } - }, "Resources": { "MinimalGlueEtlJobServiceRole60989380": { "Type": "AWS::IAM::Role", @@ -45,63 +31,6 @@ ] } }, - "MinimalGlueEtlJobServiceRoleDefaultPolicyEDA57791": { - "Type": "AWS::IAM::Policy", - "Properties": { - "PolicyDocument": { - "Statement": [ - { - "Action": [ - "s3:GetObject*", - "s3:GetBucket*", - "s3:List*" - ], - "Effect": "Allow", - "Resource": [ - { - "Fn::Join": [ - "", - [ - "arn:", - { - "Ref": "AWS::Partition" - }, - ":s3:::", - { - "Ref": "AssetParameters894df8f835015940e27548bfbf722885cb247378af70effdc8ecbe342419fc6bS3Bucket252142A8" - } - ] - ] - }, - { - "Fn::Join": [ - "", - [ - "arn:", - { - "Ref": "AWS::Partition" - }, - ":s3:::", - { - "Ref": "AssetParameters894df8f835015940e27548bfbf722885cb247378af70effdc8ecbe342419fc6bS3Bucket252142A8" - }, - "/*" - ] - ] - } - ] - } - ], - "Version": "2012-10-17" - }, - "PolicyName": "MinimalGlueEtlJobServiceRoleDefaultPolicyEDA57791", - "Roles": [ - { - "Ref": "MinimalGlueEtlJobServiceRole60989380" - } - ] - } - }, "MinimalGlueEtlJobF8C90254": { "Type": "AWS::Glue::Job", "Properties": { @@ -190,63 +119,6 @@ ] } }, - "MinimalGlueStreamingJobServiceRoleDefaultPolicyCA892591": { - "Type": "AWS::IAM::Policy", - "Properties": { - "PolicyDocument": { - "Statement": [ - { - "Action": [ - "s3:GetObject*", - "s3:GetBucket*", - "s3:List*" - ], - "Effect": "Allow", - "Resource": [ - { - "Fn::Join": [ - "", - [ - "arn:", - { - "Ref": "AWS::Partition" - }, - ":s3:::", - { - "Ref": "AssetParameters894df8f835015940e27548bfbf722885cb247378af70effdc8ecbe342419fc6bS3Bucket252142A8" - } - ] - ] - }, - { - "Fn::Join": [ - "", - [ - "arn:", - { - "Ref": "AWS::Partition" - }, - ":s3:::", - { - "Ref": "AssetParameters894df8f835015940e27548bfbf722885cb247378af70effdc8ecbe342419fc6bS3Bucket252142A8" - }, - "/*" - ] - ] - } - ] - } - ], - "Version": "2012-10-17" - }, - "PolicyName": "MinimalGlueStreamingJobServiceRoleDefaultPolicyCA892591", - "Roles": [ - { - "Ref": "MinimalGlueStreamingJobServiceRole77973DB5" - } - ] - } - }, "MinimalGlueStreamingJobC58FD856": { "Type": "AWS::Glue::Job", "Properties": { @@ -335,63 +207,6 @@ ] } }, - "MinimalPythonShellJobServiceRoleDefaultPolicy0FFC6CE9": { - "Type": "AWS::IAM::Policy", - "Properties": { - "PolicyDocument": { - "Statement": [ - { - "Action": [ - "s3:GetObject*", - "s3:GetBucket*", - "s3:List*" - ], - "Effect": "Allow", - "Resource": [ - { - "Fn::Join": [ - "", - [ - "arn:", - { - "Ref": "AWS::Partition" - }, - ":s3:::", - { - "Ref": "AssetParameters894df8f835015940e27548bfbf722885cb247378af70effdc8ecbe342419fc6bS3Bucket252142A8" - } - ] - ] - }, - { - "Fn::Join": [ - "", - [ - "arn:", - { - "Ref": "AWS::Partition" - }, - ":s3:::", - { - "Ref": "AssetParameters894df8f835015940e27548bfbf722885cb247378af70effdc8ecbe342419fc6bS3Bucket252142A8" - }, - "/*" - ] - ] - } - ] - } - ], - "Version": "2012-10-17" - }, - "PolicyName": "MinimalPythonShellJobServiceRoleDefaultPolicy0FFC6CE9", - "Roles": [ - { - "Ref": "MinimalPythonShellJobServiceRole4944649D" - } - ] - } - }, "MinimalPythonShellJob43B4A269": { "Type": "AWS::Glue::Job", "Properties": { @@ -480,63 +295,6 @@ ] } }, - "JobServiceRoleDefaultPolicy03F68F9D": { - "Type": "AWS::IAM::Policy", - "Properties": { - "PolicyDocument": { - "Statement": [ - { - "Action": [ - "s3:GetObject*", - "s3:GetBucket*", - "s3:List*" - ], - "Effect": "Allow", - "Resource": [ - { - "Fn::Join": [ - "", - [ - "arn:", - { - "Ref": "AWS::Partition" - }, - ":s3:::", - { - "Ref": "AssetParameters894df8f835015940e27548bfbf722885cb247378af70effdc8ecbe342419fc6bS3Bucket252142A8" - } - ] - ] - }, - { - "Fn::Join": [ - "", - [ - "arn:", - { - "Ref": "AWS::Partition" - }, - ":s3:::", - { - "Ref": "AssetParameters894df8f835015940e27548bfbf722885cb247378af70effdc8ecbe342419fc6bS3Bucket252142A8" - }, - "/*" - ] - ] - } - ] - } - ], - "Version": "2012-10-17" - }, - "PolicyName": "JobServiceRoleDefaultPolicy03F68F9D", - "Roles": [ - { - "Ref": "JobServiceRole4F432993" - } - ] - } - }, "JobB9D00F9F": { "Type": "AWS::Glue::Job", "Properties": { @@ -646,5 +404,19 @@ "State": "ENABLED" } } + }, + "Parameters": { + "AssetParameters894df8f835015940e27548bfbf722885cb247378af70effdc8ecbe342419fc6bS3Bucket252142A8": { + "Type": "String", + "Description": "S3 bucket for asset \"894df8f835015940e27548bfbf722885cb247378af70effdc8ecbe342419fc6b\"" + }, + "AssetParameters894df8f835015940e27548bfbf722885cb247378af70effdc8ecbe342419fc6bS3VersionKey7D45B377": { + "Type": "String", + "Description": "S3 key for asset version \"894df8f835015940e27548bfbf722885cb247378af70effdc8ecbe342419fc6b\"" + }, + "AssetParameters894df8f835015940e27548bfbf722885cb247378af70effdc8ecbe342419fc6bArtifactHashB9AA8E72": { + "Type": "String", + "Description": "Artifact hash for asset \"894df8f835015940e27548bfbf722885cb247378af70effdc8ecbe342419fc6b\"" + } } } \ No newline at end of file diff --git a/packages/@aws-cdk/aws-glue/test/integ.job.ts b/packages/@aws-cdk/aws-glue/test/integ.job.ts index c620455d5194a..0b607b66117dc 100644 --- a/packages/@aws-cdk/aws-glue/test/integ.job.ts +++ b/packages/@aws-cdk/aws-glue/test/integ.job.ts @@ -1,5 +1,4 @@ import * as path from 'path'; -import * as s3_assets from '@aws-cdk/aws-s3-assets'; import * as cdk from '@aws-cdk/core'; import * as glue from '../lib'; @@ -7,42 +6,37 @@ const app = new cdk.App(); const stack = new cdk.Stack(app, 'aws-glue-job'); -const script = new s3_assets.Asset(stack, 'script', { - path: path.join(__dirname, 'job-script/hello_world.py'), -}); +const script = glue.Code.fromAsset(path.join(__dirname, 'job-script/hello_world.py')); -const minimalEtlJob = new glue.Job(stack, 'MinimalGlueEtlJob', { +new glue.Job(stack, 'MinimalGlueEtlJob', { executable: glue.JobExecutable.pythonEtl({ glueVersion: glue.GlueVersion.V2_0, pythonVersion: glue.PythonVersion.THREE, - scriptLocation: script.s3ObjectUrl, + script, }), }); -script.bucket.grantRead(minimalEtlJob.role); -const minimalStreamingJob = new glue.Job(stack, 'MinimalGlueStreamingJob', { +new glue.Job(stack, 'MinimalGlueStreamingJob', { executable: glue.JobExecutable.pythonStreaming({ glueVersion: glue.GlueVersion.V2_0, pythonVersion: glue.PythonVersion.THREE, - scriptLocation: script.s3ObjectUrl, + script, }), }); -script.bucket.grantRead(minimalStreamingJob.role); -const minimalPythonShellJob = new glue.Job(stack, 'MinimalPythonShellJob', { +new glue.Job(stack, 'MinimalPythonShellJob', { executable: glue.JobExecutable.pythonShell({ glueVersion: glue.GlueVersion.V2_0, pythonVersion: glue.PythonVersion.THREE, - scriptLocation: script.s3ObjectUrl, + script, }), }); -script.bucket.grantRead(minimalPythonShellJob.role); const etlJob = new glue.Job(stack, 'Job', { executable: glue.JobExecutable.pythonEtl({ glueVersion: glue.GlueVersion.V2_0, pythonVersion: glue.PythonVersion.THREE, - scriptLocation: script.s3ObjectUrl, + script, }), workerType: glue.WorkerType.G_2X, numberOfWorkers: 10, @@ -58,7 +52,6 @@ const etlJob = new glue.Job(stack, 'Job', { key: 'value', }, }); -script.bucket.grantRead(etlJob.role); etlJob.metricSuccess(); diff --git a/packages/@aws-cdk/aws-glue/test/job.test.ts b/packages/@aws-cdk/aws-glue/test/job.test.ts index 58cc9fe1e3643..d3372e29f7d75 100644 --- a/packages/@aws-cdk/aws-glue/test/job.test.ts +++ b/packages/@aws-cdk/aws-glue/test/job.test.ts @@ -62,19 +62,21 @@ describe('Job', () => { }); describe('new', () => { - let scriptLocation: string; - let extraJars: string[]; - let extraFiles: string[]; - let extraPythonFiles: string[]; + let codeBucket: s3.IBucket; + let script: glue.Code; + let extraJars: glue.Code[]; + let extraFiles: glue.Code[]; + let extraPythonFiles: glue.Code[]; let className: string; let job: glue.Job; beforeEach(() => { - scriptLocation = 's3://bucketName/script'; + codeBucket = s3.Bucket.fromBucketName(stack, 'CodeBucket', 'bucketName'); + script = glue.Code.fromBucket(codeBucket, 'script'); className = 'com.amazon.test.ClassName'; - extraJars = ['s3://bucketName/file1.jar', 's3://bucketName/file2.jar']; - extraPythonFiles = ['s3://bucketName/file1.py', 's3://bucketName/file2.py']; - extraFiles = ['s3://bucketName/file1.txt', 's3://bucketName/file2.txt']; + extraJars = [glue.Code.fromBucket(codeBucket, 'file1.jar'), glue.Code.fromBucket(codeBucket, 'file2.jar')]; + extraPythonFiles = [glue.Code.fromBucket(codeBucket, 'file1.py'), glue.Code.fromBucket(codeBucket, 'file2.py')]; + extraFiles = [glue.Code.fromBucket(codeBucket, 'file1.txt'), glue.Code.fromBucket(codeBucket, 'file2.txt')]; }); describe('with necessary props only', () => { @@ -83,7 +85,7 @@ describe('Job', () => { executable: glue.JobExecutable.scalaEtl({ glueVersion: glue.GlueVersion.V2_0, className, - scriptLocation, + script, }), }); }); @@ -124,7 +126,7 @@ describe('Job', () => { cdkassert.expect(stack).to(cdkassert.haveResource('AWS::Glue::Job', { Command: { Name: 'glueetl', - ScriptLocation: scriptLocation, + ScriptLocation: 's3://bucketName/script', }, Role: { 'Fn::GetAtt': [ @@ -152,7 +154,7 @@ describe('Job', () => { executable: glue.JobExecutable.pythonEtl({ glueVersion: glue.GlueVersion.V2_0, pythonVersion: PythonVersion.THREE, - scriptLocation, + script, }), role, }); @@ -168,7 +170,7 @@ describe('Job', () => { executable: glue.JobExecutable.scalaStreaming({ glueVersion: glue.GlueVersion.V2_0, className, - scriptLocation, + script, }), jobName, }); @@ -185,7 +187,7 @@ describe('Job', () => { executable: glue.JobExecutable.scalaEtl({ glueVersion: glue.GlueVersion.V2_0, className, - scriptLocation, + script, }), continuousLogging: { enabled: true }, }); @@ -210,7 +212,7 @@ describe('Job', () => { executable: glue.JobExecutable.scalaEtl({ glueVersion: glue.GlueVersion.V2_0, className, - scriptLocation, + script, }), continuousLogging: { enabled: true, @@ -282,7 +284,7 @@ describe('Job', () => { executable: glue.JobExecutable.scalaEtl({ glueVersion: glue.GlueVersion.V2_0, className, - scriptLocation, + script, }), sparkUI: { enabled: true }, }); @@ -373,7 +375,7 @@ describe('Job', () => { executable: glue.JobExecutable.scalaEtl({ glueVersion: glue.GlueVersion.V2_0, className, - scriptLocation, + script, }), sparkUI: { enabled: true, @@ -456,7 +458,7 @@ describe('Job', () => { executable: glue.JobExecutable.scalaEtl({ glueVersion: glue.GlueVersion.V2_0, className, - scriptLocation, + script, }), sparkUI: { enabled: true, @@ -484,7 +486,7 @@ describe('Job', () => { executable: glue.JobExecutable.pythonStreaming({ glueVersion: glue.GlueVersion.V2_0, pythonVersion: PythonVersion.THREE, - scriptLocation, + script, }), workerType: glue.WorkerType.G_2X, numberOfWorkers: 10, @@ -557,14 +559,14 @@ describe('Job', () => { executable: glue.JobExecutable.pythonShell({ glueVersion: glue.GlueVersion.V2_0, pythonVersion: PythonVersion.THREE, - scriptLocation, + script, }), }); cdkassert.expect(stack).to(cdkassert.haveResource('AWS::Glue::Job', { Command: { Name: 'pythonshell', - ScriptLocation: scriptLocation, + ScriptLocation: 's3://bucketName/script', PythonVersion: '3', }, GlueVersion: '2.0', @@ -576,7 +578,7 @@ describe('Job', () => { executable: glue.JobExecutable.pythonShell({ glueVersion: glue.GlueVersion.V0_9, pythonVersion: PythonVersion.TWO, - scriptLocation, + script, }), })).toThrow('Specified GlueVersion 0.9 does not support Python Shell'); }); @@ -586,7 +588,7 @@ describe('Job', () => { executable: glue.JobExecutable.pythonShell({ glueVersion: glue.GlueVersion.V2_0, pythonVersion: PythonVersion.THREE, - scriptLocation, + script, }), sparkUI: { enabled: true }, })).toThrow('Spark UI is not available for JobType.PYTHON_SHELL'); @@ -597,7 +599,7 @@ describe('Job', () => { executable: glue.JobExecutable.pythonShell({ glueVersion: glue.GlueVersion.V2_0, pythonVersion: PythonVersion.THREE, - scriptLocation, + script, extraPythonFiles, extraFiles, }), @@ -606,7 +608,7 @@ describe('Job', () => { cdkassert.expect(stack).to(cdkassert.haveResource('AWS::Glue::Job', { Command: { Name: 'pythonshell', - ScriptLocation: scriptLocation, + ScriptLocation: 's3://bucketName/script', PythonVersion: '3', }, GlueVersion: '2.0', @@ -626,7 +628,7 @@ describe('Job', () => { executable: glue.JobExecutable.pythonEtl({ glueVersion: glue.GlueVersion.V2_0, pythonVersion: PythonVersion.THREE, - scriptLocation, + script, }), }); @@ -635,7 +637,7 @@ describe('Job', () => { GlueVersion: '2.0', Command: { Name: 'glueetl', - ScriptLocation: scriptLocation, + ScriptLocation: 's3://bucketName/script', PythonVersion: '3', }, Role: { @@ -656,7 +658,7 @@ describe('Job', () => { glueVersion: glue.GlueVersion.V2_0, pythonVersion: PythonVersion.THREE, extraJarsFirst: true, - scriptLocation, + script, extraPythonFiles, extraJars, extraFiles, @@ -667,7 +669,7 @@ describe('Job', () => { GlueVersion: '2.0', Command: { Name: 'glueetl', - ScriptLocation: scriptLocation, + ScriptLocation: 's3://bucketName/script', PythonVersion: '3', }, Role: { @@ -693,7 +695,7 @@ describe('Job', () => { new glue.Job(stack, 'Job', { executable: glue.JobExecutable.scalaStreaming({ glueVersion: glue.GlueVersion.V2_0, - scriptLocation, + script, className, }), }); @@ -702,7 +704,7 @@ describe('Job', () => { GlueVersion: '2.0', Command: { Name: 'gluestreaming', - ScriptLocation: scriptLocation, + ScriptLocation: 's3://bucketName/script', }, Role: { 'Fn::GetAtt': [ @@ -723,7 +725,7 @@ describe('Job', () => { glueVersion: glue.GlueVersion.V2_0, extraJarsFirst: true, className, - scriptLocation, + script, extraJars, extraFiles, }), @@ -733,7 +735,7 @@ describe('Job', () => { GlueVersion: '2.0', Command: { Name: 'gluestreaming', - ScriptLocation: scriptLocation, + ScriptLocation: 's3://bucketName/script', }, Role: { 'Fn::GetAtt': [ @@ -758,7 +760,7 @@ describe('Job', () => { executable: glue.JobExecutable.scalaEtl({ glueVersion: glue.GlueVersion.V2_0, className, - scriptLocation, + script, }), }); }); From 1068229d5f5b0c2f6f6422882f938d991e488aaa Mon Sep 17 00:00:00 2001 From: Ahmed Kamel Date: Tue, 31 Aug 2021 13:55:24 +0100 Subject: [PATCH 26/50] test glue.Code --- packages/@aws-cdk/aws-glue/lib/code.ts | 13 +---- packages/@aws-cdk/aws-glue/test/code.test.ts | 61 ++++++++++++++++++++ 2 files changed, 64 insertions(+), 10 deletions(-) create mode 100644 packages/@aws-cdk/aws-glue/test/code.test.ts diff --git a/packages/@aws-cdk/aws-glue/lib/code.ts b/packages/@aws-cdk/aws-glue/lib/code.ts index 0d589b729d8a9..fb33663b9be1a 100644 --- a/packages/@aws-cdk/aws-glue/lib/code.ts +++ b/packages/@aws-cdk/aws-glue/lib/code.ts @@ -38,21 +38,14 @@ export abstract class Code { * Glue job Code from an S3 bucket. */ export class S3Code extends Code { - private bucketName: string; - - constructor(bucket: s3.IBucket, private key: string) { + constructor(private bucket: s3.IBucket, private key: string) { super(); - - if (!bucket.bucketName) { - throw new Error('bucketName is undefined for the provided bucket'); - } - this.bucketName = bucket.bucketName; } public bind(_scope: constructs.Construct): CodeConfig { return { s3Location: { - bucketName: this.bucketName, + bucketName: this.bucket.bucketName, objectKey: this.key, }, }; @@ -74,7 +67,7 @@ export class AssetCode extends Code { public bind(scope: constructs.Construct): CodeConfig { if (fs.lstatSync(this.path).isDirectory()) { - throw new Error(`Code path ${this.path} is a directory. Only files are supported.`); + throw new Error(`Code path ${this.path} is a directory. Only files are supported`); } // If the same AssetCode is used multiple times, retain only the first instantiation. if (!this.asset) { diff --git a/packages/@aws-cdk/aws-glue/test/code.test.ts b/packages/@aws-cdk/aws-glue/test/code.test.ts new file mode 100644 index 0000000000000..1874a141fb9ea --- /dev/null +++ b/packages/@aws-cdk/aws-glue/test/code.test.ts @@ -0,0 +1,61 @@ +import * as path from 'path'; +import * as s3 from '@aws-cdk/aws-s3'; +import * as cdk from '@aws-cdk/core'; +import { Code } from '../lib'; + +describe('Code', () => { + let stack: cdk.Stack; + + beforeEach(() => { + stack = new cdk.Stack(); + }); + + describe('.fromBucket()', () => { + let bucket: s3.IBucket; + let key: string; + + beforeEach(() => { + bucket = s3.Bucket.fromBucketName(stack, 'Bucket', 'bucketName'); + key = 'script'; + }); + + test('with valid bucket name and key and calling bind() returns correct s3 location', () => { + expect(Code.fromBucket(bucket, key).bind(stack)).toEqual({ + s3Location: { + bucketName: 'bucketName', + objectKey: 'script', + }, + }); + }); + }); + + describe('.fromAsset()', () => { + let filePath: string; + let directoryPath: string; + + beforeEach(() => { + filePath = path.join(__dirname, 'job-script/hello_world.py'); + directoryPath = path.join(__dirname, 'job-script'); + }); + + test('with valid and existing file path and calling bind() returns an s3 location', () => { + const codeConfig = Code.fromAsset(filePath).bind(stack); + expect(codeConfig.s3Location.bucketName).toBeDefined(); + expect(codeConfig.s3Location.objectKey).toBeDefined(); + }); + + test('with an unsupported directory path and calling bind() throws', () => { + expect(() => Code.fromAsset(directoryPath).bind(stack)) + .toThrow(/Only files are supported/); + }); + + test('throws if bound with another stack', () => { + const stack2 = new cdk.Stack(); + const asset = Code.fromAsset(filePath); + asset.bind(stack); + + expect(() => asset.bind(stack2)) + .toThrow(/associated with another stack/); + }); + }); +}); \ No newline at end of file From 9f5f85acd29d7b4b8631238df30e1ab7474ddf3e Mon Sep 17 00:00:00 2001 From: Ahmed Kamel Date: Tue, 31 Aug 2021 22:19:26 +0100 Subject: [PATCH 27/50] address some comments Co-authored-by: Ben Chaimberg --- packages/@aws-cdk/aws-glue/lib/code.ts | 8 +-- .../@aws-cdk/aws-glue/lib/job-executable.ts | 10 ++-- packages/@aws-cdk/aws-glue/lib/job.ts | 54 +++++++++---------- 3 files changed, 36 insertions(+), 36 deletions(-) diff --git a/packages/@aws-cdk/aws-glue/lib/code.ts b/packages/@aws-cdk/aws-glue/lib/code.ts index fb33663b9be1a..c6c757c936a62 100644 --- a/packages/@aws-cdk/aws-glue/lib/code.ts +++ b/packages/@aws-cdk/aws-glue/lib/code.ts @@ -1,7 +1,7 @@ import * as crypto from 'crypto'; import * as fs from 'fs'; import * as s3 from '@aws-cdk/aws-s3'; -import * as s3_assets from '@aws-cdk/aws-s3-assets'; +import * as s3assets from '@aws-cdk/aws-s3-assets'; import * as cdk from '@aws-cdk/core'; import * as constructs from 'constructs'; @@ -31,14 +31,14 @@ export abstract class Code { /** * Called when the Job is initialized to allow this object to bind. */ - public abstract bind(_scope: constructs.Construct): CodeConfig; + public abstract bind(scope: constructs.Construct): CodeConfig; } /** * Glue job Code from an S3 bucket. */ export class S3Code extends Code { - constructor(private bucket: s3.IBucket, private key: string) { + constructor(private readonly bucket: s3.IBucket, private readonly key: string) { super(); } @@ -61,7 +61,7 @@ export class AssetCode extends Code { /** * @param path The path to the Code file. */ - constructor(public readonly path: string, private readonly options: s3_assets.AssetOptions = { }) { + constructor(private readonly path: string, private readonly options: s3_assets.AssetOptions = { }) { super(); } diff --git a/packages/@aws-cdk/aws-glue/lib/job-executable.ts b/packages/@aws-cdk/aws-glue/lib/job-executable.ts index 84cedc91ad9f5..a136ede5ba219 100644 --- a/packages/@aws-cdk/aws-glue/lib/job-executable.ts +++ b/packages/@aws-cdk/aws-glue/lib/job-executable.ts @@ -133,7 +133,7 @@ interface SharedJobExecutableProps { /** * Additional files, such as configuration files that AWS Glue copies to the working directory of your script before executing it. * - * @default - no extra files and argument is not set + * @default [] - no extra files are copied to the working directory * * @see `--extra-files` in https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-glue-arguments.html */ @@ -144,7 +144,7 @@ interface SharedSparkJobExecutableProps extends SharedJobExecutableProps { /** * Additional Java .jar files that AWS Glue adds to the Java classpath before executing your script. * - * @default - no extra jars and argument is not set + * @default [] - no extra jars are added to the classpath * * @see `--extra-jars` in https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-glue-arguments.html */ @@ -153,7 +153,7 @@ interface SharedSparkJobExecutableProps extends SharedJobExecutableProps { /** * Setting this value to true prioritizes the customer's extra JAR files in the classpath. * - * @default - priortiy is not given to extra jars and argument is not set + * @default false - priority is not given to user-provided jars * * @see `--user-jars-first` in https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-glue-arguments.html */ @@ -174,7 +174,7 @@ export interface JobExecutableConfig { /** * The language of the job (Scala or Python). * - * @see `--job-languae` in https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-glue-arguments.html + * @see `--job-language` in https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-glue-arguments.html */ readonly language: JobLanguage; @@ -191,7 +191,7 @@ export interface JobExecutableConfig { readonly pythonVersion?: PythonVersion; /** - * The script that executes a job. + * The script that is executed by a job. */ readonly script: Code; diff --git a/packages/@aws-cdk/aws-glue/lib/job.ts b/packages/@aws-cdk/aws-glue/lib/job.ts index f033448b9d500..6f3cfae8b64c6 100644 --- a/packages/@aws-cdk/aws-glue/lib/job.ts +++ b/packages/@aws-cdk/aws-glue/lib/job.ts @@ -375,7 +375,7 @@ export interface SparkUIProps { /** * The path inside the bucket (objects prefix) where the Glue job stores the logs. * - * @default - no path will be used, the logs will be written at the root of the bucket. + * @default '/' - the logs will be written at the root of the bucket */ readonly path?: string; } @@ -415,19 +415,19 @@ export interface ContinuousLoggingProps { /** * Specify a custom CloudWatch log group name. * - * @default LogGroup named `/aws-glue/jobs/logs-v2/`. + * @default - a log group is created with name `/aws-glue/jobs/logs-v2/`. */ readonly logGroup?: logs.ILogGroup; /** * Specify a custom CloudWatch log stream prefix. * - * @default the job run ID. + * @default - the job run ID. */ readonly logStreamPrefix?: string; /** - * Enable pruning out non-useful Apache Spark driver/executor and Apache Hadoop YARN heartbeat log messages. + * Filter out non-useful Apache Spark driver/executor and Apache Hadoop YARN heartbeat log messages. * * @default true */ @@ -463,14 +463,14 @@ export interface JobProps { /** * The name of the job. * - * @default cloudformation generated name. + * @default - a name is automatically generated */ readonly jobName?: string; /** * The description of the job. * - * @default no value. + * @default - no value */ readonly description?: string; @@ -478,12 +478,12 @@ export interface JobProps { * The number of AWS Glue data processing units (DPUs) that can be allocated when this job runs. * Cannot be used for Glue version 2.0 and later - workerType and numberOfWorkers should be used instead. * - * @default 10 when you specify an Apache Spark ETL or Sreaming job, 0.0625 DPU when you specify a Python shell job. + * @default - 10 when job type is Apache Spark ETL or streaming, 0.0625 when job type is Python shell */ readonly maxCapacity?: number; /** - * The maximum number of times to retry this job after a JobRun fails. + * The maximum number of times to retry this job after a job run fails. * * @default 0 */ @@ -491,6 +491,7 @@ export interface JobProps { /** * The maximum number of concurrent runs allowed for the job. + * * An error is returned when this threshold is reached. The maximum value you can specify is controlled by a service limit. * * @default 1 @@ -514,28 +515,28 @@ export interface JobProps { /** * The type of predefined worker that is allocated when a job runs. * - * @default differs based on specific glue version + * @default - differs based on specific Glue version */ readonly workerType?: WorkerType; /** * The number of workers of a defined {@link WorkerType} that are allocated when a job runs. * - * @default differs based on specific glue version/worker type + * @default - differs based on specific Glue version/worker type */ readonly numberOfWorkers?: number; /** * The {@link Connection}s used for this job. * - * @default no connection. + * @default [] - no connections are added to the job */ - readonly connections?: IConnection []; + readonly connections?: IConnection[]; /** * The {@link SecurityConfiguration} to use for this job. * - * @default no security configuration. + * @default - no security configuration. */ readonly securityConfiguration?: ISecurityConfiguration; @@ -543,21 +544,21 @@ export interface JobProps { * The default arguments for this job, specified as name-value pairs. * * @see https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-glue-arguments.html for a list of special parameters Used by AWS Glue - * @default no arguments + * @default - no arguments */ readonly defaultArguments?: { [key: string]: string }; /** - * The tags to use with this job. + * The tags to add to the resources on which the job runs * - * @default no tags + * @default {} - no tags */ readonly tags?: { [key: string]: string }; /** - * The IAM role associated with this job. + * The IAM role assumed by Glue to run this job. * - * @default an IAM role is generated + * @default - a role is automatically generated */ readonly role?: iam.IRole; @@ -581,9 +582,9 @@ export interface JobProps { readonly sparkUI?: SparkUIProps, /** - * Enables Continuous Logging with the specified props. + * Enables continuous logging with the specified props. * - * @default - Continuous Logging is disabled. + * @default - continuous logging is disabled. * * @see https://docs.aws.amazon.com/glue/latest/dg/monitor-continuous-logging-enable.html * @see https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-glue-arguments.html @@ -596,12 +597,12 @@ export interface JobProps { */ export class Job extends JobBase { /** - * Creates a Glue Job - * - * @param scope The scope creating construct (usually `this`). - * @param id The construct's id. - * @param attrs Import attributes - */ + * Creates a Glue Job + * + * @param scope The scope creating construct (usually `this`). + * @param id The construct's id. + * @param attrs Import attributes + */ public static fromJobAttributes(scope: constructs.Construct, id: string, attrs: JobAttributes): IJob { class Import extends JobBase { public readonly jobName = attrs.jobName; @@ -641,7 +642,6 @@ export class Job extends JobBase { const executable = props.executable.bind(); - // Create a basic service role if one is not provided https://docs.aws.amazon.com/glue/latest/dg/create-service-policy.html this.role = props.role ?? new iam.Role(this, 'ServiceRole', { assumedBy: new iam.ServicePrincipal('glue.amazonaws.com'), managedPolicies: [iam.ManagedPolicy.fromAwsManagedPolicyName('service-role/AWSGlueServiceRole')], From 0f587c9a685ac63a052c41c639b03e4930b78304 Mon Sep 17 00:00:00 2001 From: Ahmed Kamel Date: Tue, 31 Aug 2021 22:23:12 +0100 Subject: [PATCH 28/50] fix build issues from previous round of comments --- packages/@aws-cdk/aws-glue/lib/code.ts | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/packages/@aws-cdk/aws-glue/lib/code.ts b/packages/@aws-cdk/aws-glue/lib/code.ts index c6c757c936a62..4765817d25a76 100644 --- a/packages/@aws-cdk/aws-glue/lib/code.ts +++ b/packages/@aws-cdk/aws-glue/lib/code.ts @@ -24,7 +24,7 @@ export abstract class Code { * * @param path code file (not a directory). */ - public static fromAsset(path: string, options?: s3_assets.AssetOptions): AssetCode { + public static fromAsset(path: string, options?: s3assets.AssetOptions): AssetCode { return new AssetCode(path, options); } @@ -56,12 +56,12 @@ export class S3Code extends Code { * Job Code from a local file. */ export class AssetCode extends Code { - private asset?: s3_assets.Asset; + private asset?: s3assets.Asset; /** * @param path The path to the Code file. */ - constructor(private readonly path: string, private readonly options: s3_assets.AssetOptions = { }) { + constructor(private readonly path: string, private readonly options: s3assets.AssetOptions = { }) { super(); } @@ -71,7 +71,7 @@ export class AssetCode extends Code { } // If the same AssetCode is used multiple times, retain only the first instantiation. if (!this.asset) { - this.asset = new s3_assets.Asset(scope, `Code${this.hashcode(this.path)}`, { + this.asset = new s3assets.Asset(scope, `Code${this.hashcode(this.path)}`, { path: this.path, ...this.options, }); From 87dee59b17b1d1e149c99148d3436530557a7acd Mon Sep 17 00:00:00 2001 From: Ahmed Kamel Date: Wed, 1 Sep 2021 00:05:30 +0100 Subject: [PATCH 29/50] address comments - move AssetCode directory check to constructor - rename SparkUIConfig to SparkUILoggingLocation - rename filter to quiet - rename jobStateRule to onStateChange and make it public - check no reserved arguments used - rename JobProps.numberOfWorkers to JobProps.workerCount - move JobExecutableConfig to the bottom of the file - rename PythonJobExecutableProps to PythonSparkJobExecutableProps --- packages/@aws-cdk/aws-glue/lib/code.ts | 5 +- .../@aws-cdk/aws-glue/lib/job-executable.ts | 179 ++++++++------- packages/@aws-cdk/aws-glue/lib/job.ts | 205 +++++++++++------- packages/@aws-cdk/aws-glue/package.json | 3 +- packages/@aws-cdk/aws-glue/test/code.test.ts | 4 +- packages/@aws-cdk/aws-glue/test/integ.job.ts | 2 +- packages/@aws-cdk/aws-glue/test/job.test.ts | 15 +- 7 files changed, 222 insertions(+), 191 deletions(-) diff --git a/packages/@aws-cdk/aws-glue/lib/code.ts b/packages/@aws-cdk/aws-glue/lib/code.ts index 4765817d25a76..bb0f7a3c7a465 100644 --- a/packages/@aws-cdk/aws-glue/lib/code.ts +++ b/packages/@aws-cdk/aws-glue/lib/code.ts @@ -63,12 +63,13 @@ export class AssetCode extends Code { */ constructor(private readonly path: string, private readonly options: s3assets.AssetOptions = { }) { super(); - } - public bind(scope: constructs.Construct): CodeConfig { if (fs.lstatSync(this.path).isDirectory()) { throw new Error(`Code path ${this.path} is a directory. Only files are supported`); } + } + + public bind(scope: constructs.Construct): CodeConfig { // If the same AssetCode is used multiple times, retain only the first instantiation. if (!this.asset) { this.asset = new s3assets.Asset(scope, `Code${this.hashcode(this.path)}`, { diff --git a/packages/@aws-cdk/aws-glue/lib/job-executable.ts b/packages/@aws-cdk/aws-glue/lib/job-executable.ts index a136ede5ba219..6e9b2e39b425d 100644 --- a/packages/@aws-cdk/aws-glue/lib/job-executable.ts +++ b/packages/@aws-cdk/aws-glue/lib/job-executable.ts @@ -160,87 +160,6 @@ interface SharedSparkJobExecutableProps extends SharedJobExecutableProps { readonly extraJarsFirst?: boolean; } -/** - * Result of binding a `JobExecutable` into a `Job`. - */ -export interface JobExecutableConfig { - /** - * Glue version. - * - * @see https://docs.aws.amazon.com/glue/latest/dg/release-notes.html - */ - readonly glueVersion: GlueVersion; - - /** - * The language of the job (Scala or Python). - * - * @see `--job-language` in https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-glue-arguments.html - */ - readonly language: JobLanguage; - - /** - * Specify the type of the job whether it's an Apache Spark ETL or streaming one or if it's a Python shell job. - */ - readonly type: JobType; - - /** - * The Python version to use. - * - * @default - no python version specified - */ - readonly pythonVersion?: PythonVersion; - - /** - * The script that is executed by a job. - */ - readonly script: Code; - - /** - * The Scala class that serves as the entry point for the job. This applies only if your the job langauage is Scala. - * - * @default - no scala className specified - * - * @see `--class` in https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-glue-arguments.html - */ - readonly className?: string; - - /** - * Additional Java .jar files that AWS Glue adds to the Java classpath before executing your script. - * - * @default - no extra jars specified. - * - * @see `--extra-jars` in https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-glue-arguments.html - */ - readonly extraJars?: Code[]; - - /** - * Additional Python files that AWS Glue adds to the Python path before executing your script. - * - * @default - no extra python files specified. - * - * @see `--extra-py-files` in https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-glue-arguments.html - */ - readonly extraPythonFiles?: Code[]; - - /** - * Additional files, such as configuration files that AWS Glue copies to the working directory of your script before executing it. - * - * @default - no extra files specified. - * - * @see `--extra-files` in https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-glue-arguments.html - */ - readonly extraFiles?: Code[]; - - /** - * Setting this value to true prioritizes the customer's extra JAR files in the classpath. - * - * @default - extra jars are not prioritized. - * - * @see `--user-jars-first` in https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-glue-arguments.html - */ - readonly extraJarsFirst?: boolean; -} - /** * Props for creating a Scala Spark (ETL or Streaming) job executable */ @@ -256,7 +175,7 @@ export interface ScalaJobExecutableProps extends SharedSparkJobExecutableProps { /** * Props for creating a Python Spark (ETL or Streaming) job executable */ -export interface PythonJobExecutableProps extends SharedSparkJobExecutableProps { +export interface PythonSparkJobExecutableProps extends SharedSparkJobExecutableProps { /** * The Python version to use. @@ -271,14 +190,6 @@ export interface PythonJobExecutableProps extends SharedSparkJobExecutableProps * @see `--extra-py-files` in https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-glue-arguments.html */ readonly extraPythonFiles?: Code[]; - - /** - * Setting this value to true prioritizes the customer's extra JAR files in the classpath. - * - * @default - priortiy is not given to extra jars and argument is not set - * @see `--user-jars-first` in https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-glue-arguments.html - */ - readonly extraJarsFirst?: boolean; } /** @@ -337,7 +248,7 @@ export class JobExecutable { * * @param props Python Apache Spark Job props */ - public static pythonEtl(props: PythonJobExecutableProps): JobExecutable { + public static pythonEtl(props: PythonSparkJobExecutableProps): JobExecutable { return new JobExecutable({ ...props, type: JobType.ETL, @@ -350,7 +261,7 @@ export class JobExecutable { * * @param props Python Apache Spark Job props */ - public static pythonStreaming(props: PythonJobExecutableProps): JobExecutable { + public static pythonStreaming(props: PythonSparkJobExecutableProps): JobExecutable { return new JobExecutable({ ...props, type: JobType.STREAMING, @@ -400,7 +311,91 @@ export class JobExecutable { this.config = config; } + /** + * Called during Job initialization to get JobExecutableConfig. + */ public bind(): JobExecutableConfig { return this.config; } } + +/** + * Result of binding a `JobExecutable` into a `Job`. + */ +export interface JobExecutableConfig { + /** + * Glue version. + * + * @see https://docs.aws.amazon.com/glue/latest/dg/release-notes.html + */ + readonly glueVersion: GlueVersion; + + /** + * The language of the job (Scala or Python). + * + * @see `--job-language` in https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-glue-arguments.html + */ + readonly language: JobLanguage; + + /** + * Specify the type of the job whether it's an Apache Spark ETL or streaming one or if it's a Python shell job. + */ + readonly type: JobType; + + /** + * The Python version to use. + * + * @default - no python version specified + */ + readonly pythonVersion?: PythonVersion; + + /** + * The script that is executed by a job. + */ + readonly script: Code; + + /** + * The Scala class that serves as the entry point for the job. This applies only if your the job langauage is Scala. + * + * @default - no scala className specified + * + * @see `--class` in https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-glue-arguments.html + */ + readonly className?: string; + + /** + * Additional Java .jar files that AWS Glue adds to the Java classpath before executing your script. + * + * @default - no extra jars specified. + * + * @see `--extra-jars` in https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-glue-arguments.html + */ + readonly extraJars?: Code[]; + + /** + * Additional Python files that AWS Glue adds to the Python path before executing your script. + * + * @default - no extra python files specified. + * + * @see `--extra-py-files` in https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-glue-arguments.html + */ + readonly extraPythonFiles?: Code[]; + + /** + * Additional files, such as configuration files that AWS Glue copies to the working directory of your script before executing it. + * + * @default - no extra files specified. + * + * @see `--extra-files` in https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-glue-arguments.html + */ + readonly extraFiles?: Code[]; + + /** + * Setting this value to true prioritizes the customer's extra JAR files in the classpath. + * + * @default - extra jars are not prioritized. + * + * @see `--user-jars-first` in https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-glue-arguments.html + */ + readonly extraJarsFirst?: boolean; +} diff --git a/packages/@aws-cdk/aws-glue/lib/job.ts b/packages/@aws-cdk/aws-glue/lib/job.ts index 6f3cfae8b64c6..0446234d19bbb 100644 --- a/packages/@aws-cdk/aws-glue/lib/job.ts +++ b/packages/@aws-cdk/aws-glue/lib/job.ts @@ -112,7 +112,7 @@ export enum MetricType { /** * Interface representing a created or an imported {@link Job}. */ -export interface IJob extends cdk.IResource { +export interface IJob extends cdk.IResource, iam.IGrantable { /** * The name of the job. * @attribute @@ -132,6 +132,13 @@ export interface IJob extends cdk.IResource { */ onEvent(id: string, options?: events.OnEventOptions): events.Rule; + /** + * Defines a CloudWatch event rule triggered when this job moves to the input jobState. + * + * @see https://docs.aws.amazon.com/AmazonCloudWatch/latest/events/EventTypes.html#glue-event-types + */ + onStateChange(id: string, jobState: JobState, options?: events.OnEventOptions): events.Rule; + /** * Defines a CloudWatch event rule triggered when this job moves to the SUCCEEDED state. * @@ -182,40 +189,9 @@ export interface IJob extends cdk.IResource { abstract class JobBase extends cdk.Resource implements IJob { - /** - * Create a CloudWatch Metric that's based on Glue Job events - * {@see https://docs.aws.amazon.com/AmazonCloudWatch/latest/events/EventTypes.html#glue-event-types} - * The metric has namespace = 'AWS/Events', metricName = 'TriggeredRules' and RuleName = rule.ruleName dimension. - * - * @param rule for use in setting RuleName dimension value - * @param props metric properties - */ - protected static metricRule(rule: events.IRule, props?: cloudwatch.MetricOptions): cloudwatch.Metric { - return new cloudwatch.Metric({ - namespace: 'AWS/Events', - metricName: 'TriggeredRules', - dimensions: { RuleName: rule.ruleName }, - statistic: cloudwatch.Statistic.SUM, - ...props, - }).attachTo(rule); - } - - - /** - * Returns the job arn - * @param scope - * @param jobName - */ - protected static buildJobArn(scope: constructs.Construct, jobName: string) : string { - return cdk.Stack.of(scope).formatArn({ - service: 'glue', - resource: 'job', - resourceName: jobName, - }); - } - public abstract readonly jobArn: string; public abstract readonly jobName: string; + public abstract readonly grantPrincipal: iam.IPrincipal; /** * Create a CloudWatch Event Rule for this Glue Job when it's in a given state @@ -242,13 +218,34 @@ abstract class JobBase extends cdk.Resource implements IJob { } /** - * Return a CloudWatch Event Rule matching JobState.SUCCEEDED. + * Create a CloudWatch Event Rule for the transition into the input jobState. + * + * @param id construct id. + * @param jobState the job state. + * @param options optional event options. + * @private + */ + public onStateChange(id: string, jobState: JobState, options: events.OnEventOptions = {}): events.Rule { + const rule = this.onEvent(id, { + description: `Rule triggered when Glue job ${this.jobName} is in ${jobState} state`, + ...options, + }); + rule.addEventPattern({ + detail: { + state: [jobState], + }, + }); + return rule; + } + + /** + * Create a CloudWatch Event Rule matching JobState.SUCCEEDED. * * @param id construct id. * @param options optional event options. default is {}. */ public onSuccess(id: string, options: events.OnEventOptions = {}): events.Rule { - return this.jobStateRule(id, JobState.SUCCEEDED, options); + return this.onStateChange(id, JobState.SUCCEEDED, options); } /** @@ -258,7 +255,7 @@ abstract class JobBase extends cdk.Resource implements IJob { * @param options optional event options. default is {}. */ public onFailure(id: string, options: events.OnEventOptions = {}): events.Rule { - return this.jobStateRule(id, JobState.FAILED, options); + return this.onStateChange(id, JobState.FAILED, options); } /** @@ -268,7 +265,7 @@ abstract class JobBase extends cdk.Resource implements IJob { * @param options optional event options. default is {}. */ public onTimeout(id: string, options: events.OnEventOptions = {}): events.Rule { - return this.jobStateRule(id, JobState.TIMEOUT, options); + return this.onStateChange(id, JobState.TIMEOUT, options); } /** @@ -299,7 +296,7 @@ abstract class JobBase extends cdk.Resource implements IJob { * This metric is based on the Rule returned by no-args onSuccess() call. */ public metricSuccess(props?: cloudwatch.MetricOptions): cloudwatch.Metric { - return JobBase.metricRule(this.metricJobStateRule('SuccessMetricRule', JobState.SUCCEEDED), props); + return metricRule(this.metricJobStateRule('SuccessMetricRule', JobState.SUCCEEDED), props); } /** @@ -308,7 +305,7 @@ abstract class JobBase extends cdk.Resource implements IJob { * This metric is based on the Rule returned by no-args onFailure() call. */ public metricFailure(props?: cloudwatch.MetricOptions): cloudwatch.Metric { - return JobBase.metricRule(this.metricJobStateRule('FailureMetricRule', JobState.FAILED), props); + return metricRule(this.metricJobStateRule('FailureMetricRule', JobState.FAILED), props); } /** @@ -317,7 +314,7 @@ abstract class JobBase extends cdk.Resource implements IJob { * This metric is based on the Rule returned by no-args onTimeout() call. */ public metricTimeout(props?: cloudwatch.MetricOptions): cloudwatch.Metric { - return JobBase.metricRule(this.metricJobStateRule('TimeoutMetricRule', JobState.TIMEOUT), props); + return metricRule(this.metricJobStateRule('TimeoutMetricRule', JobState.TIMEOUT), props); } /** @@ -328,28 +325,7 @@ abstract class JobBase extends cdk.Resource implements IJob { * @private */ private metricJobStateRule(id: string, jobState: JobState): events.Rule { - return this.node.tryFindChild(id) as events.Rule ?? this.jobStateRule(id, jobState); - } - - /** - * Creates a new rule for a transition into the input jobState. - * - * @param id construct id. - * @param jobState the job state. - * @param options optional event options. - * @private - */ - private jobStateRule(id: string, jobState: JobState, options: events.OnEventOptions = {}): events.Rule { - const rule = this.onEvent(id, { - description: `Rule triggered when Glue job ${this.jobName} is in ${jobState} state`, - ...options, - }); - rule.addEventPattern({ - detail: { - state: [jobState], - }, - }); - return rule; + return this.node.tryFindChild(id) as events.Rule ?? this.onStateChange(id, jobState); } } @@ -377,16 +353,16 @@ export interface SparkUIProps { * * @default '/' - the logs will be written at the root of the bucket */ - readonly path?: string; + readonly prefix?: string; } /** - * The Spark UI monitoring configurations for Spark-based Glue jobs. + * The Spark UI logging location. * * @see https://docs.aws.amazon.com/glue/latest/dg/monitor-spark-ui-jobs.html * @see https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-glue-arguments.html */ -export interface SparkUIConfig { +export interface SparkUILoggingLocation { /** * The bucket where the Glue job stores the logs. */ @@ -395,9 +371,9 @@ export interface SparkUIConfig { /** * The path inside the bucket (objects prefix) where the Glue job stores the logs. * - * @default - no path will be used, the logs will be written at the root of the bucket. + * @default '/' - the logs will be written at the root of the bucket */ - readonly path?: string; + readonly prefix?: string; } /** @@ -431,10 +407,11 @@ export interface ContinuousLoggingProps { * * @default true */ - readonly filter?: boolean; + readonly quiet?: boolean; /** * Apply the provided conversion pattern. + * This is a Log4j Conversion Pattern to customize driver and executor logs. * * @default `%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n` */ @@ -476,7 +453,7 @@ export interface JobProps { /** * The number of AWS Glue data processing units (DPUs) that can be allocated when this job runs. - * Cannot be used for Glue version 2.0 and later - workerType and numberOfWorkers should be used instead. + * Cannot be used for Glue version 2.0 and later - workerType and workerCount should be used instead. * * @default - 10 when job type is Apache Spark ETL or streaming, 0.0625 when job type is Python shell */ @@ -524,10 +501,11 @@ export interface JobProps { * * @default - differs based on specific Glue version/worker type */ - readonly numberOfWorkers?: number; + readonly workerCount?: number; /** * The {@link Connection}s used for this job. + * Connections are used to connect to other AWS Service or resources within a VPC. * * @default [] - no connections are added to the job */ @@ -543,7 +521,7 @@ export interface JobProps { /** * The default arguments for this job, specified as name-value pairs. * - * @see https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-glue-arguments.html for a list of special parameters Used by AWS Glue + * @see https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-glue-arguments.html for a list of reserved parameters * @default - no arguments */ readonly defaultArguments?: { [key: string]: string }; @@ -557,6 +535,9 @@ export interface JobProps { /** * The IAM role assumed by Glue to run this job. + * If providing a custom role, it needs to trust Glue servie (glue.amazonaws.com). + * + * @see https://docs.aws.amazon.com/glue/latest/dg/getting-started-access.html * * @default - a role is automatically generated */ @@ -606,7 +587,8 @@ export class Job extends JobBase { public static fromJobAttributes(scope: constructs.Construct, id: string, attrs: JobAttributes): IJob { class Import extends JobBase { public readonly jobName = attrs.jobName; - public readonly jobArn = JobBase.buildJobArn(scope, attrs.jobName); + public readonly jobArn = jobArn(scope, attrs.jobName) + public readonly grantPrincipal = new iam.UnknownPrincipal({ resource: this }); } return new Import(scope, id); @@ -623,17 +605,22 @@ export class Job extends JobBase { public readonly jobName: string; /** - * The IAM role associated with this job. + * The IAM role Glue assumes to run this job. */ public readonly role: iam.IRole; /** - * The Spark monitoring configuration (Bucket, path) if Spark UI monitoring and debugging is enabled. + * The principal this Glue Job is running as. + */ + public readonly grantPrincipal: iam.IPrincipal; + + /** + * The Spark UI logs location if Spark UI monitoring and debugging is enabled. * * @see https://docs.aws.amazon.com/glue/latest/dg/monitor-spark-ui-jobs.html * @see https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-glue-arguments.html */ - public readonly sparkUIConfig?: SparkUIConfig; + public readonly sparkUILoggingLocation?: SparkUILoggingLocation; constructor(scope: constructs.Construct, id: string, props: JobProps) { super(scope, id, { @@ -646,9 +633,10 @@ export class Job extends JobBase { assumedBy: new iam.ServicePrincipal('glue.amazonaws.com'), managedPolicies: [iam.ManagedPolicy.fromAwsManagedPolicyName('service-role/AWSGlueServiceRole')], }); + this.grantPrincipal = this.role; const sparkUI = props.sparkUI?.enabled ? this.setupSparkUI(executable, this.role, props.sparkUI) : undefined; - this.sparkUIConfig = sparkUI?.config; + this.sparkUILoggingLocation = sparkUI?.location; const continuousLoggingArgs = props.continuousLogging?.enabled ? this.setupContinuousLogging(this.role, props.continuousLogging) : {}; const profilingMetricsArgs = props.enableProfilingMetrics ? { '--enable-metrics': '' } : {}; @@ -657,7 +645,7 @@ export class Job extends JobBase { ...continuousLoggingArgs, ...profilingMetricsArgs, ...sparkUI?.args, - ...props.defaultArguments, + ...this.checkNoReservedArgs(props.defaultArguments), }; const jobResource = new CfnJob(this, 'Resource', { @@ -671,7 +659,7 @@ export class Job extends JobBase { }, glueVersion: executable.glueVersion.name, workerType: props.workerType?.name, - numberOfWorkers: props.numberOfWorkers, + numberOfWorkers: props.workerCount, maxCapacity: props.maxCapacity, maxRetries: props.maxRetries, executionProperty: props.maxConcurrentRuns ? { maxConcurrentRuns: props.maxConcurrentRuns } : undefined, @@ -684,10 +672,27 @@ export class Job extends JobBase { }); const resourceName = this.getResourceNameAttribute(jobResource.ref); - this.jobArn = JobBase.buildJobArn(this, resourceName); + this.jobArn = jobArn(this, resourceName); this.jobName = resourceName; } + /** + * Check no usage of reserved arguments. + * + * @see https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-glue-arguments.html + */ + private checkNoReservedArgs(defaultArguments?: { [key: string]: string }) { + if (defaultArguments) { + const reservedArgs = new Set(['--conf', '--debug', '--mode', '--JOB_NAME']); + Object.keys(defaultArguments).forEach((arg) => { + if (reservedArgs.has(arg)) { + throw new Error(`${arg} is a reserved argument. Don't set it`); + } + }); + } + return defaultArguments; + } + private executableArguments(config: JobExecutableConfig) { const args: { [key: string]: string } = {}; args['--job-language'] = config.language; @@ -718,12 +723,12 @@ export class Job extends JobBase { bucket.grantReadWrite(role); const args = { '--enable-spark-ui': 'true', - '--spark-event-logs-path': `s3://${bucket.bucketName}/${props.path || ''}`, + '--spark-event-logs-path': bucket.s3UrlForObject(props.prefix), }; return { - config: { - ...props, + location: { + prefix: props.prefix, bucket, }, args, @@ -733,7 +738,7 @@ export class Job extends JobBase { private setupContinuousLogging(role: iam.IRole, props: ContinuousLoggingProps) { const args: {[key: string]: string} = { '--enable-continuous-cloudwatch-log': 'true', - '--enable-continuous-log-filter': (props.filter ?? true).toString(), + '--enable-continuous-log-filter': (props.quiet ?? true).toString(), }; if (props.logGroup) { @@ -750,8 +755,40 @@ export class Job extends JobBase { return args; } - private codeS3ObjectUrl(code: Code): string { + private codeS3ObjectUrl(code: Code) { const s3Location = code.bind(this).s3Location; return `s3://${s3Location.bucketName}/${s3Location.objectKey}`; } } + +/** + * Create a CloudWatch Metric that's based on Glue Job events + * {@see https://docs.aws.amazon.com/AmazonCloudWatch/latest/events/EventTypes.html#glue-event-types} + * The metric has namespace = 'AWS/Events', metricName = 'TriggeredRules' and RuleName = rule.ruleName dimension. + * + * @param rule for use in setting RuleName dimension value + * @param props metric properties + */ +function metricRule(rule: events.IRule, props?: cloudwatch.MetricOptions): cloudwatch.Metric { + return new cloudwatch.Metric({ + namespace: 'AWS/Events', + metricName: 'TriggeredRules', + dimensions: { RuleName: rule.ruleName }, + statistic: cloudwatch.Statistic.SUM, + ...props, + }).attachTo(rule); +} + + +/** + * Returns the job arn + * @param scope + * @param jobName + */ +function jobArn(scope: constructs.Construct, jobName: string) : string { + return cdk.Stack.of(scope).formatArn({ + service: 'glue', + resource: 'job', + resourceName: jobName, + }); +} diff --git a/packages/@aws-cdk/aws-glue/package.json b/packages/@aws-cdk/aws-glue/package.json index 38223678bd6ca..f4d14a9fce1e2 100644 --- a/packages/@aws-cdk/aws-glue/package.json +++ b/packages/@aws-cdk/aws-glue/package.json @@ -156,8 +156,7 @@ "docs-public-apis:@aws-cdk/aws-glue.ClassificationString.PARQUET", "docs-public-apis:@aws-cdk/aws-glue.ClassificationString.ORC", "docs-public-apis:@aws-cdk/aws-glue.ClassificationString.value", - "docs-public-apis:@aws-cdk/aws-glue.JobExecutable.bind", - "no-unused-type:@aws-cdk/aws-glue.JobState" + "events-method-signature:@aws-cdk/aws-glue.Job.onStateChange" ] }, "awscdkio": { diff --git a/packages/@aws-cdk/aws-glue/test/code.test.ts b/packages/@aws-cdk/aws-glue/test/code.test.ts index 1874a141fb9ea..01732dd6538e7 100644 --- a/packages/@aws-cdk/aws-glue/test/code.test.ts +++ b/packages/@aws-cdk/aws-glue/test/code.test.ts @@ -44,8 +44,8 @@ describe('Code', () => { expect(codeConfig.s3Location.objectKey).toBeDefined(); }); - test('with an unsupported directory path and calling bind() throws', () => { - expect(() => Code.fromAsset(directoryPath).bind(stack)) + test('with an unsupported directory path throws', () => { + expect(() => Code.fromAsset(directoryPath)) .toThrow(/Only files are supported/); }); diff --git a/packages/@aws-cdk/aws-glue/test/integ.job.ts b/packages/@aws-cdk/aws-glue/test/integ.job.ts index 0b607b66117dc..cae8c25913ec8 100644 --- a/packages/@aws-cdk/aws-glue/test/integ.job.ts +++ b/packages/@aws-cdk/aws-glue/test/integ.job.ts @@ -39,7 +39,7 @@ const etlJob = new glue.Job(stack, 'Job', { script, }), workerType: glue.WorkerType.G_2X, - numberOfWorkers: 10, + workerCount: 10, maxConcurrentRuns: 2, maxRetries: 2, timeout: cdk.Duration.minutes(5), diff --git a/packages/@aws-cdk/aws-glue/test/job.test.ts b/packages/@aws-cdk/aws-glue/test/job.test.ts index d3372e29f7d75..297bed6c7d54f 100644 --- a/packages/@aws-cdk/aws-glue/test/job.test.ts +++ b/packages/@aws-cdk/aws-glue/test/job.test.ts @@ -216,7 +216,7 @@ describe('Job', () => { }), continuousLogging: { enabled: true, - filter: false, + quiet: false, logStreamPrefix: 'LogStreamPrefix', conversionPattern: '%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n', logGroup, @@ -355,7 +355,6 @@ describe('Job', () => { { Ref: 'JobSparkUIBucket8E6A0139', }, - '/', ], ], }, @@ -439,7 +438,7 @@ describe('Job', () => { cdkassert.expect(stack).to(cdkassert.haveResourceLike('AWS::Glue::Job', { DefaultArguments: { '--enable-spark-ui': 'true', - '--spark-event-logs-path': `s3://${bucketName}/`, + '--spark-event-logs-path': `s3://${bucketName}`, }, })); }); @@ -448,12 +447,12 @@ describe('Job', () => { describe('enabling spark ui with bucket and path provided', () => { let bucketName: string; let bucket: s3.IBucket; - let path: string; + let prefix: string; beforeEach(() => { bucketName = 'BucketName'; bucket = s3.Bucket.fromBucketName(stack, 'BucketId', bucketName); - path = 'some/path/'; + prefix = 'some/path/'; job = new glue.Job(stack, 'Job', { executable: glue.JobExecutable.scalaEtl({ glueVersion: glue.GlueVersion.V2_0, @@ -463,7 +462,7 @@ describe('Job', () => { sparkUI: { enabled: true, bucket, - path, + prefix, }, }); }); @@ -472,7 +471,7 @@ describe('Job', () => { cdkassert.expect(stack).to(cdkassert.haveResourceLike('AWS::Glue::Job', { DefaultArguments: { '--enable-spark-ui': 'true', - '--spark-event-logs-path': `s3://${bucketName}/${path}`, + '--spark-event-logs-path': `s3://${bucketName}/${prefix}`, }, })); }); @@ -489,7 +488,7 @@ describe('Job', () => { script, }), workerType: glue.WorkerType.G_2X, - numberOfWorkers: 10, + workerCount: 10, maxConcurrentRuns: 2, maxRetries: 2, timeout: cdk.Duration.minutes(5), From 0ded0f22e40d5f2386db975532f46649b4597856 Mon Sep 17 00:00:00 2001 From: Ahmed Kamel Date: Wed, 1 Sep 2021 00:37:19 +0100 Subject: [PATCH 30/50] refactor JobExecutableProps --- .../@aws-cdk/aws-glue/lib/job-executable.ts | 50 +++++++------------ 1 file changed, 18 insertions(+), 32 deletions(-) diff --git a/packages/@aws-cdk/aws-glue/lib/job-executable.ts b/packages/@aws-cdk/aws-glue/lib/job-executable.ts index 6e9b2e39b425d..f2dec5993f968 100644 --- a/packages/@aws-cdk/aws-glue/lib/job-executable.ts +++ b/packages/@aws-cdk/aws-glue/lib/job-executable.ts @@ -117,6 +117,22 @@ export class JobType { } } +interface PythonExecutableProps { + /** + * The Python version to use. + */ + readonly pythonVersion: PythonVersion; + + /** + * Additional Python files that AWS Glue adds to the Python path before executing your script. + * + * @default - no extra python files and argument is not set + * + * @see `--extra-py-files` in https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-glue-arguments.html + */ + readonly extraPythonFiles?: Code[]; +} + interface SharedJobExecutableProps { /** * Glue version. @@ -175,42 +191,12 @@ export interface ScalaJobExecutableProps extends SharedSparkJobExecutableProps { /** * Props for creating a Python Spark (ETL or Streaming) job executable */ -export interface PythonSparkJobExecutableProps extends SharedSparkJobExecutableProps { - - /** - * The Python version to use. - */ - readonly pythonVersion: PythonVersion; - - /** - * Additional Python files that AWS Glue adds to the Python path before executing your script. - * - * @default - no extra python files and argument is not set - * - * @see `--extra-py-files` in https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-glue-arguments.html - */ - readonly extraPythonFiles?: Code[]; -} +export interface PythonSparkJobExecutableProps extends SharedSparkJobExecutableProps, PythonExecutableProps {} /** * Props for creating a Python shell job executable */ -export interface PythonShellExecutableProps extends SharedJobExecutableProps { - - /** - * The Python version to use. - */ - readonly pythonVersion: PythonVersion; - - /** - * Additional Python files that AWS Glue adds to the Python path before executing your script. - * - * @default - no extra python files and argument is not set - * - * @see `--extra-py-files` in https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-glue-arguments.html - */ - readonly extraPythonFiles?: Code[]; -} +export interface PythonShellExecutableProps extends SharedJobExecutableProps, PythonExecutableProps {} /** * The executable properties related to the Glue job's GlueVersion, JobType and code From 82c1d98859bc5058a23da3324566148c37a9c550 Mon Sep 17 00:00:00 2001 From: Ahmed Kamel Date: Wed, 1 Sep 2021 00:41:31 +0100 Subject: [PATCH 31/50] drop @aws-cdk/aws-s3-assets from devDependencies --- packages/@aws-cdk/aws-glue/package.json | 1 - 1 file changed, 1 deletion(-) diff --git a/packages/@aws-cdk/aws-glue/package.json b/packages/@aws-cdk/aws-glue/package.json index f4d14a9fce1e2..f40f10a73573f 100644 --- a/packages/@aws-cdk/aws-glue/package.json +++ b/packages/@aws-cdk/aws-glue/package.json @@ -78,7 +78,6 @@ "@aws-cdk/assert-internal": "0.0.0", "@aws-cdk/cx-api": "0.0.0", "@types/nodeunit": "^0.0.32", - "@aws-cdk/aws-s3-assets": "0.0.0", "cdk-build-tools": "0.0.0", "cdk-integ-tools": "0.0.0", "cfn2ts": "0.0.0", From c81e7361c59c796cfb8a9797c28865b9c77ef015 Mon Sep 17 00:00:00 2001 From: Ahmed Kamel Date: Wed, 1 Sep 2021 00:48:09 +0100 Subject: [PATCH 32/50] restore docs about individual files support --- packages/@aws-cdk/aws-glue/lib/job-executable.ts | 3 +++ 1 file changed, 3 insertions(+) diff --git a/packages/@aws-cdk/aws-glue/lib/job-executable.ts b/packages/@aws-cdk/aws-glue/lib/job-executable.ts index f2dec5993f968..5d9dacf04386e 100644 --- a/packages/@aws-cdk/aws-glue/lib/job-executable.ts +++ b/packages/@aws-cdk/aws-glue/lib/job-executable.ts @@ -125,6 +125,7 @@ interface PythonExecutableProps { /** * Additional Python files that AWS Glue adds to the Python path before executing your script. + * Only individual files are supported, directories are not supported. * * @default - no extra python files and argument is not set * @@ -148,6 +149,7 @@ interface SharedJobExecutableProps { /** * Additional files, such as configuration files that AWS Glue copies to the working directory of your script before executing it. + * Only individual files are supported, directories are not supported. * * @default [] - no extra files are copied to the working directory * @@ -159,6 +161,7 @@ interface SharedJobExecutableProps { interface SharedSparkJobExecutableProps extends SharedJobExecutableProps { /** * Additional Java .jar files that AWS Glue adds to the Java classpath before executing your script. + * Only individual files are supported, directories are not supported. * * @default [] - no extra jars are added to the classpath * From 8ce0fe833c95f851b0f40edf0bcc6da6ab226bf8 Mon Sep 17 00:00:00 2001 From: Ahmed Kamel Date: Wed, 1 Sep 2021 08:16:03 +0100 Subject: [PATCH 33/50] apply suggestions from comments Co-authored-by: Ben Chaimberg --- packages/@aws-cdk/aws-glue/lib/job.ts | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/packages/@aws-cdk/aws-glue/lib/job.ts b/packages/@aws-cdk/aws-glue/lib/job.ts index 0446234d19bbb..835c3172ede41 100644 --- a/packages/@aws-cdk/aws-glue/lib/job.ts +++ b/packages/@aws-cdk/aws-glue/lib/job.ts @@ -223,7 +223,6 @@ abstract class JobBase extends cdk.Resource implements IJob { * @param id construct id. * @param jobState the job state. * @param options optional event options. - * @private */ public onStateChange(id: string, jobState: JobState, options: events.OnEventOptions = {}): events.Rule { const rule = this.onEvent(id, { @@ -411,6 +410,7 @@ export interface ContinuousLoggingProps { /** * Apply the provided conversion pattern. + * * This is a Log4j Conversion Pattern to customize driver and executor logs. * * @default `%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n` @@ -505,6 +505,7 @@ export interface JobProps { /** * The {@link Connection}s used for this job. + * * Connections are used to connect to other AWS Service or resources within a VPC. * * @default [] - no connections are added to the job @@ -535,7 +536,8 @@ export interface JobProps { /** * The IAM role assumed by Glue to run this job. - * If providing a custom role, it needs to trust Glue servie (glue.amazonaws.com). + * + * If providing a custom role, it needs to trust the Glue service principal (glue.amazonaws.com) and be granted sufficient permissions. * * @see https://docs.aws.amazon.com/glue/latest/dg/getting-started-access.html * @@ -686,7 +688,7 @@ export class Job extends JobBase { const reservedArgs = new Set(['--conf', '--debug', '--mode', '--JOB_NAME']); Object.keys(defaultArguments).forEach((arg) => { if (reservedArgs.has(arg)) { - throw new Error(`${arg} is a reserved argument. Don't set it`); + throw new Error(`The ${arg} argument is reserved by Glue. Don't set it`); } }); } From 2ed79e1bc7cb557de4c166fd8bc70eaafe86f8b3 Mon Sep 17 00:00:00 2001 From: Ahmed Kamel Date: Wed, 1 Sep 2021 08:45:55 +0100 Subject: [PATCH 34/50] add optional role to JobAttributes --- packages/@aws-cdk/aws-glue/lib/job.ts | 21 ++++++++++-- packages/@aws-cdk/aws-glue/test/job.test.ts | 37 ++++++++++++++++----- 2 files changed, 46 insertions(+), 12 deletions(-) diff --git a/packages/@aws-cdk/aws-glue/lib/job.ts b/packages/@aws-cdk/aws-glue/lib/job.ts index 835c3172ede41..ad85f80657755 100644 --- a/packages/@aws-cdk/aws-glue/lib/job.ts +++ b/packages/@aws-cdk/aws-glue/lib/job.ts @@ -125,6 +125,12 @@ export interface IJob extends cdk.IResource, iam.IGrantable { */ readonly jobArn: string; + /** + * The IAM role assumed by Glue to run this job. + * @attribute + */ + readonly role?: iam.IRole; + /** * Defines a CloudWatch event rule triggered when something happens with this job. * @@ -191,6 +197,7 @@ abstract class JobBase extends cdk.Resource implements IJob { public abstract readonly jobArn: string; public abstract readonly jobName: string; + public abstract readonly role?: iam.IRole; public abstract readonly grantPrincipal: iam.IPrincipal; /** @@ -426,6 +433,13 @@ export interface JobAttributes { * The name of the job. */ readonly jobName: string; + + /** + * The IAM role assumed by Glue to run this job. + * + * @default - undefined + */ + readonly role?: iam.IRole; } /** @@ -589,8 +603,9 @@ export class Job extends JobBase { public static fromJobAttributes(scope: constructs.Construct, id: string, attrs: JobAttributes): IJob { class Import extends JobBase { public readonly jobName = attrs.jobName; - public readonly jobArn = jobArn(scope, attrs.jobName) - public readonly grantPrincipal = new iam.UnknownPrincipal({ resource: this }); + public readonly jobArn = jobArn(scope, attrs.jobName); + public readonly role = attrs.role; + public readonly grantPrincipal = attrs.role ?? new iam.UnknownPrincipal({ resource: this }); } return new Import(scope, id); @@ -609,7 +624,7 @@ export class Job extends JobBase { /** * The IAM role Glue assumes to run this job. */ - public readonly role: iam.IRole; + public readonly role?: iam.IRole; /** * The principal this Glue Job is running as. diff --git a/packages/@aws-cdk/aws-glue/test/job.test.ts b/packages/@aws-cdk/aws-glue/test/job.test.ts index 297bed6c7d54f..8b54b2a9e572f 100644 --- a/packages/@aws-cdk/aws-glue/test/job.test.ts +++ b/packages/@aws-cdk/aws-glue/test/job.test.ts @@ -50,17 +50,36 @@ describe('Job', () => { jobName = 'test-job'; }); - test('.fromJobAttributes() should return correct jobName and jobArn', () => { - const iJob = glue.Job.fromJobAttributes(stack, 'ImportedJob', { jobName }); - - expect(iJob.jobName).toEqual(jobName); - expect(iJob.jobArn).toEqual(stack.formatArn({ - service: 'glue', - resource: 'job', - resourceName: jobName, - })); + describe('.fromJobAttributes()', () => { + test('with required attrs only', () => { + const job = glue.Job.fromJobAttributes(stack, 'ImportedJob', { jobName }); + + expect(job.jobName).toEqual(jobName); + expect(job.jobArn).toEqual(stack.formatArn({ + service: 'glue', + resource: 'job', + resourceName: jobName, + })); + expect(job.role).toBeUndefined(); + expect(job.grantPrincipal).toEqual(new iam.UnknownPrincipal({ resource: job })); + }); + + test('with all attrs', () => { + const role = iam.Role.fromRoleArn(stack, 'Role', 'arn:aws:iam::123456789012:role/TestRole'); + const job = glue.Job.fromJobAttributes(stack, 'ImportedJob', { jobName, role }); + + expect(job.jobName).toEqual(jobName); + expect(job.jobArn).toEqual(stack.formatArn({ + service: 'glue', + resource: 'job', + resourceName: jobName, + })); + expect(job.role).toEqual(role); + expect(job.grantPrincipal).toEqual(role); + }); }); + describe('new', () => { let codeBucket: s3.IBucket; let script: glue.Code; From 1eb9c2089c51599d03984fed1d85c80232327777 Mon Sep 17 00:00:00 2001 From: Ahmed Kamel Date: Wed, 1 Sep 2021 08:45:55 +0100 Subject: [PATCH 35/50] drop @aws-cdk/assert-internal in favour of @aws-cdk/assertions --- packages/@aws-cdk/aws-glue/package.json | 1 - packages/@aws-cdk/aws-glue/test/job.test.ts | 132 ++++++++++---------- 2 files changed, 65 insertions(+), 68 deletions(-) diff --git a/packages/@aws-cdk/aws-glue/package.json b/packages/@aws-cdk/aws-glue/package.json index f40f10a73573f..daaba9400606c 100644 --- a/packages/@aws-cdk/aws-glue/package.json +++ b/packages/@aws-cdk/aws-glue/package.json @@ -75,7 +75,6 @@ "devDependencies": { "@types/jest": "^26.0.24", "@aws-cdk/assertions": "0.0.0", - "@aws-cdk/assert-internal": "0.0.0", "@aws-cdk/cx-api": "0.0.0", "@types/nodeunit": "^0.0.32", "cdk-build-tools": "0.0.0", diff --git a/packages/@aws-cdk/aws-glue/test/job.test.ts b/packages/@aws-cdk/aws-glue/test/job.test.ts index 8b54b2a9e572f..4d5ed366bb0de 100644 --- a/packages/@aws-cdk/aws-glue/test/job.test.ts +++ b/packages/@aws-cdk/aws-glue/test/job.test.ts @@ -1,13 +1,11 @@ -import * as cdkassert from '@aws-cdk/assert-internal'; +import { Template } from '@aws-cdk/assertions'; import * as cloudwatch from '@aws-cdk/aws-cloudwatch'; import * as events from '@aws-cdk/aws-events'; import * as iam from '@aws-cdk/aws-iam'; import * as logs from '@aws-cdk/aws-logs'; import * as s3 from '@aws-cdk/aws-s3'; import * as cdk from '@aws-cdk/core'; -import '@aws-cdk/assert-internal/jest'; import * as glue from '../lib'; -import { PythonVersion } from '../lib'; describe('GlueVersion', () => { test('.V0_9', () => expect(glue.GlueVersion.V0_9.name).toEqual('0.9')); @@ -112,7 +110,7 @@ describe('Job', () => { test('should create a role and use it with the job', () => { // check the role expect(job.role).toBeDefined(); - cdkassert.expect(stack).to(cdkassert.haveResource('AWS::IAM::Role', { + Template.fromStack(stack).hasResourceProperties('AWS::IAM::Role', { AssumeRolePolicyDocument: { Statement: [ { @@ -139,10 +137,10 @@ describe('Job', () => { ], }, ], - })); + }); // check the job using the role - cdkassert.expect(stack).to(cdkassert.haveResource('AWS::Glue::Job', { + Template.fromStack(stack).hasResourceProperties('AWS::Glue::Job', { Command: { Name: 'glueetl', ScriptLocation: 's3://bucketName/script', @@ -153,7 +151,7 @@ describe('Job', () => { 'Arn', ], }, - })); + }); }); test('should return correct jobName and jobArn from CloudFormation', () => { @@ -172,16 +170,16 @@ describe('Job', () => { job = new glue.Job(stack, 'JobWithRole', { executable: glue.JobExecutable.pythonEtl({ glueVersion: glue.GlueVersion.V2_0, - pythonVersion: PythonVersion.THREE, + pythonVersion: glue.PythonVersion.THREE, script, }), role, }); expect(job.role).toEqual(role); - cdkassert.expect(stack).to(cdkassert.haveResourceLike('AWS::Glue::Job', { + Template.fromStack(stack).hasResourceProperties('AWS::Glue::Job', { Role: role.roleArn, - })); + }); }); test('with a custom jobName should set it in CloudFormation', () => { @@ -194,9 +192,9 @@ describe('Job', () => { jobName, }); - cdkassert.expect(stack).to(cdkassert.haveResourceLike('AWS::Glue::Job', { + Template.fromStack(stack).hasResourceProperties('AWS::Glue::Job', { Name: jobName, - })); + }); }); }); @@ -213,12 +211,12 @@ describe('Job', () => { }); test('should set minimal default arguments', () => { - cdkassert.expect(stack).to(cdkassert.haveResourceLike('AWS::Glue::Job', { + Template.fromStack(stack).hasResourceProperties('AWS::Glue::Job', { DefaultArguments: { '--enable-continuous-cloudwatch-log': 'true', '--enable-continuous-log-filter': 'true', }, - })); + }); }); }); @@ -244,7 +242,7 @@ describe('Job', () => { }); test('should set all arguments', () => { - cdkassert.expect(stack).to(cdkassert.haveResourceLike('AWS::Glue::Job', { + Template.fromStack(stack).hasResourceProperties('AWS::Glue::Job', { DefaultArguments: { '--enable-continuous-cloudwatch-log': 'true', '--enable-continuous-log-filter': 'false', @@ -252,11 +250,11 @@ describe('Job', () => { '--continuous-log-logStreamPrefix': 'LogStreamPrefix', '--continuous-log-conversionPattern': '%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n', }, - })); + }); }); test('should grant cloudwatch log write permissions', () => { - cdkassert.expect(stack).to(cdkassert.haveResourceLike('AWS::IAM::Policy', { + Template.fromStack(stack).hasResourceProperties('AWS::IAM::Policy', { PolicyDocument: { Statement: [ { @@ -293,7 +291,7 @@ describe('Job', () => { Ref: 'JobServiceRole4F432993', }, ], - })); + }); }); }); @@ -310,11 +308,11 @@ describe('Job', () => { }); test('should create spark ui bucket', () => { - cdkassert.expect(stack).to(cdkassert.countResources('AWS::S3::Bucket', 1)); + Template.fromStack(stack).resourceCountIs('AWS::S3::Bucket', 1); }); test('should grant the role read/write permissions to the spark ui bucket', () => { - cdkassert.expect(stack).to(cdkassert.haveResource('AWS::IAM::Policy', { + Template.fromStack(stack).hasResourceProperties('AWS::IAM::Policy', { PolicyDocument: { Statement: [ { @@ -359,11 +357,11 @@ describe('Job', () => { Ref: 'JobServiceRole4F432993', }, ], - })); + }); }); test('should set spark arguments on the job', () => { - cdkassert.expect(stack).to(cdkassert.haveResourceLike('AWS::Glue::Job', { + Template.fromStack(stack).hasResourceProperties('AWS::Glue::Job', { DefaultArguments: { '--enable-spark-ui': 'true', '--spark-event-logs-path': { @@ -378,7 +376,7 @@ describe('Job', () => { ], }, }, - })); + }); }); }); @@ -403,7 +401,7 @@ describe('Job', () => { }); test('should grant the role read/write permissions to the provided spark ui bucket', () => { - cdkassert.expect(stack).to(cdkassert.haveResourceLike('AWS::IAM::Policy', { + Template.fromStack(stack).hasResourceProperties('AWS::IAM::Policy', { PolicyDocument: { Statement: [ { @@ -450,16 +448,16 @@ describe('Job', () => { Ref: 'JobServiceRole4F432993', }, ], - })); + }); }); test('should set spark arguments on the job', () => { - cdkassert.expect(stack).to(cdkassert.haveResourceLike('AWS::Glue::Job', { + Template.fromStack(stack).hasResourceProperties('AWS::Glue::Job', { DefaultArguments: { '--enable-spark-ui': 'true', '--spark-event-logs-path': `s3://${bucketName}`, }, - })); + }); }); }); @@ -487,12 +485,12 @@ describe('Job', () => { }); test('should set spark arguments on the job', () => { - cdkassert.expect(stack).to(cdkassert.haveResourceLike('AWS::Glue::Job', { + Template.fromStack(stack).hasResourceProperties('AWS::Glue::Job', { DefaultArguments: { '--enable-spark-ui': 'true', '--spark-event-logs-path': `s3://${bucketName}/${prefix}`, }, - })); + }); }); }); @@ -503,7 +501,7 @@ describe('Job', () => { description: 'test job', executable: glue.JobExecutable.pythonStreaming({ glueVersion: glue.GlueVersion.V2_0, - pythonVersion: PythonVersion.THREE, + pythonVersion: glue.PythonVersion.THREE, script, }), workerType: glue.WorkerType.G_2X, @@ -526,7 +524,7 @@ describe('Job', () => { }); test('should synthesize correctly', () => { - cdkassert.expect(stack).to(cdkassert.haveResource('AWS::Glue::Job', { + Template.fromStack(stack).hasResourceProperties('AWS::Glue::Job', { Command: { Name: 'gluestreaming', ScriptLocation: 's3://bucketName/script', @@ -566,7 +564,7 @@ describe('Job', () => { ], }, SecurityConfiguration: 'SecurityConfigurationName', - })); + }); }); }); @@ -576,26 +574,26 @@ describe('Job', () => { new glue.Job(stack, 'Job', { executable: glue.JobExecutable.pythonShell({ glueVersion: glue.GlueVersion.V2_0, - pythonVersion: PythonVersion.THREE, + pythonVersion: glue.PythonVersion.THREE, script, }), }); - cdkassert.expect(stack).to(cdkassert.haveResource('AWS::Glue::Job', { + Template.fromStack(stack).hasResourceProperties('AWS::Glue::Job', { Command: { Name: 'pythonshell', ScriptLocation: 's3://bucketName/script', PythonVersion: '3', }, GlueVersion: '2.0', - })); + }); }); test('with unsupported glue version throws', () => { expect(() => new glue.Job(stack, 'Job', { executable: glue.JobExecutable.pythonShell({ glueVersion: glue.GlueVersion.V0_9, - pythonVersion: PythonVersion.TWO, + pythonVersion: glue.PythonVersion.TWO, script, }), })).toThrow('Specified GlueVersion 0.9 does not support Python Shell'); @@ -605,7 +603,7 @@ describe('Job', () => { expect(() => new glue.Job(stack, 'Job', { executable: glue.JobExecutable.pythonShell({ glueVersion: glue.GlueVersion.V2_0, - pythonVersion: PythonVersion.THREE, + pythonVersion: glue.PythonVersion.THREE, script, }), sparkUI: { enabled: true }, @@ -616,14 +614,14 @@ describe('Job', () => { new glue.Job(stack, 'Job', { executable: glue.JobExecutable.pythonShell({ glueVersion: glue.GlueVersion.V2_0, - pythonVersion: PythonVersion.THREE, + pythonVersion: glue.PythonVersion.THREE, script, extraPythonFiles, extraFiles, }), }); - cdkassert.expect(stack).to(cdkassert.haveResource('AWS::Glue::Job', { + Template.fromStack(stack).hasResourceProperties('AWS::Glue::Job', { Command: { Name: 'pythonshell', ScriptLocation: 's3://bucketName/script', @@ -635,7 +633,7 @@ describe('Job', () => { '--extra-py-files': 's3://bucketName/file1.py,s3://bucketName/file2.py', '--extra-files': 's3://bucketName/file1.txt,s3://bucketName/file2.txt', }, - })); + }); }); }); @@ -645,13 +643,13 @@ describe('Job', () => { new glue.Job(stack, 'Job', { executable: glue.JobExecutable.pythonEtl({ glueVersion: glue.GlueVersion.V2_0, - pythonVersion: PythonVersion.THREE, + pythonVersion: glue.PythonVersion.THREE, script, }), }); // check the job using the role - cdkassert.expect(stack).to(cdkassert.haveResource('AWS::Glue::Job', { + Template.fromStack(stack).hasResourceProperties('AWS::Glue::Job', { GlueVersion: '2.0', Command: { Name: 'glueetl', @@ -667,14 +665,14 @@ describe('Job', () => { DefaultArguments: { '--job-language': 'python', }, - })); + }); }); test('with all props should synthesize correctly', () => { new glue.Job(stack, 'Job', { executable: glue.JobExecutable.pythonEtl({ glueVersion: glue.GlueVersion.V2_0, - pythonVersion: PythonVersion.THREE, + pythonVersion: glue.PythonVersion.THREE, extraJarsFirst: true, script, extraPythonFiles, @@ -683,7 +681,7 @@ describe('Job', () => { }), }); - cdkassert.expect(stack).to(cdkassert.haveResource('AWS::Glue::Job', { + Template.fromStack(stack).hasResourceProperties('AWS::Glue::Job', { GlueVersion: '2.0', Command: { Name: 'glueetl', @@ -703,7 +701,7 @@ describe('Job', () => { '--extra-files': 's3://bucketName/file1.txt,s3://bucketName/file2.txt', '--user-jars-first': 'true', }, - })); + }); }); }); @@ -718,7 +716,7 @@ describe('Job', () => { }), }); - cdkassert.expect(stack).to(cdkassert.haveResource('AWS::Glue::Job', { + Template.fromStack(stack).hasResourceProperties('AWS::Glue::Job', { GlueVersion: '2.0', Command: { Name: 'gluestreaming', @@ -734,7 +732,7 @@ describe('Job', () => { '--job-language': 'scala', '--class': 'com.amazon.test.ClassName', }, - })); + }); }); test('with all props should synthesize correctly', () => { @@ -749,7 +747,7 @@ describe('Job', () => { }), }); - cdkassert.expect(stack).to(cdkassert.haveResource('AWS::Glue::Job', { + Template.fromStack(stack).hasResourceProperties('AWS::Glue::Job', { GlueVersion: '2.0', Command: { Name: 'gluestreaming', @@ -768,7 +766,7 @@ describe('Job', () => { '--extra-files': 's3://bucketName/file1.txt,s3://bucketName/file2.txt', '--user-jars-first': 'true', }, - })); + }); }); }); @@ -786,7 +784,7 @@ describe('Job', () => { test('.onEvent() creates the expected event rule', () => { job.onEvent('eventId', {}); - cdkassert.expect(stack).to(cdkassert.haveResource('AWS::Events::Rule', { + Template.fromStack(stack).hasResourceProperties('AWS::Events::Rule', { EventPattern: { 'source': [ 'aws.glue', @@ -804,14 +802,14 @@ describe('Job', () => { }, }, State: 'ENABLED', - })); + }); }); describe('.onSuccess()', () => { test('should create a rule with correct properties', () => { job.onSuccess('SuccessRule'); - cdkassert.expect(stack).to(cdkassert.haveResource('AWS::Events::Rule', { + Template.fromStack(stack).hasResourceProperties('AWS::Events::Rule', { Description: { 'Fn::Join': [ '', @@ -844,7 +842,7 @@ describe('Job', () => { }, }, State: 'ENABLED', - })); + }); }); }); @@ -852,7 +850,7 @@ describe('Job', () => { test('should create a rule with correct properties', () => { job.onFailure('FailureRule'); - cdkassert.expect(stack).to(cdkassert.haveResource('AWS::Events::Rule', { + Template.fromStack(stack).hasResourceProperties('AWS::Events::Rule', { Description: { 'Fn::Join': [ '', @@ -885,7 +883,7 @@ describe('Job', () => { }, }, State: 'ENABLED', - })); + }); }); }); @@ -893,7 +891,7 @@ describe('Job', () => { test('should create a rule with correct properties', () => { job.onTimeout('TimeoutRule'); - cdkassert.expect(stack).to(cdkassert.haveResource('AWS::Events::Rule', { + Template.fromStack(stack).hasResourceProperties('AWS::Events::Rule', { Description: { 'Fn::Join': [ '', @@ -926,7 +924,7 @@ describe('Job', () => { }, }, State: 'ENABLED', - })); + }); }); }); @@ -943,8 +941,8 @@ describe('Job', () => { statistic: 'Sum', })); - cdkassert.countResources('AWS::Events::Rule', 1); - cdkassert.expect(stack).to(cdkassert.haveResource('AWS::Events::Rule', { + Template.fromStack(stack).resourceCountIs('AWS::Events::Rule', 1); + Template.fromStack(stack).hasResourceProperties('AWS::Events::Rule', { Description: { 'Fn::Join': [ '', @@ -977,7 +975,7 @@ describe('Job', () => { }, }, State: 'ENABLED', - })); + }); }); test('.metricFailure() creates the expected singleton event rule and corresponding metric', () => { @@ -993,8 +991,8 @@ describe('Job', () => { statistic: 'Sum', })); - cdkassert.countResources('AWS::Events::Rule', 1); - cdkassert.expect(stack).to(cdkassert.haveResource('AWS::Events::Rule', { + Template.fromStack(stack).resourceCountIs('AWS::Events::Rule', 1); + Template.fromStack(stack).hasResourceProperties('AWS::Events::Rule', { Description: { 'Fn::Join': [ '', @@ -1027,7 +1025,7 @@ describe('Job', () => { }, }, State: 'ENABLED', - })); + }); }); test('.metricTimeout() creates the expected singleton event rule and corresponding metric', () => { @@ -1043,8 +1041,8 @@ describe('Job', () => { statistic: 'Sum', })); - cdkassert.countResources('AWS::Events::Rule', 1); - cdkassert.expect(stack).to(cdkassert.haveResource('AWS::Events::Rule', { + Template.fromStack(stack).resourceCountIs('AWS::Events::Rule', 1); + Template.fromStack(stack).hasResourceProperties('AWS::Events::Rule', { Description: { 'Fn::Join': [ '', @@ -1077,7 +1075,7 @@ describe('Job', () => { }, }, State: 'ENABLED', - })); + }); }); }); From 7df0c1fdb8ef96a228d8ef109614580180e3cac4 Mon Sep 17 00:00:00 2001 From: Ahmed Kamel Date: Wed, 1 Sep 2021 14:04:23 +0100 Subject: [PATCH 36/50] update README --- packages/@aws-cdk/aws-glue/README.md | 86 +++++++++++++++++++++------- 1 file changed, 65 insertions(+), 21 deletions(-) diff --git a/packages/@aws-cdk/aws-glue/README.md b/packages/@aws-cdk/aws-glue/README.md index e96f279383574..4fdadea2fd796 100644 --- a/packages/@aws-cdk/aws-glue/README.md +++ b/packages/@aws-cdk/aws-glue/README.md @@ -23,52 +23,87 @@ This module is part of the [AWS Cloud Development Kit](https://github.com/aws/aws-cdk) project. -## Connection +## Job -A `Connection` allows Glue jobs, crawlers and development endpoints to access certain types of data stores. For example, to create a network connection to connect to a data source within a VPC: +A `Job` encapsulates a script that connects to a data source, processes it, and then writes output to a data target. + +There are 3 types of jobs supported by AWS Glue: Spark ETL, Spark Streaming, and Python Shell jobs. + +The `glue.JobExecutable` allows you to specify the type of job, the language to use and the code assets required by the job. + +`glue.Code` allows you to refer to the different code assets required by the job, either from an existing S3 location or from a local file path. + +### Spark Jobs + +These jobs run in an Apache Spark environment managed by AWS Glue. + +#### ETL Jobs + +An ETL job processes data in batches using Apache Spark. ```ts -new glue.Connection(stack, 'MyConnection', { - connectionType: glue.ConnectionTypes.NETWORK, - // The security groups granting AWS Glue inbound access to the data source within the VPC - securityGroups: [securityGroup], - // The VPC subnet which contains the data source - subnet, +new glue.Job(stack, 'ScalaSparkEtlJob', { + executable: glue.JobExecutable.scalaEtl({ + glueVersion: glue.GlueVersion.V2_0, + script: glue.Code.fromBucket(bucket, 'src/com/example/HelloWorld.scala'), + className: 'com.example.HelloWorld', + extraJars: [glue.Code.fromBucket(bucket, 'jars/HelloWorld.jar')], + }), + description: 'an example Scala ETL job', }); ``` -If you need to use a connection type that doesn't exist as a static member on `ConnectionType`, you can instantiate a `ConnectionType` object, e.g: `new glue.ConnectionType('NEW_TYPE')`. +#### Streaming Jobs -See [Adding a Connection to Your Data Store](https://docs.aws.amazon.com/glue/latest/dg/populate-add-connection.html) and [Connection Structure](https://docs.aws.amazon.com/glue/latest/dg/aws-glue-api-catalog-connections.html#aws-glue-api-catalog-connections-Connection) documentation for more information on the supported data stores and their configurations. +A Streaming job is similar to an ETL job, except that it performs ETL on data streams. It uses the Apache Spark Structured Streaming framework. Some Spark job features are not available to streaming ETL jobs. -## Job +```ts +new glue.Job(stack, 'PythonSparkStreamingJob', { + executable: glue.JobExecutable.pythonStreaming({ + glueVersion: glue.GlueVersion.V2_0, + pythonVersion: glue.PythonVersion.THREE, + script: glue.Code.fromAsset(path.join(__dirname, 'job-script/hello_world.py')), + }), + description: 'an example Python Streaming job', +}); +``` -A `Job` encapsulates a script that connects to a data source, processes it, and then writes output to a data target. -Typically, a job runs extract, transform, and load (ETL) scripts. Jobs can also run general-purpose Python scripts (Python shell jobs). +### Python Shell Jobs + +A Python shell job runs Python scripts as a shell and supports a Python version that depends on the AWS Glue version you are using. +This can be used to schedule and run tasks that don't require an Apache Spark environment. ```ts -new glue.Job(stack, 'Job', { - executable: glue.JobExecutable.shellPython({ +new glue.Job(stack, 'PythonShellJob', { + executable: glue.JobExecutable.pythonShell({ glueVersion: glue.GlueVersion.V2_0, pythonVersion: PythonVersion.TWO, - scriptLocation: 's3://bucketName/script.py', + script: glue.Code.fromBucket(bucket, 'script.py'), }), - description: 'an example pythonshell job', + description: 'an example Python Shell job', }); ``` See [documentation](https://docs.aws.amazon.com/glue/latest/dg/add-job.html) for more information on adding jobs in Glue. -## Database +## Connection -A `Database` is a logical grouping of `Tables` in the Glue Catalog. +A `Connection` allows Glue jobs, crawlers and development endpoints to access certain types of data stores. For example, to create a network connection to connect to a data source within a VPC: ```ts -new glue.Database(stack, 'MyDatabase', { - databaseName: 'my_database' +new glue.Connection(stack, 'MyConnection', { + connectionType: glue.ConnectionTypes.NETWORK, + // The security groups granting AWS Glue inbound access to the data source within the VPC + securityGroups: [securityGroup], + // The VPC subnet which contains the data source + subnet, }); ``` +If you need to use a connection type that doesn't exist as a static member on `ConnectionType`, you can instantiate a `ConnectionType` object, e.g: `new glue.ConnectionType('NEW_TYPE')`. + +See [Adding a Connection to Your Data Store](https://docs.aws.amazon.com/glue/latest/dg/populate-add-connection.html) and [Connection Structure](https://docs.aws.amazon.com/glue/latest/dg/aws-glue-api-catalog-connections.html#aws-glue-api-catalog-connections-Connection) documentation for more information on the supported data stores and their configurations. + ## SecurityConfiguration A `SecurityConfiguration` is a set of security properties that can be used by AWS Glue to encrypt data at rest. @@ -102,6 +137,15 @@ new glue.SecurityConfiguration(stack, 'MySecurityConfiguration', { See [documentation](https://docs.aws.amazon.com/glue/latest/dg/encryption-security-configuration.html) for more info for Glue encrypting data written by Crawlers, Jobs, and Development Endpoints. +## Database + +A `Database` is a logical grouping of `Tables` in the Glue Catalog. + +```ts +new glue.Database(stack, 'MyDatabase', { + databaseName: 'my_database' +}); +``` ## Table From 8884ad777eab38c7c909f0f0928d4e5706983f3a Mon Sep 17 00:00:00 2001 From: Ahmed Kamel Date: Wed, 1 Sep 2021 17:45:29 +0100 Subject: [PATCH 37/50] increase test coverage to 100% for the new files --- packages/@aws-cdk/aws-glue/test/job-executable.test.ts | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100644 packages/@aws-cdk/aws-glue/test/job-executable.test.ts diff --git a/packages/@aws-cdk/aws-glue/test/job-executable.test.ts b/packages/@aws-cdk/aws-glue/test/job-executable.test.ts new file mode 100644 index 0000000000000..e69de29bb2d1d From 13b03e86ec6a1c43c6c8b73a0e86d551516e5125 Mon Sep 17 00:00:00 2001 From: Ahmed Kamel Date: Wed, 1 Sep 2021 17:51:32 +0100 Subject: [PATCH 38/50] increase test coverage to 100% --- .../aws-glue/test/job-executable.test.ts | 72 +++++++++++++++++++ packages/@aws-cdk/aws-glue/test/job.test.ts | 16 +++++ 2 files changed, 88 insertions(+) diff --git a/packages/@aws-cdk/aws-glue/test/job-executable.test.ts b/packages/@aws-cdk/aws-glue/test/job-executable.test.ts index e69de29bb2d1d..226227a2ffd3b 100644 --- a/packages/@aws-cdk/aws-glue/test/job-executable.test.ts +++ b/packages/@aws-cdk/aws-glue/test/job-executable.test.ts @@ -0,0 +1,72 @@ +import * as s3 from '@aws-cdk/aws-s3'; +import * as cdk from '@aws-cdk/core'; +import * as glue from '../lib'; + +describe('JobExecutable', () => { + let stack: cdk.Stack; + let bucket: s3.IBucket; + let script: glue.Code; + + beforeEach(() => { + stack = new cdk.Stack(); + bucket = s3.Bucket.fromBucketName(stack, 'Bucket', 'bucketName'); + script = glue.Code.fromBucket(bucket, 'script.py'); + }); + + describe('.of()', () => { + test('with valid config', () => { + expect(glue.JobExecutable.of({ + glueVersion: glue.GlueVersion.V2_0, + type: glue.JobType.PYTHON_SHELL, + language: glue.JobLanguage.PYTHON, + pythonVersion: glue.PythonVersion.THREE, + script, + })).toBeDefined(); + }); + + test('python shell job with a language other than python throws', () => { + expect(() => glue.JobExecutable.of({ + glueVersion: glue.GlueVersion.V3_0, + type: glue.JobType.PYTHON_SHELL, + language: glue.JobLanguage.SCALA, + script, + })).toThrow(/Python shell requires the language to be set to Python/); + }); + + [glue.GlueVersion.V0_9, glue.GlueVersion.V1_0].forEach((glueVersion) => { + test(`python shell with glue version ${glueVersion} throws`, () => { + expect(() => glue.JobExecutable.of({ + type: glue.JobType.PYTHON_SHELL, + language: glue.JobLanguage.PYTHON, + pythonVersion: glue.PythonVersion.TWO, + script, + glueVersion, + })).toThrow(`Specified GlueVersion ${glueVersion.name} does not support Python Shell`); + }); + + test(`extraJarsFirst with glue version ${glueVersion} throws`, () => { + expect(() => glue.JobExecutable.of({ + type: glue.JobType.ETL, + language: glue.JobLanguage.PYTHON, + pythonVersion: glue.PythonVersion.TWO, + extraJarsFirst: true, + script, + glueVersion, + })).toThrow(`Specified GlueVersion ${glueVersion.name} does not support extraJarsFirst`); + }); + }); + + [glue.GlueVersion.V2_0, glue.GlueVersion.V3_0].forEach((glueVersion) => { + test(`PythonVersion.TWO with glue version ${glueVersion} throws`, () => { + expect(() => glue.JobExecutable.of({ + type: glue.JobType.PYTHON_SHELL, + language: glue.JobLanguage.PYTHON, + pythonVersion: glue.PythonVersion.TWO, + script, + glueVersion, + })).toThrow(`Specified GlueVersion ${glueVersion.name} does not support PythonVersion 2`); + }); + }); + + }); +}); \ No newline at end of file diff --git a/packages/@aws-cdk/aws-glue/test/job.test.ts b/packages/@aws-cdk/aws-glue/test/job.test.ts index 4d5ed366bb0de..6e459b3ad0790 100644 --- a/packages/@aws-cdk/aws-glue/test/job.test.ts +++ b/packages/@aws-cdk/aws-glue/test/job.test.ts @@ -568,6 +568,22 @@ describe('Job', () => { }); }); + test('with reserved args throws', () => { + ['--conf', '--debug', '--mode', '--JOB_NAME'].forEach((arg, index) => { + const defaultArguments: {[key: string]: string} = {}; + defaultArguments[arg] = 'random value'; + + expect(() => new glue.Job(stack, `Job${index}`, { + executable: glue.JobExecutable.scalaEtl({ + glueVersion: glue.GlueVersion.V2_0, + className, + script, + }), + defaultArguments, + })).toThrow(/argument is reserved by Glue/); + }); + }); + describe('python shell job', () => { test('with minimal props should synthesize correctly', () => { From bc82d60c2239f2fd6b219be88bdcf62f118c7f18 Mon Sep 17 00:00:00 2001 From: Ahmed Kamel Date: Fri, 3 Sep 2021 15:21:04 +0100 Subject: [PATCH 39/50] tweak tests --- .../aws-glue/test/job-executable.test.ts | 43 +++++++++++++--- packages/@aws-cdk/aws-glue/test/job.test.ts | 50 ++++--------------- 2 files changed, 46 insertions(+), 47 deletions(-) diff --git a/packages/@aws-cdk/aws-glue/test/job-executable.test.ts b/packages/@aws-cdk/aws-glue/test/job-executable.test.ts index 226227a2ffd3b..aca9876bfa4b7 100644 --- a/packages/@aws-cdk/aws-glue/test/job-executable.test.ts +++ b/packages/@aws-cdk/aws-glue/test/job-executable.test.ts @@ -2,6 +2,38 @@ import * as s3 from '@aws-cdk/aws-s3'; import * as cdk from '@aws-cdk/core'; import * as glue from '../lib'; +describe('GlueVersion', () => { + test('.V0_9 should set the name correctly', () => expect(glue.GlueVersion.V0_9.name).toEqual('0.9')); + + test('.V1_0 should set the name correctly', () => expect(glue.GlueVersion.V1_0.name).toEqual('1.0')); + + test('.V2_0 should set the name correctly', () => expect(glue.GlueVersion.V2_0.name).toEqual('2.0')); + + test('.V3_0 should set the name correctly', () => expect(glue.GlueVersion.V3_0.name).toEqual('3.0')); + + test('of(customVersion) should set the name correctly', () => expect(glue.GlueVersion.of('CustomVersion').name).toEqual('CustomVersion')); +}); + +describe('WorkerType', () => { + test('.STANDARD should set the name correctly', () => expect(glue.WorkerType.STANDARD.name).toEqual('Standard')); + + test('.G_1X should set the name correctly', () => expect(glue.WorkerType.G_1X.name).toEqual('G.1X')); + + test('.G_2X should set the name correctly', () => expect(glue.WorkerType.G_2X.name).toEqual('G.2X')); + + test('of(customType) should set name correctly', () => expect(glue.WorkerType.of('CustomType').name).toEqual('CustomType')); +}); + +describe('JobType', () => { + test('.ETL should set the name correctly', () => expect(glue.JobType.ETL.name).toEqual('glueetl')); + + test('.STREAMING should set the name correctly', () => expect(glue.JobType.STREAMING.name).toEqual('gluestreaming')); + + test('.PYTHON_SHELL should set the name correctly', () => expect(glue.JobType.PYTHON_SHELL.name).toEqual('pythonshell')); + + test('of(customName) should set the name correctly', () => expect(glue.JobType.of('CustomName').name).toEqual('CustomName')); +}); + describe('JobExecutable', () => { let stack: cdk.Stack; let bucket: s3.IBucket; @@ -14,7 +46,7 @@ describe('JobExecutable', () => { }); describe('.of()', () => { - test('with valid config', () => { + test('with valid config should succeed', () => { expect(glue.JobExecutable.of({ glueVersion: glue.GlueVersion.V2_0, type: glue.JobType.PYTHON_SHELL, @@ -24,7 +56,7 @@ describe('JobExecutable', () => { })).toBeDefined(); }); - test('python shell job with a language other than python throws', () => { + test('with JobType.PYTHON_SHELL and a language other than JobLanguage.PYTHON', () => { expect(() => glue.JobExecutable.of({ glueVersion: glue.GlueVersion.V3_0, type: glue.JobType.PYTHON_SHELL, @@ -34,7 +66,7 @@ describe('JobExecutable', () => { }); [glue.GlueVersion.V0_9, glue.GlueVersion.V1_0].forEach((glueVersion) => { - test(`python shell with glue version ${glueVersion} throws`, () => { + test(`with JobType.PYTHON_SHELL and GlueVersion ${glueVersion} should throw`, () => { expect(() => glue.JobExecutable.of({ type: glue.JobType.PYTHON_SHELL, language: glue.JobLanguage.PYTHON, @@ -44,7 +76,7 @@ describe('JobExecutable', () => { })).toThrow(`Specified GlueVersion ${glueVersion.name} does not support Python Shell`); }); - test(`extraJarsFirst with glue version ${glueVersion} throws`, () => { + test(`with extraJarsFirst set and GlueVersion ${glueVersion} should throw`, () => { expect(() => glue.JobExecutable.of({ type: glue.JobType.ETL, language: glue.JobLanguage.PYTHON, @@ -57,7 +89,7 @@ describe('JobExecutable', () => { }); [glue.GlueVersion.V2_0, glue.GlueVersion.V3_0].forEach((glueVersion) => { - test(`PythonVersion.TWO with glue version ${glueVersion} throws`, () => { + test(`with PythonVersion.TWO and GlueVersion ${glueVersion} should throw`, () => { expect(() => glue.JobExecutable.of({ type: glue.JobType.PYTHON_SHELL, language: glue.JobLanguage.PYTHON, @@ -67,6 +99,5 @@ describe('JobExecutable', () => { })).toThrow(`Specified GlueVersion ${glueVersion.name} does not support PythonVersion 2`); }); }); - }); }); \ No newline at end of file diff --git a/packages/@aws-cdk/aws-glue/test/job.test.ts b/packages/@aws-cdk/aws-glue/test/job.test.ts index 6e459b3ad0790..203f9c8109b23 100644 --- a/packages/@aws-cdk/aws-glue/test/job.test.ts +++ b/packages/@aws-cdk/aws-glue/test/job.test.ts @@ -7,38 +7,6 @@ import * as s3 from '@aws-cdk/aws-s3'; import * as cdk from '@aws-cdk/core'; import * as glue from '../lib'; -describe('GlueVersion', () => { - test('.V0_9', () => expect(glue.GlueVersion.V0_9.name).toEqual('0.9')); - - test('.V1_0', () => expect(glue.GlueVersion.V1_0.name).toEqual('1.0')); - - test('.V2_0', () => expect(glue.GlueVersion.V2_0.name).toEqual('2.0')); - - test('.V3_0', () => expect(glue.GlueVersion.V3_0.name).toEqual('3.0')); - - test('of(customVersion) sets name correctly', () => expect(glue.GlueVersion.of('CustomVersion').name).toEqual('CustomVersion')); -}); - -describe('WorkerType', () => { - test('.STANDARD', () => expect(glue.WorkerType.STANDARD.name).toEqual('Standard')); - - test('.G_1X', () => expect(glue.WorkerType.G_1X.name).toEqual('G.1X')); - - test('.G_2X', () => expect(glue.WorkerType.G_2X.name).toEqual('G.2X')); - - test('of(customType) sets name correctly', () => expect(glue.WorkerType.of('CustomType').name).toEqual('CustomType')); -}); - -describe('JobType', () => { - test('.ETL', () => expect(glue.JobType.ETL.name).toEqual('glueetl')); - - test('.STREAMING', () => expect(glue.JobType.STREAMING.name).toEqual('gluestreaming')); - - test('.PYTHON_SHELL', () => expect(glue.JobType.PYTHON_SHELL.name).toEqual('pythonshell')); - - test('of(customName) sets name correctly', () => expect(glue.JobType.of('CustomName').name).toEqual('CustomName')); -}); - describe('Job', () => { let stack: cdk.Stack; let jobName: string; @@ -568,7 +536,7 @@ describe('Job', () => { }); }); - test('with reserved args throws', () => { + test('with reserved args should throw', () => { ['--conf', '--debug', '--mode', '--JOB_NAME'].forEach((arg, index) => { const defaultArguments: {[key: string]: string} = {}; defaultArguments[arg] = 'random value'; @@ -605,7 +573,7 @@ describe('Job', () => { }); }); - test('with unsupported glue version throws', () => { + test('with unsupported glue version should throw', () => { expect(() => new glue.Job(stack, 'Job', { executable: glue.JobExecutable.pythonShell({ glueVersion: glue.GlueVersion.V0_9, @@ -615,7 +583,7 @@ describe('Job', () => { })).toThrow('Specified GlueVersion 0.9 does not support Python Shell'); }); - test('with unsupported Spark UI prop throws', () => { + test('with unsupported Spark UI prop should throw', () => { expect(() => new glue.Job(stack, 'Job', { executable: glue.JobExecutable.pythonShell({ glueVersion: glue.GlueVersion.V2_0, @@ -797,7 +765,7 @@ describe('Job', () => { }); }); - test('.onEvent() creates the expected event rule', () => { + test('.onEvent() should create the expected event rule', () => { job.onEvent('eventId', {}); Template.fromStack(stack).hasResourceProperties('AWS::Events::Rule', { @@ -944,7 +912,7 @@ describe('Job', () => { }); }); - test('.metricSuccess() creates the expected singleton event rule and corresponding metric', () => { + test('.metricSuccess() should create the expected singleton event rule and corresponding metric', () => { const metric = job.metricSuccess(); job.metricSuccess(); @@ -994,7 +962,7 @@ describe('Job', () => { }); }); - test('.metricFailure() creates the expected singleton event rule and corresponding metric', () => { + test('.metricFailure() should create the expected singleton event rule and corresponding metric', () => { const metric = job.metricFailure(); job.metricFailure(); @@ -1044,7 +1012,7 @@ describe('Job', () => { }); }); - test('.metricTimeout() creates the expected singleton event rule and corresponding metric', () => { + test('.metricTimeout() should create the expected singleton event rule and corresponding metric', () => { const metric = job.metricTimeout(); job.metricTimeout(); @@ -1097,7 +1065,7 @@ describe('Job', () => { describe('.metric()', () => { - test('to create a count sum metric', () => { + test('with MetricType.COUNT should create a count sum metric', () => { const metricName = 'glue.driver.aggregate.bytesRead'; const props = { statistic: cloudwatch.Statistic.SUM }; @@ -1113,7 +1081,7 @@ describe('Job', () => { })); }); - test('to create a gauge average metric', () => { + test('with MetricType.GAUGE should create a gauge average metric', () => { const metricName = 'glue.driver.BlockManager.disk.diskSpaceUsed_MB'; const props = { statistic: cloudwatch.Statistic.AVERAGE }; From 70b3e24233e902e664df65d608707e26ac561710 Mon Sep 17 00:00:00 2001 From: Ahmed Kamel Date: Fri, 3 Sep 2021 15:32:11 +0100 Subject: [PATCH 40/50] tweak tests #2 --- packages/@aws-cdk/aws-glue/test/job-executable.test.ts | 10 ---------- packages/@aws-cdk/aws-glue/test/job.test.ts | 10 ++++++++++ 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/packages/@aws-cdk/aws-glue/test/job-executable.test.ts b/packages/@aws-cdk/aws-glue/test/job-executable.test.ts index aca9876bfa4b7..f08413403ee4f 100644 --- a/packages/@aws-cdk/aws-glue/test/job-executable.test.ts +++ b/packages/@aws-cdk/aws-glue/test/job-executable.test.ts @@ -14,16 +14,6 @@ describe('GlueVersion', () => { test('of(customVersion) should set the name correctly', () => expect(glue.GlueVersion.of('CustomVersion').name).toEqual('CustomVersion')); }); -describe('WorkerType', () => { - test('.STANDARD should set the name correctly', () => expect(glue.WorkerType.STANDARD.name).toEqual('Standard')); - - test('.G_1X should set the name correctly', () => expect(glue.WorkerType.G_1X.name).toEqual('G.1X')); - - test('.G_2X should set the name correctly', () => expect(glue.WorkerType.G_2X.name).toEqual('G.2X')); - - test('of(customType) should set name correctly', () => expect(glue.WorkerType.of('CustomType').name).toEqual('CustomType')); -}); - describe('JobType', () => { test('.ETL should set the name correctly', () => expect(glue.JobType.ETL.name).toEqual('glueetl')); diff --git a/packages/@aws-cdk/aws-glue/test/job.test.ts b/packages/@aws-cdk/aws-glue/test/job.test.ts index 203f9c8109b23..d1a50d3fcfeb9 100644 --- a/packages/@aws-cdk/aws-glue/test/job.test.ts +++ b/packages/@aws-cdk/aws-glue/test/job.test.ts @@ -7,6 +7,16 @@ import * as s3 from '@aws-cdk/aws-s3'; import * as cdk from '@aws-cdk/core'; import * as glue from '../lib'; +describe('WorkerType', () => { + test('.STANDARD should set the name correctly', () => expect(glue.WorkerType.STANDARD.name).toEqual('Standard')); + + test('.G_1X should set the name correctly', () => expect(glue.WorkerType.G_1X.name).toEqual('G.1X')); + + test('.G_2X should set the name correctly', () => expect(glue.WorkerType.G_2X.name).toEqual('G.2X')); + + test('of(customType) should set name correctly', () => expect(glue.WorkerType.of('CustomType').name).toEqual('CustomType')); +}); + describe('Job', () => { let stack: cdk.Stack; let jobName: string; From 32ba2ae6140aa4fb268e797f6ad0b1b518f99cd7 Mon Sep 17 00:00:00 2001 From: Ben Chaimberg Date: Tue, 7 Sep 2021 20:59:25 -0400 Subject: [PATCH 41/50] remove role from IJob --- packages/@aws-cdk/aws-glue/lib/job.ts | 10 +--------- packages/@aws-cdk/aws-glue/test/job.test.ts | 5 +---- 2 files changed, 2 insertions(+), 13 deletions(-) diff --git a/packages/@aws-cdk/aws-glue/lib/job.ts b/packages/@aws-cdk/aws-glue/lib/job.ts index ad85f80657755..30c325f31ae80 100644 --- a/packages/@aws-cdk/aws-glue/lib/job.ts +++ b/packages/@aws-cdk/aws-glue/lib/job.ts @@ -125,12 +125,6 @@ export interface IJob extends cdk.IResource, iam.IGrantable { */ readonly jobArn: string; - /** - * The IAM role assumed by Glue to run this job. - * @attribute - */ - readonly role?: iam.IRole; - /** * Defines a CloudWatch event rule triggered when something happens with this job. * @@ -197,7 +191,6 @@ abstract class JobBase extends cdk.Resource implements IJob { public abstract readonly jobArn: string; public abstract readonly jobName: string; - public abstract readonly role?: iam.IRole; public abstract readonly grantPrincipal: iam.IPrincipal; /** @@ -604,7 +597,6 @@ export class Job extends JobBase { class Import extends JobBase { public readonly jobName = attrs.jobName; public readonly jobArn = jobArn(scope, attrs.jobName); - public readonly role = attrs.role; public readonly grantPrincipal = attrs.role ?? new iam.UnknownPrincipal({ resource: this }); } @@ -624,7 +616,7 @@ export class Job extends JobBase { /** * The IAM role Glue assumes to run this job. */ - public readonly role?: iam.IRole; + public readonly role: iam.IRole; /** * The principal this Glue Job is running as. diff --git a/packages/@aws-cdk/aws-glue/test/job.test.ts b/packages/@aws-cdk/aws-glue/test/job.test.ts index d1a50d3fcfeb9..7f041ebf2109a 100644 --- a/packages/@aws-cdk/aws-glue/test/job.test.ts +++ b/packages/@aws-cdk/aws-glue/test/job.test.ts @@ -36,7 +36,6 @@ describe('Job', () => { resource: 'job', resourceName: jobName, })); - expect(job.role).toBeUndefined(); expect(job.grantPrincipal).toEqual(new iam.UnknownPrincipal({ resource: job })); }); @@ -50,7 +49,6 @@ describe('Job', () => { resource: 'job', resourceName: jobName, })); - expect(job.role).toEqual(role); expect(job.grantPrincipal).toEqual(role); }); }); @@ -87,7 +85,6 @@ describe('Job', () => { test('should create a role and use it with the job', () => { // check the role - expect(job.role).toBeDefined(); Template.fromStack(stack).hasResourceProperties('AWS::IAM::Role', { AssumeRolePolicyDocument: { Statement: [ @@ -154,7 +151,7 @@ describe('Job', () => { role, }); - expect(job.role).toEqual(role); + expect(job.grantPrincipal).toEqual(role); Template.fromStack(stack).hasResourceProperties('AWS::Glue::Job', { Role: role.roleArn, }); From babd3ec0b383795f7c807db5e415118dbf0bde96 Mon Sep 17 00:00:00 2001 From: Ahmed Kamel Date: Wed, 8 Sep 2021 12:25:16 +0100 Subject: [PATCH 42/50] address some comments - update README - use const instead of let and beforeEach where relevant - assert on stack assets metadata - add test case of 2 jobs reusing asset - other minor things --- packages/@aws-cdk/aws-glue/README.md | 4 +- packages/@aws-cdk/aws-glue/test/code.test.ts | 113 ++++++++++++++---- .../aws-glue/test/job-executable.test.ts | 2 +- packages/@aws-cdk/aws-glue/test/job.test.ts | 22 +--- 4 files changed, 102 insertions(+), 39 deletions(-) diff --git a/packages/@aws-cdk/aws-glue/README.md b/packages/@aws-cdk/aws-glue/README.md index 4fdadea2fd796..7fa9f07c6d65e 100644 --- a/packages/@aws-cdk/aws-glue/README.md +++ b/packages/@aws-cdk/aws-glue/README.md @@ -25,7 +25,7 @@ This module is part of the [AWS Cloud Development Kit](https://github.com/aws/aw ## Job -A `Job` encapsulates a script that connects to a data source, processes it, and then writes output to a data target. +A `Job` encapsulates a script that connects to data sources, processes them, and then writes output to a data target. There are 3 types of jobs supported by AWS Glue: Spark ETL, Spark Streaming, and Python Shell jobs. @@ -77,7 +77,7 @@ This can be used to schedule and run tasks that don't require an Apache Spark en new glue.Job(stack, 'PythonShellJob', { executable: glue.JobExecutable.pythonShell({ glueVersion: glue.GlueVersion.V2_0, - pythonVersion: PythonVersion.TWO, + pythonVersion: PythonVersion.THREE, script: glue.Code.fromBucket(bucket, 'script.py'), }), description: 'an example Python Shell job', diff --git a/packages/@aws-cdk/aws-glue/test/code.test.ts b/packages/@aws-cdk/aws-glue/test/code.test.ts index 01732dd6538e7..9e979dfd638ed 100644 --- a/packages/@aws-cdk/aws-glue/test/code.test.ts +++ b/packages/@aws-cdk/aws-glue/test/code.test.ts @@ -1,7 +1,8 @@ import * as path from 'path'; +import { Template } from '@aws-cdk/assertions'; import * as s3 from '@aws-cdk/aws-s3'; import * as cdk from '@aws-cdk/core'; -import { Code } from '../lib'; +import * as glue from '../lib'; describe('Code', () => { let stack: cdk.Stack; @@ -11,16 +12,12 @@ describe('Code', () => { }); describe('.fromBucket()', () => { + const key = 'script'; let bucket: s3.IBucket; - let key: string; - - beforeEach(() => { - bucket = s3.Bucket.fromBucketName(stack, 'Bucket', 'bucketName'); - key = 'script'; - }); test('with valid bucket name and key and calling bind() returns correct s3 location', () => { - expect(Code.fromBucket(bucket, key).bind(stack)).toEqual({ + bucket = s3.Bucket.fromBucketName(stack, 'Bucket', 'bucketName'); + expect(glue.Code.fromBucket(bucket, key).bind(stack)).toEqual({ s3Location: { bucketName: 'bucketName', objectKey: 'script', @@ -30,28 +27,104 @@ describe('Code', () => { }); describe('.fromAsset()', () => { - let filePath: string; - let directoryPath: string; + const filePath = path.join(__dirname, 'job-script/hello_world.py'); + const directoryPath = path.join(__dirname, 'job-script'); - beforeEach(() => { - filePath = path.join(__dirname, 'job-script/hello_world.py'); - directoryPath = path.join(__dirname, 'job-script'); - }); - - test('with valid and existing file path and calling bind() returns an s3 location', () => { - const codeConfig = Code.fromAsset(filePath).bind(stack); + test('with valid and existing file path and calling bind() returns an s3 location and sets metadata', () => { + const codeConfig = glue.Code.fromAsset(filePath).bind(stack); expect(codeConfig.s3Location.bucketName).toBeDefined(); expect(codeConfig.s3Location.objectKey).toBeDefined(); + expect(stack.node.metadata.find(m => m.type === 'aws:cdk:asset')).toBeDefined(); }); test('with an unsupported directory path throws', () => { - expect(() => Code.fromAsset(directoryPath)) + expect(() => glue.Code.fromAsset(directoryPath)) .toThrow(/Only files are supported/); }); - test('throws if bound with another stack', () => { + test('used in more than 1 job in the same stack should be reused', () => { + const script = glue.Code.fromAsset(filePath); + new glue.Job(stack, 'Job1', { + executable: glue.JobExecutable.pythonShell({ + glueVersion: glue.GlueVersion.V2_0, + pythonVersion: glue.PythonVersion.THREE, + script, + }), + }); + new glue.Job(stack, 'Job2', { + executable: glue.JobExecutable.pythonShell({ + glueVersion: glue.GlueVersion.V2_0, + pythonVersion: glue.PythonVersion.THREE, + script, + }), + }); + const ScriptLocation = { + 'Fn::Join': [ + '', + [ + 's3://', + { + Ref: 'AssetParameters894df8f835015940e27548bfbf722885cb247378af70effdc8ecbe342419fc6bS3Bucket252142A8', + }, + '/', + { + 'Fn::Select': [ + 0, + { + 'Fn::Split': [ + '||', + { + Ref: 'AssetParameters894df8f835015940e27548bfbf722885cb247378af70effdc8ecbe342419fc6bS3VersionKey7D45B377', + }, + ], + }, + ], + }, + { + 'Fn::Select': [ + 1, + { + 'Fn::Split': [ + '||', + { + Ref: 'AssetParameters894df8f835015940e27548bfbf722885cb247378af70effdc8ecbe342419fc6bS3VersionKey7D45B377', + }, + ], + }, + ], + }, + ], + ], + }; + expect(stack.node.metadata.find(m => m.type === 'aws:cdk:asset')).toBeDefined(); + // Job1 and Job2 use reuse the asset + Template.fromStack(stack).hasResourceProperties('AWS::Glue::Job', { + Command: { + ScriptLocation, + }, + Role: { + 'Fn::GetAtt': [ + 'Job1ServiceRole7AF34CCA', + 'Arn', + ], + }, + }); + Template.fromStack(stack).hasResourceProperties('AWS::Glue::Job', { + Command: { + ScriptLocation, + }, + Role: { + 'Fn::GetAtt': [ + 'Job2ServiceRole5D2B98FE', + 'Arn', + ], + }, + }); + }); + + test('throws if used in more than 1 stack', () => { const stack2 = new cdk.Stack(); - const asset = Code.fromAsset(filePath); + const asset = glue.Code.fromAsset(filePath); asset.bind(stack); expect(() => asset.bind(stack2)) diff --git a/packages/@aws-cdk/aws-glue/test/job-executable.test.ts b/packages/@aws-cdk/aws-glue/test/job-executable.test.ts index f08413403ee4f..36d8efb32b838 100644 --- a/packages/@aws-cdk/aws-glue/test/job-executable.test.ts +++ b/packages/@aws-cdk/aws-glue/test/job-executable.test.ts @@ -46,7 +46,7 @@ describe('JobExecutable', () => { })).toBeDefined(); }); - test('with JobType.PYTHON_SHELL and a language other than JobLanguage.PYTHON', () => { + test('with JobType.PYTHON_SHELL and a language other than JobLanguage.PYTHON should throw', () => { expect(() => glue.JobExecutable.of({ glueVersion: glue.GlueVersion.V3_0, type: glue.JobType.PYTHON_SHELL, diff --git a/packages/@aws-cdk/aws-glue/test/job.test.ts b/packages/@aws-cdk/aws-glue/test/job.test.ts index 7f041ebf2109a..007edd0dc6ff7 100644 --- a/packages/@aws-cdk/aws-glue/test/job.test.ts +++ b/packages/@aws-cdk/aws-glue/test/job.test.ts @@ -18,12 +18,11 @@ describe('WorkerType', () => { }); describe('Job', () => { + const jobName = 'test-job'; let stack: cdk.Stack; - let jobName: string; beforeEach(() => { stack = new cdk.Stack(); - jobName = 'test-job'; }); describe('.fromJobAttributes()', () => { @@ -55,18 +54,17 @@ describe('Job', () => { describe('new', () => { + const className = 'com.amazon.test.ClassName'; let codeBucket: s3.IBucket; let script: glue.Code; let extraJars: glue.Code[]; let extraFiles: glue.Code[]; let extraPythonFiles: glue.Code[]; - let className: string; let job: glue.Job; beforeEach(() => { codeBucket = s3.Bucket.fromBucketName(stack, 'CodeBucket', 'bucketName'); script = glue.Code.fromBucket(codeBucket, 'script'); - className = 'com.amazon.test.ClassName'; extraJars = [glue.Code.fromBucket(codeBucket, 'file1.jar'), glue.Code.fromBucket(codeBucket, 'file2.jar')]; extraPythonFiles = [glue.Code.fromBucket(codeBucket, 'file1.py'), glue.Code.fromBucket(codeBucket, 'file2.py')]; extraFiles = [glue.Code.fromBucket(codeBucket, 'file1.txt'), glue.Code.fromBucket(codeBucket, 'file2.txt')]; @@ -84,7 +82,6 @@ describe('Job', () => { }); test('should create a role and use it with the job', () => { - // check the role Template.fromStack(stack).hasResourceProperties('AWS::IAM::Role', { AssumeRolePolicyDocument: { Statement: [ @@ -132,11 +129,7 @@ describe('Job', () => { test('should return correct jobName and jobArn from CloudFormation', () => { expect(stack.resolve(job.jobName)).toEqual({ Ref: 'JobB9D00F9F' }); expect(stack.resolve(job.jobArn)).toEqual({ - 'Fn::Join': ['', [ - 'arn:', { Ref: 'AWS::Partition' }, - ':glue:', { Ref: 'AWS::Region' }, ':', - { Ref: 'AWS::AccountId' }, ':job/', { Ref: 'JobB9D00F9F' }, - ]], + 'Fn::Join': ['', ['arn:', { Ref: 'AWS::Partition' }, ':glue:', { Ref: 'AWS::Region' }, ':', { Ref: 'AWS::AccountId' }, ':job/', { Ref: 'JobB9D00F9F' }]], }); }); @@ -356,11 +349,10 @@ describe('Job', () => { }); describe('enabling spark ui with bucket provided', () => { - let bucketName: string; + const bucketName = 'BucketName'; let bucket: s3.IBucket; beforeEach(() => { - bucketName = 'BucketName'; bucket = s3.Bucket.fromBucketName(stack, 'BucketId', bucketName); job = new glue.Job(stack, 'Job', { executable: glue.JobExecutable.scalaEtl({ @@ -437,14 +429,12 @@ describe('Job', () => { }); describe('enabling spark ui with bucket and path provided', () => { - let bucketName: string; + const bucketName = 'BucketName'; + const prefix = 'some/path/'; let bucket: s3.IBucket; - let prefix: string; beforeEach(() => { - bucketName = 'BucketName'; bucket = s3.Bucket.fromBucketName(stack, 'BucketId', bucketName); - prefix = 'some/path/'; job = new glue.Job(stack, 'Job', { executable: glue.JobExecutable.scalaEtl({ glueVersion: glue.GlueVersion.V2_0, From b62a8685ae4317ab6e2a558df81d2b971435ab9e Mon Sep 17 00:00:00 2001 From: Ahmed Kamel Date: Wed, 8 Sep 2021 13:55:20 +0100 Subject: [PATCH 43/50] simplify job.test.ts --- packages/@aws-cdk/aws-glue/lib/job.ts | 4 +- packages/@aws-cdk/aws-glue/test/job.test.ts | 308 ++++++-------------- 2 files changed, 87 insertions(+), 225 deletions(-) diff --git a/packages/@aws-cdk/aws-glue/lib/job.ts b/packages/@aws-cdk/aws-glue/lib/job.ts index 30c325f31ae80..f84acc449aaea 100644 --- a/packages/@aws-cdk/aws-glue/lib/job.ts +++ b/packages/@aws-cdk/aws-glue/lib/job.ts @@ -5,7 +5,7 @@ import * as logs from '@aws-cdk/aws-logs'; import * as s3 from '@aws-cdk/aws-s3'; import * as cdk from '@aws-cdk/core'; import * as constructs from 'constructs'; -import { Code, JobExecutable, JobExecutableConfig, JobType } from '.'; +import { Code, JobExecutable, JobExecutableConfig, JobLanguage, JobType } from '.'; import { IConnection } from './connection'; import { CfnJob } from './glue.generated'; import { ISecurityConfiguration } from './security-configuration'; @@ -711,7 +711,7 @@ export class Job extends JobBase { if (config.extraJars && config.extraJars?.length > 0) { args['--extra-jars'] = config.extraJars.map(code => this.codeS3ObjectUrl(code)).join(','); } - if (config.extraPythonFiles && config.extraPythonFiles.length > 0) { + if (JobLanguage.PYTHON === config.language && config.extraPythonFiles && config.extraPythonFiles.length > 0) { args['--extra-py-files'] = config.extraPythonFiles.map(code => this.codeS3ObjectUrl(code)).join(','); } if (config.extraFiles && config.extraFiles.length > 0) { diff --git a/packages/@aws-cdk/aws-glue/test/job.test.ts b/packages/@aws-cdk/aws-glue/test/job.test.ts index 007edd0dc6ff7..a742b9c195162 100644 --- a/packages/@aws-cdk/aws-glue/test/job.test.ts +++ b/packages/@aws-cdk/aws-glue/test/job.test.ts @@ -61,6 +61,7 @@ describe('Job', () => { let extraFiles: glue.Code[]; let extraPythonFiles: glue.Code[]; let job: glue.Job; + let defaultProps: glue.JobProps; beforeEach(() => { codeBucket = s3.Bucket.fromBucketName(stack, 'CodeBucket', 'bucketName'); @@ -68,17 +69,18 @@ describe('Job', () => { extraJars = [glue.Code.fromBucket(codeBucket, 'file1.jar'), glue.Code.fromBucket(codeBucket, 'file2.jar')]; extraPythonFiles = [glue.Code.fromBucket(codeBucket, 'file1.py'), glue.Code.fromBucket(codeBucket, 'file2.py')]; extraFiles = [glue.Code.fromBucket(codeBucket, 'file1.txt'), glue.Code.fromBucket(codeBucket, 'file2.txt')]; + defaultProps = { + executable: glue.JobExecutable.scalaEtl({ + glueVersion: glue.GlueVersion.V2_0, + className, + script, + }), + }; }); describe('with necessary props only', () => { beforeEach(() => { - job = new glue.Job(stack, 'Job', { - executable: glue.JobExecutable.scalaEtl({ - glueVersion: glue.GlueVersion.V2_0, - className, - script, - }), - }); + job = new glue.Job(stack, 'Job', defaultProps); }); test('should create a role and use it with the job', () => { @@ -136,11 +138,7 @@ describe('Job', () => { test('with a custom role should use it and set it in CloudFormation', () => { const role = iam.Role.fromRoleArn(stack, 'Role', 'arn:aws:iam::123456789012:role/TestRole'); job = new glue.Job(stack, 'JobWithRole', { - executable: glue.JobExecutable.pythonEtl({ - glueVersion: glue.GlueVersion.V2_0, - pythonVersion: glue.PythonVersion.THREE, - script, - }), + ...defaultProps, role, }); @@ -152,11 +150,7 @@ describe('Job', () => { test('with a custom jobName should set it in CloudFormation', () => { job = new glue.Job(stack, 'JobWithName', { - executable: glue.JobExecutable.scalaStreaming({ - glueVersion: glue.GlueVersion.V2_0, - className, - script, - }), + ...defaultProps, jobName, }); @@ -169,11 +163,7 @@ describe('Job', () => { describe('enabling continuous logging with defaults', () => { beforeEach(() => { job = new glue.Job(stack, 'Job', { - executable: glue.JobExecutable.scalaEtl({ - glueVersion: glue.GlueVersion.V2_0, - className, - script, - }), + ...defaultProps, continuousLogging: { enabled: true }, }); }); @@ -194,11 +184,7 @@ describe('Job', () => { beforeEach(() => { logGroup = logs.LogGroup.fromLogGroupName(stack, 'LogGroup', 'LogGroupName'); job = new glue.Job(stack, 'Job', { - executable: glue.JobExecutable.scalaEtl({ - glueVersion: glue.GlueVersion.V2_0, - className, - script, - }), + ...defaultProps, continuousLogging: { enabled: true, quiet: false, @@ -266,11 +252,7 @@ describe('Job', () => { describe('enabling spark ui but no bucket or path provided', () => { beforeEach(() => { job = new glue.Job(stack, 'Job', { - executable: glue.JobExecutable.scalaEtl({ - glueVersion: glue.GlueVersion.V2_0, - className, - script, - }), + ...defaultProps, sparkUI: { enabled: true }, }); }); @@ -355,11 +337,7 @@ describe('Job', () => { beforeEach(() => { bucket = s3.Bucket.fromBucketName(stack, 'BucketId', bucketName); job = new glue.Job(stack, 'Job', { - executable: glue.JobExecutable.scalaEtl({ - glueVersion: glue.GlueVersion.V2_0, - className, - script, - }), + ...defaultProps, sparkUI: { enabled: true, bucket, @@ -436,11 +414,7 @@ describe('Job', () => { beforeEach(() => { bucket = s3.Bucket.fromBucketName(stack, 'BucketId', bucketName); job = new glue.Job(stack, 'Job', { - executable: glue.JobExecutable.scalaEtl({ - glueVersion: glue.GlueVersion.V2_0, - className, - script, - }), + ...defaultProps, sparkUI: { enabled: true, bucket, @@ -462,13 +436,9 @@ describe('Job', () => { describe('with extended props', () => { beforeEach(() => { job = new glue.Job(stack, 'Job', { + ...defaultProps, jobName, description: 'test job', - executable: glue.JobExecutable.pythonStreaming({ - glueVersion: glue.GlueVersion.V2_0, - pythonVersion: glue.PythonVersion.THREE, - script, - }), workerType: glue.WorkerType.G_2X, workerCount: 10, maxConcurrentRuns: 2, @@ -491,9 +461,8 @@ describe('Job', () => { test('should synthesize correctly', () => { Template.fromStack(stack).hasResourceProperties('AWS::Glue::Job', { Command: { - Name: 'gluestreaming', + Name: 'glueetl', ScriptLocation: 's3://bucketName/script', - PythonVersion: '3', }, Role: { 'Fn::GetAtt': [ @@ -502,7 +471,8 @@ describe('Job', () => { ], }, DefaultArguments: { - '--job-language': 'python', + '--job-language': 'scala', + '--class': 'com.amazon.test.ClassName', '--enable-metrics': '', 'arg1': 'value1', 'arg2': 'value2', @@ -549,27 +519,7 @@ describe('Job', () => { }); }); - describe('python shell job', () => { - - test('with minimal props should synthesize correctly', () => { - new glue.Job(stack, 'Job', { - executable: glue.JobExecutable.pythonShell({ - glueVersion: glue.GlueVersion.V2_0, - pythonVersion: glue.PythonVersion.THREE, - script, - }), - }); - - Template.fromStack(stack).hasResourceProperties('AWS::Glue::Job', { - Command: { - Name: 'pythonshell', - ScriptLocation: 's3://bucketName/script', - PythonVersion: '3', - }, - GlueVersion: '2.0', - }); - }); - + describe('shell job', () => { test('with unsupported glue version should throw', () => { expect(() => new glue.Job(stack, 'Job', { executable: glue.JobExecutable.pythonShell({ @@ -590,164 +540,76 @@ describe('Job', () => { sparkUI: { enabled: true }, })).toThrow('Spark UI is not available for JobType.PYTHON_SHELL'); }); - - test('with all props should synthesize correctly', () => { - new glue.Job(stack, 'Job', { - executable: glue.JobExecutable.pythonShell({ - glueVersion: glue.GlueVersion.V2_0, - pythonVersion: glue.PythonVersion.THREE, - script, - extraPythonFiles, - extraFiles, - }), - }); - - Template.fromStack(stack).hasResourceProperties('AWS::Glue::Job', { - Command: { - Name: 'pythonshell', - ScriptLocation: 's3://bucketName/script', - PythonVersion: '3', - }, - GlueVersion: '2.0', - DefaultArguments: { - '--job-language': 'python', - '--extra-py-files': 's3://bucketName/file1.py,s3://bucketName/file2.py', - '--extra-files': 's3://bucketName/file1.txt,s3://bucketName/file2.txt', - }, - }); - }); }); - describe('python etl job', () => { - - test('with minimal props should synthesize correctly', () => { - new glue.Job(stack, 'Job', { - executable: glue.JobExecutable.pythonEtl({ - glueVersion: glue.GlueVersion.V2_0, - pythonVersion: glue.PythonVersion.THREE, - script, - }), - }); - - // check the job using the role - Template.fromStack(stack).hasResourceProperties('AWS::Glue::Job', { - GlueVersion: '2.0', - Command: { - Name: 'glueetl', - ScriptLocation: 's3://bucketName/script', - PythonVersion: '3', - }, - Role: { - 'Fn::GetAtt': [ - 'JobServiceRole4F432993', - 'Arn', - ], - }, - DefaultArguments: { - '--job-language': 'python', - }, - }); - }); - - test('with all props should synthesize correctly', () => { - new glue.Job(stack, 'Job', { - executable: glue.JobExecutable.pythonEtl({ - glueVersion: glue.GlueVersion.V2_0, - pythonVersion: glue.PythonVersion.THREE, - extraJarsFirst: true, - script, - extraPythonFiles, - extraJars, - extraFiles, - }), - }); - Template.fromStack(stack).hasResourceProperties('AWS::Glue::Job', { - GlueVersion: '2.0', - Command: { - Name: 'glueetl', - ScriptLocation: 's3://bucketName/script', - PythonVersion: '3', - }, - Role: { - 'Fn::GetAtt': [ - 'JobServiceRole4F432993', - 'Arn', - ], - }, - DefaultArguments: { - '--job-language': 'python', - '--extra-jars': 's3://bucketName/file1.jar,s3://bucketName/file2.jar', - '--extra-py-files': 's3://bucketName/file1.py,s3://bucketName/file2.py', - '--extra-files': 's3://bucketName/file1.txt,s3://bucketName/file2.txt', - '--user-jars-first': 'true', - }, - }); + test('etl job with all props should synthesize correctly', () => { + new glue.Job(stack, 'Job', { + executable: glue.JobExecutable.pythonEtl({ + glueVersion: glue.GlueVersion.V2_0, + pythonVersion: glue.PythonVersion.THREE, + extraJarsFirst: true, + script, + extraPythonFiles, + extraJars, + extraFiles, + }), + }); + + Template.fromStack(stack).hasResourceProperties('AWS::Glue::Job', { + GlueVersion: '2.0', + Command: { + Name: 'glueetl', + ScriptLocation: 's3://bucketName/script', + PythonVersion: '3', + }, + Role: { + 'Fn::GetAtt': [ + 'JobServiceRole4F432993', + 'Arn', + ], + }, + DefaultArguments: { + '--job-language': 'python', + '--extra-jars': 's3://bucketName/file1.jar,s3://bucketName/file2.jar', + '--extra-py-files': 's3://bucketName/file1.py,s3://bucketName/file2.py', + '--extra-files': 's3://bucketName/file1.txt,s3://bucketName/file2.txt', + '--user-jars-first': 'true', + }, }); }); - describe('scala streaming job', () => { - - test('with minimal props should synthesize correctly', () => { - new glue.Job(stack, 'Job', { - executable: glue.JobExecutable.scalaStreaming({ - glueVersion: glue.GlueVersion.V2_0, - script, - className, - }), - }); - - Template.fromStack(stack).hasResourceProperties('AWS::Glue::Job', { - GlueVersion: '2.0', - Command: { - Name: 'gluestreaming', - ScriptLocation: 's3://bucketName/script', - }, - Role: { - 'Fn::GetAtt': [ - 'JobServiceRole4F432993', - 'Arn', - ], - }, - DefaultArguments: { - '--job-language': 'scala', - '--class': 'com.amazon.test.ClassName', - }, - }); - }); - - test('with all props should synthesize correctly', () => { - new glue.Job(stack, 'Job', { - executable: glue.JobExecutable.scalaStreaming({ - glueVersion: glue.GlueVersion.V2_0, - extraJarsFirst: true, - className, - script, - extraJars, - extraFiles, - }), - }); - - Template.fromStack(stack).hasResourceProperties('AWS::Glue::Job', { - GlueVersion: '2.0', - Command: { - Name: 'gluestreaming', - ScriptLocation: 's3://bucketName/script', - }, - Role: { - 'Fn::GetAtt': [ - 'JobServiceRole4F432993', - 'Arn', - ], - }, - DefaultArguments: { - '--job-language': 'scala', - '--class': 'com.amazon.test.ClassName', - '--extra-jars': 's3://bucketName/file1.jar,s3://bucketName/file2.jar', - '--extra-files': 's3://bucketName/file1.txt,s3://bucketName/file2.txt', - '--user-jars-first': 'true', - }, - }); + test('streaming job with all props should synthesize correctly', () => { + new glue.Job(stack, 'Job', { + executable: glue.JobExecutable.scalaStreaming({ + glueVersion: glue.GlueVersion.V2_0, + extraJarsFirst: true, + className, + script, + extraJars, + extraFiles, + }), + }); + + Template.fromStack(stack).hasResourceProperties('AWS::Glue::Job', { + GlueVersion: '2.0', + Command: { + Name: 'gluestreaming', + ScriptLocation: 's3://bucketName/script', + }, + Role: { + 'Fn::GetAtt': [ + 'JobServiceRole4F432993', + 'Arn', + ], + }, + DefaultArguments: { + '--job-language': 'scala', + '--class': 'com.amazon.test.ClassName', + '--extra-jars': 's3://bucketName/file1.jar,s3://bucketName/file2.jar', + '--extra-files': 's3://bucketName/file1.txt,s3://bucketName/file2.txt', + '--user-jars-first': 'true', + }, }); }); From 80c3f153bc93c305828f6d40d505c3067d538b45 Mon Sep 17 00:00:00 2001 From: Ahmed Kamel Date: Wed, 8 Sep 2021 14:21:26 +0100 Subject: [PATCH 44/50] simplify testing success/failure/timeout rules and metrics --- packages/@aws-cdk/aws-glue/test/job.test.ts | 226 +++----------------- 1 file changed, 27 insertions(+), 199 deletions(-) diff --git a/packages/@aws-cdk/aws-glue/test/job.test.ts b/packages/@aws-cdk/aws-glue/test/job.test.ts index a742b9c195162..55e67fd194bbc 100644 --- a/packages/@aws-cdk/aws-glue/test/job.test.ts +++ b/packages/@aws-cdk/aws-glue/test/job.test.ts @@ -648,9 +648,13 @@ describe('Job', () => { }); }); - describe('.onSuccess()', () => { - test('should create a rule with correct properties', () => { - job.onSuccess('SuccessRule'); + [ + { name: 'onSuccess()', invoke: (testJob: glue.Job) => testJob.onSuccess('SuccessRule'), state: 'SUCCEEDED' }, + { name: 'onFailure()', invoke: (testJob: glue.Job) => testJob.onFailure('FailureRule'), state: 'FAILED' }, + { name: 'onTimeout()', invoke: (testJob: glue.Job) => testJob.onTimeout('TimeoutRule'), state: 'TIMEOUT' }, + ].forEach((testCase) => { + test(`${testCase.name} should create a rule with correct properties`, () => { + testCase.invoke(job); Template.fromStack(stack).hasResourceProperties('AWS::Events::Rule', { Description: { @@ -661,7 +665,7 @@ describe('Job', () => { { Ref: 'JobB9D00F9F', }, - ' is in SUCCEEDED state', + ` is in ${testCase.state} state`, ], ], }, @@ -675,7 +679,7 @@ describe('Job', () => { ], 'detail': { state: [ - 'SUCCEEDED', + testCase.state, ], jobName: [ { @@ -689,51 +693,25 @@ describe('Job', () => { }); }); - describe('.onFailure()', () => { - test('should create a rule with correct properties', () => { - job.onFailure('FailureRule'); + [ + { name: '.metricSuccess()', invoke: (testJob: glue.Job) => testJob.metricSuccess(), state: 'SUCCEEDED', ruleId: 'SuccessMetricRule' }, + { name: '.metricFailure()', invoke: (testJob: glue.Job) => testJob.metricFailure(), state: 'FAILED', ruleId: 'FailureMetricRule' }, + { name: '.metricTimeout()', invoke: (testJob: glue.Job) => testJob.metricTimeout(), state: 'TIMEOUT', ruleId: 'TimeoutMetricRule' }, + ].forEach((testCase) => { + test(`${testCase.name} should create the expected singleton event rule and corresponding metric`, () => { + const metric = testCase.invoke(job); + testCase.invoke(job); - Template.fromStack(stack).hasResourceProperties('AWS::Events::Rule', { - Description: { - 'Fn::Join': [ - '', - [ - 'Rule triggered when Glue job ', - { - Ref: 'JobB9D00F9F', - }, - ' is in FAILED state', - ], - ], - }, - EventPattern: { - 'source': [ - 'aws.glue', - ], - 'detail-type': [ - 'Glue Job State Change', - 'Glue Job Run Status', - ], - 'detail': { - state: [ - 'FAILED', - ], - jobName: [ - { - Ref: 'JobB9D00F9F', - }, - ], - }, + expect(metric).toEqual(new cloudwatch.Metric({ + dimensions: { + RuleName: (job.node.findChild(testCase.ruleId) as events.Rule).ruleName, }, - State: 'ENABLED', - }); - }); - }); - - describe('.onTimeout()', () => { - test('should create a rule with correct properties', () => { - job.onTimeout('TimeoutRule'); + metricName: 'TriggeredRules', + namespace: 'AWS/Events', + statistic: 'Sum', + })); + Template.fromStack(stack).resourceCountIs('AWS::Events::Rule', 1); Template.fromStack(stack).hasResourceProperties('AWS::Events::Rule', { Description: { 'Fn::Join': [ @@ -743,7 +721,7 @@ describe('Job', () => { { Ref: 'JobB9D00F9F', }, - ' is in TIMEOUT state', + ` is in ${testCase.state} state`, ], ], }, @@ -757,7 +735,7 @@ describe('Job', () => { ], 'detail': { state: [ - 'TIMEOUT', + testCase.state, ], jobName: [ { @@ -770,156 +748,6 @@ describe('Job', () => { }); }); }); - - test('.metricSuccess() should create the expected singleton event rule and corresponding metric', () => { - const metric = job.metricSuccess(); - job.metricSuccess(); - - expect(metric).toEqual(new cloudwatch.Metric({ - dimensions: { - RuleName: (job.node.findChild('SuccessMetricRule') as events.Rule).ruleName, - }, - metricName: 'TriggeredRules', - namespace: 'AWS/Events', - statistic: 'Sum', - })); - - Template.fromStack(stack).resourceCountIs('AWS::Events::Rule', 1); - Template.fromStack(stack).hasResourceProperties('AWS::Events::Rule', { - Description: { - 'Fn::Join': [ - '', - [ - 'Rule triggered when Glue job ', - { - Ref: 'JobB9D00F9F', - }, - ' is in SUCCEEDED state', - ], - ], - }, - EventPattern: { - 'source': [ - 'aws.glue', - ], - 'detail-type': [ - 'Glue Job State Change', - 'Glue Job Run Status', - ], - 'detail': { - state: [ - 'SUCCEEDED', - ], - jobName: [ - { - Ref: 'JobB9D00F9F', - }, - ], - }, - }, - State: 'ENABLED', - }); - }); - - test('.metricFailure() should create the expected singleton event rule and corresponding metric', () => { - const metric = job.metricFailure(); - job.metricFailure(); - - expect(metric).toEqual(new cloudwatch.Metric({ - dimensions: { - RuleName: (job.node.findChild('FailureMetricRule') as events.Rule).ruleName, - }, - metricName: 'TriggeredRules', - namespace: 'AWS/Events', - statistic: 'Sum', - })); - - Template.fromStack(stack).resourceCountIs('AWS::Events::Rule', 1); - Template.fromStack(stack).hasResourceProperties('AWS::Events::Rule', { - Description: { - 'Fn::Join': [ - '', - [ - 'Rule triggered when Glue job ', - { - Ref: 'JobB9D00F9F', - }, - ' is in FAILED state', - ], - ], - }, - EventPattern: { - 'source': [ - 'aws.glue', - ], - 'detail-type': [ - 'Glue Job State Change', - 'Glue Job Run Status', - ], - 'detail': { - state: [ - 'FAILED', - ], - jobName: [ - { - Ref: 'JobB9D00F9F', - }, - ], - }, - }, - State: 'ENABLED', - }); - }); - - test('.metricTimeout() should create the expected singleton event rule and corresponding metric', () => { - const metric = job.metricTimeout(); - job.metricTimeout(); - - expect(metric).toEqual(new cloudwatch.Metric({ - dimensions: { - RuleName: (job.node.findChild('TimeoutMetricRule') as events.Rule).ruleName, - }, - metricName: 'TriggeredRules', - namespace: 'AWS/Events', - statistic: 'Sum', - })); - - Template.fromStack(stack).resourceCountIs('AWS::Events::Rule', 1); - Template.fromStack(stack).hasResourceProperties('AWS::Events::Rule', { - Description: { - 'Fn::Join': [ - '', - [ - 'Rule triggered when Glue job ', - { - Ref: 'JobB9D00F9F', - }, - ' is in TIMEOUT state', - ], - ], - }, - EventPattern: { - 'source': [ - 'aws.glue', - ], - 'detail-type': [ - 'Glue Job State Change', - 'Glue Job Run Status', - ], - 'detail': { - state: [ - 'TIMEOUT', - ], - jobName: [ - { - Ref: 'JobB9D00F9F', - }, - ], - }, - }, - State: 'ENABLED', - }); - }); }); describe('.metric()', () => { From 094929cccc6f576044dbc0fedfc04789b1265bee Mon Sep 17 00:00:00 2001 From: Ahmed Kamel Date: Wed, 8 Sep 2021 14:31:56 +0100 Subject: [PATCH 45/50] better handling for extraPythonFiles with non-Python jobs --- packages/@aws-cdk/aws-glue/lib/job-executable.ts | 3 +++ packages/@aws-cdk/aws-glue/lib/job.ts | 4 ++-- .../@aws-cdk/aws-glue/test/job-executable.test.ts | 11 +++++++++++ 3 files changed, 16 insertions(+), 2 deletions(-) diff --git a/packages/@aws-cdk/aws-glue/lib/job-executable.ts b/packages/@aws-cdk/aws-glue/lib/job-executable.ts index 5d9dacf04386e..597ce5140cf8c 100644 --- a/packages/@aws-cdk/aws-glue/lib/job-executable.ts +++ b/packages/@aws-cdk/aws-glue/lib/job-executable.ts @@ -297,6 +297,9 @@ export class JobExecutable { if (config.pythonVersion === PythonVersion.TWO && ![GlueVersion.V0_9, GlueVersion.V1_0].includes(config.glueVersion)) { throw new Error(`Specified GlueVersion ${config.glueVersion.name} does not support PythonVersion ${config.pythonVersion}`); } + if (JobLanguage.PYTHON !== config.language && config.extraPythonFiles) { + throw new Error('extraPythonFiles is not supported for languages other than JobLanguage.PYTHON'); + } this.config = config; } diff --git a/packages/@aws-cdk/aws-glue/lib/job.ts b/packages/@aws-cdk/aws-glue/lib/job.ts index f84acc449aaea..30c325f31ae80 100644 --- a/packages/@aws-cdk/aws-glue/lib/job.ts +++ b/packages/@aws-cdk/aws-glue/lib/job.ts @@ -5,7 +5,7 @@ import * as logs from '@aws-cdk/aws-logs'; import * as s3 from '@aws-cdk/aws-s3'; import * as cdk from '@aws-cdk/core'; import * as constructs from 'constructs'; -import { Code, JobExecutable, JobExecutableConfig, JobLanguage, JobType } from '.'; +import { Code, JobExecutable, JobExecutableConfig, JobType } from '.'; import { IConnection } from './connection'; import { CfnJob } from './glue.generated'; import { ISecurityConfiguration } from './security-configuration'; @@ -711,7 +711,7 @@ export class Job extends JobBase { if (config.extraJars && config.extraJars?.length > 0) { args['--extra-jars'] = config.extraJars.map(code => this.codeS3ObjectUrl(code)).join(','); } - if (JobLanguage.PYTHON === config.language && config.extraPythonFiles && config.extraPythonFiles.length > 0) { + if (config.extraPythonFiles && config.extraPythonFiles.length > 0) { args['--extra-py-files'] = config.extraPythonFiles.map(code => this.codeS3ObjectUrl(code)).join(','); } if (config.extraFiles && config.extraFiles.length > 0) { diff --git a/packages/@aws-cdk/aws-glue/test/job-executable.test.ts b/packages/@aws-cdk/aws-glue/test/job-executable.test.ts index 36d8efb32b838..e9352a96bcd63 100644 --- a/packages/@aws-cdk/aws-glue/test/job-executable.test.ts +++ b/packages/@aws-cdk/aws-glue/test/job-executable.test.ts @@ -55,6 +55,17 @@ describe('JobExecutable', () => { })).toThrow(/Python shell requires the language to be set to Python/); }); + test('with a non JobLanguage.PYTHON and extraPythonFiles set should throw', () => { + expect(() => glue.JobExecutable.of({ + glueVersion: glue.GlueVersion.V3_0, + type: glue.JobType.ETL, + language: glue.JobLanguage.SCALA, + className: 'com.Test', + extraPythonFiles: [script], + script, + })).toThrow(/extraPythonFiles is not supported for languages other than JobLanguage.PYTHON/); + }); + [glue.GlueVersion.V0_9, glue.GlueVersion.V1_0].forEach((glueVersion) => { test(`with JobType.PYTHON_SHELL and GlueVersion ${glueVersion} should throw`, () => { expect(() => glue.JobExecutable.of({ From f01c0be40a44c58d41c7bf35691b839790a74e25 Mon Sep 17 00:00:00 2001 From: Ahmed Kamel Date: Wed, 8 Sep 2021 15:17:19 +0100 Subject: [PATCH 46/50] update integ.job.ts --- .../aws-glue/test/integ.job.expected.json | 264 +++++++++--------- packages/@aws-cdk/aws-glue/test/integ.job.ts | 60 ++-- 2 files changed, 170 insertions(+), 154 deletions(-) diff --git a/packages/@aws-cdk/aws-glue/test/integ.job.expected.json b/packages/@aws-cdk/aws-glue/test/integ.job.expected.json index 92d724caabae5..25520e55b1715 100644 --- a/packages/@aws-cdk/aws-glue/test/integ.job.expected.json +++ b/packages/@aws-cdk/aws-glue/test/integ.job.expected.json @@ -1,6 +1,6 @@ { "Resources": { - "MinimalGlueEtlJobServiceRole60989380": { + "EtlJobServiceRole837F781B": { "Type": "AWS::IAM::Role", "Properties": { "AssumeRolePolicyDocument": { @@ -31,99 +31,65 @@ ] } }, - "MinimalGlueEtlJobF8C90254": { - "Type": "AWS::Glue::Job", + "EtlJobServiceRoleDefaultPolicy8BFE343B": { + "Type": "AWS::IAM::Policy", "Properties": { - "Command": { - "Name": "glueetl", - "PythonVersion": "3", - "ScriptLocation": { - "Fn::Join": [ - "", - [ - "s3://", - { - "Ref": "AssetParameters894df8f835015940e27548bfbf722885cb247378af70effdc8ecbe342419fc6bS3Bucket252142A8" - }, - "/", + "PolicyDocument": { + "Statement": [ + { + "Action": [ + "s3:GetObject*", + "s3:GetBucket*", + "s3:List*", + "s3:DeleteObject*", + "s3:PutObject", + "s3:Abort*" + ], + "Effect": "Allow", + "Resource": [ { - "Fn::Select": [ - 0, - { - "Fn::Split": [ - "||", - { - "Ref": "AssetParameters894df8f835015940e27548bfbf722885cb247378af70effdc8ecbe342419fc6bS3VersionKey7D45B377" - } - ] - } + "Fn::GetAtt": [ + "EtlJobSparkUIBucketBF23744B", + "Arn" ] }, { - "Fn::Select": [ - 1, - { - "Fn::Split": [ - "||", - { - "Ref": "AssetParameters894df8f835015940e27548bfbf722885cb247378af70effdc8ecbe342419fc6bS3VersionKey7D45B377" - } - ] - } + "Fn::Join": [ + "", + [ + { + "Fn::GetAtt": [ + "EtlJobSparkUIBucketBF23744B", + "Arn" + ] + }, + "/*" + ] ] } ] - ] - } - }, - "Role": { - "Fn::GetAtt": [ - "MinimalGlueEtlJobServiceRole60989380", - "Arn" - ] - }, - "DefaultArguments": { - "--job-language": "python" - }, - "GlueVersion": "2.0" - } - }, - "MinimalGlueStreamingJobServiceRole77973DB5": { - "Type": "AWS::IAM::Role", - "Properties": { - "AssumeRolePolicyDocument": { - "Statement": [ - { - "Action": "sts:AssumeRole", - "Effect": "Allow", - "Principal": { - "Service": "glue.amazonaws.com" - } } ], "Version": "2012-10-17" }, - "ManagedPolicyArns": [ + "PolicyName": "EtlJobServiceRoleDefaultPolicy8BFE343B", + "Roles": [ { - "Fn::Join": [ - "", - [ - "arn:", - { - "Ref": "AWS::Partition" - }, - ":iam::aws:policy/service-role/AWSGlueServiceRole" - ] - ] + "Ref": "EtlJobServiceRole837F781B" } ] } }, - "MinimalGlueStreamingJobC58FD856": { + "EtlJobSparkUIBucketBF23744B": { + "Type": "AWS::S3::Bucket", + "UpdateReplacePolicy": "Retain", + "DeletionPolicy": "Retain" + }, + "EtlJob7FC88E45": { "Type": "AWS::Glue::Job", "Properties": { "Command": { - "Name": "gluestreaming", + "Name": "glueetl", "PythonVersion": "3", "ScriptLocation": { "Fn::Join": [ @@ -166,17 +132,85 @@ }, "Role": { "Fn::GetAtt": [ - "MinimalGlueStreamingJobServiceRole77973DB5", + "EtlJobServiceRole837F781B", "Arn" ] }, "DefaultArguments": { - "--job-language": "python" + "--job-language": "python", + "--enable-continuous-cloudwatch-log": "true", + "--enable-continuous-log-filter": "true", + "--continuous-log-logStreamPrefix": "EtlJob", + "--enable-spark-ui": "true", + "--spark-event-logs-path": { + "Fn::Join": [ + "", + [ + "s3://", + { + "Ref": "EtlJobSparkUIBucketBF23744B" + } + ] + ] + }, + "arg1": "value1", + "arg2": "value2" + }, + "ExecutionProperty": { + "MaxConcurrentRuns": 2 + }, + "GlueVersion": "2.0", + "MaxRetries": 2, + "Name": "EtlJob", + "NotificationProperty": { + "NotifyDelayAfter": 1 + }, + "NumberOfWorkers": 10, + "Tags": { + "key": "value" }, - "GlueVersion": "2.0" + "Timeout": 5, + "WorkerType": "G.2X" } }, - "MinimalPythonShellJobServiceRole4944649D": { + "EtlJobSuccessMetricRuleA72A3EF6": { + "Type": "AWS::Events::Rule", + "Properties": { + "Description": { + "Fn::Join": [ + "", + [ + "Rule triggered when Glue job ", + { + "Ref": "EtlJob7FC88E45" + }, + " is in SUCCEEDED state" + ] + ] + }, + "EventPattern": { + "source": [ + "aws.glue" + ], + "detail-type": [ + "Glue Job State Change", + "Glue Job Run Status" + ], + "detail": { + "jobName": [ + { + "Ref": "EtlJob7FC88E45" + } + ], + "state": [ + "SUCCEEDED" + ] + } + }, + "State": "ENABLED" + } + }, + "StreamingJobServiceRole1B4B8BF9": { "Type": "AWS::IAM::Role", "Properties": { "AssumeRolePolicyDocument": { @@ -207,11 +241,11 @@ ] } }, - "MinimalPythonShellJob43B4A269": { + "StreamingJob3783CC17": { "Type": "AWS::Glue::Job", "Properties": { "Command": { - "Name": "pythonshell", + "Name": "gluestreaming", "PythonVersion": "3", "ScriptLocation": { "Fn::Join": [ @@ -254,17 +288,23 @@ }, "Role": { "Fn::GetAtt": [ - "MinimalPythonShellJobServiceRole4944649D", + "StreamingJobServiceRole1B4B8BF9", "Arn" ] }, "DefaultArguments": { - "--job-language": "python" + "--job-language": "python", + "arg1": "value1", + "arg2": "value2" }, - "GlueVersion": "2.0" + "GlueVersion": "2.0", + "Name": "StreamingJob", + "Tags": { + "key": "value" + } } }, - "JobServiceRole4F432993": { + "ShellJobServiceRoleCF97BC4B": { "Type": "AWS::IAM::Role", "Properties": { "AssumeRolePolicyDocument": { @@ -295,11 +335,11 @@ ] } }, - "JobB9D00F9F": { + "ShellJob42E81F95": { "Type": "AWS::Glue::Job", "Properties": { "Command": { - "Name": "glueetl", + "Name": "pythonshell", "PythonVersion": "3", "ScriptLocation": { "Fn::Join": [ @@ -342,7 +382,7 @@ }, "Role": { "Fn::GetAtt": [ - "JobServiceRole4F432993", + "ShellJobServiceRoleCF97BC4B", "Arn" ] }, @@ -351,57 +391,11 @@ "arg1": "value1", "arg2": "value2" }, - "ExecutionProperty": { - "MaxConcurrentRuns": 2 - }, "GlueVersion": "2.0", - "MaxRetries": 2, - "NotificationProperty": { - "NotifyDelayAfter": 1 - }, - "NumberOfWorkers": 10, + "Name": "ShellJob", "Tags": { "key": "value" - }, - "Timeout": 5, - "WorkerType": "G.2X" - } - }, - "JobSuccessMetricRule80747C33": { - "Type": "AWS::Events::Rule", - "Properties": { - "Description": { - "Fn::Join": [ - "", - [ - "Rule triggered when Glue job ", - { - "Ref": "JobB9D00F9F" - }, - " is in SUCCEEDED state" - ] - ] - }, - "EventPattern": { - "source": [ - "aws.glue" - ], - "detail-type": [ - "Glue Job State Change", - "Glue Job Run Status" - ], - "detail": { - "jobName": [ - { - "Ref": "JobB9D00F9F" - } - ], - "state": [ - "SUCCEEDED" - ] - } - }, - "State": "ENABLED" + } } } }, diff --git a/packages/@aws-cdk/aws-glue/test/integ.job.ts b/packages/@aws-cdk/aws-glue/test/integ.job.ts index cae8c25913ec8..f4dfe7bfa40cc 100644 --- a/packages/@aws-cdk/aws-glue/test/integ.job.ts +++ b/packages/@aws-cdk/aws-glue/test/integ.job.ts @@ -2,48 +2,72 @@ import * as path from 'path'; import * as cdk from '@aws-cdk/core'; import * as glue from '../lib'; +/** + * To verify the ability to run those jobs + * + * Run the job using + * `aws glue start-job-run --job-name ` + */ const app = new cdk.App(); const stack = new cdk.Stack(app, 'aws-glue-job'); const script = glue.Code.fromAsset(path.join(__dirname, 'job-script/hello_world.py')); -new glue.Job(stack, 'MinimalGlueEtlJob', { +const etlJob = new glue.Job(stack, 'EtlJob', { + jobName: 'EtlJob', executable: glue.JobExecutable.pythonEtl({ glueVersion: glue.GlueVersion.V2_0, pythonVersion: glue.PythonVersion.THREE, script, }), + workerType: glue.WorkerType.G_2X, + workerCount: 10, + maxConcurrentRuns: 2, + maxRetries: 2, + timeout: cdk.Duration.minutes(5), + notifyDelayAfter: cdk.Duration.minutes(1), + defaultArguments: { + arg1: 'value1', + arg2: 'value2', + }, + sparkUI: { + enabled: true, + }, + continuousLogging: { + enabled: true, + quiet: true, + logStreamPrefix: 'EtlJob', + }, + tags: { + key: 'value', + }, }); +etlJob.metricSuccess(); -new glue.Job(stack, 'MinimalGlueStreamingJob', { +new glue.Job(stack, 'StreamingJob', { + jobName: 'StreamingJob', executable: glue.JobExecutable.pythonStreaming({ glueVersion: glue.GlueVersion.V2_0, pythonVersion: glue.PythonVersion.THREE, script, }), + defaultArguments: { + arg1: 'value1', + arg2: 'value2', + }, + tags: { + key: 'value', + }, }); -new glue.Job(stack, 'MinimalPythonShellJob', { +new glue.Job(stack, 'ShellJob', { + jobName: 'ShellJob', executable: glue.JobExecutable.pythonShell({ glueVersion: glue.GlueVersion.V2_0, pythonVersion: glue.PythonVersion.THREE, script, }), -}); - -const etlJob = new glue.Job(stack, 'Job', { - executable: glue.JobExecutable.pythonEtl({ - glueVersion: glue.GlueVersion.V2_0, - pythonVersion: glue.PythonVersion.THREE, - script, - }), - workerType: glue.WorkerType.G_2X, - workerCount: 10, - maxConcurrentRuns: 2, - maxRetries: 2, - timeout: cdk.Duration.minutes(5), - notifyDelayAfter: cdk.Duration.minutes(1), defaultArguments: { arg1: 'value1', arg2: 'value2', @@ -53,6 +77,4 @@ const etlJob = new glue.Job(stack, 'Job', { }, }); -etlJob.metricSuccess(); - app.synth(); From cd2d2eebd2ed373f48074d5972001c866468dce5 Mon Sep 17 00:00:00 2001 From: Ahmed Kamel Date: Wed, 8 Sep 2021 17:47:08 +0100 Subject: [PATCH 47/50] fix issues identified trying to run jobs from integ tests - fix bug with code bucket permissions - Glue role did not have permissions to read from the code bucket which manifested as DeniedAccess exceptions when attempting to run jobs - Update Code.bind to take a Job instead of Construct so it can grant it the necessary read permissions - Python shell failed saying Glue 2.0 is not supported - Console UI for adding a new Python shell jobs shows Glue 1.0 as the only supported version - Modified tests to reject using Glue 0.9 but didn't exclude Glue 2.0 or 3.0 for any future support by the new versions --- packages/@aws-cdk/aws-glue/README.md | 2 +- packages/@aws-cdk/aws-glue/lib/code.ts | 15 +- .../@aws-cdk/aws-glue/lib/job-executable.ts | 2 +- packages/@aws-cdk/aws-glue/test/code.test.ts | 208 +++++++++- .../aws-glue/test/integ.job.expected.json | 187 ++++++++- packages/@aws-cdk/aws-glue/test/integ.job.ts | 2 +- .../aws-glue/test/job-executable.test.ts | 2 +- .../aws-glue/test/job-script/hello_world.py | 2 +- .../aws-glue/test/job-script/hello_world_2.py | 1 + packages/@aws-cdk/aws-glue/test/job.test.ts | 362 ++++++++++-------- 10 files changed, 582 insertions(+), 201 deletions(-) create mode 100644 packages/@aws-cdk/aws-glue/test/job-script/hello_world_2.py diff --git a/packages/@aws-cdk/aws-glue/README.md b/packages/@aws-cdk/aws-glue/README.md index 7fa9f07c6d65e..f5e200f0465e7 100644 --- a/packages/@aws-cdk/aws-glue/README.md +++ b/packages/@aws-cdk/aws-glue/README.md @@ -76,7 +76,7 @@ This can be used to schedule and run tasks that don't require an Apache Spark en ```ts new glue.Job(stack, 'PythonShellJob', { executable: glue.JobExecutable.pythonShell({ - glueVersion: glue.GlueVersion.V2_0, + glueVersion: glue.GlueVersion.V1_0, pythonVersion: PythonVersion.THREE, script: glue.Code.fromBucket(bucket, 'script.py'), }), diff --git a/packages/@aws-cdk/aws-glue/lib/code.ts b/packages/@aws-cdk/aws-glue/lib/code.ts index bb0f7a3c7a465..0f4276199ace2 100644 --- a/packages/@aws-cdk/aws-glue/lib/code.ts +++ b/packages/@aws-cdk/aws-glue/lib/code.ts @@ -3,7 +3,7 @@ import * as fs from 'fs'; import * as s3 from '@aws-cdk/aws-s3'; import * as s3assets from '@aws-cdk/aws-s3-assets'; import * as cdk from '@aws-cdk/core'; -import * as constructs from 'constructs'; +import { Job } from './'; /** * Represents a Glue Job's Code assets (an asset can be a scripts, a jar, a python file or any other file). @@ -31,7 +31,7 @@ export abstract class Code { /** * Called when the Job is initialized to allow this object to bind. */ - public abstract bind(scope: constructs.Construct): CodeConfig; + public abstract bind(job: Job): CodeConfig; } /** @@ -42,7 +42,8 @@ export class S3Code extends Code { super(); } - public bind(_scope: constructs.Construct): CodeConfig { + public bind(job: Job): CodeConfig { + this.bucket.grantRead(job); return { s3Location: { bucketName: this.bucket.bucketName, @@ -69,18 +70,18 @@ export class AssetCode extends Code { } } - public bind(scope: constructs.Construct): CodeConfig { + public bind(job: Job): CodeConfig { // If the same AssetCode is used multiple times, retain only the first instantiation. if (!this.asset) { - this.asset = new s3assets.Asset(scope, `Code${this.hashcode(this.path)}`, { + this.asset = new s3assets.Asset(job, `Code${this.hashcode(this.path)}`, { path: this.path, ...this.options, }); - } else if (cdk.Stack.of(this.asset) !== cdk.Stack.of(scope)) { + } else if (cdk.Stack.of(this.asset) !== cdk.Stack.of(job)) { throw new Error(`Asset is already associated with another stack '${cdk.Stack.of(this.asset).stackName}'. ` + 'Create a new Code instance for every stack.'); } - + this.asset.grantRead(job); return { s3Location: { bucketName: this.asset.s3BucketName, diff --git a/packages/@aws-cdk/aws-glue/lib/job-executable.ts b/packages/@aws-cdk/aws-glue/lib/job-executable.ts index 597ce5140cf8c..f08902d9efdd0 100644 --- a/packages/@aws-cdk/aws-glue/lib/job-executable.ts +++ b/packages/@aws-cdk/aws-glue/lib/job-executable.ts @@ -287,7 +287,7 @@ export class JobExecutable { if (config.language !== JobLanguage.PYTHON) { throw new Error('Python shell requires the language to be set to Python'); } - if ([GlueVersion.V0_9, GlueVersion.V1_0].includes(config.glueVersion)) { + if ([GlueVersion.V0_9].includes(config.glueVersion)) { throw new Error(`Specified GlueVersion ${config.glueVersion.name} does not support Python Shell`); } } diff --git a/packages/@aws-cdk/aws-glue/test/code.test.ts b/packages/@aws-cdk/aws-glue/test/code.test.ts index 9e979dfd638ed..70a49340c93d1 100644 --- a/packages/@aws-cdk/aws-glue/test/code.test.ts +++ b/packages/@aws-cdk/aws-glue/test/code.test.ts @@ -6,6 +6,7 @@ import * as glue from '../lib'; describe('Code', () => { let stack: cdk.Stack; + let script: glue.Code; beforeEach(() => { stack = new cdk.Stack(); @@ -15,14 +16,69 @@ describe('Code', () => { const key = 'script'; let bucket: s3.IBucket; - test('with valid bucket name and key and calling bind() returns correct s3 location', () => { + test('with valid bucket name and key and bound by job sets the right path and grants the job permissions to read from it', () => { bucket = s3.Bucket.fromBucketName(stack, 'Bucket', 'bucketName'); - expect(glue.Code.fromBucket(bucket, key).bind(stack)).toEqual({ - s3Location: { - bucketName: 'bucketName', - objectKey: 'script', + script = glue.Code.fromBucket(bucket, key); + new glue.Job(stack, 'Job1', { + executable: glue.JobExecutable.pythonShell({ + glueVersion: glue.GlueVersion.V2_0, + pythonVersion: glue.PythonVersion.THREE, + script, + }), + }); + + Template.fromStack(stack).hasResourceProperties('AWS::Glue::Job', { + Command: { + ScriptLocation: 's3://bucketName/script', }, }); + + // Role policy should grant reading from the assets bucket + Template.fromStack(stack).hasResourceProperties('AWS::IAM::Policy', { + PolicyDocument: { + Statement: [ + { + Action: [ + 's3:GetObject*', + 's3:GetBucket*', + 's3:List*', + ], + Effect: 'Allow', + Resource: [ + { + 'Fn::Join': [ + '', + [ + 'arn:', + { + Ref: 'AWS::Partition', + }, + ':s3:::bucketName', + ], + ], + }, + { + 'Fn::Join': [ + '', + [ + 'arn:', + { + Ref: 'AWS::Partition', + }, + ':s3:::bucketName/*', + ], + ], + }, + ], + }, + ], + }, + Roles: [ + { + Ref: 'Job1ServiceRole7AF34CCA', + }, + ], + }); }); }); @@ -30,11 +86,115 @@ describe('Code', () => { const filePath = path.join(__dirname, 'job-script/hello_world.py'); const directoryPath = path.join(__dirname, 'job-script'); - test('with valid and existing file path and calling bind() returns an s3 location and sets metadata', () => { - const codeConfig = glue.Code.fromAsset(filePath).bind(stack); - expect(codeConfig.s3Location.bucketName).toBeDefined(); - expect(codeConfig.s3Location.objectKey).toBeDefined(); + beforeEach(() => { + script = glue.Code.fromAsset(filePath); + }); + + test("with valid and existing file path and bound to job sets job's script location and permissions stack metadata", () => { + new glue.Job(stack, 'Job1', { + executable: glue.JobExecutable.pythonShell({ + glueVersion: glue.GlueVersion.V2_0, + pythonVersion: glue.PythonVersion.THREE, + script, + }), + }); + expect(stack.node.metadata.find(m => m.type === 'aws:cdk:asset')).toBeDefined(); + Template.fromStack(stack).hasResourceProperties('AWS::Glue::Job', { + Command: { + ScriptLocation: { + 'Fn::Join': [ + '', + [ + 's3://', + { + Ref: 'AssetParameters432033e3218068a915d2532fa9be7858a12b228a2ae6e5c10faccd9097b1e855S3Bucket4E517469', + }, + '/', + { + 'Fn::Select': [ + 0, + { + 'Fn::Split': [ + '||', + { + Ref: 'AssetParameters432033e3218068a915d2532fa9be7858a12b228a2ae6e5c10faccd9097b1e855S3VersionKeyF7753763', + }, + ], + }, + ], + }, + { + 'Fn::Select': [ + 1, + { + 'Fn::Split': [ + '||', + { + Ref: 'AssetParameters432033e3218068a915d2532fa9be7858a12b228a2ae6e5c10faccd9097b1e855S3VersionKeyF7753763', + }, + ], + }, + ], + }, + ], + ], + }, + }, + }); + // Role policy should grant reading from the assets bucket + Template.fromStack(stack).hasResourceProperties('AWS::IAM::Policy', { + PolicyDocument: { + Statement: [ + { + Action: [ + 's3:GetObject*', + 's3:GetBucket*', + 's3:List*', + ], + Effect: 'Allow', + Resource: [ + { + 'Fn::Join': [ + '', + [ + 'arn:', + { + Ref: 'AWS::Partition', + }, + ':s3:::', + { + Ref: 'AssetParameters432033e3218068a915d2532fa9be7858a12b228a2ae6e5c10faccd9097b1e855S3Bucket4E517469', + }, + ], + ], + }, + { + 'Fn::Join': [ + '', + [ + 'arn:', + { + Ref: 'AWS::Partition', + }, + ':s3:::', + { + Ref: 'AssetParameters432033e3218068a915d2532fa9be7858a12b228a2ae6e5c10faccd9097b1e855S3Bucket4E517469', + }, + '/*', + ], + ], + }, + ], + }, + ], + }, + Roles: [ + { + Ref: 'Job1ServiceRole7AF34CCA', + }, + ], + }); }); test('with an unsupported directory path throws', () => { @@ -43,7 +203,6 @@ describe('Code', () => { }); test('used in more than 1 job in the same stack should be reused', () => { - const script = glue.Code.fromAsset(filePath); new glue.Job(stack, 'Job1', { executable: glue.JobExecutable.pythonShell({ glueVersion: glue.GlueVersion.V2_0, @@ -64,7 +223,7 @@ describe('Code', () => { [ 's3://', { - Ref: 'AssetParameters894df8f835015940e27548bfbf722885cb247378af70effdc8ecbe342419fc6bS3Bucket252142A8', + Ref: 'AssetParameters432033e3218068a915d2532fa9be7858a12b228a2ae6e5c10faccd9097b1e855S3Bucket4E517469', }, '/', { @@ -74,7 +233,7 @@ describe('Code', () => { 'Fn::Split': [ '||', { - Ref: 'AssetParameters894df8f835015940e27548bfbf722885cb247378af70effdc8ecbe342419fc6bS3VersionKey7D45B377', + Ref: 'AssetParameters432033e3218068a915d2532fa9be7858a12b228a2ae6e5c10faccd9097b1e855S3VersionKeyF7753763', }, ], }, @@ -87,7 +246,7 @@ describe('Code', () => { 'Fn::Split': [ '||', { - Ref: 'AssetParameters894df8f835015940e27548bfbf722885cb247378af70effdc8ecbe342419fc6bS3VersionKey7D45B377', + Ref: 'AssetParameters432033e3218068a915d2532fa9be7858a12b228a2ae6e5c10faccd9097b1e855S3VersionKeyF7753763', }, ], }, @@ -96,6 +255,7 @@ describe('Code', () => { ], ], }; + expect(stack.node.metadata.find(m => m.type === 'aws:cdk:asset')).toBeDefined(); // Job1 and Job2 use reuse the asset Template.fromStack(stack).hasResourceProperties('AWS::Glue::Job', { @@ -122,13 +282,23 @@ describe('Code', () => { }); }); - test('throws if used in more than 1 stack', () => { - const stack2 = new cdk.Stack(); - const asset = glue.Code.fromAsset(filePath); - asset.bind(stack); + test('throws if trying to rebind in another stack', () => { + new glue.Job(stack, 'Job1', { + executable: glue.JobExecutable.pythonShell({ + glueVersion: glue.GlueVersion.V2_0, + pythonVersion: glue.PythonVersion.THREE, + script, + }), + }); + const differentStack = new cdk.Stack(); - expect(() => asset.bind(stack2)) - .toThrow(/associated with another stack/); + expect(() => new glue.Job(differentStack, 'Job2', { + executable: glue.JobExecutable.pythonShell({ + glueVersion: glue.GlueVersion.V2_0, + pythonVersion: glue.PythonVersion.THREE, + script: script, + }), + })).toThrow(/associated with another stack/); }); }); }); \ No newline at end of file diff --git a/packages/@aws-cdk/aws-glue/test/integ.job.expected.json b/packages/@aws-cdk/aws-glue/test/integ.job.expected.json index 25520e55b1715..61f4f60434db1 100644 --- a/packages/@aws-cdk/aws-glue/test/integ.job.expected.json +++ b/packages/@aws-cdk/aws-glue/test/integ.job.expected.json @@ -68,6 +68,47 @@ ] } ] + }, + { + "Action": [ + "s3:GetObject*", + "s3:GetBucket*", + "s3:List*" + ], + "Effect": "Allow", + "Resource": [ + { + "Fn::Join": [ + "", + [ + "arn:", + { + "Ref": "AWS::Partition" + }, + ":s3:::", + { + "Ref": "AssetParameters432033e3218068a915d2532fa9be7858a12b228a2ae6e5c10faccd9097b1e855S3Bucket4E517469" + } + ] + ] + }, + { + "Fn::Join": [ + "", + [ + "arn:", + { + "Ref": "AWS::Partition" + }, + ":s3:::", + { + "Ref": "AssetParameters432033e3218068a915d2532fa9be7858a12b228a2ae6e5c10faccd9097b1e855S3Bucket4E517469" + }, + "/*" + ] + ] + } + ] } ], "Version": "2012-10-17" @@ -97,7 +138,7 @@ [ "s3://", { - "Ref": "AssetParameters894df8f835015940e27548bfbf722885cb247378af70effdc8ecbe342419fc6bS3Bucket252142A8" + "Ref": "AssetParameters432033e3218068a915d2532fa9be7858a12b228a2ae6e5c10faccd9097b1e855S3Bucket4E517469" }, "/", { @@ -107,7 +148,7 @@ "Fn::Split": [ "||", { - "Ref": "AssetParameters894df8f835015940e27548bfbf722885cb247378af70effdc8ecbe342419fc6bS3VersionKey7D45B377" + "Ref": "AssetParameters432033e3218068a915d2532fa9be7858a12b228a2ae6e5c10faccd9097b1e855S3VersionKeyF7753763" } ] } @@ -120,7 +161,7 @@ "Fn::Split": [ "||", { - "Ref": "AssetParameters894df8f835015940e27548bfbf722885cb247378af70effdc8ecbe342419fc6bS3VersionKey7D45B377" + "Ref": "AssetParameters432033e3218068a915d2532fa9be7858a12b228a2ae6e5c10faccd9097b1e855S3VersionKeyF7753763" } ] } @@ -241,6 +282,63 @@ ] } }, + "StreamingJobServiceRoleDefaultPolicyA0CC4C68": { + "Type": "AWS::IAM::Policy", + "Properties": { + "PolicyDocument": { + "Statement": [ + { + "Action": [ + "s3:GetObject*", + "s3:GetBucket*", + "s3:List*" + ], + "Effect": "Allow", + "Resource": [ + { + "Fn::Join": [ + "", + [ + "arn:", + { + "Ref": "AWS::Partition" + }, + ":s3:::", + { + "Ref": "AssetParameters432033e3218068a915d2532fa9be7858a12b228a2ae6e5c10faccd9097b1e855S3Bucket4E517469" + } + ] + ] + }, + { + "Fn::Join": [ + "", + [ + "arn:", + { + "Ref": "AWS::Partition" + }, + ":s3:::", + { + "Ref": "AssetParameters432033e3218068a915d2532fa9be7858a12b228a2ae6e5c10faccd9097b1e855S3Bucket4E517469" + }, + "/*" + ] + ] + } + ] + } + ], + "Version": "2012-10-17" + }, + "PolicyName": "StreamingJobServiceRoleDefaultPolicyA0CC4C68", + "Roles": [ + { + "Ref": "StreamingJobServiceRole1B4B8BF9" + } + ] + } + }, "StreamingJob3783CC17": { "Type": "AWS::Glue::Job", "Properties": { @@ -253,7 +351,7 @@ [ "s3://", { - "Ref": "AssetParameters894df8f835015940e27548bfbf722885cb247378af70effdc8ecbe342419fc6bS3Bucket252142A8" + "Ref": "AssetParameters432033e3218068a915d2532fa9be7858a12b228a2ae6e5c10faccd9097b1e855S3Bucket4E517469" }, "/", { @@ -263,7 +361,7 @@ "Fn::Split": [ "||", { - "Ref": "AssetParameters894df8f835015940e27548bfbf722885cb247378af70effdc8ecbe342419fc6bS3VersionKey7D45B377" + "Ref": "AssetParameters432033e3218068a915d2532fa9be7858a12b228a2ae6e5c10faccd9097b1e855S3VersionKeyF7753763" } ] } @@ -276,7 +374,7 @@ "Fn::Split": [ "||", { - "Ref": "AssetParameters894df8f835015940e27548bfbf722885cb247378af70effdc8ecbe342419fc6bS3VersionKey7D45B377" + "Ref": "AssetParameters432033e3218068a915d2532fa9be7858a12b228a2ae6e5c10faccd9097b1e855S3VersionKeyF7753763" } ] } @@ -335,6 +433,63 @@ ] } }, + "ShellJobServiceRoleDefaultPolicy7F22D315": { + "Type": "AWS::IAM::Policy", + "Properties": { + "PolicyDocument": { + "Statement": [ + { + "Action": [ + "s3:GetObject*", + "s3:GetBucket*", + "s3:List*" + ], + "Effect": "Allow", + "Resource": [ + { + "Fn::Join": [ + "", + [ + "arn:", + { + "Ref": "AWS::Partition" + }, + ":s3:::", + { + "Ref": "AssetParameters432033e3218068a915d2532fa9be7858a12b228a2ae6e5c10faccd9097b1e855S3Bucket4E517469" + } + ] + ] + }, + { + "Fn::Join": [ + "", + [ + "arn:", + { + "Ref": "AWS::Partition" + }, + ":s3:::", + { + "Ref": "AssetParameters432033e3218068a915d2532fa9be7858a12b228a2ae6e5c10faccd9097b1e855S3Bucket4E517469" + }, + "/*" + ] + ] + } + ] + } + ], + "Version": "2012-10-17" + }, + "PolicyName": "ShellJobServiceRoleDefaultPolicy7F22D315", + "Roles": [ + { + "Ref": "ShellJobServiceRoleCF97BC4B" + } + ] + } + }, "ShellJob42E81F95": { "Type": "AWS::Glue::Job", "Properties": { @@ -347,7 +502,7 @@ [ "s3://", { - "Ref": "AssetParameters894df8f835015940e27548bfbf722885cb247378af70effdc8ecbe342419fc6bS3Bucket252142A8" + "Ref": "AssetParameters432033e3218068a915d2532fa9be7858a12b228a2ae6e5c10faccd9097b1e855S3Bucket4E517469" }, "/", { @@ -357,7 +512,7 @@ "Fn::Split": [ "||", { - "Ref": "AssetParameters894df8f835015940e27548bfbf722885cb247378af70effdc8ecbe342419fc6bS3VersionKey7D45B377" + "Ref": "AssetParameters432033e3218068a915d2532fa9be7858a12b228a2ae6e5c10faccd9097b1e855S3VersionKeyF7753763" } ] } @@ -370,7 +525,7 @@ "Fn::Split": [ "||", { - "Ref": "AssetParameters894df8f835015940e27548bfbf722885cb247378af70effdc8ecbe342419fc6bS3VersionKey7D45B377" + "Ref": "AssetParameters432033e3218068a915d2532fa9be7858a12b228a2ae6e5c10faccd9097b1e855S3VersionKeyF7753763" } ] } @@ -391,7 +546,7 @@ "arg1": "value1", "arg2": "value2" }, - "GlueVersion": "2.0", + "GlueVersion": "1.0", "Name": "ShellJob", "Tags": { "key": "value" @@ -400,17 +555,17 @@ } }, "Parameters": { - "AssetParameters894df8f835015940e27548bfbf722885cb247378af70effdc8ecbe342419fc6bS3Bucket252142A8": { + "AssetParameters432033e3218068a915d2532fa9be7858a12b228a2ae6e5c10faccd9097b1e855S3Bucket4E517469": { "Type": "String", - "Description": "S3 bucket for asset \"894df8f835015940e27548bfbf722885cb247378af70effdc8ecbe342419fc6b\"" + "Description": "S3 bucket for asset \"432033e3218068a915d2532fa9be7858a12b228a2ae6e5c10faccd9097b1e855\"" }, - "AssetParameters894df8f835015940e27548bfbf722885cb247378af70effdc8ecbe342419fc6bS3VersionKey7D45B377": { + "AssetParameters432033e3218068a915d2532fa9be7858a12b228a2ae6e5c10faccd9097b1e855S3VersionKeyF7753763": { "Type": "String", - "Description": "S3 key for asset version \"894df8f835015940e27548bfbf722885cb247378af70effdc8ecbe342419fc6b\"" + "Description": "S3 key for asset version \"432033e3218068a915d2532fa9be7858a12b228a2ae6e5c10faccd9097b1e855\"" }, - "AssetParameters894df8f835015940e27548bfbf722885cb247378af70effdc8ecbe342419fc6bArtifactHashB9AA8E72": { + "AssetParameters432033e3218068a915d2532fa9be7858a12b228a2ae6e5c10faccd9097b1e855ArtifactHash0C610005": { "Type": "String", - "Description": "Artifact hash for asset \"894df8f835015940e27548bfbf722885cb247378af70effdc8ecbe342419fc6b\"" + "Description": "Artifact hash for asset \"432033e3218068a915d2532fa9be7858a12b228a2ae6e5c10faccd9097b1e855\"" } } } \ No newline at end of file diff --git a/packages/@aws-cdk/aws-glue/test/integ.job.ts b/packages/@aws-cdk/aws-glue/test/integ.job.ts index f4dfe7bfa40cc..364d0818b050d 100644 --- a/packages/@aws-cdk/aws-glue/test/integ.job.ts +++ b/packages/@aws-cdk/aws-glue/test/integ.job.ts @@ -64,7 +64,7 @@ new glue.Job(stack, 'StreamingJob', { new glue.Job(stack, 'ShellJob', { jobName: 'ShellJob', executable: glue.JobExecutable.pythonShell({ - glueVersion: glue.GlueVersion.V2_0, + glueVersion: glue.GlueVersion.V1_0, pythonVersion: glue.PythonVersion.THREE, script, }), diff --git a/packages/@aws-cdk/aws-glue/test/job-executable.test.ts b/packages/@aws-cdk/aws-glue/test/job-executable.test.ts index e9352a96bcd63..f4af2bb807730 100644 --- a/packages/@aws-cdk/aws-glue/test/job-executable.test.ts +++ b/packages/@aws-cdk/aws-glue/test/job-executable.test.ts @@ -66,7 +66,7 @@ describe('JobExecutable', () => { })).toThrow(/extraPythonFiles is not supported for languages other than JobLanguage.PYTHON/); }); - [glue.GlueVersion.V0_9, glue.GlueVersion.V1_0].forEach((glueVersion) => { + [glue.GlueVersion.V0_9].forEach((glueVersion) => { test(`with JobType.PYTHON_SHELL and GlueVersion ${glueVersion} should throw`, () => { expect(() => glue.JobExecutable.of({ type: glue.JobType.PYTHON_SHELL, diff --git a/packages/@aws-cdk/aws-glue/test/job-script/hello_world.py b/packages/@aws-cdk/aws-glue/test/job-script/hello_world.py index 4c613bd3178b5..e75154b7c390f 100644 --- a/packages/@aws-cdk/aws-glue/test/job-script/hello_world.py +++ b/packages/@aws-cdk/aws-glue/test/job-script/hello_world.py @@ -1 +1 @@ -println("hello world") \ No newline at end of file +print("hello world") \ No newline at end of file diff --git a/packages/@aws-cdk/aws-glue/test/job-script/hello_world_2.py b/packages/@aws-cdk/aws-glue/test/job-script/hello_world_2.py new file mode 100644 index 0000000000000..e75154b7c390f --- /dev/null +++ b/packages/@aws-cdk/aws-glue/test/job-script/hello_world_2.py @@ -0,0 +1 @@ +print("hello world") \ No newline at end of file diff --git a/packages/@aws-cdk/aws-glue/test/job.test.ts b/packages/@aws-cdk/aws-glue/test/job.test.ts index 55e67fd194bbc..50e981c2a7091 100644 --- a/packages/@aws-cdk/aws-glue/test/job.test.ts +++ b/packages/@aws-cdk/aws-glue/test/job.test.ts @@ -55,6 +55,41 @@ describe('Job', () => { describe('new', () => { const className = 'com.amazon.test.ClassName'; + const codeBucketName = 'bucketName'; + const codeBucketAccessStatement = { + Action: [ + 's3:GetObject*', + 's3:GetBucket*', + 's3:List*', + ], + Effect: 'Allow', + Resource: [ + { + 'Fn::Join': [ + '', + [ + 'arn:', + { + Ref: 'AWS::Partition', + }, + `:s3:::${codeBucketName}`, + ], + ], + }, + { + 'Fn::Join': [ + '', + [ + 'arn:', + { + Ref: 'AWS::Partition', + }, + `:s3:::${codeBucketName}/*`, + ], + ], + }, + ], + }; let codeBucket: s3.IBucket; let script: glue.Code; let extraJars: glue.Code[]; @@ -64,7 +99,7 @@ describe('Job', () => { let defaultProps: glue.JobProps; beforeEach(() => { - codeBucket = s3.Bucket.fromBucketName(stack, 'CodeBucket', 'bucketName'); + codeBucket = s3.Bucket.fromBucketName(stack, 'CodeBucket', codeBucketName); script = glue.Code.fromBucket(codeBucket, 'script'); extraJars = [glue.Code.fromBucket(codeBucket, 'file1.jar'), glue.Code.fromBucket(codeBucket, 'file2.jar')]; extraPythonFiles = [glue.Code.fromBucket(codeBucket, 'file1.py'), glue.Code.fromBucket(codeBucket, 'file2.py')]; @@ -113,6 +148,20 @@ describe('Job', () => { ], }); + // Role policy should grant reading from the assets bucket + Template.fromStack(stack).hasResourceProperties('AWS::IAM::Policy', { + PolicyDocument: { + Statement: [ + codeBucketAccessStatement, + ], + }, + Roles: [ + { + Ref: 'JobServiceRole4F432993', + }, + ], + }); + // check the job using the role Template.fromStack(stack).hasResourceProperties('AWS::Glue::Job', { Command: { @@ -238,6 +287,7 @@ describe('Job', () => { ], }, }, + codeBucketAccessStatement, ], }, Roles: [ @@ -249,186 +299,190 @@ describe('Job', () => { }); }); - describe('enabling spark ui but no bucket or path provided', () => { - beforeEach(() => { - job = new glue.Job(stack, 'Job', { - ...defaultProps, - sparkUI: { enabled: true }, + describe('enabling spark ui', () => { + describe('with no bucket or path provided', () => { + beforeEach(() => { + job = new glue.Job(stack, 'Job', { + ...defaultProps, + sparkUI: { enabled: true }, + }); }); - }); - test('should create spark ui bucket', () => { - Template.fromStack(stack).resourceCountIs('AWS::S3::Bucket', 1); - }); + test('should create spark ui bucket', () => { + Template.fromStack(stack).resourceCountIs('AWS::S3::Bucket', 1); + }); - test('should grant the role read/write permissions to the spark ui bucket', () => { - Template.fromStack(stack).hasResourceProperties('AWS::IAM::Policy', { - PolicyDocument: { - Statement: [ - { - Action: [ - 's3:GetObject*', - 's3:GetBucket*', - 's3:List*', - 's3:DeleteObject*', - 's3:PutObject*', - 's3:Abort*', - ], - Effect: 'Allow', - Resource: [ - { - 'Fn::GetAtt': [ - 'JobSparkUIBucket8E6A0139', - 'Arn', - ], - }, - { - 'Fn::Join': [ - '', - [ - { - 'Fn::GetAtt': [ - 'JobSparkUIBucket8E6A0139', - 'Arn', - ], - }, - '/*', + test('should grant the role read/write permissions to the spark ui bucket', () => { + Template.fromStack(stack).hasResourceProperties('AWS::IAM::Policy', { + PolicyDocument: { + Statement: [ + { + Action: [ + 's3:GetObject*', + 's3:GetBucket*', + 's3:List*', + 's3:DeleteObject*', + 's3:PutObject*', + 's3:Abort*', + ], + Effect: 'Allow', + Resource: [ + { + 'Fn::GetAtt': [ + 'JobSparkUIBucket8E6A0139', + 'Arn', ], - ], - }, - ], + }, + { + 'Fn::Join': [ + '', + [ + { + 'Fn::GetAtt': [ + 'JobSparkUIBucket8E6A0139', + 'Arn', + ], + }, + '/*', + ], + ], + }, + ], + }, + codeBucketAccessStatement, + ], + Version: '2012-10-17', + }, + PolicyName: 'JobServiceRoleDefaultPolicy03F68F9D', + Roles: [ + { + Ref: 'JobServiceRole4F432993', }, ], - Version: '2012-10-17', - }, - PolicyName: 'JobServiceRoleDefaultPolicy03F68F9D', - Roles: [ - { - Ref: 'JobServiceRole4F432993', - }, - ], + }); }); - }); - test('should set spark arguments on the job', () => { - Template.fromStack(stack).hasResourceProperties('AWS::Glue::Job', { - DefaultArguments: { - '--enable-spark-ui': 'true', - '--spark-event-logs-path': { - 'Fn::Join': [ - '', - [ - 's3://', - { - Ref: 'JobSparkUIBucket8E6A0139', - }, + test('should set spark arguments on the job', () => { + Template.fromStack(stack).hasResourceProperties('AWS::Glue::Job', { + DefaultArguments: { + '--enable-spark-ui': 'true', + '--spark-event-logs-path': { + 'Fn::Join': [ + '', + [ + 's3://', + { + Ref: 'JobSparkUIBucket8E6A0139', + }, + ], ], - ], + }, }, - }, + }); }); }); - }); - describe('enabling spark ui with bucket provided', () => { - const bucketName = 'BucketName'; - let bucket: s3.IBucket; + describe('with bucket provided', () => { + const sparkUIBucketName = 'sparkBucketName'; + let sparkUIBucket: s3.IBucket; - beforeEach(() => { - bucket = s3.Bucket.fromBucketName(stack, 'BucketId', bucketName); - job = new glue.Job(stack, 'Job', { - ...defaultProps, - sparkUI: { - enabled: true, - bucket, - }, + beforeEach(() => { + sparkUIBucket = s3.Bucket.fromBucketName(stack, 'SparkBucketId', sparkUIBucketName); + job = new glue.Job(stack, 'Job', { + ...defaultProps, + sparkUI: { + enabled: true, + bucket: sparkUIBucket, + }, + }); }); - }); - test('should grant the role read/write permissions to the provided spark ui bucket', () => { - Template.fromStack(stack).hasResourceProperties('AWS::IAM::Policy', { - PolicyDocument: { - Statement: [ - { - Action: [ - 's3:GetObject*', - 's3:GetBucket*', - 's3:List*', - 's3:DeleteObject*', - 's3:PutObject*', - 's3:Abort*', - ], - Effect: 'Allow', - Resource: [ - { - 'Fn::Join': [ - '', - [ - 'arn:', - { - Ref: 'AWS::Partition', - }, - ':s3:::BucketName', + test('should grant the role read/write permissions to the provided spark ui bucket', () => { + Template.fromStack(stack).hasResourceProperties('AWS::IAM::Policy', { + PolicyDocument: { + Statement: [ + { + Action: [ + 's3:GetObject*', + 's3:GetBucket*', + 's3:List*', + 's3:DeleteObject*', + 's3:PutObject*', + 's3:Abort*', + ], + Effect: 'Allow', + Resource: [ + { + 'Fn::Join': [ + '', + [ + 'arn:', + { + Ref: 'AWS::Partition', + }, + ':s3:::sparkBucketName', + ], ], - ], - }, - { - 'Fn::Join': [ - '', - [ - 'arn:', - { - Ref: 'AWS::Partition', - }, - ':s3:::BucketName/*', + }, + { + 'Fn::Join': [ + '', + [ + 'arn:', + { + Ref: 'AWS::Partition', + }, + ':s3:::sparkBucketName/*', + ], ], - ], - }, - ], + }, + ], + }, + codeBucketAccessStatement, + ], + }, + Roles: [ + { + Ref: 'JobServiceRole4F432993', }, ], - }, - Roles: [ - { - Ref: 'JobServiceRole4F432993', - }, - ], + }); }); - }); - test('should set spark arguments on the job', () => { - Template.fromStack(stack).hasResourceProperties('AWS::Glue::Job', { - DefaultArguments: { - '--enable-spark-ui': 'true', - '--spark-event-logs-path': `s3://${bucketName}`, - }, + test('should set spark arguments on the job', () => { + Template.fromStack(stack).hasResourceProperties('AWS::Glue::Job', { + DefaultArguments: { + '--enable-spark-ui': 'true', + '--spark-event-logs-path': `s3://${sparkUIBucketName}`, + }, + }); }); }); - }); - describe('enabling spark ui with bucket and path provided', () => { - const bucketName = 'BucketName'; - const prefix = 'some/path/'; - let bucket: s3.IBucket; + describe('with bucket and path provided', () => { + const sparkUIBucketName = 'sparkBucketName'; + const prefix = 'some/path/'; + let sparkUIBucket: s3.IBucket; - beforeEach(() => { - bucket = s3.Bucket.fromBucketName(stack, 'BucketId', bucketName); - job = new glue.Job(stack, 'Job', { - ...defaultProps, - sparkUI: { - enabled: true, - bucket, - prefix, - }, + beforeEach(() => { + sparkUIBucket = s3.Bucket.fromBucketName(stack, 'BucketId', sparkUIBucketName); + job = new glue.Job(stack, 'Job', { + ...defaultProps, + sparkUI: { + enabled: true, + bucket: sparkUIBucket, + prefix, + }, + }); }); - }); - test('should set spark arguments on the job', () => { - Template.fromStack(stack).hasResourceProperties('AWS::Glue::Job', { - DefaultArguments: { - '--enable-spark-ui': 'true', - '--spark-event-logs-path': `s3://${bucketName}/${prefix}`, - }, + test('should set spark arguments on the job', () => { + Template.fromStack(stack).hasResourceProperties('AWS::Glue::Job', { + DefaultArguments: { + '--enable-spark-ui': 'true', + '--spark-event-logs-path': `s3://${sparkUIBucketName}/${prefix}`, + }, + }); }); }); }); @@ -533,7 +587,7 @@ describe('Job', () => { test('with unsupported Spark UI prop should throw', () => { expect(() => new glue.Job(stack, 'Job', { executable: glue.JobExecutable.pythonShell({ - glueVersion: glue.GlueVersion.V2_0, + glueVersion: glue.GlueVersion.V1_0, pythonVersion: glue.PythonVersion.THREE, script, }), From ea32eabd1242f4e37373d2f445bb6dd161ea902e Mon Sep 17 00:00:00 2001 From: Ahmed Kamel Date: Wed, 8 Sep 2021 18:31:41 +0100 Subject: [PATCH 48/50] update integ test verification documentation --- packages/@aws-cdk/aws-glue/test/integ.job.ts | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/packages/@aws-cdk/aws-glue/test/integ.job.ts b/packages/@aws-cdk/aws-glue/test/integ.job.ts index 364d0818b050d..fedbc0b8b8428 100644 --- a/packages/@aws-cdk/aws-glue/test/integ.job.ts +++ b/packages/@aws-cdk/aws-glue/test/integ.job.ts @@ -3,10 +3,19 @@ import * as cdk from '@aws-cdk/core'; import * as glue from '../lib'; /** - * To verify the ability to run those jobs + * To verify the ability to run jobs created in this test * * Run the job using - * `aws glue start-job-run --job-name ` + * `aws glue start-job-run --region us-east-1 --job-name ` + * This will return a runId + * + * Get the status of the job run using + * `aws glue get-job-run --region us-east-1 --job-name --run-id ` + * + * For example, to test the ShellJob + * - Run: `aws glue start-job-run --region us-east-1 --job-name ShellJob` + * - Get Status: `aws glue get-job-run --region us-east-1 --job-name ShellJob --run-id ` + * - Check output: `aws logs get-log-events --region us-east-1 --log-group-name "/aws-glue/python-jobs/output" --log-stream-name ">` which should show "hello world" */ const app = new cdk.App(); From 98cc575401eb6349d8b6949b37ba72699cce29c9 Mon Sep 17 00:00:00 2001 From: Ahmed Kamel Date: Wed, 8 Sep 2021 19:53:04 +0100 Subject: [PATCH 49/50] update Code.bind signature and PythonShell supported glue versions --- packages/@aws-cdk/aws-glue/lib/code.ts | 17 +++++++++-------- .../@aws-cdk/aws-glue/lib/job-executable.ts | 2 +- packages/@aws-cdk/aws-glue/lib/job.ts | 2 +- packages/@aws-cdk/aws-glue/test/code.test.ts | 12 ++++++------ .../aws-glue/test/job-executable.test.ts | 10 ++++++---- 5 files changed, 23 insertions(+), 20 deletions(-) diff --git a/packages/@aws-cdk/aws-glue/lib/code.ts b/packages/@aws-cdk/aws-glue/lib/code.ts index 0f4276199ace2..cb118f873019c 100644 --- a/packages/@aws-cdk/aws-glue/lib/code.ts +++ b/packages/@aws-cdk/aws-glue/lib/code.ts @@ -1,9 +1,10 @@ import * as crypto from 'crypto'; import * as fs from 'fs'; +import * as iam from '@aws-cdk/aws-iam'; import * as s3 from '@aws-cdk/aws-s3'; import * as s3assets from '@aws-cdk/aws-s3-assets'; import * as cdk from '@aws-cdk/core'; -import { Job } from './'; +import * as constructs from 'constructs'; /** * Represents a Glue Job's Code assets (an asset can be a scripts, a jar, a python file or any other file). @@ -31,7 +32,7 @@ export abstract class Code { /** * Called when the Job is initialized to allow this object to bind. */ - public abstract bind(job: Job): CodeConfig; + public abstract bind(scope: constructs.Construct, grantable: iam.IGrantable): CodeConfig; } /** @@ -42,8 +43,8 @@ export class S3Code extends Code { super(); } - public bind(job: Job): CodeConfig { - this.bucket.grantRead(job); + public bind(_scope: constructs.Construct, grantable: iam.IGrantable): CodeConfig { + this.bucket.grantRead(grantable); return { s3Location: { bucketName: this.bucket.bucketName, @@ -70,18 +71,18 @@ export class AssetCode extends Code { } } - public bind(job: Job): CodeConfig { + public bind(scope: constructs.Construct, grantable: iam.IGrantable): CodeConfig { // If the same AssetCode is used multiple times, retain only the first instantiation. if (!this.asset) { - this.asset = new s3assets.Asset(job, `Code${this.hashcode(this.path)}`, { + this.asset = new s3assets.Asset(scope, `Code${this.hashcode(this.path)}`, { path: this.path, ...this.options, }); - } else if (cdk.Stack.of(this.asset) !== cdk.Stack.of(job)) { + } else if (cdk.Stack.of(this.asset) !== cdk.Stack.of(scope)) { throw new Error(`Asset is already associated with another stack '${cdk.Stack.of(this.asset).stackName}'. ` + 'Create a new Code instance for every stack.'); } - this.asset.grantRead(job); + this.asset.grantRead(grantable); return { s3Location: { bucketName: this.asset.s3BucketName, diff --git a/packages/@aws-cdk/aws-glue/lib/job-executable.ts b/packages/@aws-cdk/aws-glue/lib/job-executable.ts index f08902d9efdd0..8fd7c39da5508 100644 --- a/packages/@aws-cdk/aws-glue/lib/job-executable.ts +++ b/packages/@aws-cdk/aws-glue/lib/job-executable.ts @@ -287,7 +287,7 @@ export class JobExecutable { if (config.language !== JobLanguage.PYTHON) { throw new Error('Python shell requires the language to be set to Python'); } - if ([GlueVersion.V0_9].includes(config.glueVersion)) { + if ([GlueVersion.V0_9, GlueVersion.V2_0, GlueVersion.V3_0].includes(config.glueVersion)) { throw new Error(`Specified GlueVersion ${config.glueVersion.name} does not support Python Shell`); } } diff --git a/packages/@aws-cdk/aws-glue/lib/job.ts b/packages/@aws-cdk/aws-glue/lib/job.ts index 30c325f31ae80..0233783f94869 100644 --- a/packages/@aws-cdk/aws-glue/lib/job.ts +++ b/packages/@aws-cdk/aws-glue/lib/job.ts @@ -765,7 +765,7 @@ export class Job extends JobBase { } private codeS3ObjectUrl(code: Code) { - const s3Location = code.bind(this).s3Location; + const s3Location = code.bind(this, this.role).s3Location; return `s3://${s3Location.bucketName}/${s3Location.objectKey}`; } } diff --git a/packages/@aws-cdk/aws-glue/test/code.test.ts b/packages/@aws-cdk/aws-glue/test/code.test.ts index 70a49340c93d1..f87ad46c9687f 100644 --- a/packages/@aws-cdk/aws-glue/test/code.test.ts +++ b/packages/@aws-cdk/aws-glue/test/code.test.ts @@ -21,7 +21,7 @@ describe('Code', () => { script = glue.Code.fromBucket(bucket, key); new glue.Job(stack, 'Job1', { executable: glue.JobExecutable.pythonShell({ - glueVersion: glue.GlueVersion.V2_0, + glueVersion: glue.GlueVersion.V1_0, pythonVersion: glue.PythonVersion.THREE, script, }), @@ -93,7 +93,7 @@ describe('Code', () => { test("with valid and existing file path and bound to job sets job's script location and permissions stack metadata", () => { new glue.Job(stack, 'Job1', { executable: glue.JobExecutable.pythonShell({ - glueVersion: glue.GlueVersion.V2_0, + glueVersion: glue.GlueVersion.V1_0, pythonVersion: glue.PythonVersion.THREE, script, }), @@ -205,14 +205,14 @@ describe('Code', () => { test('used in more than 1 job in the same stack should be reused', () => { new glue.Job(stack, 'Job1', { executable: glue.JobExecutable.pythonShell({ - glueVersion: glue.GlueVersion.V2_0, + glueVersion: glue.GlueVersion.V1_0, pythonVersion: glue.PythonVersion.THREE, script, }), }); new glue.Job(stack, 'Job2', { executable: glue.JobExecutable.pythonShell({ - glueVersion: glue.GlueVersion.V2_0, + glueVersion: glue.GlueVersion.V1_0, pythonVersion: glue.PythonVersion.THREE, script, }), @@ -285,7 +285,7 @@ describe('Code', () => { test('throws if trying to rebind in another stack', () => { new glue.Job(stack, 'Job1', { executable: glue.JobExecutable.pythonShell({ - glueVersion: glue.GlueVersion.V2_0, + glueVersion: glue.GlueVersion.V1_0, pythonVersion: glue.PythonVersion.THREE, script, }), @@ -294,7 +294,7 @@ describe('Code', () => { expect(() => new glue.Job(differentStack, 'Job2', { executable: glue.JobExecutable.pythonShell({ - glueVersion: glue.GlueVersion.V2_0, + glueVersion: glue.GlueVersion.V1_0, pythonVersion: glue.PythonVersion.THREE, script: script, }), diff --git a/packages/@aws-cdk/aws-glue/test/job-executable.test.ts b/packages/@aws-cdk/aws-glue/test/job-executable.test.ts index f4af2bb807730..481bd16dc8944 100644 --- a/packages/@aws-cdk/aws-glue/test/job-executable.test.ts +++ b/packages/@aws-cdk/aws-glue/test/job-executable.test.ts @@ -38,7 +38,7 @@ describe('JobExecutable', () => { describe('.of()', () => { test('with valid config should succeed', () => { expect(glue.JobExecutable.of({ - glueVersion: glue.GlueVersion.V2_0, + glueVersion: glue.GlueVersion.V1_0, type: glue.JobType.PYTHON_SHELL, language: glue.JobLanguage.PYTHON, pythonVersion: glue.PythonVersion.THREE, @@ -66,7 +66,7 @@ describe('JobExecutable', () => { })).toThrow(/extraPythonFiles is not supported for languages other than JobLanguage.PYTHON/); }); - [glue.GlueVersion.V0_9].forEach((glueVersion) => { + [glue.GlueVersion.V0_9, glue.GlueVersion.V2_0, glue.GlueVersion.V3_0].forEach((glueVersion) => { test(`with JobType.PYTHON_SHELL and GlueVersion ${glueVersion} should throw`, () => { expect(() => glue.JobExecutable.of({ type: glue.JobType.PYTHON_SHELL, @@ -76,8 +76,10 @@ describe('JobExecutable', () => { glueVersion, })).toThrow(`Specified GlueVersion ${glueVersion.name} does not support Python Shell`); }); + }); - test(`with extraJarsFirst set and GlueVersion ${glueVersion} should throw`, () => { + [glue.GlueVersion.V0_9, glue.GlueVersion.V1_0].forEach((glueVersion) => { + test(`with extraJarsFirst set and GlueVersion ${glueVersion.name} should throw`, () => { expect(() => glue.JobExecutable.of({ type: glue.JobType.ETL, language: glue.JobLanguage.PYTHON, @@ -92,7 +94,7 @@ describe('JobExecutable', () => { [glue.GlueVersion.V2_0, glue.GlueVersion.V3_0].forEach((glueVersion) => { test(`with PythonVersion.TWO and GlueVersion ${glueVersion} should throw`, () => { expect(() => glue.JobExecutable.of({ - type: glue.JobType.PYTHON_SHELL, + type: glue.JobType.ETL, language: glue.JobLanguage.PYTHON, pythonVersion: glue.PythonVersion.TWO, script, From 03286152d6fcbfd216e8bd5917d62e741ea3a8d2 Mon Sep 17 00:00:00 2001 From: Ahmed Kamel Date: Wed, 8 Sep 2021 23:02:25 +0100 Subject: [PATCH 50/50] narrow the permissions granted by S3Code --- packages/@aws-cdk/aws-glue/lib/code.ts | 2 +- packages/@aws-cdk/aws-glue/test/code.test.ts | 2 +- packages/@aws-cdk/aws-glue/test/job.test.ts | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/packages/@aws-cdk/aws-glue/lib/code.ts b/packages/@aws-cdk/aws-glue/lib/code.ts index cb118f873019c..9f2f03d9884be 100644 --- a/packages/@aws-cdk/aws-glue/lib/code.ts +++ b/packages/@aws-cdk/aws-glue/lib/code.ts @@ -44,7 +44,7 @@ export class S3Code extends Code { } public bind(_scope: constructs.Construct, grantable: iam.IGrantable): CodeConfig { - this.bucket.grantRead(grantable); + this.bucket.grantRead(grantable, this.key); return { s3Location: { bucketName: this.bucket.bucketName, diff --git a/packages/@aws-cdk/aws-glue/test/code.test.ts b/packages/@aws-cdk/aws-glue/test/code.test.ts index f87ad46c9687f..061f6d26c351f 100644 --- a/packages/@aws-cdk/aws-glue/test/code.test.ts +++ b/packages/@aws-cdk/aws-glue/test/code.test.ts @@ -65,7 +65,7 @@ describe('Code', () => { { Ref: 'AWS::Partition', }, - ':s3:::bucketName/*', + ':s3:::bucketName/script', ], ], }, diff --git a/packages/@aws-cdk/aws-glue/test/job.test.ts b/packages/@aws-cdk/aws-glue/test/job.test.ts index 50e981c2a7091..625e4743570fd 100644 --- a/packages/@aws-cdk/aws-glue/test/job.test.ts +++ b/packages/@aws-cdk/aws-glue/test/job.test.ts @@ -84,7 +84,7 @@ describe('Job', () => { { Ref: 'AWS::Partition', }, - `:s3:::${codeBucketName}/*`, + `:s3:::${codeBucketName}/script`, ], ], },