Skip to content

Latest commit

 

History

History
482 lines (415 loc) · 35.3 KB

README.md

File metadata and controls

482 lines (415 loc) · 35.3 KB

AWS EMR Terraform module

Terraform module which creates AWS EMR resources.

SWUbanner

This module supports the creation of:

  • EMR clusters using instance fleets or instance groups deployed in public or private subnets

  • EMR Virtual clusters that run on Amazon EKS

  • EMR Serverless clusters

  • EMR Studios

  • Security groups for master, core, and task nodes

  • Security group for EMR service to support private clusters

  • IAM roles for autoscaling, EMR service, and EC2 instance profiles

    ℹ️ The appropriate resources have been tagged with { "for-use-with-amazon-emr-managed-policies" = true } to support the use of the recommended IAM policy "arn:aws:iam::aws:policy/service-role/AmazonEMRServicePolicy_v2". Users are required to tag the appropriate VPC resources (VPC and subnets) as needed. See here for more details regarding v2 of managed EMR policies and their usage requirements.

Usage

Private Cluster w/ Instance Fleet

module "emr" {
  source = "terraform-aws-modules/emr/aws"

  name = "example-instance-fleet"

  release_label = "emr-6.9.0"
  applications  = ["spark", "trino"]
  auto_termination_policy = {
    idle_timeout = 3600
  }

  bootstrap_action = {
    example = {
      path = "file:/bin/echo",
      name = "Just an example",
      args = ["Hello World!"]
    }
  }

  configurations_json = jsonencode([
    {
      "Classification" : "spark-env",
      "Configurations" : [
        {
          "Classification" : "export",
          "Properties" : {
            "JAVA_HOME" : "/usr/lib/jvm/java-1.8.0"
          }
        }
      ],
      "Properties" : {}
    }
  ])

  master_instance_fleet = {
    name                      = "master-fleet"
    target_on_demand_capacity = 1
    instance_type_configs = [
      {
        instance_type = "m5.xlarge"
      }
    ]
  }

  core_instance_fleet = {
    name                      = "core-fleet"
    target_on_demand_capacity = 2
    target_spot_capacity      = 2
    instance_type_configs = [
      {
        instance_type     = "c4.large"
        weighted_capacity = 1
      },
      {
        bid_price_as_percentage_of_on_demand_price = 100
        ebs_config = {
          size                 = 64
          type                 = "gp3"
          volumes_per_instance = 1
        }
        instance_type     = "c5.xlarge"
        weighted_capacity = 2
      },
      {
        bid_price_as_percentage_of_on_demand_price = 100
        instance_type                              = "c6i.xlarge"
        weighted_capacity                          = 2
      }
    ]
    launch_specifications = {
      spot_specification = {
        allocation_strategy      = "capacity-optimized"
        block_duration_minutes   = 0
        timeout_action           = "SWITCH_TO_ON_DEMAND"
        timeout_duration_minutes = 5
      }
    }
  }

  task_instance_fleet = {
    name                      = "task-fleet"
    target_on_demand_capacity = 1
    target_spot_capacity      = 2
    instance_type_configs = [
      {
        instance_type     = "c4.large"
        weighted_capacity = 1
      },
      {
        bid_price_as_percentage_of_on_demand_price = 100
        ebs_config = {
          size                 = 64
          type                 = "gp3"
          volumes_per_instance = 1
        }
        instance_type     = "c5.xlarge"
        weighted_capacity = 2
      }
    ]
    launch_specifications = {
      spot_specification = {
        allocation_strategy      = "capacity-optimized"
        block_duration_minutes   = 0
        timeout_action           = "SWITCH_TO_ON_DEMAND"
        timeout_duration_minutes = 5
      }
    }
  }

  ebs_root_volume_size = 64
  ec2_attributes = {
    # Subnets should be private subnets and tagged with
    # { "for-use-with-amazon-emr-managed-policies" = true }
    subnet_ids = ["subnet-abcde012", "subnet-bcde012a", "subnet-fghi345a"]
  }
  vpc_id = "vpc-1234556abcdef"

  list_steps_states  = ["PENDING", "RUNNING", "FAILED", "INTERRUPTED"]
  log_uri            = "s3://my-elasticmapreduce-bucket/"

  scale_down_behavior    = "TERMINATE_AT_TASK_COMPLETION"
  step_concurrency_level = 3
  termination_protection = false
  visible_to_all_users   = true

  tags = {
    Terraform   = "true"
    Environment = "dev"
  }
}

Public Cluster w/ Instance Fleet

Configuration is the same as the public version shown above except for the following changes noted below. Users should utilize S3 and EMR VPC endpoints for private connectivity and avoid data transfer charges across NAT gateways.

...
  ec2_attributes = {
    # Subnets should be public subnets and tagged with
    # { "for-use-with-amazon-emr-managed-policies" = true }
    subnet_ids = ["subnet-xyzde987", "subnet-slkjf456", "subnet-qeiru789"]
  }

  # Required for creating public cluster
  is_private_cluster = false
...

Private Cluster w/ Instance Group

module "emr" {
  source = "terraform-aws-modules/emr/aws"

  name = "example-instance-group"

  release_label = "emr-6.9.0"
  applications  = ["spark", "trino"]
  auto_termination_policy = {
    idle_timeout = 3600
  }

  bootstrap_action = {
    example = {
      name = "Just an example",
      path = "file:/bin/echo",
      args = ["Hello World!"]
    }
  }

  configurations_json = jsonencode([
    {
      "Classification" : "spark-env",
      "Configurations" : [
        {
          "Classification" : "export",
          "Properties" : {
            "JAVA_HOME" : "/usr/lib/jvm/java-1.8.0"
          }
        }
      ],
      "Properties" : {}
    }
  ])

  master_instance_group = {
    name           = "master-group"
    instance_count = 1
    instance_type  = "m5.xlarge"
  }

  core_instance_group = {
    name           = "core-group"
    instance_count = 2
    instance_type  = "c4.large"
  }

  task_instance_group = {
    name           = "task-group"
    instance_count = 2
    instance_type  = "c5.xlarge"
    bid_price      = "0.1"

    ebs_config = {
      size                 = 64
      type                 = "gp3"
      volumes_per_instance = 1
    }
    ebs_optimized = true
  }

  ebs_root_volume_size = 64
  ec2_attributes = {
    # Instance groups only support one Subnet/AZ
    # Subnets should be private subnets and tagged with
    # { "for-use-with-amazon-emr-managed-policies" = true }
    subnet_id = "subnet-abcde012"
  }
  vpc_id = "vpc-1234556abcdef"

  list_steps_states  = ["PENDING", "RUNNING", "FAILED", "INTERRUPTED"]
  log_uri            = "s3://my-elasticmapreduce-bucket/"

  scale_down_behavior    = "TERMINATE_AT_TASK_COMPLETION"
  step_concurrency_level = 3
  termination_protection = false
  visible_to_all_users   = true

  tags = {
    Terraform   = "true"
    Environment = "dev"
  }
}

Public Cluster w/ Instance Group

Configuration is the same as the public version shown above except for the following changes noted below. Users should utilize S3 and EMR VPC endpoints for private connectivity and avoid data transfer charges across NAT gateways.

...
  ec2_attributes = {
    # Instance groups only support one Subnet/AZ
    # Subnets should be public subnets and tagged with
    # { "for-use-with-amazon-emr-managed-policies" = true }
    subnet_id = "subnet-xyzde987"
  }

  # Required for creating public cluster
  is_private_cluster = false
...

Conditional Creation

The following values are provided to toggle on/off creation of the associated resources as desired:

module "emr" {
  source = "terraform-aws-modules/emr/aws"

  # Disables all resources from being created
  create = false

  # Enables the creation of a security configuration for the cluster
  # Configuration should be supplied via the `security_configuration` variable
  create_security_configuration = true

  # Disables the creation of the role used by the service
  # An externally created role must be supplied via the `service_iam_role_arn` variable
  create_service_iam_role = false

  # Disables the creation of the role used by the service
  # An externally created role can be supplied via the `autoscaling_iam_role_arn` variable
  create_autoscaling_iam_role = false

  # Disables the creation of the IAM role/instance profile used by the EC2 instances
  # An externally created IAM instance profile must be supplied
  # via the `iam_instance_profile_name` variable
  create_iam_instance_profile = false

  # Disables the creation of the security groups used by the EC2 instances. Users can supplied
  # security groups for `master`, `slave`, and `service` security groups via the
  # `ec2_attributes` map variable. If not, the EMR service will create and associate
  # the necessary security groups. Note - the VPC will need to be tagged with
  # { "for-use-with-amazon-emr-managed-policies" = true } for EMR to create security groups
  # https://docs.aws.amazon.com/emr/latest/ManagementGuide/emr-man-sec-groups.html
  create_managed_security_groups = false

  is_private_cluster = false
}

Examples

Examples codified under the examples are intended to give users references for how to use the module(s) as well as testing/validating changes to the source code of the module. If contributing to the project, please be sure to make any appropriate updates to the relevant examples to allow maintainers to test your changes and to keep the examples up to date for users. Thank you!

Requirements

Name Version
terraform >= 1.0
aws >= 4.42

Providers

Name Version
aws >= 4.42

Modules

No modules.

Resources

Name Type
aws_emr_cluster.this resource
aws_emr_instance_fleet.this resource
aws_emr_instance_group.this resource
aws_emr_managed_scaling_policy.this resource
aws_emr_security_configuration.this resource
aws_iam_instance_profile.this resource
aws_iam_policy.service_pass_role resource
aws_iam_role.autoscaling resource
aws_iam_role.instance_profile resource
aws_iam_role.service resource
aws_iam_role_policy_attachment.autoscaling resource
aws_iam_role_policy_attachment.instance_profile resource
aws_iam_role_policy_attachment.service resource
aws_iam_role_policy_attachment.service_pass_role resource
aws_security_group.master resource
aws_security_group.service resource
aws_security_group.slave resource
aws_security_group_rule.master resource
aws_security_group_rule.service resource
aws_security_group_rule.slave resource
aws_caller_identity.current data source
aws_emr_release_labels.this data source
aws_iam_policy_document.autoscaling data source
aws_iam_policy_document.instance_profile data source
aws_iam_policy_document.service data source
aws_iam_policy_document.service_pass_role data source
aws_partition.current data source
aws_region.current data source

Inputs

Name Description Type Default Required
additional_info JSON string for selecting additional features such as adding proxy information. Note: Currently there is no API to retrieve the value of this argument after EMR cluster creation from provider, therefore Terraform cannot detect drift from the actual EMR cluster if its value is changed outside Terraform string null no
applications A case-insensitive list of applications for Amazon EMR to install and configure when launching the cluster list(string) [] no
auto_termination_policy An auto-termination policy for an Amazon EMR cluster. An auto-termination policy defines the amount of idle time in seconds after which a cluster automatically terminates any {} no
autoscaling_iam_role_arn The ARN of an existing IAM role to use for autoscaling string null no
autoscaling_iam_role_description Description of the role string null no
autoscaling_iam_role_name Name to use on IAM role created string null no
bootstrap_action Ordered list of bootstrap actions that will be run before Hadoop is started on the cluster nodes any {} no
configurations List of configurations supplied for the EMR cluster you are creating. Supply a configuration object for applications to override their default configuration string null no
configurations_json JSON string for supplying list of configurations for the EMR cluster string null no
core_instance_fleet Configuration block to use an Instance Fleet for the core node type. Cannot be specified if any core_instance_group configuration blocks are set any {} no
core_instance_group Configuration block to use an [Instance Group] for the core node type any {} no
create Controls if resources should be created (affects nearly all resources) bool true no
create_autoscaling_iam_role Determines whether the autoscaling IAM role should be created bool true no
create_iam_instance_profile Determines whether the EC2 IAM role/instance profile should be created bool true no
create_managed_security_groups Determines whether managed security groups are created bool true no
create_security_configuration Determines whether a security configuration is created bool false no
create_service_iam_role Determines whether the service IAM role should be created bool true no
custom_ami_id Custom Amazon Linux AMI for the cluster (instead of an EMR-owned AMI). Available in Amazon EMR version 5.7.0 and later string null no
ebs_root_volume_size Size in GiB of the EBS root device volume of the Linux AMI that is used for each EC2 instance. Available in Amazon EMR version 4.x and later number null no
ec2_attributes Attributes for the EC2 instances running the job flow any {} no
iam_instance_profile_description Description of the EC2 IAM role/instance profile string null no
iam_instance_profile_name Name to use on EC2 IAM role/instance profile created string null no
iam_instance_profile_policies Map of IAM policies to attach to the EC2 IAM role/instance profile map(string)
{
"AmazonElasticMapReduceforEC2Role": "arn:aws:iam::aws:policy/service-role/AmazonElasticMapReduceforEC2Role"
}
no
iam_role_path IAM role path string null no
iam_role_permissions_boundary ARN of the policy that is used to set the permissions boundary for the IAM role string null no
iam_role_tags A map of additional tags to add to the IAM role created map(string) {} no
iam_role_use_name_prefix Determines whether the IAM role name is used as a prefix bool true no
is_private_cluster Identifies whether the cluster is created in a private subnet bool true no
keep_job_flow_alive_when_no_steps Switch on/off run cluster with no steps or when all steps are complete (default is on) bool null no
kerberos_attributes Kerberos configuration for the cluster any {} no
list_steps_states List of step states used to filter returned steps list(string) [] no
log_encryption_kms_key_id AWS KMS customer master key (CMK) key ID or arn used for encrypting log files. This attribute is only available with EMR version 5.30.0 and later, excluding EMR 6.0.0 string null no
log_uri S3 bucket to write the log files of the job flow. If a value is not provided, logs are not created string null no
managed_scaling_policy Compute limit configuration for a Managed Scaling Policy any {} no
managed_security_group_name Name to use on manged security group created. Note - -master, -slave, and -service will be appended to this name to distinguish string null no
managed_security_group_tags A map of additional tags to add to the security group created map(string) {} no
managed_security_group_use_name_prefix Determines whether the security group name (security_group_name) is used as a prefix bool true no
master_instance_fleet Configuration block to use an Instance Fleet for the master node type. Cannot be specified if any master_instance_group configuration blocks are set any {} no
master_instance_group Configuration block to use an Instance Group for the master node type any {} no
master_security_group_description Description of the security group created string "Managed master security group" no
master_security_group_rules Security group rules to add to the security group created any
{
"default": {
"cidr_blocks": [
"0.0.0.0/0"
],
"description": "Allow all egress traffic",
"from_port": 0,
"ipv6_cidr_blocks": [
"::/0"
],
"protocol": "-1",
"to_port": 0,
"type": "egress"
}
}
no
name Name of the job flow string "" no
release_label Release label for the Amazon EMR release string null no
release_label_filters Map of release label filters use to lookup a release label any
{
"default": {
"prefix": "emr-6"
}
}
no
scale_down_behavior Way that individual Amazon EC2 instances terminate when an automatic scale-in activity occurs or an instance group is resized string "TERMINATE_AT_TASK_COMPLETION" no
security_configuration Security configuration to create, or attach if create_security_configuration is false. Only valid for EMR clusters with release_label 4.8.0 or greater string null no
security_configuration_name Name of the security configuration to create, or attach if create_security_configuration is false. Only valid for EMR clusters with release_label 4.8.0 or greater string null no
security_configuration_use_name_prefix Determines whether security_configuration_name is used as a prefix bool true no
service_iam_role_arn The ARN of an existing IAM role to use for the service string null no
service_iam_role_description Description of the role string null no
service_iam_role_name Name to use on IAM role created string null no
service_iam_role_policies Map of IAM policies to attach to the service role map(string)
{
"AmazonEMRServicePolicy_v2": "arn:aws:iam::aws:policy/service-role/AmazonEMRServicePolicy_v2"
}
no
service_pass_role_policy_description Description of the policy string null no
service_pass_role_policy_name Name to use on IAM policy created string null no
service_security_group_description Description of the security group created string "Managed service access security group" no
service_security_group_rules Security group rules to add to the security group created any {} no
slave_security_group_description Description of the security group created string "Managed slave security group" no
slave_security_group_rules Security group rules to add to the security group created any
{
"default": {
"cidr_blocks": [
"0.0.0.0/0"
],
"description": "Allow all egress traffic",
"from_port": 0,
"ipv6_cidr_blocks": [
"::/0"
],
"protocol": "-1",
"to_port": 0,
"type": "egress"
}
}
no
step Steps to run when creating the cluster any {} no
step_concurrency_level Number of steps that can be executed concurrently. You can specify a maximum of 256 steps. Only valid for EMR clusters with release_label 5.28.0 or greater (default is 1) number null no
tags A map of tags to add to all resources map(string) {} no
task_instance_fleet Configuration block to use an Instance Fleet for the task node type. Cannot be specified if any task_instance_group configuration blocks are set any {} no
task_instance_group Configuration block to use an Instance Group for the task node type any {} no
termination_protection Switch on/off termination protection (default is false, except when using multiple master nodes). Before attempting to destroy the resource when termination protection is enabled, this configuration must be applied with its value set to false bool null no
visible_to_all_users Whether the job flow is visible to all IAM users of the AWS account associated with the job flow. Default value is true bool null no
vpc_id The ID of the Amazon Virtual Private Cloud (Amazon VPC) where the security groups will be created string "" no

Outputs

Name Description
autoscaling_iam_role_arn Autoscaling IAM role ARN
autoscaling_iam_role_name Autoscaling IAM role name
autoscaling_iam_role_unique_id Stable and unique string identifying the autoscaling IAM role
cluster_arn The ARN of the cluster
cluster_core_instance_group_id Core node type Instance Group ID, if using Instance Group for this node type
cluster_id The ID of the cluster
cluster_master_instance_group_id Master node type Instance Group ID, if using Instance Group for this node type
cluster_master_public_dns The DNS name of the master node. If the cluster is on a private subnet, this is the private DNS name. On a public subnet, this is the public DNS name
iam_instance_profile_arn ARN assigned by AWS to the instance profile
iam_instance_profile_iam_role_arn Instance profile IAM role ARN
iam_instance_profile_iam_role_name Instance profile IAM role name
iam_instance_profile_iam_role_unique_id Stable and unique string identifying the instance profile IAM role
iam_instance_profile_id Instance profile's ID
iam_instance_profile_unique Stable and unique string identifying the IAM instance profile
managed_master_security_group_arn Amazon Resource Name (ARN) of the managed master security group
managed_master_security_group_id ID of the managed master security group
managed_service_access_security_group_arn Amazon Resource Name (ARN) of the managed service access security group
managed_service_access_security_group_id ID of the managed service access security group
managed_slave_security_group_arn Amazon Resource Name (ARN) of the managed slave security group
managed_slave_security_group_id ID of the managed slave security group
security_configuration_id The ID of the security configuration
security_configuration_name The name of the security configuration
service_iam_role_arn Service IAM role ARN
service_iam_role_name Service IAM role name
service_iam_role_unique_id Stable and unique string identifying the service IAM role

License

Apache-2.0 Licensed. See LICENSE.