Skip to content

Commit f07a2da

Browse files
added eks node auto-repair (#65) (#66)
Co-authored-by: ankush-sqops <ankush.upadhyay@squareops.com>
1 parent d882967 commit f07a2da

File tree

6 files changed

+86
-27
lines changed

6 files changed

+86
-27
lines changed

README.md

+4
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,10 @@ module "managed_node_group_addons" {
7272
managed_ng_ebs_volume_size = 50
7373
managed_ng_instance_types = ["t3a.large", "t2.large", "t2.xlarge", "t3.large", "m5.large"]
7474
managed_ng_kms_policy_arn = module.eks.kms_policy_arn
75+
managed_ng_node_autorepair = {
76+
enabled = false
77+
enable_node_monitoring_agent_addon = true
78+
}
7579
eks_cluster_name = module.eks.eks_cluster_name
7680
worker_iam_role_name = module.eks.worker_iam_role_name
7781
worker_iam_role_arn = module.eks.worker_iam_role_arn

examples/complete/README.md

+2-2
Original file line numberDiff line numberDiff line change
@@ -26,8 +26,8 @@ This directory contains a complete example that demonstrates the usage of the Te
2626
| <a name="module_key_pair_vpn"></a> [key\_pair\_vpn](#module\_key\_pair\_vpn) | squareops/keypair/aws | 1.0.2 |
2727
| <a name="module_key_pair_eks"></a> [key\_pair\_eks](#module\_key\_pair\_eks) | squareops/keypair/aws | 1.0.2 |
2828
| <a name="module_vpc"></a> [vpc](#module\_vpc) | squareops/vpc/aws | 3.4.1 |
29-
| <a name="module_eks"></a> [eks](#module\_eks) | squareops/eks/aws | 5.1.1 |
30-
| <a name="module_managed_node_group_addons"></a> [managed\_node\_group\_addons](#module\_managed\_node\_group\_addons) | squareops/eks/aws//modules/managed-nodegroup | 5.1.1 |
29+
| <a name="module_eks"></a> [eks](#module\_eks) | squareops/eks/aws | 5.3.0 |
30+
| <a name="module_managed_node_group_addons"></a> [managed\_node\_group\_addons](#module\_managed\_node\_group\_addons) | squareops/eks/aws//modules/managed-nodegroup | 5.3.0 |
3131
| <a name="module_fargate_profle"></a> [fargate\_profle](#module\_fargate\_profle) | squareops/eks/aws//modules/fargate-profile | n/a |
3232

3333
## Resources

examples/complete/main.tf

+29-25
Original file line numberDiff line numberDiff line change
@@ -1,33 +1,33 @@
11
locals {
2-
region = "us-west-1"
2+
region = "us-east-2"
33
kms_deletion_window_in_days = 7
44
kms_key_rotation_enabled = true
55
is_enabled = true
66
multi_region = false
77
environment = "stage"
88
name = "sqops"
99
auto_assign_public_ip = true
10-
vpc_availability_zones = ["us-west-1a", "us-west-1b"]
10+
vpc_availability_zones = ["us-east-2a", "us-east-2b"]
1111
vpc_public_subnet_enabled = true
1212
vpc_private_subnet_enabled = true
1313
vpc_database_subnet_enabled = true
1414
vpc_intra_subnet_enabled = true
15-
vpc_one_nat_gateway_per_az = true
15+
vpc_one_nat_gateway_per_az = false
1616
vpn_server_instance_type = "t3a.small"
1717
vpc_flow_log_enabled = false
1818
kms_user = null
1919
vpc_cidr = "10.10.0.0/16"
20-
vpn_server_enabled = true
21-
cluster_version = "1.30"
20+
vpn_server_enabled = false
21+
cluster_version = "1.31"
2222
cluster_log_types = ["api", "audit", "authenticator", "controllerManager", "scheduler"]
2323
cluster_log_retention_in_days = 30
2424
managed_ng_capacity_type = "SPOT" # Choose the capacity type ("SPOT" or "ON_DEMAND")
25-
cluster_endpoint_private_access = false
25+
cluster_endpoint_private_access = true
2626
cluster_endpoint_public_access = true
2727
cluster_endpoint_public_access_cidrs = ["0.0.0.0/0"]
2828
ebs_volume_size = 50
2929
fargate_profile_name = "app"
30-
vpc_s3_endpoint_enabled = true
30+
vpc_s3_endpoint_enabled = false
3131
vpc_ecr_endpoint_enabled = false
3232
vpc_public_subnets_counts = 2
3333
vpc_private_subnets_counts = 2
@@ -139,7 +139,7 @@ module "vpc" {
139139

140140
module "eks" {
141141
source = "squareops/eks/aws"
142-
version = "5.2.1"
142+
version = "5.3.0"
143143
access_entry_enabled = true
144144
access_entries = {
145145
"example" = {
@@ -185,22 +185,26 @@ module "eks" {
185185
}
186186

187187
module "managed_node_group_addons" {
188-
source = "squareops/eks/aws//modules/managed-nodegroup"
189-
version = "5.2.1"
190-
depends_on = [module.vpc, module.eks]
191-
managed_ng_name = "Infra"
192-
managed_ng_min_size = 2
193-
managed_ng_max_size = 5
194-
managed_ng_desired_size = 2
195-
vpc_subnet_ids = [module.vpc.private_subnets[0]]
196-
environment = local.environment
197-
managed_ng_kms_key_arn = module.kms.key_arn
198-
managed_ng_capacity_type = local.managed_ng_capacity_type
199-
managed_ng_ebs_volume_size = local.ebs_volume_size
200-
managed_ng_ebs_volume_type = "gp3"
201-
managed_ng_ebs_encrypted = true
202-
managed_ng_instance_types = ["t3a.large", "t2.large", "t2.xlarge", "t3.large", "m5.large"] # Pass instance type according to the ami architecture.
203-
managed_ng_kms_policy_arn = module.eks.kms_policy_arn
188+
source = "squareops/eks/aws//modules/managed-nodegroup"
189+
version = "5.3.0"
190+
depends_on = [module.vpc, module.eks]
191+
managed_ng_name = "Infra"
192+
managed_ng_min_size = 2
193+
managed_ng_max_size = 5
194+
managed_ng_desired_size = 2
195+
vpc_subnet_ids = [module.vpc.private_subnets[0]]
196+
environment = local.environment
197+
managed_ng_kms_key_arn = module.kms.key_arn
198+
managed_ng_capacity_type = local.managed_ng_capacity_type
199+
managed_ng_ebs_volume_size = local.ebs_volume_size
200+
managed_ng_ebs_volume_type = "gp3"
201+
managed_ng_ebs_encrypted = true
202+
managed_ng_instance_types = ["t3a.large", "t3.large", "t3.medium"] # Pass instance type according to the ami architecture.
203+
managed_ng_kms_policy_arn = module.eks.kms_policy_arn
204+
managed_ng_node_autorepair = {
205+
enabled = false
206+
enable_node_monitoring_agent_addon = true
207+
}
204208
eks_cluster_name = module.eks.cluster_name
205209
worker_iam_role_name = module.eks.worker_iam_role_name
206210
worker_iam_role_arn = module.eks.worker_iam_role_arn
@@ -212,7 +216,7 @@ module "managed_node_group_addons" {
212216
"Addons-Services" = "true"
213217
}
214218
tags = local.additional_aws_tags
215-
custom_ami_id = "" # Optional, if not passed terraform will automatically select the latest supported ami id
219+
custom_ami_id = "" # Optional, if not passed terraform will automatically select the latest supported ami id
216220
aws_managed_node_group_arch = local.aws_managed_node_group_arch # optional if "custom_ami_id" is passed
217221
enable_bottlerocket_ami = local.enable_bottlerocket_ami # Set it to false if using Amazon Linux AMIs
218222
bottlerocket_node_config = {

modules/managed-nodegroup/README.md

+2
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ No modules.
3131

3232
| Name | Type |
3333
|------|------|
34+
| [aws_eks_addon.node_monitoring_addon](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/eks_addon) | resource |
3435
| [aws_eks_node_group.managed_ng](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/eks_node_group) | resource |
3536
| [aws_launch_template.eks_template](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/resources/launch_template) | resource |
3637
| [aws_ami.launch_template_ami](https://registry.terraform.io/providers/hashicorp/aws/latest/docs/data-sources/ami) | data source |
@@ -75,6 +76,7 @@ No modules.
7576
| <a name="input_enable_bottlerocket_ami"></a> [enable\_bottlerocket\_ami](#input\_enable\_bottlerocket\_ami) | Set to true to enable the use of Bottlerocket AMIs for instances. | `bool` | `false` | no |
7677
| <a name="input_bottlerocket_node_config"></a> [bottlerocket\_node\_config](#input\_bottlerocket\_node\_config) | Bottlerocket Node configurations for EKS. | `map(any)` | <pre>{<br> "bottlerocket_eks_enable_control_container": true,<br> "bottlerocket_eks_node_admin_container_enabled": false<br>}</pre> | no |
7778
| <a name="input_custom_ami_id"></a> [custom\_ami\_id](#input\_custom\_ami\_id) | worker node AMI id to be created | `string` | `""` | no |
79+
| <a name="input_managed_ng_node_autorepair"></a> [managed\_ng\_node\_autorepair](#input\_managed\_ng\_node\_autorepair) | Choose whether to enable managed nodegroup auto repair feature | <pre>object({<br> enabled = bool<br> enable_node_monitoring_agent_addon = bool<br> })</pre> | <pre>{<br> "enable_node_monitoring_agent_addon": false,<br> "enabled": false<br>}</pre> | no |
7880

7981
## Outputs
8082

modules/managed-nodegroup/main.tf

+37
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,9 @@ resource "aws_eks_node_group" "managed_ng" {
114114
capacity_type = var.managed_ng_capacity_type
115115
instance_types = var.managed_ng_instance_types
116116
force_update_version = true
117+
node_repair_config {
118+
enabled = var.managed_ng_node_autorepair.enabled
119+
}
117120
launch_template {
118121
id = aws_launch_template.eks_template.id
119122
version = aws_launch_template.eks_template.latest_version
@@ -128,3 +131,37 @@ resource "aws_eks_node_group" "managed_ng" {
128131
var.tags
129132
)
130133
}
134+
135+
resource "aws_eks_addon" "node_monitoring_addon" {
136+
count = var.managed_ng_node_autorepair.enabled == true && var.managed_ng_node_autorepair.enable_node_monitoring_agent_addon == true ? 1 : 0
137+
cluster_name = var.eks_cluster_name
138+
addon_name = "eks-node-monitoring-agent"
139+
addon_version = "v1.0.1-eksbuild.2"
140+
resolve_conflicts_on_update = "PRESERVE"
141+
configuration_values = jsonencode({
142+
dcgmAgent = {
143+
resources = {
144+
limits = {
145+
cpu = "50m"
146+
memory = "100Mi"
147+
}
148+
requests = {
149+
cpu = "10m"
150+
memory = "30Mi"
151+
}
152+
}
153+
}
154+
monitoringAgent = {
155+
resources = {
156+
limits = {
157+
cpu = "50m"
158+
memory = "100Mi"
159+
}
160+
requests = {
161+
cpu = "10m"
162+
memory = "30Mi"
163+
}
164+
}
165+
}
166+
})
167+
}

modules/managed-nodegroup/variables.tf

+12
Original file line numberDiff line numberDiff line change
@@ -204,3 +204,15 @@ variable "custom_ami_id" {
204204
description = "worker node AMI id to be created"
205205
default = ""
206206
}
207+
208+
variable "managed_ng_node_autorepair" {
209+
type = object({
210+
enabled = bool
211+
enable_node_monitoring_agent_addon = bool
212+
})
213+
description = "Choose whether to enable managed nodegroup auto repair feature"
214+
default = {
215+
enabled = false
216+
enable_node_monitoring_agent_addon = false
217+
}
218+
}

0 commit comments

Comments
 (0)