diff --git a/README.md b/README.md index a9072bf4..f7558c6d 100644 --- a/README.md +++ b/README.md @@ -194,45 +194,12 @@ For a full list of configuration options see our [Helm readme](https://github.co The termination handler deployment requires some infrastructure to be setup before deploying the application. You'll need the following AWS infrastructure components: -1. AutoScaling Group Termination Lifecycle Hook -2. Amazon Simple Queue Service (SQS) Queue +1. Amazon Simple Queue Service (SQS) Queue +2. AutoScaling Group Termination Lifecycle Hook 3. Amazon EventBridge Rule 4. IAM Role for the aws-node-termination-handler Queue Processing Pods -#### 1. Setup a Termination Lifecycle Hook on an ASG: - -Here is the AWS CLI command to create a termination lifecycle hook on an existing ASG, although this should really be configured via your favorite infrastructure-as-code tool like CloudFormation or Terraform: - -``` -$ aws autoscaling put-lifecycle-hook \ - --lifecycle-hook-name=my-k8s-term-hook \ - --auto-scaling-group-name=my-k8s-asg \ - --lifecycle-transition=autoscaling:EC2_INSTANCE_TERMINATING \ - --default-result=CONTINUE \ - --heartbeat-timeout=300 -``` - -#### 2. Tag the ASGs: - -By default the aws-node-termination-handler will only manage terminations for ASGs tagged w/ `key=aws-node-termination-handler/managed` - -``` -$ aws autoscaling create-or-update-tags \ - --tags ResourceId=my-auto-scaling-group,ResourceType=auto-scaling-group,Key=aws-node-termination-handler/managed,Value=,PropagateAtLaunch=true -``` - -The value of the key does not matter. - -This functionality is helpful in accounts where there are ASGs that do not run kubernetes nodes or you do not want aws-node-termination-handler to manage their termination lifecycle. -However, if your account is dedicated to ASGs for your kubernetes cluster, then you can turn off the ASG tag check by setting the flag `--check-asg-tag-before-draining=false` or environment variable `CHECK_ASG_TAG_BEFORE_DRAINING=false`. - -You can also control what resources NTH manages by adding the resource ARNs to your Amazon EventBridge rules. - -Take a look at the docs on how to create rules that only manage certain ASGs [here](https://docs.aws.amazon.com/autoscaling/ec2/userguide/cloud-watch-events.html). - -See all the different events docs [here](https://docs.aws.amazon.com/eventbridge/latest/userguide/event-types.html#auto-scaling-event-types). - -#### 3. Create an SQS Queue: +#### 1. Create an SQS Queue: Here is the AWS CLI command to create an SQS queue to hold termination events from ASG and EC2, although this should really be configured via your favorite infrastructure-as-code tool like CloudFormation or Terraform: @@ -270,8 +237,59 @@ EOF $ aws sqs create-queue --queue-name "${SQS_QUEUE_NAME}" --attributes file:///tmp/queue-attributes.json ``` +If you are sending Lifecycle termination events from ASG directly to SQS, instead of through EventBridge, then you will also need to create an IAM service role to give Amazon EC2 Auto Scaling access to your SQS queue. Please follow [these linked instructions to create the IAM service role: link.](https://docs.aws.amazon.com/autoscaling/ec2/userguide/configuring-lifecycle-hook-notifications.html#sqs-notifications) +Note the ARNs for the SQS queue and the associated IAM role for Step 2. + +#### 2. Setup a Termination Lifecycle Hook on an ASG: + +Here is the AWS CLI command to create a termination lifecycle hook on an existing ASG when using EventBridge, although this should really be configured via your favorite infrastructure-as-code tool like CloudFormation or Terraform: + +``` +$ aws autoscaling put-lifecycle-hook \ + --lifecycle-hook-name=my-k8s-term-hook \ + --auto-scaling-group-name=my-k8s-asg \ + --lifecycle-transition=autoscaling:EC2_INSTANCE_TERMINATING \ + --default-result=CONTINUE \ + --heartbeat-timeout=300 +``` + +If you want to avoid using EventBridge and instead send ASG Lifecycle events directly to SQS, instead use the following command, using the ARNs from Step 1: + +``` +$ aws autoscaling put-lifecycle-hook \ + --lifecycle-hook-name=my-k8s-term-hook \ + --auto-scaling-group-name=my-k8s-asg \ + --lifecycle-transition=autoscaling:EC2_INSTANCE_TERMINATING \ + --default-result=CONTINUE \ + --heartbeat-timeout=300 \ + --notification-target-arn \ + --role-arn +``` + +#### 3. Tag the ASGs: + +By default the aws-node-termination-handler will only manage terminations for ASGs tagged w/ `key=aws-node-termination-handler/managed` + +``` +$ aws autoscaling create-or-update-tags \ + --tags ResourceId=my-auto-scaling-group,ResourceType=auto-scaling-group,Key=aws-node-termination-handler/managed,Value=,PropagateAtLaunch=true +``` + +The value of the key does not matter. + +This functionality is helpful in accounts where there are ASGs that do not run kubernetes nodes or you do not want aws-node-termination-handler to manage their termination lifecycle. +However, if your account is dedicated to ASGs for your kubernetes cluster, then you can turn off the ASG tag check by setting the flag `--check-asg-tag-before-draining=false` or environment variable `CHECK_ASG_TAG_BEFORE_DRAINING=false`. + +You can also control what resources NTH manages by adding the resource ARNs to your Amazon EventBridge rules. + +Take a look at the docs on how to create rules that only manage certain ASGs [here](https://docs.aws.amazon.com/autoscaling/ec2/userguide/cloud-watch-events.html). + +See all the different events docs [here](https://docs.aws.amazon.com/eventbridge/latest/userguide/event-types.html#auto-scaling-event-types). + #### 4. Create Amazon EventBridge Rules +You may skip this step if sending events from ASG to SQS directly. + Here are AWS CLI commands to create Amazon EventBridge rules so that ASG termination events, Spot Interruptions, Instance state changes, Rebalance Recommendations, and AWS Health Scheduled Changes are sent to the SQS queue created in the previous step. This should really be configured via your favorite infrastructure-as-code tool like CloudFormation or Terraform: ``` diff --git a/pkg/monitor/sqsevent/asg-lifecycle-event.go b/pkg/monitor/sqsevent/asg-lifecycle-event.go index efe092b2..969d5234 100644 --- a/pkg/monitor/sqsevent/asg-lifecycle-event.go +++ b/pkg/monitor/sqsevent/asg-lifecycle-event.go @@ -32,11 +32,11 @@ import ( "id": "782d5b4c-0f6f-1fd6-9d62-ecf6aed0a470", "detail-type": "EC2 Instance-terminate Lifecycle Action", "source": "aws.autoscaling", - "account": "896453262834", + "account": "123456789012", "time": "2020-07-01T22:19:58Z", "region": "us-east-1", "resources": [ - "arn:aws:autoscaling:us-east-1:896453262834:autoScalingGroup:26e7234b-03a4-47fb-b0a9-2b241662774e:autoScalingGroupName/testt1.demo-0a20f32c.kops.sh" + "arn:aws:autoscaling:us-east-1:123456789012:autoScalingGroup:26e7234b-03a4-47fb-b0a9-2b241662774e:autoScalingGroupName/testt1.demo-0a20f32c.kops.sh" ], "detail": { "LifecycleActionToken": "0befcbdb-6ecd-498a-9ff7-ae9b54447cd6", @@ -55,6 +55,8 @@ type LifecycleDetail struct { LifecycleHookName string `json:"LifecycleHookName"` EC2InstanceID string `json:"EC2InstanceId"` LifecycleTransition string `json:"LifecycleTransition"` + RequestID string `json:"RequestId"` + Time string `json:"Time"` } func (m SQSMonitor) asgTerminationToInterruptionEvent(event *EventBridgeEvent, message *sqs.Message) (*monitor.InterruptionEvent, error) { diff --git a/pkg/monitor/sqsevent/sqs-monitor.go b/pkg/monitor/sqsevent/sqs-monitor.go index 5cc3f6fd..13c0afda 100644 --- a/pkg/monitor/sqsevent/sqs-monitor.go +++ b/pkg/monitor/sqsevent/sqs-monitor.go @@ -101,9 +101,38 @@ func (m SQSMonitor) processSQSMessage(message *sqs.Message) (*EventBridgeEvent, event := EventBridgeEvent{} err := json.Unmarshal([]byte(*message.Body), &event) + if err != nil { + return &event, err + } + + if len(event.DetailType) == 0 { + event, err = m.processLifecycleEventFromASG(message) + } + return &event, err } +// processLifecycleEventFromASG checks for a Lifecycle event from ASG to SQS, and wraps it in an EventBridgeEvent +func (m SQSMonitor) processLifecycleEventFromASG(message *sqs.Message) (EventBridgeEvent, error) { + eventBridgeEvent := EventBridgeEvent{} + lifecycleEvent := LifecycleDetail{} + err := json.Unmarshal([]byte(*message.Body), &lifecycleEvent) + + if err != nil || lifecycleEvent.LifecycleTransition != "autoscaling:EC2_INSTANCE_TERMINATING" { + log.Err(err).Msg("only lifecycle termination events from ASG to SQS are supported outside EventBridge") + err = fmt.Errorf("unsupported message type (%s)", message.String()) + return eventBridgeEvent, err + } + + eventBridgeEvent.Source = "aws.autoscaling" + eventBridgeEvent.Time = lifecycleEvent.Time + eventBridgeEvent.ID = lifecycleEvent.RequestID + eventBridgeEvent.Detail, err = json.Marshal(lifecycleEvent) + + log.Debug().Msg("processing lifecycle termination event from ASG") + return eventBridgeEvent, err +} + // processEventBridgeEvent processes an EventBridge event and returns interruption event wrappers func (m SQSMonitor) processEventBridgeEvent(eventBridgeEvent *EventBridgeEvent, message *sqs.Message) []InterruptionEventWrapper { interruptionEventWrappers := []InterruptionEventWrapper{} @@ -150,7 +179,7 @@ func (m SQSMonitor) processInterruptionEvents(interruptionEventWrappers []Interr case eventWrapper.Err != nil: // Log errors and record as failed events. Don't delete the message in order to allow retries log.Err(eventWrapper.Err).Msg("ignoring interruption event due to error") - failedInterruptionEventsCount++ // seems useless + failedInterruptionEventsCount++ case eventWrapper.InterruptionEvent == nil: log.Debug().Msg("dropping non-actionable interruption event") diff --git a/pkg/monitor/sqsevent/sqs-monitor_test.go b/pkg/monitor/sqsevent/sqs-monitor_test.go index e702c691..b236f405 100644 --- a/pkg/monitor/sqsevent/sqs-monitor_test.go +++ b/pkg/monitor/sqsevent/sqs-monitor_test.go @@ -67,6 +67,16 @@ var asgLifecycleEvent = sqsevent.EventBridgeEvent{ }`), } +var asgLifecycleEventFromSQS = sqsevent.LifecycleDetail{ + LifecycleHookName: "test-nth-asg-to-sqs", + RequestID: "3775fac9-93c3-7ead-8713-159816566000", + LifecycleTransition: "autoscaling:EC2_INSTANCE_TERMINATING", + AutoScalingGroupName: "my-asg", + Time: "2022-01-31T23:07:47.872Z", + EC2InstanceID: "i-040107f6ba000e5ee", + LifecycleActionToken: "b4dd0f5b-0ef2-4479-9dad-6c55f027000e", +} + var rebalanceRecommendationEvent = sqsevent.EventBridgeEvent{ Version: "0", ID: "5d5555d5-dd55-5555-5555-5555dd55d55d", @@ -87,7 +97,7 @@ func TestKind(t *testing.T) { h.Assert(t, sqsevent.SQSMonitor{}.Kind() == sqsevent.SQSTerminateKind, "SQSMonitor kind should return the kind constant for the event") } -func TestMonitor_Success(t *testing.T) { +func TestMonitor_EventBridgeSuccess(t *testing.T) { spotItnEventNoTime := spotItnEvent spotItnEventNoTime.Time = "" for _, event := range []sqsevent.EventBridgeEvent{spotItnEvent, asgLifecycleEvent, spotItnEventNoTime, rebalanceRecommendationEvent} { @@ -134,6 +144,53 @@ func TestMonitor_Success(t *testing.T) { } } +func TestMonitor_AsgDirectToSqsSuccess(t *testing.T) { + event := asgLifecycleEventFromSQS + eventBytes, err := json.Marshal(&event) + h.Ok(t, err) + eventStr := string(eventBytes) + msg := sqs.Message{Body: &eventStr} + h.Ok(t, err) + messages := []*sqs.Message{ + &msg, + } + sqsMock := h.MockedSQS{ + ReceiveMessageResp: sqs.ReceiveMessageOutput{Messages: messages}, + ReceiveMessageErr: nil, + } + dnsNodeName := "ip-10-0-0-157.us-east-2.compute.internal" + ec2Mock := h.MockedEC2{ + DescribeInstancesResp: getDescribeInstancesResp(dnsNodeName, true, true), + } + drainChan := make(chan monitor.InterruptionEvent, 1) + + sqsMonitor := sqsevent.SQSMonitor{ + SQS: sqsMock, + EC2: ec2Mock, + ManagedAsgTag: "aws-node-termination-handler/managed", + ASG: mockIsManagedTrue(nil), + CheckIfManaged: true, + QueueURL: "https://test-queue", + InterruptionChan: drainChan, + } + + err = sqsMonitor.Monitor() + h.Ok(t, err) + + select { + case result := <-drainChan: + h.Equals(t, sqsevent.SQSTerminateKind, result.Kind) + h.Equals(t, result.NodeName, dnsNodeName) + h.Assert(t, result.PostDrainTask != nil, "PostDrainTask should have been set") + h.Assert(t, result.PreDrainTask != nil, "PreDrainTask should have been set") + err = result.PostDrainTask(result, node.Node{}) + h.Ok(t, err) + default: + h.Ok(t, fmt.Errorf("Expected an event to be generated")) + } + +} + func TestMonitor_DrainTasks(t *testing.T) { testEvents := []sqsevent.EventBridgeEvent{spotItnEvent, asgLifecycleEvent, rebalanceRecommendationEvent} messages := make([]*sqs.Message, 0, len(testEvents))