Skip to content

Commit

Permalink
feat: Add Native Spot Termination Handling (#2546)
Browse files Browse the repository at this point in the history
  • Loading branch information
jonathan-innis authored Nov 3, 2022
1 parent 9acf3b2 commit bde080e
Show file tree
Hide file tree
Showing 50 changed files with 3,730 additions and 63 deletions.
1 change: 1 addition & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ HELM_OPTS ?= --set serviceAccount.annotations.eks\\.amazonaws\\.com/role-arn=${K
--set clusterName=${CLUSTER_NAME} \
--set clusterEndpoint=${CLUSTER_ENDPOINT} \
--set aws.defaultInstanceProfile=KarpenterNodeInstanceProfile-${CLUSTER_NAME} \
--set aws.enableInterruptionHandling=true \
--create-namespace

# CR for local builds of Karpenter
Expand Down
2 changes: 1 addition & 1 deletion charts/karpenter/templates/clusterrole.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ rules:
# Read
- apiGroups: ["karpenter.k8s.aws"]
resources: ["awsnodetemplates"]
verbs: ["get", "list", "watch"]
verbs: ["get", "list", "watch", "patch"]
- apiGroups: ["admissionregistration.k8s.io"]
resources: ["validatingwebhookconfigurations"]
verbs: ["update"]
Expand Down
1 change: 1 addition & 0 deletions charts/karpenter/templates/configmap.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ metadata:
data:
"batchMaxDuration": "{{ .Values.controller.batchMaxDuration }}"
"batchIdleDuration": "{{ .Values.controller.batchIdleDuration }}"
"aws.enableInterruptionHandling": "{{ .Values.aws.enableInterruptionHandling }}"
{{- range $tag, $value := .Values.aws.tags }}
"aws.tags.{{ $tag }}": "{{ $value }}"
{{- end }}
3 changes: 3 additions & 0 deletions charts/karpenter/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -140,5 +140,8 @@ clusterEndpoint: ""
aws:
# -- The default instance profile to use when launching nodes on AWS
defaultInstanceProfile: ""
# -- enableInterruptionHandling is currently in BETA and is disabled by default. Enabling interruption handling may
# require additional permissions on the controller service account. Additional permissions are outlined in the docs
enableInterruptionHandling: false
# -- The global tags to use on all AWS infrastructure resources (launch templates, instances, SQS queue, etc.)
tags:
7 changes: 2 additions & 5 deletions cmd/controller/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@ package main

import (
"github.com/samber/lo"
"k8s.io/utils/clock"

awscloudprovider "github.com/aws/karpenter/pkg/cloudprovider"
"github.com/aws/karpenter/pkg/context"
Expand Down Expand Up @@ -46,22 +45,20 @@ func main() {
lo.Must0(operator.AddHealthzCheck("cloud-provider", awsCloudProvider.LivenessProbe))
cloudProvider := metrics.Decorate(awsCloudProvider)

clusterState := state.NewCluster(operator.SettingsStore.InjectSettings(ctx), operator.Clock, operator.GetClient(), cloudProvider)
operator.
WithControllers(ctx, corecontrollers.NewControllers(
ctx,
clock.RealClock{},
operator.Clock,
operator.GetClient(),
operator.KubernetesInterface,
clusterState,
state.NewCluster(operator.SettingsStore.InjectSettings(ctx), operator.Clock, operator.GetClient(), cloudProvider),
operator.EventRecorder,
operator.SettingsStore,
cloudProvider,
)...).
WithWebhooks(corewebhooks.NewWebhooks()...).
WithControllers(ctx, controllers.NewControllers(
awsCtx,
clusterState,
)...).
WithWebhooks(webhooks.NewWebhooks()...).
Start(ctx)
Expand Down
4 changes: 2 additions & 2 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,10 @@ require (
github.com/onsi/gomega v1.24.0
github.com/patrickmn/go-cache v2.1.0+incompatible
github.com/pelletier/go-toml/v2 v2.0.5
github.com/prometheus/client_golang v1.12.2
github.com/samber/lo v1.33.0
go.uber.org/multierr v1.8.0
go.uber.org/zap v1.23.0
k8s.io/api v0.25.2
k8s.io/apimachinery v0.25.2
k8s.io/client-go v0.25.2
Expand Down Expand Up @@ -70,7 +72,6 @@ require (
github.com/modern-go/reflect2 v1.0.2 // indirect
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
github.com/pkg/errors v0.9.1 // indirect
github.com/prometheus/client_golang v1.13.0 // indirect
github.com/prometheus/client_model v0.2.0 // indirect
github.com/prometheus/common v0.37.0 // indirect
github.com/prometheus/procfs v0.8.0 // indirect
Expand All @@ -79,7 +80,6 @@ require (
go.opencensus.io v0.23.0 // indirect
go.uber.org/atomic v1.9.0 // indirect
go.uber.org/automaxprocs v1.4.0 // indirect
go.uber.org/zap v1.23.0 // indirect
golang.org/x/crypto v0.0.0-20220315160706-3147a52a75dd // indirect
golang.org/x/exp v0.0.0-20220303212507-bbda1eaf7a17 // indirect
golang.org/x/net v0.1.0 // indirect
Expand Down
4 changes: 2 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -335,8 +335,8 @@ github.com/prometheus/client_golang v1.0.0/go.mod h1:db9x61etRT2tGnBNRi70OPL5Fsn
github.com/prometheus/client_golang v1.7.1/go.mod h1:PY5Wy2awLA44sXw4AOSfFBetzPP4j5+D6mVACh+pe2M=
github.com/prometheus/client_golang v1.11.0/go.mod h1:Z6t4BnS23TR94PD6BsDNk8yVqroYurpAkEiz0P2BEV0=
github.com/prometheus/client_golang v1.12.1/go.mod h1:3Z9XVyYiZYEO+YQWt3RD2R3jrbd179Rt297l4aS6nDY=
github.com/prometheus/client_golang v1.13.0 h1:b71QUfeo5M8gq2+evJdTPfZhYMAU0uKPkyPJ7TPsloU=
github.com/prometheus/client_golang v1.13.0/go.mod h1:vTeo+zgvILHsnnj/39Ou/1fPN5nJFOEMgftOUOmlvYQ=
github.com/prometheus/client_golang v1.12.2 h1:51L9cDoUHVrXx4zWYlcLQIZ+d+VXHgqnYKkIuq4g/34=
github.com/prometheus/client_golang v1.12.2/go.mod h1:3Z9XVyYiZYEO+YQWt3RD2R3jrbd179Rt297l4aS6nDY=
github.com/prometheus/client_model v0.0.0-20180712105110-5c3871d89910/go.mod h1:MbSGuTsp3dbXC40dX6PRTWyKYBIrTGTE9sqQNg2J8bo=
github.com/prometheus/client_model v0.0.0-20190129233127-fd36f4220a90/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA=
github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA=
Expand Down
36 changes: 22 additions & 14 deletions pkg/apis/config/settings/settings.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,27 +35,34 @@ var Registration = &config.Registration{
DefaultData: lo.Must(defaultSettings.Data()),
}

var defaultSettings = Settings{}
var defaultSettings = Settings{
EnableInterruptionHandling: false,
Tags: map[string]string{},
}

type Settings struct {
Tags map[string]string
EnableInterruptionHandling bool `json:"aws.enableInterruptionHandling,string"`
Tags map[string]string `json:"aws.tags,omitempty"`
}

func (s Settings) UnmarshalJSON(raw []byte) error {
func (s Settings) MarshalJSON() ([]byte, error) {
type internal Settings
d := map[string]string{}
if err := json.Unmarshal(raw, &d); err != nil {
return err

// Store a value of tags locally, so we can marshal the rest of the struct
tags := s.Tags
s.Tags = nil

raw, err := json.Marshal(internal(s))
if err != nil {
return nil, fmt.Errorf("marshaling settings, %w", err)
}
if err := AsMap("aws.tags", &s.Tags)(d); err != nil {
return err
if err = json.Unmarshal(raw, &d); err != nil {
return nil, fmt.Errorf("unmarshalling settings into map, %w", err)
}
return nil
}

func (s Settings) MarshalJSON() ([]byte, error) {
d := map[string]string{}
if err := FromMap(s.Tags)("aws.tags", &d); err != nil {
return nil, err
// Rewind the tags from the map into separate values
if err = FromMap(tags)("aws.tags", &d); err != nil {
return nil, fmt.Errorf("rewinding tags into map, %w", err)
}
return json.Marshal(d)
}
Expand All @@ -74,6 +81,7 @@ func NewSettingsFromConfigMap(cm *v1.ConfigMap) (Settings, error) {
s := defaultSettings

if err := configmap.Parse(cm.Data,
configmap.AsBool("aws.enableInterruptionHandling", &s.EnableInterruptionHandling),
AsMap("aws.tags", &s.Tags),
); err != nil {
// Failing to parse means that there is some error in the Settings, so we should crash
Expand Down
2 changes: 2 additions & 0 deletions pkg/apis/v1alpha1/register.go
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,8 @@ var (
LabelInstanceGPUCount = LabelDomain + "/instance-gpu-count"
LabelInstanceGPUMemory = LabelDomain + "/instance-gpu-memory"
LabelInstanceAMIID = LabelDomain + "/instance-ami-id"

InterruptionInfrastructureFinalizer = Group + "/interruption-infrastructure"
)

var (
Expand Down
17 changes: 14 additions & 3 deletions pkg/controllers/controllers.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,22 @@ limitations under the License.
package controllers

import (
"github.com/aws/karpenter-core/pkg/controllers/state"
"github.com/aws/aws-sdk-go/service/eventbridge"
"github.com/aws/aws-sdk-go/service/sqs"

"github.com/aws/karpenter-core/pkg/operator/controller"
awscontext "github.com/aws/karpenter/pkg/context"
"github.com/aws/karpenter/pkg/controllers/interruption"
"github.com/aws/karpenter/pkg/controllers/nodetemplate"
"github.com/aws/karpenter/pkg/controllers/providers"
)

func NewControllers(ctx awscontext.Context, cluster *state.Cluster) []controller.Controller {
return []controller.Controller{}
func NewControllers(ctx awscontext.Context) []controller.Controller {
sqsProvider := providers.NewSQS(sqs.New(ctx.Session))
eventBridgeProvider := providers.NewEventBridge(eventbridge.New(ctx.Session), sqsProvider)

return []controller.Controller{
nodetemplate.NewController(ctx.KubeClient, sqsProvider, eventBridgeProvider),
interruption.NewController(ctx.KubeClient, ctx.Clock, ctx.EventRecorder, sqsProvider, ctx.UnavailableOfferingsCache),
}
}
Loading

0 comments on commit bde080e

Please sign in to comment.