From 9febd5f35c9481cbc55b3c4bf59ee0b18e647bd7 Mon Sep 17 00:00:00 2001 From: Jonathan Innis Date: Fri, 2 Sep 2022 17:35:34 -0700 Subject: [PATCH 01/55] Initial setup of SQS client and message handlers --- go.mod | 2 + pkg/cloudprovider/aws/cloudprovider.go | 22 ++- .../controllers/notification/controller.go | 143 ++++++++++++++++++ .../aggregatedparser/aggregatedparser.go | 69 +++++++++ .../notification/event/metadata.go | 49 ++++++ .../controllers/notification/event/noop.go | 45 ++++++ .../rebalancerecommendation/v0/handler.go | 47 ++++++ .../rebalancerecommendation/v0/parser.go | 50 ++++++ .../rebalancerecommendation/v0/unmarshal.go | 44 ++++++ .../event/scheduledchange/v1/handler.go | 51 +++++++ .../event/scheduledchange/v1/parser.go | 68 +++++++++ .../event/scheduledchange/v1/unmarshal.go | 89 +++++++++++ .../event/spotinterruption/v1/handler.go | 47 ++++++ .../event/spotinterruption/v1/parser.go | 50 ++++++ .../event/spotinterruption/v1/unmarshal.go | 46 ++++++ .../event/statechange/v1/handler.go | 47 ++++++ .../event/statechange/v1/parser.go | 62 ++++++++ .../event/statechange/v1/unmarshal.go | 46 ++++++ .../controllers/notification/event/types.go | 50 ++++++ .../aws/controllers/notification/sqs.go | 106 +++++++++++++ .../aws/controllers/notification/types.go | 30 ++++ pkg/cloudprovider/types.go | 5 +- 22 files changed, 1165 insertions(+), 3 deletions(-) create mode 100644 pkg/cloudprovider/aws/controllers/notification/controller.go create mode 100644 pkg/cloudprovider/aws/controllers/notification/event/aggregatedparser/aggregatedparser.go create mode 100644 pkg/cloudprovider/aws/controllers/notification/event/metadata.go create mode 100644 pkg/cloudprovider/aws/controllers/notification/event/noop.go create mode 100644 pkg/cloudprovider/aws/controllers/notification/event/rebalancerecommendation/v0/handler.go create mode 100644 pkg/cloudprovider/aws/controllers/notification/event/rebalancerecommendation/v0/parser.go create mode 100644 pkg/cloudprovider/aws/controllers/notification/event/rebalancerecommendation/v0/unmarshal.go create mode 100644 pkg/cloudprovider/aws/controllers/notification/event/scheduledchange/v1/handler.go create mode 100644 pkg/cloudprovider/aws/controllers/notification/event/scheduledchange/v1/parser.go create mode 100644 pkg/cloudprovider/aws/controllers/notification/event/scheduledchange/v1/unmarshal.go create mode 100644 pkg/cloudprovider/aws/controllers/notification/event/spotinterruption/v1/handler.go create mode 100644 pkg/cloudprovider/aws/controllers/notification/event/spotinterruption/v1/parser.go create mode 100644 pkg/cloudprovider/aws/controllers/notification/event/spotinterruption/v1/unmarshal.go create mode 100644 pkg/cloudprovider/aws/controllers/notification/event/statechange/v1/handler.go create mode 100644 pkg/cloudprovider/aws/controllers/notification/event/statechange/v1/parser.go create mode 100644 pkg/cloudprovider/aws/controllers/notification/event/statechange/v1/unmarshal.go create mode 100644 pkg/cloudprovider/aws/controllers/notification/event/types.go create mode 100644 pkg/cloudprovider/aws/controllers/notification/sqs.go create mode 100644 pkg/cloudprovider/aws/controllers/notification/types.go diff --git a/go.mod b/go.mod index 1fcb6e26f79f..d6fd41121238 100644 --- a/go.mod +++ b/go.mod @@ -29,6 +29,8 @@ require ( sigs.k8s.io/controller-runtime v0.13.0 ) +require k8s.io/utils v0.0.0-20210802155522-efc7438f0176 + require ( contrib.go.opencensus.io/exporter/ocagent v0.7.1-0.20200907061046-05415f1de66d // indirect contrib.go.opencensus.io/exporter/prometheus v0.4.0 // indirect diff --git a/pkg/cloudprovider/aws/cloudprovider.go b/pkg/cloudprovider/aws/cloudprovider.go index 10258ed34f32..373bbefd5807 100644 --- a/pkg/cloudprovider/aws/cloudprovider.go +++ b/pkg/cloudprovider/aws/cloudprovider.go @@ -30,6 +30,7 @@ import ( "github.com/aws/aws-sdk-go/aws/request" "github.com/aws/aws-sdk-go/aws/session" "github.com/aws/aws-sdk-go/service/ec2" + "github.com/aws/aws-sdk-go/service/sqs" "github.com/aws/aws-sdk-go/service/ssm" "github.com/patrickmn/go-cache" "github.com/samber/lo" @@ -37,6 +38,7 @@ import ( "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/types" "k8s.io/client-go/transport" + "k8s.io/utils/clock" "knative.dev/pkg/apis" "knative.dev/pkg/logging" "knative.dev/pkg/ptr" @@ -47,6 +49,8 @@ import ( "github.com/aws/karpenter/pkg/cloudprovider" "github.com/aws/karpenter/pkg/cloudprovider/aws/amifamily" "github.com/aws/karpenter/pkg/cloudprovider/aws/apis/v1alpha1" + "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification" + "github.com/aws/karpenter/pkg/events" "github.com/aws/karpenter/pkg/utils/functional" "github.com/aws/karpenter/pkg/utils/injection" "github.com/aws/karpenter/pkg/utils/project" @@ -75,7 +79,9 @@ var _ cloudprovider.CloudProvider = (*CloudProvider)(nil) type CloudProvider struct { instanceTypeProvider *InstanceTypeProvider instanceProvider *InstanceProvider + sqsProvider *notification.SQSProvider kubeClient k8sClient.Client + recorder events.Recorder } func NewCloudProvider(ctx context.Context, options cloudprovider.Options) *CloudProvider { @@ -105,8 +111,12 @@ func NewCloudProvider(ctx context.Context, options cloudprovider.Options) *Cloud if err := checkEC2Connectivity(ec2api); err != nil { logging.FromContext(ctx).Errorf("Checking EC2 API connectivity, %s", err) } + sqsapi := sqs.New(sess) subnetProvider := NewSubnetProvider(ec2api) instanceTypeProvider := NewInstanceTypeProvider(ctx, sess, options, ec2api, subnetProvider) + + // TODO: Change this queue url value to a useful value + sqsProvider := notification.NewSQSProvider(sqsapi, "dummyqueueurl") cloudprovider := &CloudProvider{ instanceTypeProvider: instanceTypeProvider, instanceProvider: NewInstanceProvider(ctx, ec2api, instanceTypeProvider, subnetProvider, @@ -120,10 +130,16 @@ func NewCloudProvider(ctx context.Context, options cloudprovider.Options) *Cloud options.StartAsync, ), ), - kubeClient: options.KubeClient, + sqsProvider: sqsProvider, + kubeClient: options.KubeClient, } v1alpha5.ValidateHook = cloudprovider.Validate v1alpha5.DefaultHook = cloudprovider.Default + + // Inject all the controllers for this cloudprovider + // Controllers will start when signaled by the StartAsync channel + cloudprovider.injectControllers(ctx, options.StartAsync) + return cloudprovider } @@ -138,6 +154,10 @@ func checkEC2Connectivity(api *ec2.EC2) error { return err } +func (c *CloudProvider) injectControllers(ctx context.Context, startAsync <-chan struct{}) { + notification.NewController(ctx, clock.RealClock{}, c.kubeClient, c.sqsProvider, c.recorder, startAsync) +} + // Create a node given the constraints. func (c *CloudProvider) Create(ctx context.Context, nodeRequest *cloudprovider.NodeRequest) (*v1.Node, error) { aws, err := c.getProvider(ctx, nodeRequest.Template.Provider, nodeRequest.Template.ProviderRef) diff --git a/pkg/cloudprovider/aws/controllers/notification/controller.go b/pkg/cloudprovider/aws/controllers/notification/controller.go new file mode 100644 index 000000000000..d4423207883a --- /dev/null +++ b/pkg/cloudprovider/aws/controllers/notification/controller.go @@ -0,0 +1,143 @@ +/* +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package notification + +import ( + "context" + "fmt" + "time" + + sqsapi "github.com/aws/aws-sdk-go/service/sqs" + "go.uber.org/multierr" + "k8s.io/utils/clock" + "knative.dev/pkg/logging" + "sigs.k8s.io/controller-runtime/pkg/client" + + "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event" + "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event/aggregatedparser" + "github.com/aws/karpenter/pkg/events" +) + +// Controller is the consolidation controller. It is not a standard controller-runtime controller in that it doesn't +// have a reconcile method. +type Controller struct { + kubeClient client.Client + recorder events.Recorder + clock clock.Clock + provider *SQSProvider + parser event.Parser +} + +// pollingPeriod that we go to the SQS queue to check if there are any new events +const pollingPeriod = 2 * time.Second + +func NewController(ctx context.Context, clk clock.Clock, kubeClient client.Client, + sqsProvider *SQSProvider, recorder events.Recorder, startAsync <-chan struct{}) *Controller { + c := &Controller{ + clock: clk, + kubeClient: kubeClient, + recorder: recorder, + provider: sqsProvider, + parser: aggregatedparser.NewAggregatedParser(aggregatedparser.DefaultParsers...), + } + + go func() { + select { + case <-ctx.Done(): + return + case <-startAsync: + c.run(ctx) + } + }() + + return c +} + +func (c *Controller) run(ctx context.Context) { + logger := logging.FromContext(ctx).Named("notification") + ctx = logging.WithLogger(ctx, logger) + for { + select { + case <-ctx.Done(): + logger.Infof("Shutting down") + return + case <-time.After(pollingPeriod): + logging.FromContext(ctx).Info("Here") + } + } +} + +func (c *Controller) Poll(ctx context.Context) error { + sqsMessages, err := c.provider.GetSQSMessages(ctx) + if err != nil { + return err + } + + for _, msg := range sqsMessages { + e := c.handleMessage(ctx, msg) + err = multierr.Append(err, e) + } + return nil +} + +func (c *Controller) handleMessage(ctx context.Context, msg *sqsapi.Message) (err error) { + fmt.Printf("Handling the message for %#v\n", msg) + + // No message to parse in this case + if msg == nil || msg.Body == nil { + return nil + } + evt := c.parser.Parse(ctx, *msg.Body) + evtAction := actionForEvent(evt) + + // TODO: hand some of this work off to a batcher that will handle the spinning up of a new node + // and the deletion of the old node separate from this reconciliation loop + if evtAction != Actions.NoAction { + for _, ec2InstanceID := range evt.EC2InstanceIDs() { + e := c.handleInstance(ctx, ec2InstanceID, evtAction) + err = multierr.Append(err, e) + } + } + if err != nil { + return err + } + return c.provider.DeleteSQSMessage(ctx, msg) +} + +// TODO: Handle the instance appropriately, this should be handled with a batcher +func (c *Controller) handleInstance(ctx context.Context, ec2InstanceID string, evtAction Action) error { + logging.FromContext(ctx).Infof("Got a message for ec2 instance id %s", ec2InstanceID) + return nil +} + +func actionForEvent(evt event.Interface) Action { + switch evt.Kind() { + case event.Kinds.RebalanceRecommendation: + return Actions.NoAction + + case event.Kinds.ScheduledChange: + return Actions.CordonAndDrain + + case event.Kinds.SpotInterruption: + return Actions.CordonAndDrain + + // TODO: understand what the state change action is + case event.Kinds.StateChange: + return Actions.NoAction + + default: + return Actions.NoAction + } +} diff --git a/pkg/cloudprovider/aws/controllers/notification/event/aggregatedparser/aggregatedparser.go b/pkg/cloudprovider/aws/controllers/notification/event/aggregatedparser/aggregatedparser.go new file mode 100644 index 000000000000..10a3d35503d0 --- /dev/null +++ b/pkg/cloudprovider/aws/controllers/notification/event/aggregatedparser/aggregatedparser.go @@ -0,0 +1,69 @@ +/* +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package aggregatedparser + +import ( + "context" + "encoding/json" + + "knative.dev/pkg/logging" + + "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event" + rebalancerecommendationv0 "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event/rebalancerecommendation/v0" + scheduledchangev1 "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event/scheduledchange/v1" + spotinterruptionv1 "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event/spotinterruption/v1" + statechangev1 "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event/statechange/v1" +) + +var ( + DefaultParsers = []event.Parser{ + statechangev1.Parser{}, + spotinterruptionv1.Parser{}, + scheduledchangev1.Parser{}, + rebalancerecommendationv0.Parser{}, + } +) + +type AggregatedParser []event.Parser + +func NewAggregatedParser(parsers ...event.Parser) AggregatedParser { + return parsers +} + +func (p AggregatedParser) Parse(ctx context.Context, str string) event.Interface { + ctx = logging.WithLogger(ctx, logging.FromContext(ctx).Named("event.parser")) + + if str == "" { + logging.FromContext(ctx).Warn("nothing to parse") + return event.NoOp{} + } + + for _, parser := range p { + if a := parser.Parse(ctx, str); a != nil { + return a + } + } + + logging.FromContext(ctx).Error("failed to parse") + + md := event.AWSMetadata{} + if err := json.Unmarshal([]byte(str), &md); err != nil { + logging.FromContext(ctx). + With("error", err). + Error("failed to unmarshal message metadata") + return event.NoOp{} + } + return event.NoOp(md) +} diff --git a/pkg/cloudprovider/aws/controllers/notification/event/metadata.go b/pkg/cloudprovider/aws/controllers/notification/event/metadata.go new file mode 100644 index 000000000000..9846e482e51d --- /dev/null +++ b/pkg/cloudprovider/aws/controllers/notification/event/metadata.go @@ -0,0 +1,49 @@ +/* +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package event + +import ( + "time" + + "go.uber.org/zap/zapcore" +) + +type AWSMetadata struct { + Account string `json:"account"` + DetailType string `json:"detail-type"` + ID string `json:"id"` + Region string `json:"region"` + Resources []string `json:"resources"` + Source string `json:"source"` + Time time.Time `json:"time"` + Version string `json:"version"` +} + +func (e AWSMetadata) MarshalLogObject(enc zapcore.ObjectEncoder) (err error) { + enc.AddString("source", e.Source) + enc.AddString("detail-type", e.DetailType) + enc.AddString("id", e.ID) + enc.AddTime("time", e.Time) + enc.AddString("region", e.Region) + _ = enc.AddArray("resources", zapcore.ArrayMarshalerFunc(func(enc zapcore.ArrayEncoder) error { + for _, resource := range e.Resources { + enc.AppendString(resource) + } + return nil + })) + enc.AddString("version", e.Version) + enc.AddString("account", e.Account) + return err +} diff --git a/pkg/cloudprovider/aws/controllers/notification/event/noop.go b/pkg/cloudprovider/aws/controllers/notification/event/noop.go new file mode 100644 index 000000000000..c2709c59353d --- /dev/null +++ b/pkg/cloudprovider/aws/controllers/notification/event/noop.go @@ -0,0 +1,45 @@ +/* +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package event + +import ( + "time" + + "go.uber.org/zap" + "go.uber.org/zap/zapcore" +) + +type NoOp AWSMetadata + +func (NoOp) EventID() string { + return "" +} + +func (NoOp) EC2InstanceIDs() []string { + return []string{} +} + +func (NoOp) Kind() Kind { + return Kinds.Noop +} + +func (n NoOp) MarshalLogObject(enc zapcore.ObjectEncoder) error { + zap.Inline(AWSMetadata(n)).AddTo(enc) + return nil +} + +func (NoOp) StartTime() time.Time { + return time.Now() +} diff --git a/pkg/cloudprovider/aws/controllers/notification/event/rebalancerecommendation/v0/handler.go b/pkg/cloudprovider/aws/controllers/notification/event/rebalancerecommendation/v0/handler.go new file mode 100644 index 000000000000..bc156bd8b722 --- /dev/null +++ b/pkg/cloudprovider/aws/controllers/notification/event/rebalancerecommendation/v0/handler.go @@ -0,0 +1,47 @@ +/* +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package v0 + +import ( + "time" + + "go.uber.org/zap" + "go.uber.org/zap/zapcore" + + "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event" +) + +type EC2InstanceRebalanceRecommendation AWSEvent + +func (e EC2InstanceRebalanceRecommendation) EventID() string { + return e.ID +} + +func (e EC2InstanceRebalanceRecommendation) EC2InstanceIDs() []string { + return []string{e.Detail.InstanceID} +} + +func (EC2InstanceRebalanceRecommendation) Kind() event.Kind { + return event.Kinds.RebalanceRecommendation +} + +func (e EC2InstanceRebalanceRecommendation) MarshalLogObject(enc zapcore.ObjectEncoder) error { + zap.Inline(AWSEvent(e)).AddTo(enc) + return nil +} + +func (e EC2InstanceRebalanceRecommendation) StartTime() time.Time { + return e.Time +} diff --git a/pkg/cloudprovider/aws/controllers/notification/event/rebalancerecommendation/v0/parser.go b/pkg/cloudprovider/aws/controllers/notification/event/rebalancerecommendation/v0/parser.go new file mode 100644 index 000000000000..1b96574713bf --- /dev/null +++ b/pkg/cloudprovider/aws/controllers/notification/event/rebalancerecommendation/v0/parser.go @@ -0,0 +1,50 @@ +/* +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package v0 + +import ( + "context" + "encoding/json" + + "knative.dev/pkg/logging" + + "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event" +) + +const ( + source = "aws.ec2" + detailType = "EC2 Instance Rebalance Recommendation" + version = "0" +) + +type Parser struct{} + +func (Parser) Parse(ctx context.Context, str string) event.Interface { + ctx = logging.WithLogger(ctx, logging.FromContext(ctx).Named("rebalanceRecommendation.v0")) + + evt := EC2InstanceRebalanceRecommendation{} + if err := json.Unmarshal([]byte(str), &evt); err != nil { + logging.FromContext(ctx). + With("error", err). + Error("failed to unmarshal EC2 instance rebalance recommendation event") + return nil + } + + if evt.Source != source || evt.DetailType != detailType || evt.Version != version { + return nil + } + + return evt +} diff --git a/pkg/cloudprovider/aws/controllers/notification/event/rebalancerecommendation/v0/unmarshal.go b/pkg/cloudprovider/aws/controllers/notification/event/rebalancerecommendation/v0/unmarshal.go new file mode 100644 index 000000000000..dee22372f1f6 --- /dev/null +++ b/pkg/cloudprovider/aws/controllers/notification/event/rebalancerecommendation/v0/unmarshal.go @@ -0,0 +1,44 @@ +/* +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package v0 + +import ( + "go.uber.org/zap" + "go.uber.org/zap/zapcore" + + "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event" +) + +// AWSEvent contains the properties defined by +// https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/rebalance-recommendations.html#monitor-rebalance-recommendations +type AWSEvent struct { + event.AWSMetadata + + Detail EC2InstanceRebalanceRecommendationDetail `json:"detail"` +} + +func (e AWSEvent) MarshalLogObject(enc zapcore.ObjectEncoder) error { + zap.Inline(e.AWSMetadata).AddTo(enc) + return enc.AddObject("detail", e.Detail) +} + +type EC2InstanceRebalanceRecommendationDetail struct { + InstanceID string `json:"instance-id"` +} + +func (e EC2InstanceRebalanceRecommendationDetail) MarshalLogObject(enc zapcore.ObjectEncoder) error { + enc.AddString("instance-id", e.InstanceID) + return nil +} diff --git a/pkg/cloudprovider/aws/controllers/notification/event/scheduledchange/v1/handler.go b/pkg/cloudprovider/aws/controllers/notification/event/scheduledchange/v1/handler.go new file mode 100644 index 000000000000..aa5187b768b0 --- /dev/null +++ b/pkg/cloudprovider/aws/controllers/notification/event/scheduledchange/v1/handler.go @@ -0,0 +1,51 @@ +/* +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package v1 + +import ( + "time" + + "go.uber.org/zap" + "go.uber.org/zap/zapcore" + + "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event" +) + +type AWSHealthEvent AWSEvent + +func (e AWSHealthEvent) EventID() string { + return e.ID +} + +func (e AWSHealthEvent) EC2InstanceIDs() []string { + ids := make([]string, len(e.Detail.AffectedEntities)) + for i, entity := range e.Detail.AffectedEntities { + ids[i] = entity.EntityValue + } + return ids +} + +func (AWSHealthEvent) Kind() event.Kind { + return event.Kinds.ScheduledChange +} + +func (e AWSHealthEvent) MarshalLogObject(enc zapcore.ObjectEncoder) error { + zap.Inline(AWSEvent(e)).AddTo(enc) + return nil +} + +func (e AWSHealthEvent) StartTime() time.Time { + return e.Time +} diff --git a/pkg/cloudprovider/aws/controllers/notification/event/scheduledchange/v1/parser.go b/pkg/cloudprovider/aws/controllers/notification/event/scheduledchange/v1/parser.go new file mode 100644 index 000000000000..590c89987c74 --- /dev/null +++ b/pkg/cloudprovider/aws/controllers/notification/event/scheduledchange/v1/parser.go @@ -0,0 +1,68 @@ +/* +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package v1 + +import ( + "context" + "encoding/json" + + "knative.dev/pkg/logging" + + "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event" +) + +const ( + source = "aws.health" + detailType = "AWS Health Event" + version = "1" + acceptedService = "EC2" + acceptedEventTypeCategory = "scheduledChange" +) + +type Parser struct{} + +func (Parser) Parse(ctx context.Context, str string) event.Interface { + ctx = logging.WithLogger(ctx, logging.FromContext(ctx).Named("scheduledChange.v1")) + + evt := AWSHealthEvent{} + if err := json.Unmarshal([]byte(str), &evt); err != nil { + logging.FromContext(ctx). + With("error", err). + Error("failed to unmarshal AWS health event") + return nil + } + + if evt.Source != source || evt.DetailType != detailType || evt.Version != version { + return nil + } + + if evt.Detail.Service != acceptedService { + logging.FromContext(ctx). + With("eventDetails", evt). + With("acceptedService", acceptedService). + Warn("ignoring AWS health event") + return nil + } + + if evt.Detail.EventTypeCategory != acceptedEventTypeCategory { + logging.FromContext(ctx). + With("eventDetails", evt). + With("acceptedEventTypeCategory", acceptedEventTypeCategory). + Warn("ignoring AWS health event") + return nil + } + + return evt +} diff --git a/pkg/cloudprovider/aws/controllers/notification/event/scheduledchange/v1/unmarshal.go b/pkg/cloudprovider/aws/controllers/notification/event/scheduledchange/v1/unmarshal.go new file mode 100644 index 000000000000..c1fcde03803d --- /dev/null +++ b/pkg/cloudprovider/aws/controllers/notification/event/scheduledchange/v1/unmarshal.go @@ -0,0 +1,89 @@ +/* +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package v1 + +import ( + "go.uber.org/multierr" + "go.uber.org/zap" + "go.uber.org/zap/zapcore" + + "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event" +) + +// AWSEvent contains the properties defined in AWS EventBridge schema +// aws.health@AWSHealthEvent v1. +type AWSEvent struct { + event.AWSMetadata + + Detail AWSHealthEventDetail `json:"detail"` +} + +func (e AWSEvent) MarshalLogObject(enc zapcore.ObjectEncoder) error { + zap.Inline(e.AWSMetadata).AddTo(enc) + return enc.AddObject("detail", e.Detail) +} + +type AWSHealthEventDetail struct { + EventARN string `json:"eventArn"` + EventTypeCode string `json:"eventTypeCode"` + Service string `json:"service"` + EventDescription []EventDescription `json:"eventDescription"` + StartTime string `json:"startTime"` + EndTime string `json:"endTime"` + EventTypeCategory string `json:"eventTypeCategory"` + AffectedEntities []AffectedEntity `json:"affectedEntities"` +} + +func (e AWSHealthEventDetail) MarshalLogObject(enc zapcore.ObjectEncoder) (err error) { + enc.AddString("eventArn", e.EventARN) + enc.AddString("eventTypeCode", e.EventTypeCode) + enc.AddString("eventTypeCategory", e.EventTypeCategory) + enc.AddString("service", e.Service) + enc.AddString("startTime", e.StartTime) + enc.AddString("endTime", e.EndTime) + err = multierr.Append(err, enc.AddArray("eventDescription", zapcore.ArrayMarshalerFunc(func(enc zapcore.ArrayEncoder) (err error) { + for _, desc := range e.EventDescription { + err = multierr.Append(err, enc.AppendObject(desc)) + } + return err + }))) + err = multierr.Append(err, enc.AddArray("affectedEntities", zapcore.ArrayMarshalerFunc(func(enc zapcore.ArrayEncoder) (err error) { + for _, entity := range e.AffectedEntities { + err = multierr.Append(err, enc.AppendObject(entity)) + } + return err + }))) + return err +} + +type EventDescription struct { + LatestDescription string `json:"latestDescription"` + Language string `json:"language"` +} + +func (e EventDescription) MarshalLogObject(enc zapcore.ObjectEncoder) error { + enc.AddString("latestDescription", e.LatestDescription) + enc.AddString("language", e.Language) + return nil +} + +type AffectedEntity struct { + EntityValue string `json:"entityValue"` +} + +func (e AffectedEntity) MarshalLogObject(enc zapcore.ObjectEncoder) error { + enc.AddString("entityValue", e.EntityValue) + return nil +} diff --git a/pkg/cloudprovider/aws/controllers/notification/event/spotinterruption/v1/handler.go b/pkg/cloudprovider/aws/controllers/notification/event/spotinterruption/v1/handler.go new file mode 100644 index 000000000000..925ffd037cb5 --- /dev/null +++ b/pkg/cloudprovider/aws/controllers/notification/event/spotinterruption/v1/handler.go @@ -0,0 +1,47 @@ +/* +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package v1 + +import ( + "time" + + "go.uber.org/zap" + "go.uber.org/zap/zapcore" + + "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event" +) + +type EC2SpotInstanceInterruptionWarning AWSEvent + +func (e EC2SpotInstanceInterruptionWarning) EventID() string { + return e.ID +} + +func (e EC2SpotInstanceInterruptionWarning) EC2InstanceIDs() []string { + return []string{e.Detail.InstanceID} +} + +func (EC2SpotInstanceInterruptionWarning) Kind() event.Kind { + return event.Kinds.SpotInterruption +} + +func (e EC2SpotInstanceInterruptionWarning) MarshalLogObject(enc zapcore.ObjectEncoder) error { + zap.Inline(AWSEvent(e)).AddTo(enc) + return nil +} + +func (e EC2SpotInstanceInterruptionWarning) StartTime() time.Time { + return e.Time +} diff --git a/pkg/cloudprovider/aws/controllers/notification/event/spotinterruption/v1/parser.go b/pkg/cloudprovider/aws/controllers/notification/event/spotinterruption/v1/parser.go new file mode 100644 index 000000000000..58c30721267b --- /dev/null +++ b/pkg/cloudprovider/aws/controllers/notification/event/spotinterruption/v1/parser.go @@ -0,0 +1,50 @@ +/* +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package v1 + +import ( + "context" + "encoding/json" + + "knative.dev/pkg/logging" + + "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event" +) + +const ( + source = "aws.ec2" + detailType = "EC2 Spot Instance Interruption Warning" + version = "1" +) + +type Parser struct{} + +func (Parser) Parse(ctx context.Context, str string) event.Interface { + ctx = logging.WithLogger(ctx, logging.FromContext(ctx).Named("spotInterruption.v1")) + + evt := EC2SpotInstanceInterruptionWarning{} + if err := json.Unmarshal([]byte(str), &evt); err != nil { + logging.FromContext(ctx). + With("error", err). + Error("failed to unmarshal EC2 spot instance interruption event") + return nil + } + + if evt.Source != source || evt.DetailType != detailType || evt.Version != version { + return nil + } + + return evt +} diff --git a/pkg/cloudprovider/aws/controllers/notification/event/spotinterruption/v1/unmarshal.go b/pkg/cloudprovider/aws/controllers/notification/event/spotinterruption/v1/unmarshal.go new file mode 100644 index 000000000000..7b3452af21d4 --- /dev/null +++ b/pkg/cloudprovider/aws/controllers/notification/event/spotinterruption/v1/unmarshal.go @@ -0,0 +1,46 @@ +/* +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package v1 + +import ( + "go.uber.org/zap" + "go.uber.org/zap/zapcore" + + "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event" +) + +// AWSEvent contains the properties defined in AWS EventBridge schema +// aws.ec2@EC2SpotInstanceInterruptionWarning v1. +type AWSEvent struct { + event.AWSMetadata + + Detail EC2SpotInstanceInterruptionWarningDetail `json:"detail"` +} + +func (e AWSEvent) MarshalLogObject(enc zapcore.ObjectEncoder) error { + zap.Inline(e.AWSMetadata).AddTo(enc) + return enc.AddObject("detail", e.Detail) +} + +type EC2SpotInstanceInterruptionWarningDetail struct { + InstanceID string `json:"instance-id"` + InstanceAction string `json:"instance-action"` +} + +func (e EC2SpotInstanceInterruptionWarningDetail) MarshalLogObject(enc zapcore.ObjectEncoder) error { + enc.AddString("instance-id", e.InstanceID) + enc.AddString("instance-action", e.InstanceAction) + return nil +} diff --git a/pkg/cloudprovider/aws/controllers/notification/event/statechange/v1/handler.go b/pkg/cloudprovider/aws/controllers/notification/event/statechange/v1/handler.go new file mode 100644 index 000000000000..3fe1ac8ca057 --- /dev/null +++ b/pkg/cloudprovider/aws/controllers/notification/event/statechange/v1/handler.go @@ -0,0 +1,47 @@ +/* +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package v1 + +import ( + "time" + + "go.uber.org/zap" + "go.uber.org/zap/zapcore" + + "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event" +) + +type EC2InstanceStateChangeNotification AWSEvent + +func (e EC2InstanceStateChangeNotification) EventID() string { + return e.ID +} + +func (e EC2InstanceStateChangeNotification) EC2InstanceIDs() []string { + return []string{e.Detail.InstanceID} +} + +func (EC2InstanceStateChangeNotification) Kind() event.Kind { + return event.Kinds.StateChange +} + +func (e EC2InstanceStateChangeNotification) MarshalLogObject(enc zapcore.ObjectEncoder) error { + zap.Inline(AWSEvent(e)).AddTo(enc) + return nil +} + +func (e EC2InstanceStateChangeNotification) StartTime() time.Time { + return e.Time +} diff --git a/pkg/cloudprovider/aws/controllers/notification/event/statechange/v1/parser.go b/pkg/cloudprovider/aws/controllers/notification/event/statechange/v1/parser.go new file mode 100644 index 000000000000..00d02db74e97 --- /dev/null +++ b/pkg/cloudprovider/aws/controllers/notification/event/statechange/v1/parser.go @@ -0,0 +1,62 @@ +/* +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package v1 + +import ( + "context" + "encoding/json" + "strings" + + "knative.dev/pkg/logging" + + "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event" +) + +const ( + source = "aws.ec2" + detailType = "EC2 Instance State-change Notification" + version = "1" + acceptedStates = "stopping,stopped,shutting-down,terminated" +) + +var acceptedStatesList = strings.Split(acceptedStates, ",") + +type Parser struct{} + +func (Parser) Parse(ctx context.Context, str string) event.Interface { + ctx = logging.WithLogger(ctx, logging.FromContext(ctx).Named("stateChange.v1")) + + evt := EC2InstanceStateChangeNotification{} + if err := json.Unmarshal([]byte(str), &evt); err != nil { + logging.FromContext(ctx). + With("error", err). + Error("failed to unmarshal EC2 state-change event") + return nil + } + + if evt.Source != source || evt.DetailType != detailType || evt.Version != version { + return nil + } + + if !strings.Contains(acceptedStates, strings.ToLower(evt.Detail.State)) { + logging.FromContext(ctx). + With("eventDetails", evt). + With("acceptedStates", acceptedStatesList). + Warn("ignorning EC2 state-change notification") + return nil + } + + return evt +} diff --git a/pkg/cloudprovider/aws/controllers/notification/event/statechange/v1/unmarshal.go b/pkg/cloudprovider/aws/controllers/notification/event/statechange/v1/unmarshal.go new file mode 100644 index 000000000000..c3a84fdb7ec4 --- /dev/null +++ b/pkg/cloudprovider/aws/controllers/notification/event/statechange/v1/unmarshal.go @@ -0,0 +1,46 @@ +/* +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package v1 + +import ( + "go.uber.org/zap" + "go.uber.org/zap/zapcore" + + "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event" +) + +// AWSEvent contains the properties defined in AWS EventBridge schema +// aws.ec2@EC2InstanceStateChangeNotification v1. +type AWSEvent struct { + event.AWSMetadata + + Detail EC2InstanceStateChangeNotificationDetail `json:"detail"` +} + +func (e AWSEvent) MarshalLogObject(enc zapcore.ObjectEncoder) error { + zap.Inline(e.AWSMetadata).AddTo(enc) + return enc.AddObject("detail", e.Detail) +} + +type EC2InstanceStateChangeNotificationDetail struct { + InstanceID string `json:"instance-id"` + State string `json:"state"` +} + +func (e EC2InstanceStateChangeNotificationDetail) MarshalLogObject(enc zapcore.ObjectEncoder) error { + enc.AddString("instance-id", e.InstanceID) + enc.AddString("state", e.State) + return nil +} diff --git a/pkg/cloudprovider/aws/controllers/notification/event/types.go b/pkg/cloudprovider/aws/controllers/notification/event/types.go new file mode 100644 index 000000000000..09e8be62c20a --- /dev/null +++ b/pkg/cloudprovider/aws/controllers/notification/event/types.go @@ -0,0 +1,50 @@ +/* +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package event + +import ( + "context" + + "go.uber.org/zap/zapcore" +) + +type Parser interface { + Parse(context.Context, string) Interface +} + +type Interface interface { + zapcore.ObjectMarshaler + + EC2InstanceIDs() []string + Kind() Kind +} + +type Kind = string + +var Kinds = struct { + AutoScalingTermination, + RebalanceRecommendation, + ScheduledChange, + SpotInterruption, + StateChange, + Noop Kind +}{ + AutoScalingTermination: Kind("autoScalingTermination"), + RebalanceRecommendation: Kind("rebalanceRecommendation"), + ScheduledChange: Kind("scheduledChange"), + SpotInterruption: Kind("spotInterruption"), + StateChange: Kind("stateChange"), + Noop: Kind("noop"), +} diff --git a/pkg/cloudprovider/aws/controllers/notification/sqs.go b/pkg/cloudprovider/aws/controllers/notification/sqs.go new file mode 100644 index 000000000000..756ab682b0d8 --- /dev/null +++ b/pkg/cloudprovider/aws/controllers/notification/sqs.go @@ -0,0 +1,106 @@ +/* +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package notification + +import ( + "bytes" + "context" + "encoding/json" + "fmt" + + "github.com/aws/aws-sdk-go/aws" + "github.com/aws/aws-sdk-go/service/sqs" + "github.com/aws/aws-sdk-go/service/sqs/sqsiface" + "knative.dev/pkg/logging" +) + +type SQSProvider struct { + receiveMessageInput *sqs.ReceiveMessageInput + deleteMessageInput *sqs.DeleteMessageInput + client sqsiface.SQSAPI +} + +func NewSQSProvider(client sqsiface.SQSAPI, queueURL string) *SQSProvider { + receiveMessageInput := &sqs.ReceiveMessageInput{ + QueueUrl: aws.String(queueURL), + MaxNumberOfMessages: aws.Int64(10), + VisibilityTimeout: aws.Int64(20), // Seconds + WaitTimeSeconds: aws.Int64(20), // Seconds, maximum for long polling + AttributeNames: []*string{ + aws.String(sqs.MessageSystemAttributeNameSentTimestamp), + }, + MessageAttributeNames: []*string{ + aws.String(sqs.QueueAttributeNameAll), + }, + } + + deleteMessageInput := &sqs.DeleteMessageInput{ + QueueUrl: aws.String(queueURL), + } + + return &SQSProvider{ + receiveMessageInput: receiveMessageInput, + deleteMessageInput: deleteMessageInput, + client: client, + } +} + +func (s *SQSProvider) GetSQSMessages(ctx context.Context) ([]*sqs.Message, error) { + ctx = logging.WithLogger(ctx, logging.FromContext(ctx).Named("sqsClient.getMessages")) + + result, err := s.client.ReceiveMessageWithContext(ctx, s.receiveMessageInput) + if err != nil { + logging.FromContext(ctx). + With("error", err). + Error("failed to fetch messages") + return nil, err + } + + return result.Messages, nil +} + +func (s *SQSProvider) DeleteSQSMessage(ctx context.Context, msg *sqs.Message) error { + ctx = logging.WithLogger(ctx, logging.FromContext(ctx).Named("sqsClient.deleteMessage")) + + input, err := deepCopyDeleteMessage(s.deleteMessageInput) + if err != nil { + return fmt.Errorf("error copying delete message input, %w", err) + } + input.ReceiptHandle = msg.ReceiptHandle + + _, err = s.client.DeleteMessageWithContext(ctx, input) + if err != nil { + logging.FromContext(ctx). + With("error", err). + Error("failed to delete message") + return err + } + + return nil +} + +func deepCopyDeleteMessage(input *sqs.DeleteMessageInput) (*sqs.DeleteMessageInput, error) { + var buf bytes.Buffer + enc := json.NewEncoder(&buf) + if err := enc.Encode(input); err != nil { + return nil, err + } + dec := json.NewDecoder(&buf) + var cp sqs.DeleteMessageInput + if err := dec.Decode(&cp); err != nil { + return nil, err + } + return &cp, nil +} diff --git a/pkg/cloudprovider/aws/controllers/notification/types.go b/pkg/cloudprovider/aws/controllers/notification/types.go new file mode 100644 index 000000000000..c77c1f3aa315 --- /dev/null +++ b/pkg/cloudprovider/aws/controllers/notification/types.go @@ -0,0 +1,30 @@ +/* +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package notification + +type Action string + +var Actions = struct { + CordonAndDrain, + Cordon, + NoAction Action +}{ + CordonAndDrain: Action("CordonAndDrain"), + Cordon: Action("Cordon"), + NoAction: Action("NoAction"), +} + +type MessageParser interface { +} diff --git a/pkg/cloudprovider/types.go b/pkg/cloudprovider/types.go index 0006d115350c..1a0516ae7897 100644 --- a/pkg/cloudprovider/types.go +++ b/pkg/cloudprovider/types.go @@ -18,12 +18,12 @@ import ( "context" "github.com/samber/lo" - v1 "k8s.io/api/core/v1" "k8s.io/client-go/kubernetes" "sigs.k8s.io/controller-runtime/pkg/client" "github.com/aws/karpenter/pkg/apis/provisioning/v1alpha5" + "github.com/aws/karpenter/pkg/events" "github.com/aws/karpenter/pkg/scheduling" ) @@ -31,10 +31,11 @@ import ( type Options struct { ClientSet *kubernetes.Clientset KubeClient client.Client + Recorder events.Recorder // WebhookOnly is true if the cloud provider is being used for its validation/defaulting only by the webhook. In // this case it may not need to perform some initialization and the StartAsync channel will not be closed. WebhookOnly bool - // StartAsync is a channel that is closed when leader election has been won. This is a signal to start any async + // StartAsync is a channel that is closed when leader election has been won. This is a signal to start any async // processing that should only occur while the cloud provider is the leader. StartAsync <-chan struct{} } From 5f188d1ef7bdee093c0774829cab249ee4bf0edf Mon Sep 17 00:00:00 2001 From: Jonathan Innis Date: Tue, 6 Sep 2022 15:39:30 -0700 Subject: [PATCH 02/55] Inject notification and infrastructure controller for AWS provider --- cmd/controller/main.go | 15 +- go.mod | 2 +- pkg/apis/provisioning/v1alpha5/labels.go | 1 + pkg/cloudprovider/aws/cloudprovider.go | 18 +- .../controllers/infrastructure/controller.go | 80 +++++++++ .../controllers/notification/controller.go | 156 ++++++++++++++---- .../aggregatedparser/aggregatedparser.go | 29 ++-- .../scheduledchange/{v1 => v0}/handler.go | 2 +- .../scheduledchange/{v1 => v0}/parser.go | 4 +- .../scheduledchange/{v1 => v0}/unmarshal.go | 2 +- .../spotinterruption/{v1 => v0}/handler.go | 2 +- .../spotinterruption/{v1 => v0}/parser.go | 4 +- .../spotinterruption/{v1 => v0}/unmarshal.go | 2 +- .../event/statechange/{v1 => v0}/handler.go | 2 +- .../event/statechange/{v1 => v0}/parser.go | 9 +- .../event/statechange/{v1 => v0}/unmarshal.go | 2 +- .../aws/controllers/notification/types.go | 30 ---- pkg/cloudprovider/aws/controllers/recorder.go | 53 ++++++ .../aws/{controllers/notification => }/sqs.go | 2 +- pkg/cloudprovider/fake/cloudprovider.go | 4 + pkg/cloudprovider/types.go | 2 - pkg/controllers/consolidation/controller.go | 67 +++++++- pkg/controllers/consolidation/types.go | 19 ++- pkg/controllers/controllers.go | 17 +- pkg/events/dedupe.go | 22 +-- pkg/events/loadshedding.go | 20 +-- pkg/events/recorder.go | 24 +-- pkg/test/eventrecorder.go | 2 + 28 files changed, 432 insertions(+), 160 deletions(-) create mode 100644 pkg/cloudprovider/aws/controllers/infrastructure/controller.go rename pkg/cloudprovider/aws/controllers/notification/event/scheduledchange/{v1 => v0}/handler.go (99%) rename pkg/cloudprovider/aws/controllers/notification/event/scheduledchange/{v1 => v0}/parser.go (97%) rename pkg/cloudprovider/aws/controllers/notification/event/scheduledchange/{v1 => v0}/unmarshal.go (99%) rename pkg/cloudprovider/aws/controllers/notification/event/spotinterruption/{v1 => v0}/handler.go (99%) rename pkg/cloudprovider/aws/controllers/notification/event/spotinterruption/{v1 => v0}/parser.go (97%) rename pkg/cloudprovider/aws/controllers/notification/event/spotinterruption/{v1 => v0}/unmarshal.go (99%) rename pkg/cloudprovider/aws/controllers/notification/event/statechange/{v1 => v0}/handler.go (99%) rename pkg/cloudprovider/aws/controllers/notification/event/statechange/{v1 => v0}/parser.go (88%) rename pkg/cloudprovider/aws/controllers/notification/event/statechange/{v1 => v0}/unmarshal.go (99%) delete mode 100644 pkg/cloudprovider/aws/controllers/notification/types.go create mode 100644 pkg/cloudprovider/aws/controllers/recorder.go rename pkg/cloudprovider/aws/{controllers/notification => }/sqs.go (99%) diff --git a/cmd/controller/main.go b/cmd/controller/main.go index dfa93a732339..1d46e3c47f56 100644 --- a/cmd/controller/main.go +++ b/cmd/controller/main.go @@ -19,11 +19,22 @@ import ( "github.com/aws/karpenter/pkg/cloudprovider" "github.com/aws/karpenter/pkg/cloudprovider/aws" + awscontrollers "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers" + "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/infrastructure" + "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification" "github.com/aws/karpenter/pkg/controllers" ) func main() { - controllers.Initialize(func(ctx context.Context, options cloudprovider.Options) cloudprovider.CloudProvider { - return aws.NewCloudProvider(ctx, options) + controllers.Initialize(func(ctx context.Context, options cloudprovider.Options) (cloudprovider.CloudProvider, func(context.Context, *controllers.ControllerOptions)) { + provider := aws.NewCloudProvider(ctx, options) + injectControllers := func(ctx context.Context, opts *controllers.ControllerOptions) { + recorder := awscontrollers.NewRecorder(opts.Recorder) + + // Injecting the controllers that will start when opts.StartAsync is triggered + notification.NewController(ctx, opts.Clock, opts.KubeClient, provider.SQSProvider, recorder, opts.Provisioner, opts.Cluster, opts.StartAsync) + infrastructure.NewController(ctx, opts.Clock, opts.KubeClient, recorder, opts.Cluster, opts.StartAsync) + } + return provider, injectControllers }) } diff --git a/go.mod b/go.mod index d6fd41121238..f63645ce8607 100644 --- a/go.mod +++ b/go.mod @@ -29,7 +29,7 @@ require ( sigs.k8s.io/controller-runtime v0.13.0 ) -require k8s.io/utils v0.0.0-20210802155522-efc7438f0176 +require k8s.io/utils v0.0.0-20210802155522-efc7438f0176 // indirect require ( contrib.go.opencensus.io/exporter/ocagent v0.7.1-0.20200907061046-05415f1de66d // indirect diff --git a/pkg/apis/provisioning/v1alpha5/labels.go b/pkg/apis/provisioning/v1alpha5/labels.go index b3ddf0a51189..5f38819d5d46 100644 --- a/pkg/apis/provisioning/v1alpha5/labels.go +++ b/pkg/apis/provisioning/v1alpha5/labels.go @@ -33,6 +33,7 @@ var ( ProvisionerNameLabelKey = Group + "/provisioner-name" DoNotEvictPodAnnotationKey = Group + "/do-not-evict" DoNotConsolidateNodeAnnotationKey = KarpenterLabelDomain + "/do-not-consolidate" + MarkedForDeletionAnnotationKey = KarpenterLabelDomain + "/marked-for-delete" EmptinessTimestampAnnotationKey = Group + "/emptiness-timestamp" TerminationFinalizer = Group + "/termination" diff --git a/pkg/cloudprovider/aws/cloudprovider.go b/pkg/cloudprovider/aws/cloudprovider.go index 373bbefd5807..d67b2eb904c9 100644 --- a/pkg/cloudprovider/aws/cloudprovider.go +++ b/pkg/cloudprovider/aws/cloudprovider.go @@ -38,7 +38,6 @@ import ( "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/types" "k8s.io/client-go/transport" - "k8s.io/utils/clock" "knative.dev/pkg/apis" "knative.dev/pkg/logging" "knative.dev/pkg/ptr" @@ -49,8 +48,6 @@ import ( "github.com/aws/karpenter/pkg/cloudprovider" "github.com/aws/karpenter/pkg/cloudprovider/aws/amifamily" "github.com/aws/karpenter/pkg/cloudprovider/aws/apis/v1alpha1" - "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification" - "github.com/aws/karpenter/pkg/events" "github.com/aws/karpenter/pkg/utils/functional" "github.com/aws/karpenter/pkg/utils/injection" "github.com/aws/karpenter/pkg/utils/project" @@ -79,9 +76,8 @@ var _ cloudprovider.CloudProvider = (*CloudProvider)(nil) type CloudProvider struct { instanceTypeProvider *InstanceTypeProvider instanceProvider *InstanceProvider - sqsProvider *notification.SQSProvider kubeClient k8sClient.Client - recorder events.Recorder + SQSProvider *SQSProvider } func NewCloudProvider(ctx context.Context, options cloudprovider.Options) *CloudProvider { @@ -116,7 +112,7 @@ func NewCloudProvider(ctx context.Context, options cloudprovider.Options) *Cloud instanceTypeProvider := NewInstanceTypeProvider(ctx, sess, options, ec2api, subnetProvider) // TODO: Change this queue url value to a useful value - sqsProvider := notification.NewSQSProvider(sqsapi, "dummyqueueurl") + sqsProvider := NewSQSProvider(sqsapi, "https://sqs.us-west-2.amazonaws.com/330700974597/test-stack-Queue-VimlxX8fIySZ") cloudprovider := &CloudProvider{ instanceTypeProvider: instanceTypeProvider, instanceProvider: NewInstanceProvider(ctx, ec2api, instanceTypeProvider, subnetProvider, @@ -130,16 +126,12 @@ func NewCloudProvider(ctx context.Context, options cloudprovider.Options) *Cloud options.StartAsync, ), ), - sqsProvider: sqsProvider, + SQSProvider: sqsProvider, kubeClient: options.KubeClient, } v1alpha5.ValidateHook = cloudprovider.Validate v1alpha5.DefaultHook = cloudprovider.Default - // Inject all the controllers for this cloudprovider - // Controllers will start when signaled by the StartAsync channel - cloudprovider.injectControllers(ctx, options.StartAsync) - return cloudprovider } @@ -154,10 +146,6 @@ func checkEC2Connectivity(api *ec2.EC2) error { return err } -func (c *CloudProvider) injectControllers(ctx context.Context, startAsync <-chan struct{}) { - notification.NewController(ctx, clock.RealClock{}, c.kubeClient, c.sqsProvider, c.recorder, startAsync) -} - // Create a node given the constraints. func (c *CloudProvider) Create(ctx context.Context, nodeRequest *cloudprovider.NodeRequest) (*v1.Node, error) { aws, err := c.getProvider(ctx, nodeRequest.Template.Provider, nodeRequest.Template.ProviderRef) diff --git a/pkg/cloudprovider/aws/controllers/infrastructure/controller.go b/pkg/cloudprovider/aws/controllers/infrastructure/controller.go new file mode 100644 index 000000000000..3bcb2eb9a981 --- /dev/null +++ b/pkg/cloudprovider/aws/controllers/infrastructure/controller.go @@ -0,0 +1,80 @@ +/* +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package infrastructure + +import ( + "context" + "time" + + "k8s.io/apimachinery/pkg/util/clock" + "knative.dev/pkg/logging" + "sigs.k8s.io/controller-runtime/pkg/client" + + "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers" + "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event" + "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event/aggregatedparser" + "github.com/aws/karpenter/pkg/controllers/provisioning" + "github.com/aws/karpenter/pkg/controllers/state" +) + +// Controller is the consolidation controller. It is not a standard controller-runtime controller in that it doesn't +// have a reconcile method. +type Controller struct { + kubeClient client.Client + provisioner *provisioning.Provisioner + cluster *state.Cluster + recorder controllers.Recorder + clock clock.Clock + parser event.Parser +} + +// pollingPeriod that we go to the SQS queue to check if there are any new events +const pollingPeriod = 2 * time.Second + +func NewController(ctx context.Context, clk clock.Clock, kubeClient client.Client, recorder controllers.Recorder, + cluster *state.Cluster, startAsync <-chan struct{}) *Controller { + c := &Controller{ + kubeClient: kubeClient, + cluster: cluster, + recorder: recorder, + clock: clk, + parser: aggregatedparser.NewAggregatedParser(aggregatedparser.DefaultParsers...), + } + + go func() { + select { + case <-ctx.Done(): + return + case <-startAsync: + c.run(ctx) + } + }() + + return c +} + +func (c *Controller) run(ctx context.Context) { + logger := logging.FromContext(ctx).Named("notification") + ctx = logging.WithLogger(ctx, logger) + for { + select { + case <-ctx.Done(): + logger.Infof("Shutting down") + return + case <-time.After(pollingPeriod): + logging.FromContext(ctx).Infof("polled after the polling period") + } + } +} diff --git a/pkg/cloudprovider/aws/controllers/notification/controller.go b/pkg/cloudprovider/aws/controllers/notification/controller.go index d4423207883a..bad3535699d4 100644 --- a/pkg/cloudprovider/aws/controllers/notification/controller.go +++ b/pkg/cloudprovider/aws/controllers/notification/controller.go @@ -17,40 +17,62 @@ package notification import ( "context" "fmt" + "regexp" "time" sqsapi "github.com/aws/aws-sdk-go/service/sqs" "go.uber.org/multierr" - "k8s.io/utils/clock" + v1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/util/clock" "knative.dev/pkg/logging" "sigs.k8s.io/controller-runtime/pkg/client" + "github.com/aws/karpenter/pkg/apis/provisioning/v1alpha5" + "github.com/aws/karpenter/pkg/cloudprovider/aws" + "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers" "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event" "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event/aggregatedparser" - "github.com/aws/karpenter/pkg/events" + "github.com/aws/karpenter/pkg/controllers/provisioning" + "github.com/aws/karpenter/pkg/controllers/state" ) +type Action = string + +var Actions = struct { + CordonAndDrain, + Cordon, + NoAction Action +}{ + CordonAndDrain: "CordonAndDrain", + Cordon: "Cordon", + NoAction: "NoAction", +} + // Controller is the consolidation controller. It is not a standard controller-runtime controller in that it doesn't // have a reconcile method. type Controller struct { - kubeClient client.Client - recorder events.Recorder - clock clock.Clock - provider *SQSProvider - parser event.Parser + kubeClient client.Client + provisioner *provisioning.Provisioner + cluster *state.Cluster + recorder controllers.Recorder + clock clock.Clock + provider *aws.SQSProvider + parser event.Parser } // pollingPeriod that we go to the SQS queue to check if there are any new events const pollingPeriod = 2 * time.Second -func NewController(ctx context.Context, clk clock.Clock, kubeClient client.Client, - sqsProvider *SQSProvider, recorder events.Recorder, startAsync <-chan struct{}) *Controller { +func NewController(ctx context.Context, clk clock.Clock, kubeClient client.Client, sqsProvider *aws.SQSProvider, + recorder controllers.Recorder, provisioner *provisioning.Provisioner, cluster *state.Cluster, startAsync <-chan struct{}) *Controller { c := &Controller{ - clock: clk, - kubeClient: kubeClient, - recorder: recorder, - provider: sqsProvider, - parser: aggregatedparser.NewAggregatedParser(aggregatedparser.DefaultParsers...), + kubeClient: kubeClient, + provisioner: provisioner, + cluster: cluster, + recorder: recorder, + clock: clk, + provider: sqsProvider, + parser: aggregatedparser.NewAggregatedParser(aggregatedparser.DefaultParsers...), } go func() { @@ -74,27 +96,32 @@ func (c *Controller) run(ctx context.Context) { logger.Infof("Shutting down") return case <-time.After(pollingPeriod): - logging.FromContext(ctx).Info("Here") + err := c.pollSQS(ctx) + if err != nil { + logging.FromContext(ctx).Errorf("Handling notification messages from SQS queue, %v", err) + } } } } -func (c *Controller) Poll(ctx context.Context) error { +func (c *Controller) pollSQS(ctx context.Context) error { sqsMessages, err := c.provider.GetSQSMessages(ctx) if err != nil { return err } + if len(sqsMessages) == 0 { + return nil + } + instanceIDMap := c.makeInstanceIDMap() for _, msg := range sqsMessages { - e := c.handleMessage(ctx, msg) + e := c.handleMessage(ctx, instanceIDMap, msg) err = multierr.Append(err, e) } return nil } -func (c *Controller) handleMessage(ctx context.Context, msg *sqsapi.Message) (err error) { - fmt.Printf("Handling the message for %#v\n", msg) - +func (c *Controller) handleMessage(ctx context.Context, instanceIDMap map[string]*v1.Node, msg *sqsapi.Message) (err error) { // No message to parse in this case if msg == nil || msg.Body == nil { return nil @@ -102,26 +129,56 @@ func (c *Controller) handleMessage(ctx context.Context, msg *sqsapi.Message) (er evt := c.parser.Parse(ctx, *msg.Body) evtAction := actionForEvent(evt) - // TODO: hand some of this work off to a batcher that will handle the spinning up of a new node - // and the deletion of the old node separate from this reconciliation loop - if evtAction != Actions.NoAction { - for _, ec2InstanceID := range evt.EC2InstanceIDs() { - e := c.handleInstance(ctx, ec2InstanceID, evtAction) + nodes := getInvolvedNodes(evt.EC2InstanceIDs(), instanceIDMap) + action := actionForEvent(evt) + + for i := range nodes { + node := nodes[i] + c.notificationForEvent(evt, node) + + if action != Actions.NoAction { + e := c.handleInstance(ctx, node, evtAction) err = multierr.Append(err, e) } } + + // If everything is successful, we can delete the notification associated with this event if err != nil { return err } return c.provider.DeleteSQSMessage(ctx, msg) } -// TODO: Handle the instance appropriately, this should be handled with a batcher -func (c *Controller) handleInstance(ctx context.Context, ec2InstanceID string, evtAction Action) error { - logging.FromContext(ctx).Infof("Got a message for ec2 instance id %s", ec2InstanceID) +// TODO: Handle the instance appropriately, this should be handled with a batcher potentially +func (c *Controller) handleInstance(ctx context.Context, node *v1.Node, _ Action) error { + ctx = logging.WithLogger(ctx, logging.FromContext(ctx).With("node", node.Name)) + logging.FromContext(ctx).Infof("Terminating node due to spot interruption warning") + if err := c.kubeClient.Delete(ctx, node); err != nil { + return fmt.Errorf("deleting the spot interrupted node, %w", err) + } return nil } +func (c *Controller) notificationForEvent(evt event.Interface, n *v1.Node) { + switch evt.Kind() { + case event.Kinds.RebalanceRecommendation: + c.recorder.EC2SpotRebalanceRecommendation(n) + + case event.Kinds.ScheduledChange: + c.recorder.EC2HealthWarning(n) + + case event.Kinds.SpotInterruption: + c.recorder.EC2SpotInterruptionWarning(n) + + // For now, we won't do anything with the state change action + case event.Kinds.StateChange: + return + + default: + return + } +} + func actionForEvent(evt event.Interface) Action { switch evt.Kind() { case event.Kinds.RebalanceRecommendation: @@ -141,3 +198,46 @@ func actionForEvent(evt event.Interface) Action { return Actions.NoAction } } + +func getInvolvedNodes(instanceIDs []string, instanceIDMap map[string]*v1.Node) []*v1.Node { + var nodes []*v1.Node + for _, id := range instanceIDs { + if node, ok := instanceIDMap[id]; ok { + nodes = append(nodes, node) + } + } + return nodes +} + +// buildInstanceIDMap builds a map between the instance name that is stored in the +// node .spec.providerID and the node name stored on the host +func (c *Controller) makeInstanceIDMap() map[string]*v1.Node { + m := map[string]*v1.Node{} + c.cluster.ForEachNode(func(n *state.Node) bool { + // If this node isn't owned by a provisioner, we shouldn't handle it + if _, ok := n.Node.Labels[v1alpha5.ProvisionerNameLabelKey]; !ok { + return true + } + id := parseProviderID(n.Node.Spec.ProviderID) + if id == "" { + return true + } + m[id] = n.Node + return true + }) + return m +} + +func parseProviderID(pid string) string { + r := regexp.MustCompile(`aws:///(?P.*)/(?P.*)`) + matches := r.FindStringSubmatch(pid) + if matches == nil { + return "" + } + for i, name := range r.SubexpNames() { + if name == "InstanceID" { + return matches[i] + } + } + return "" +} diff --git a/pkg/cloudprovider/aws/controllers/notification/event/aggregatedparser/aggregatedparser.go b/pkg/cloudprovider/aws/controllers/notification/event/aggregatedparser/aggregatedparser.go index 10a3d35503d0..8f99863ccbc5 100644 --- a/pkg/cloudprovider/aws/controllers/notification/event/aggregatedparser/aggregatedparser.go +++ b/pkg/cloudprovider/aws/controllers/notification/event/aggregatedparser/aggregatedparser.go @@ -20,15 +20,15 @@ import ( "knative.dev/pkg/logging" - "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event" + event2 "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event" rebalancerecommendationv0 "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event/rebalancerecommendation/v0" - scheduledchangev1 "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event/scheduledchange/v1" - spotinterruptionv1 "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event/spotinterruption/v1" - statechangev1 "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event/statechange/v1" + scheduledchangev1 "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event/scheduledchange/v0" + spotinterruptionv1 "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event/spotinterruption/v0" + statechangev1 "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event/statechange/v0" ) var ( - DefaultParsers = []event.Parser{ + DefaultParsers = []event2.Parser{ statechangev1.Parser{}, spotinterruptionv1.Parser{}, scheduledchangev1.Parser{}, @@ -36,34 +36,33 @@ var ( } ) -type AggregatedParser []event.Parser +type AggregatedParser []event2.Parser -func NewAggregatedParser(parsers ...event.Parser) AggregatedParser { +func NewAggregatedParser(parsers ...event2.Parser) AggregatedParser { return parsers } -func (p AggregatedParser) Parse(ctx context.Context, str string) event.Interface { +func (p AggregatedParser) Parse(ctx context.Context, str string) event2.Interface { ctx = logging.WithLogger(ctx, logging.FromContext(ctx).Named("event.parser")) if str == "" { - logging.FromContext(ctx).Warn("nothing to parse") - return event.NoOp{} + return event2.NoOp{} } + // We will go through all the parsers to see if we can parse + // If we aren't able to parse the message, we will just assume that it is a no-op for _, parser := range p { if a := parser.Parse(ctx, str); a != nil { return a } } - logging.FromContext(ctx).Error("failed to parse") - - md := event.AWSMetadata{} + md := event2.AWSMetadata{} if err := json.Unmarshal([]byte(str), &md); err != nil { logging.FromContext(ctx). With("error", err). Error("failed to unmarshal message metadata") - return event.NoOp{} + return event2.NoOp{} } - return event.NoOp(md) + return event2.NoOp(md) } diff --git a/pkg/cloudprovider/aws/controllers/notification/event/scheduledchange/v1/handler.go b/pkg/cloudprovider/aws/controllers/notification/event/scheduledchange/v0/handler.go similarity index 99% rename from pkg/cloudprovider/aws/controllers/notification/event/scheduledchange/v1/handler.go rename to pkg/cloudprovider/aws/controllers/notification/event/scheduledchange/v0/handler.go index aa5187b768b0..cfe006dbda22 100644 --- a/pkg/cloudprovider/aws/controllers/notification/event/scheduledchange/v1/handler.go +++ b/pkg/cloudprovider/aws/controllers/notification/event/scheduledchange/v0/handler.go @@ -12,7 +12,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package v1 +package v0 import ( "time" diff --git a/pkg/cloudprovider/aws/controllers/notification/event/scheduledchange/v1/parser.go b/pkg/cloudprovider/aws/controllers/notification/event/scheduledchange/v0/parser.go similarity index 97% rename from pkg/cloudprovider/aws/controllers/notification/event/scheduledchange/v1/parser.go rename to pkg/cloudprovider/aws/controllers/notification/event/scheduledchange/v0/parser.go index 590c89987c74..a08164cdf122 100644 --- a/pkg/cloudprovider/aws/controllers/notification/event/scheduledchange/v1/parser.go +++ b/pkg/cloudprovider/aws/controllers/notification/event/scheduledchange/v0/parser.go @@ -12,7 +12,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package v1 +package v0 import ( "context" @@ -26,7 +26,7 @@ import ( const ( source = "aws.health" detailType = "AWS Health Event" - version = "1" + version = "0" acceptedService = "EC2" acceptedEventTypeCategory = "scheduledChange" ) diff --git a/pkg/cloudprovider/aws/controllers/notification/event/scheduledchange/v1/unmarshal.go b/pkg/cloudprovider/aws/controllers/notification/event/scheduledchange/v0/unmarshal.go similarity index 99% rename from pkg/cloudprovider/aws/controllers/notification/event/scheduledchange/v1/unmarshal.go rename to pkg/cloudprovider/aws/controllers/notification/event/scheduledchange/v0/unmarshal.go index c1fcde03803d..805e40fde657 100644 --- a/pkg/cloudprovider/aws/controllers/notification/event/scheduledchange/v1/unmarshal.go +++ b/pkg/cloudprovider/aws/controllers/notification/event/scheduledchange/v0/unmarshal.go @@ -12,7 +12,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package v1 +package v0 import ( "go.uber.org/multierr" diff --git a/pkg/cloudprovider/aws/controllers/notification/event/spotinterruption/v1/handler.go b/pkg/cloudprovider/aws/controllers/notification/event/spotinterruption/v0/handler.go similarity index 99% rename from pkg/cloudprovider/aws/controllers/notification/event/spotinterruption/v1/handler.go rename to pkg/cloudprovider/aws/controllers/notification/event/spotinterruption/v0/handler.go index 925ffd037cb5..23ffe6edcb55 100644 --- a/pkg/cloudprovider/aws/controllers/notification/event/spotinterruption/v1/handler.go +++ b/pkg/cloudprovider/aws/controllers/notification/event/spotinterruption/v0/handler.go @@ -12,7 +12,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package v1 +package v0 import ( "time" diff --git a/pkg/cloudprovider/aws/controllers/notification/event/spotinterruption/v1/parser.go b/pkg/cloudprovider/aws/controllers/notification/event/spotinterruption/v0/parser.go similarity index 97% rename from pkg/cloudprovider/aws/controllers/notification/event/spotinterruption/v1/parser.go rename to pkg/cloudprovider/aws/controllers/notification/event/spotinterruption/v0/parser.go index 58c30721267b..3bcc1fc523d3 100644 --- a/pkg/cloudprovider/aws/controllers/notification/event/spotinterruption/v1/parser.go +++ b/pkg/cloudprovider/aws/controllers/notification/event/spotinterruption/v0/parser.go @@ -12,7 +12,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package v1 +package v0 import ( "context" @@ -26,7 +26,7 @@ import ( const ( source = "aws.ec2" detailType = "EC2 Spot Instance Interruption Warning" - version = "1" + version = "0" ) type Parser struct{} diff --git a/pkg/cloudprovider/aws/controllers/notification/event/spotinterruption/v1/unmarshal.go b/pkg/cloudprovider/aws/controllers/notification/event/spotinterruption/v0/unmarshal.go similarity index 99% rename from pkg/cloudprovider/aws/controllers/notification/event/spotinterruption/v1/unmarshal.go rename to pkg/cloudprovider/aws/controllers/notification/event/spotinterruption/v0/unmarshal.go index 7b3452af21d4..2fefe931e11b 100644 --- a/pkg/cloudprovider/aws/controllers/notification/event/spotinterruption/v1/unmarshal.go +++ b/pkg/cloudprovider/aws/controllers/notification/event/spotinterruption/v0/unmarshal.go @@ -12,7 +12,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package v1 +package v0 import ( "go.uber.org/zap" diff --git a/pkg/cloudprovider/aws/controllers/notification/event/statechange/v1/handler.go b/pkg/cloudprovider/aws/controllers/notification/event/statechange/v0/handler.go similarity index 99% rename from pkg/cloudprovider/aws/controllers/notification/event/statechange/v1/handler.go rename to pkg/cloudprovider/aws/controllers/notification/event/statechange/v0/handler.go index 3fe1ac8ca057..d0eb84382b19 100644 --- a/pkg/cloudprovider/aws/controllers/notification/event/statechange/v1/handler.go +++ b/pkg/cloudprovider/aws/controllers/notification/event/statechange/v0/handler.go @@ -12,7 +12,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package v1 +package v0 import ( "time" diff --git a/pkg/cloudprovider/aws/controllers/notification/event/statechange/v1/parser.go b/pkg/cloudprovider/aws/controllers/notification/event/statechange/v0/parser.go similarity index 88% rename from pkg/cloudprovider/aws/controllers/notification/event/statechange/v1/parser.go rename to pkg/cloudprovider/aws/controllers/notification/event/statechange/v0/parser.go index 00d02db74e97..629944973e4b 100644 --- a/pkg/cloudprovider/aws/controllers/notification/event/statechange/v1/parser.go +++ b/pkg/cloudprovider/aws/controllers/notification/event/statechange/v0/parser.go @@ -12,7 +12,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package v1 +package v0 import ( "context" @@ -27,7 +27,7 @@ import ( const ( source = "aws.ec2" detailType = "EC2 Instance State-change Notification" - version = "1" + version = "0" acceptedStates = "stopping,stopped,shutting-down,terminated" ) @@ -50,11 +50,8 @@ func (Parser) Parse(ctx context.Context, str string) event.Interface { return nil } + // Do not log the information on instance state change if it isn't in accepted states if !strings.Contains(acceptedStates, strings.ToLower(evt.Detail.State)) { - logging.FromContext(ctx). - With("eventDetails", evt). - With("acceptedStates", acceptedStatesList). - Warn("ignorning EC2 state-change notification") return nil } diff --git a/pkg/cloudprovider/aws/controllers/notification/event/statechange/v1/unmarshal.go b/pkg/cloudprovider/aws/controllers/notification/event/statechange/v0/unmarshal.go similarity index 99% rename from pkg/cloudprovider/aws/controllers/notification/event/statechange/v1/unmarshal.go rename to pkg/cloudprovider/aws/controllers/notification/event/statechange/v0/unmarshal.go index c3a84fdb7ec4..bfeb1593d9de 100644 --- a/pkg/cloudprovider/aws/controllers/notification/event/statechange/v1/unmarshal.go +++ b/pkg/cloudprovider/aws/controllers/notification/event/statechange/v0/unmarshal.go @@ -12,7 +12,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package v1 +package v0 import ( "go.uber.org/zap" diff --git a/pkg/cloudprovider/aws/controllers/notification/types.go b/pkg/cloudprovider/aws/controllers/notification/types.go deleted file mode 100644 index c77c1f3aa315..000000000000 --- a/pkg/cloudprovider/aws/controllers/notification/types.go +++ /dev/null @@ -1,30 +0,0 @@ -/* -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package notification - -type Action string - -var Actions = struct { - CordonAndDrain, - Cordon, - NoAction Action -}{ - CordonAndDrain: Action("CordonAndDrain"), - Cordon: Action("Cordon"), - NoAction: Action("NoAction"), -} - -type MessageParser interface { -} diff --git a/pkg/cloudprovider/aws/controllers/recorder.go b/pkg/cloudprovider/aws/controllers/recorder.go new file mode 100644 index 000000000000..dd4ad33535d3 --- /dev/null +++ b/pkg/cloudprovider/aws/controllers/recorder.go @@ -0,0 +1,53 @@ +/* +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package controllers + +import ( + v1 "k8s.io/api/core/v1" + "k8s.io/client-go/tools/record" +) + +type recorder struct { + record.EventRecorder +} + +type Recorder interface { + record.EventRecorder + + // EC2SpotInterruptionWarning is called when EC2 sends a spot interruption 2-minute warning for the node from the SQS queue + EC2SpotInterruptionWarning(*v1.Node) + // EC2SpotRebalanceRecommendation is called when EC2 sends a rebalance recommendation for the node from the SQS queue + EC2SpotRebalanceRecommendation(*v1.Node) + // EC2HealthWarning is called when EC2 sends a health warning notification for a health issue for the node from the SQS queue + EC2HealthWarning(*v1.Node) +} + +func NewRecorder(r record.EventRecorder) Recorder { + return recorder{ + EventRecorder: r, + } +} + +func (r recorder) EC2SpotInterruptionWarning(node *v1.Node) { + r.Eventf(node, "Normal", "EC2SpotInterruptionWarning", "Node %s event: EC2 triggered a spot interruption warning for the node", node.Name) +} + +func (r recorder) EC2SpotRebalanceRecommendation(node *v1.Node) { + r.Eventf(node, "Normal", "EC2RebalanceRecommendation", "Node %s event: EC2 triggered a spot rebalance recommendation for the node", node.Name) +} + +func (r recorder) EC2HealthWarning(node *v1.Node) { + r.Eventf(node, "Normal", "EC2HealthWarning", "Node %s event: EC2 triggered a health warning for the node", node.Name) +} diff --git a/pkg/cloudprovider/aws/controllers/notification/sqs.go b/pkg/cloudprovider/aws/sqs.go similarity index 99% rename from pkg/cloudprovider/aws/controllers/notification/sqs.go rename to pkg/cloudprovider/aws/sqs.go index 756ab682b0d8..79a189190bee 100644 --- a/pkg/cloudprovider/aws/controllers/notification/sqs.go +++ b/pkg/cloudprovider/aws/sqs.go @@ -12,7 +12,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package notification +package aws import ( "bytes" diff --git a/pkg/cloudprovider/fake/cloudprovider.go b/pkg/cloudprovider/fake/cloudprovider.go index c4d23193aed7..b624ebe3f464 100644 --- a/pkg/cloudprovider/fake/cloudprovider.go +++ b/pkg/cloudprovider/fake/cloudprovider.go @@ -45,6 +45,10 @@ type CloudProvider struct { var _ cloudprovider.CloudProvider = (*CloudProvider)(nil) var _ cloudprovider.InstanceType = (*InstanceType)(nil) +func (c *CloudProvider) InjectControllers(ctx context.Context, opts cloudprovider.ControllerOptions) error { + return nil +} + func (c *CloudProvider) Create(ctx context.Context, nodeRequest *cloudprovider.NodeRequest) (*v1.Node, error) { c.mu.Lock() c.CreateCalls = append(c.CreateCalls, nodeRequest) diff --git a/pkg/cloudprovider/types.go b/pkg/cloudprovider/types.go index 1a0516ae7897..0dea45be150f 100644 --- a/pkg/cloudprovider/types.go +++ b/pkg/cloudprovider/types.go @@ -23,7 +23,6 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" "github.com/aws/karpenter/pkg/apis/provisioning/v1alpha5" - "github.com/aws/karpenter/pkg/events" "github.com/aws/karpenter/pkg/scheduling" ) @@ -31,7 +30,6 @@ import ( type Options struct { ClientSet *kubernetes.Clientset KubeClient client.Client - Recorder events.Recorder // WebhookOnly is true if the cloud provider is being used for its validation/defaulting only by the webhook. In // this case it may not need to perform some initialization and the StartAsync channel will not be closed. WebhookOnly bool diff --git a/pkg/controllers/consolidation/controller.go b/pkg/controllers/consolidation/controller.go index 16d02c501a3d..020eb95ddf3b 100644 --- a/pkg/controllers/consolidation/controller.go +++ b/pkg/controllers/consolidation/controller.go @@ -165,6 +165,7 @@ func (c *Controller) ProcessCluster(ctx context.Context) (ProcessResult, error) // the remaining nodes are all non-empty, so we just consolidate the first one that we can sort.Slice(candidates, byNodeDisruptionCost(candidates)) + for _, node := range candidates { // is this a node that we can terminate? This check is meant to be fast so we can save the expense of simulated // scheduling unless its really needed @@ -381,13 +382,13 @@ func (c *Controller) launchReplacementNode(ctx context.Context, action consolida // cordon the node before we launch the replacement to prevent new pods from scheduling to the node if err := c.setNodeUnschedulable(ctx, action.oldNodes[0].Name, true); err != nil { - return fmt.Errorf("cordoning node %s, %w", oldNode.Name, err) + return fmt.Errorf("cordoning node %s, %w", action.oldNodes[0].Name, err) } - nodeNames, err := c.provisioner.LaunchNodes(ctx, provisioning.LaunchOptions{RecordPodNomination: false}, action.replacementNode) + nodeNames, err := c.provisioner.LaunchNodes(ctx, provisioning.LaunchOptions{RecordPodNomination: false}, action.replacementNodes...) if err != nil { // uncordon the node as the launch may fail (e.g. ICE or incompatible AMI) - err = multierr.Append(err, c.setNodeUnschedulable(ctx, oldNode.Name, false)) + err = multierr.Append(err, c.setNodeUnschedulable(ctx, action.oldNodes[0].Name, false)) return err } if len(nodeNames) != 1 { @@ -419,8 +420,7 @@ func (c *Controller) launchReplacementNode(ctx context.Context, action consolida return nil }, waitRetryOptions...); err != nil { // node never become ready, so uncordon the node we were trying to delete and report the error - c.cluster.UnmarkForDeletion(oldNode.Name) - return multierr.Combine(c.setNodeUnschedulable(ctx, oldNode.Name, false), + return multierr.Combine(c.setNodeUnschedulable(ctx, action.oldNodes[0].Name, false), fmt.Errorf("timed out checking node readiness, %w", err)) } return nil @@ -468,6 +468,54 @@ func (c *Controller) calculateLifetimeRemaining(node candidateNode) float64 { return remaining } +func (c *Controller) spotTerminationOptionReplace(ctx context.Context, nodes []candidateNode) (consolidationAction, error) { + var stateNodes []*state.Node + c.cluster.ForEachNode(func(n *state.Node) bool { + stateNodes = append(stateNodes, n.DeepCopy()) + return true + }) + var nodeNames []string + var pods []*v1.Pod + for _, node := range nodes { + nodeNames = append(nodeNames, node.Name) + pods = append(pods, node.pods...) + } + scheduler, err := c.provisioner.NewScheduler(ctx, pods, stateNodes, scheduling.SchedulerOptions{ + SimulationMode: true, + ExcludeNodes: nodeNames, + }) + if err != nil { + return consolidationAction{result: consolidateResultUnknown}, fmt.Errorf("creating scheduler, %w", err) + } + + newNodes, inflightNodes, err := scheduler.Solve(ctx, pods) + if err != nil { + return consolidationAction{result: consolidateResultUnknown}, fmt.Errorf("simulating scheduling, %w", err) + } + + // were we able to schedule all the pods on the inflight nodes? + // delete all the nodes that are going to be deleted by spot interruption + if len(newNodes) == 0 { + schedulableCount := 0 + for _, inflight := range inflightNodes { + schedulableCount += len(inflight.Pods) + } + if len(pods) == schedulableCount { + return consolidationAction{ + oldNodes: lo.Map(nodes, func(n candidateNode, _ int) *v1.Node { return n.Node }), + disruptionCost: disruptionCost(ctx, pods), + result: consolidateResultDelete, + }, nil + } + } + return consolidationAction{ + oldNodes: lo.Map(nodes, func(n candidateNode, _ int) *v1.Node { return n.Node }), + disruptionCost: disruptionCost(ctx, pods), + result: consolidateResultReplace, + replacementNodes: newNodes, + }, nil +} + // nolint:gocyclo func (c *Controller) nodeConsolidationOptionReplaceOrDelete(ctx context.Context, node candidateNode) (consolidationAction, error) { defer metrics.Measure(consolidationDurationHistogram.WithLabelValues("Replace/Delete"))() @@ -555,11 +603,12 @@ func (c *Controller) nodeConsolidationOptionReplaceOrDelete(ctx context.Context, return consolidationAction{result: consolidateResultNotPossible}, nil } + // We know the length of newNodes is 1 from above so this should only launch a single node return consolidationAction{ - oldNodes: []*v1.Node{node.Node}, - disruptionCost: disruptionCost(ctx, node.pods), - result: consolidateResultReplace, - replacementNode: newNodes[0], + oldNodes: []*v1.Node{node.Node}, + disruptionCost: disruptionCost(ctx, node.pods), + result: consolidateResultReplace, + replacementNodes: newNodes, }, nil } diff --git a/pkg/controllers/consolidation/types.go b/pkg/controllers/consolidation/types.go index 5f73dec2060f..25fb9f4587d7 100644 --- a/pkg/controllers/consolidation/types.go +++ b/pkg/controllers/consolidation/types.go @@ -64,10 +64,10 @@ func (r consolidateResult) String() string { } type consolidationAction struct { - oldNodes []*v1.Node - disruptionCost float64 - result consolidateResult - replacementNode *scheduling.Node + oldNodes []*v1.Node + disruptionCost float64 + result consolidateResult + replacementNodes []*scheduling.Node } func (o consolidationAction) String() string { @@ -82,9 +82,14 @@ func (o consolidationAction) String() string { fmt.Fprintf(&buf, "/%s", instanceType) } } - if o.replacementNode != nil { - fmt.Fprintf(&buf, " and replacing with a node from types %s", - scheduling.InstanceTypeList(o.replacementNode.InstanceTypeOptions)) + // TODO: Improve the stringify method here for getting all the nodes + if o.replacementNodes != nil { + if len(o.replacementNodes) == 1 { + fmt.Fprintf(&buf, " and replacing with a node from types %s", + scheduling.InstanceTypeList(o.replacementNodes[0].InstanceTypeOptions)) + } else { + fmt.Fprintf(&buf, " and replacing with multiple nodes") + } } return buf.String() } diff --git a/pkg/controllers/controllers.go b/pkg/controllers/controllers.go index 6bac474b915d..e7dbbf1094a3 100644 --- a/pkg/controllers/controllers.go +++ b/pkg/controllers/controllers.go @@ -84,7 +84,16 @@ type Controller interface { Register(context.Context, manager.Manager) error } -func Initialize(injectCloudProvider func(context.Context, cloudprovider.Options) cloudprovider.CloudProvider) { +type ControllerOptions struct { + Cluster *state.Cluster + KubeClient client.Client + Provisioner *provisioning.Provisioner + Recorder events.Recorder + StartAsync <-chan struct{} + Clock clock.Clock +} + +func Initialize(injectCloudProvider func(context.Context, cloudprovider.Options) (cloudprovider.CloudProvider, func(context.Context, *ControllerOptions))) { opts := options.New().MustParse() // Setup Client controllerRuntimeConfig := controllerruntime.GetConfigOrDie() @@ -121,7 +130,7 @@ func Initialize(injectCloudProvider func(context.Context, cloudprovider.Options) utilruntime.Must(registerPprof(manager)) } - cloudProvider := injectCloudProvider(ctx, cloudprovider.Options{ClientSet: clientSet, KubeClient: manager.GetClient(), StartAsync: manager.Elected()}) + cloudProvider, injectControllers := injectCloudProvider(ctx, cloudprovider.Options{ClientSet: clientSet, KubeClient: manager.GetClient(), StartAsync: manager.Elected()}) if hp, ok := cloudProvider.(HealthCheck); ok { utilruntime.Must(manager.AddHealthzCheck("cloud-provider", hp.LivenessProbe)) } @@ -146,6 +155,10 @@ func Initialize(injectCloudProvider func(context.Context, cloudprovider.Options) provisioner := provisioning.NewProvisioner(ctx, cfg, manager.GetClient(), clientSet.CoreV1(), recorder, cloudProvider, cluster) consolidation.NewController(ctx, realClock, manager.GetClient(), provisioner, cloudProvider, recorder, cluster, manager.Elected()) + // Inject cloudprovider-specific controllers into the controller-set using the injectControllers function + // Inject the base cloud provider into the injection function rather than the decorated interface + injectControllers(ctx, &ControllerOptions{Cluster: cluster, KubeClient: manager.GetClient(), Provisioner: provisioner, Recorder: recorder, StartAsync: manager.Elected(), Clock: realClock}) + metricsstate.StartMetricScraper(ctx, cluster) if err := RegisterControllers(ctx, diff --git a/pkg/events/dedupe.go b/pkg/events/dedupe.go index cbc2c0562100..9c2a60ddca1c 100644 --- a/pkg/events/dedupe.go +++ b/pkg/events/dedupe.go @@ -24,13 +24,13 @@ import ( func NewDedupeRecorder(r Recorder) Recorder { return &dedupe{ - rec: r, - cache: cache.New(120*time.Second, 10*time.Second), + Recorder: r, + cache: cache.New(120*time.Second, 10*time.Second), } } type dedupe struct { - rec Recorder + Recorder cache *cache.Cache } @@ -38,35 +38,35 @@ func (d *dedupe) WaitingOnDeletionForConsolidation(node *v1.Node) { if !d.shouldCreateEvent(fmt.Sprintf("wait-node-consolidate-delete-%s", node.UID)) { return } - d.rec.WaitingOnDeletionForConsolidation(node) + d.Recorder.WaitingOnDeletionForConsolidation(node) } func (d *dedupe) WaitingOnReadinessForConsolidation(node *v1.Node) { if !d.shouldCreateEvent(fmt.Sprintf("wait-node-consolidate-ready-%s", node.UID)) { return } - d.rec.WaitingOnReadinessForConsolidation(node) + d.Recorder.WaitingOnReadinessForConsolidation(node) } func (d *dedupe) TerminatingNodeForConsolidation(node *v1.Node, reason string) { if !d.shouldCreateEvent(fmt.Sprintf("terminate-node-consolidate-%s-%s", node.UID, reason)) { return } - d.rec.TerminatingNodeForConsolidation(node, reason) + d.Recorder.TerminatingNodeForConsolidation(node, reason) } func (d *dedupe) LaunchingNodeForConsolidation(node *v1.Node, reason string) { if !d.shouldCreateEvent(fmt.Sprintf("launch-node-consolidate-%s-%s", node.UID, reason)) { return } - d.rec.LaunchingNodeForConsolidation(node, reason) + d.Recorder.LaunchingNodeForConsolidation(node, reason) } func (d *dedupe) NominatePod(pod *v1.Pod, node *v1.Node) { if !d.shouldCreateEvent(fmt.Sprintf("nominate-node-%s-%s", pod.UID, node.UID)) { return } - d.rec.NominatePod(pod, node) + d.Recorder.NominatePod(pod, node) } func (d *dedupe) EvictPod(pod *v1.Pod) { @@ -75,21 +75,21 @@ func (d *dedupe) EvictPod(pod *v1.Pod) { return } d.cache.SetDefault(key, nil) - d.rec.EvictPod(pod) + d.Recorder.EvictPod(pod) } func (d *dedupe) PodFailedToSchedule(pod *v1.Pod, err error) { if !d.shouldCreateEvent(fmt.Sprintf("failed-to-schedule-%s-%s", pod.UID, err)) { return } - d.rec.PodFailedToSchedule(pod, err) + d.Recorder.PodFailedToSchedule(pod, err) } func (d *dedupe) NodeFailedToDrain(node *v1.Node, err error) { if !d.shouldCreateEvent(fmt.Sprintf("failed-to-drain-%s", node.Name)) { return } - d.rec.NodeFailedToDrain(node, err) + d.Recorder.NodeFailedToDrain(node, err) } func (d *dedupe) shouldCreateEvent(key string) bool { diff --git a/pkg/events/loadshedding.go b/pkg/events/loadshedding.go index 9ecfc5a8f735..04d08f13a60c 100644 --- a/pkg/events/loadshedding.go +++ b/pkg/events/loadshedding.go @@ -21,13 +21,13 @@ import ( func NewLoadSheddingRecorder(r Recorder) Recorder { return &loadshedding{ - rec: r, + Recorder: r, nominationBucket: flowcontrol.NewTokenBucketRateLimiter(5, 10), } } type loadshedding struct { - rec Recorder + Recorder nominationBucket flowcontrol.RateLimiter } @@ -39,33 +39,33 @@ func (l *loadshedding) NominatePod(pod *v1.Pod, node *v1.Node) { if !l.nominationBucket.TryAccept() { return } - l.rec.NominatePod(pod, node) + l.Recorder.NominatePod(pod, node) } func (l *loadshedding) EvictPod(pod *v1.Pod) { - l.rec.EvictPod(pod) + l.Recorder.EvictPod(pod) } func (l *loadshedding) PodFailedToSchedule(pod *v1.Pod, err error) { - l.rec.PodFailedToSchedule(pod, err) + l.Recorder.PodFailedToSchedule(pod, err) } func (l *loadshedding) NodeFailedToDrain(node *v1.Node, err error) { - l.rec.NodeFailedToDrain(node, err) + l.Recorder.NodeFailedToDrain(node, err) } func (l *loadshedding) TerminatingNodeForConsolidation(node *v1.Node, reason string) { - l.rec.TerminatingNodeForConsolidation(node, reason) + l.Recorder.TerminatingNodeForConsolidation(node, reason) } func (l *loadshedding) LaunchingNodeForConsolidation(node *v1.Node, reason string) { - l.rec.LaunchingNodeForConsolidation(node, reason) + l.Recorder.LaunchingNodeForConsolidation(node, reason) } func (l *loadshedding) WaitingOnReadinessForConsolidation(node *v1.Node) { - l.rec.WaitingOnReadinessForConsolidation(node) + l.Recorder.WaitingOnReadinessForConsolidation(node) } func (l *loadshedding) WaitingOnDeletionForConsolidation(node *v1.Node) { - l.rec.WaitingOnDeletionForConsolidation(node) + l.Recorder.WaitingOnDeletionForConsolidation(node) } diff --git a/pkg/events/recorder.go b/pkg/events/recorder.go index 3eb399eb01e8..d329132e61c3 100644 --- a/pkg/events/recorder.go +++ b/pkg/events/recorder.go @@ -22,10 +22,12 @@ import ( // Recorder is used to record events that occur about pods so they can be viewed by looking at the pod's events so our // actions are more observable without requiring log inspection type Recorder interface { + record.EventRecorder + // NominatePod is called when we have determined that a pod should schedule against an existing node and don't // currently need to provision new capacity for the pod. NominatePod(*v1.Pod, *v1.Node) - // EvictedPod is called when a pod is evicted + // EvictPod is called when a pod is evicted EvictPod(*v1.Pod) // PodFailedToSchedule is called when a pod has failed to schedule entirely. PodFailedToSchedule(*v1.Pod, error) @@ -45,40 +47,40 @@ type Recorder interface { } type recorder struct { - rec record.EventRecorder + record.EventRecorder } func NewRecorder(rec record.EventRecorder) Recorder { - return &recorder{rec: rec} + return &recorder{EventRecorder: rec} } func (r recorder) WaitingOnDeletionForConsolidation(node *v1.Node) { - r.rec.Eventf(node, "Normal", "ConsolidateWaiting", "Waiting on deletion to continue consolidation") + r.Eventf(node, "Normal", "ConsolidateWaiting", "Waiting on deletion to continue consolidation") } func (r recorder) WaitingOnReadinessForConsolidation(node *v1.Node) { - r.rec.Eventf(node, "Normal", "ConsolidateWaiting", "Waiting on readiness to continue consolidation") + r.Eventf(node, "Normal", "ConsolidateWaiting", "Waiting on readiness to continue consolidation") } func (r recorder) TerminatingNodeForConsolidation(node *v1.Node, reason string) { - r.rec.Eventf(node, "Normal", "ConsolidateTerminateNode", "Consolidating node via %s", reason) + r.Eventf(node, "Normal", "ConsolidateTerminateNode", "Consolidating node via %s", reason) } func (r recorder) LaunchingNodeForConsolidation(node *v1.Node, reason string) { - r.rec.Eventf(node, "Normal", "ConsolidateLaunchNode", "Launching node for %s", reason) + r.Eventf(node, "Normal", "ConsolidateLaunchNode", "Launching node for %s", reason) } func (r recorder) NominatePod(pod *v1.Pod, node *v1.Node) { - r.rec.Eventf(pod, "Normal", "Nominate", "Pod should schedule on %s", node.Name) + r.Eventf(pod, "Normal", "Nominate", "Pod should schedule on %s", node.Name) } func (r recorder) EvictPod(pod *v1.Pod) { - r.rec.Eventf(pod, "Normal", "Evict", "Evicted pod") + r.Eventf(pod, "Normal", "Evict", "Evicted pod") } func (r recorder) PodFailedToSchedule(pod *v1.Pod, err error) { - r.rec.Eventf(pod, "Warning", "FailedProvisioning", "Failed to provision new node, %s", err) + r.Eventf(pod, "Warning", "FailedProvisioning", "Failed to provision new node, %s", err) } func (r recorder) NodeFailedToDrain(node *v1.Node, err error) { - r.rec.Eventf(node, "Warning", "FailedDraining", "Failed to drain node, %s", err) + r.Eventf(node, "Warning", "FailedDraining", "Failed to drain node, %s", err) } diff --git a/pkg/test/eventrecorder.go b/pkg/test/eventrecorder.go index ab6008cc62bf..250b48cb886e 100644 --- a/pkg/test/eventrecorder.go +++ b/pkg/test/eventrecorder.go @@ -18,6 +18,7 @@ import ( "sync" v1 "k8s.io/api/core/v1" + "k8s.io/client-go/tools/record" "github.com/aws/karpenter/pkg/events" ) @@ -30,6 +31,7 @@ type Binding struct { // EventRecorder is a mock event recorder that is used to facilitate testing. type EventRecorder struct { + record.EventRecorder mu sync.Mutex bindings []Binding } From f2ddcc757c42c4f2d33e50e51bb74aae651d0a27 Mon Sep 17 00:00:00 2001 From: Jonathan Innis Date: Tue, 6 Sep 2022 16:27:40 -0700 Subject: [PATCH 03/55] Add an in-memory cache to SQS provider --- cmd/controller/main.go | 3 + go.mod | 5 +- go.sum | 9 +- pkg/cloudprovider/aws/cloudprovider.go | 10 ++- .../controllers/infrastructure/controller.go | 37 ++++----- .../controllers/notification/controller.go | 9 +- pkg/cloudprovider/aws/createfleetbatcher.go | 16 ---- pkg/cloudprovider/aws/helpers.go | 20 +++++ pkg/cloudprovider/aws/sqs.go | 82 ++++++++++++------- 9 files changed, 111 insertions(+), 80 deletions(-) create mode 100644 pkg/cloudprovider/aws/helpers.go diff --git a/cmd/controller/main.go b/cmd/controller/main.go index 1d46e3c47f56..bbc06ee7934f 100644 --- a/cmd/controller/main.go +++ b/cmd/controller/main.go @@ -17,6 +17,8 @@ package main import ( "context" + "knative.dev/pkg/logging" + "github.com/aws/karpenter/pkg/cloudprovider" "github.com/aws/karpenter/pkg/cloudprovider/aws" awscontrollers "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers" @@ -30,6 +32,7 @@ func main() { provider := aws.NewCloudProvider(ctx, options) injectControllers := func(ctx context.Context, opts *controllers.ControllerOptions) { recorder := awscontrollers.NewRecorder(opts.Recorder) + ctx = logging.WithLogger(ctx, logging.FromContext(ctx).Named("aws")) // Injecting the controllers that will start when opts.StartAsync is triggered notification.NewController(ctx, opts.Clock, opts.KubeClient, provider.SQSProvider, recorder, opts.Provisioner, opts.Cluster, opts.StartAsync) diff --git a/go.mod b/go.mod index f63645ce8607..7875fb692d32 100644 --- a/go.mod +++ b/go.mod @@ -29,8 +29,6 @@ require ( sigs.k8s.io/controller-runtime v0.13.0 ) -require k8s.io/utils v0.0.0-20210802155522-efc7438f0176 // indirect - require ( contrib.go.opencensus.io/exporter/ocagent v0.7.1-0.20200907061046-05415f1de66d // indirect contrib.go.opencensus.io/exporter/prometheus v0.4.0 // indirect @@ -79,10 +77,11 @@ require ( go.uber.org/atomic v1.9.0 // indirect go.uber.org/automaxprocs v1.4.0 // indirect golang.org/x/exp v0.0.0-20220303212507-bbda1eaf7a17 // indirect + golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4 // indirect golang.org/x/net v0.0.0-20220722155237-a158d28d115b // indirect golang.org/x/oauth2 v0.0.0-20220223155221-ee480838109b // indirect golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4 // indirect - golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f // indirect + golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab // indirect golang.org/x/term v0.0.0-20210927222741-03fcf44c2211 // indirect golang.org/x/text v0.3.7 // indirect gomodules.xyz/jsonpatch/v2 v2.2.0 // indirect diff --git a/go.sum b/go.sum index 2e99ca13c978..0f3efe6b3834 100644 --- a/go.sum +++ b/go.sum @@ -441,7 +441,9 @@ golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.4.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.4.1/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.4.2/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= +golang.org/x/mod v0.6.0-dev.0.20220106191415-9b9b3d81d5e3/go.mod h1:3p9vT2HGsQu2K1YbXdKPJLVgG5VJdoTa1poYQBtP1AY= golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4 h1:6zppjxzCulZykYSLyVDYbneBfbaBIQPYMevg0bEwv2s= +golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20180826012351-8a410e7b638d/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20181114220301-adae6a3d119a/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= @@ -517,6 +519,7 @@ golang.org/x/sync v0.0.0-20201207232520-09787c993a3a/go.mod h1:RxMgew5VJxzue5/jJ golang.org/x/sync v0.0.0-20210220032951-036812b2e83c/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4 h1:uVc8UZUe6tr40fFVnUP5Oj+veunVezqYl9z7DYw9xzw= golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sys v0.0.0-20180823144017-11551d06cbcc/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20180905080454-ebe1bf3edb33/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20181116152217-5ac8a444bdc5/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= @@ -574,9 +577,9 @@ golang.org/x/sys v0.0.0-20210908233432-aa78b53d3365/go.mod h1:oPkhp1MJrh7nUepCBc golang.org/x/sys v0.0.0-20211124211545-fe61309f8881/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20211216021012-1d35b9e2eb4e/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220114195835-da31bd327af9/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.0.0-20220412211240-33da011f77ad/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f h1:v4INt8xihDGvnrfjMDVXGxw9wrfxYyCjk0KbXjhR55s= -golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220209214540-3681064d5158/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab h1:2QkjZIsXupsJbJIdSjjUOgWK3aEtzyuh2mPt3l/CkeU= +golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= golang.org/x/term v0.0.0-20210927222741-03fcf44c2211 h1:JGgROgKl9N8DuW20oFS5gxc+lE67/N3FcwmBPMe7ArY= golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= diff --git a/pkg/cloudprovider/aws/cloudprovider.go b/pkg/cloudprovider/aws/cloudprovider.go index d67b2eb904c9..6ce125a913ae 100644 --- a/pkg/cloudprovider/aws/cloudprovider.go +++ b/pkg/cloudprovider/aws/cloudprovider.go @@ -77,7 +77,7 @@ type CloudProvider struct { instanceTypeProvider *InstanceTypeProvider instanceProvider *InstanceProvider kubeClient k8sClient.Client - SQSProvider *SQSProvider + sqsProvider *SQSProvider } func NewCloudProvider(ctx context.Context, options cloudprovider.Options) *CloudProvider { @@ -112,7 +112,7 @@ func NewCloudProvider(ctx context.Context, options cloudprovider.Options) *Cloud instanceTypeProvider := NewInstanceTypeProvider(ctx, sess, options, ec2api, subnetProvider) // TODO: Change this queue url value to a useful value - sqsProvider := NewSQSProvider(sqsapi, "https://sqs.us-west-2.amazonaws.com/330700974597/test-stack-Queue-VimlxX8fIySZ") + sqsProvider := NewSQSProvider(sqsapi, "test-stack-Queue-VimlxX8fIySZ") cloudprovider := &CloudProvider{ instanceTypeProvider: instanceTypeProvider, instanceProvider: NewInstanceProvider(ctx, ec2api, instanceTypeProvider, subnetProvider, @@ -126,7 +126,7 @@ func NewCloudProvider(ctx context.Context, options cloudprovider.Options) *Cloud options.StartAsync, ), ), - SQSProvider: sqsProvider, + sqsProvider: sqsProvider, kubeClient: options.KubeClient, } v1alpha5.ValidateHook = cloudprovider.Validate @@ -220,6 +220,10 @@ func (*CloudProvider) Validate(ctx context.Context, provisioner *v1alpha5.Provis return provider.Validate() } +func (c *CloudProvider) SQSProvider() *SQSProvider { + return c.sqsProvider +} + // Default the provisioner func (*CloudProvider) Default(ctx context.Context, provisioner *v1alpha5.Provisioner) { defaultLabels(provisioner) diff --git a/pkg/cloudprovider/aws/controllers/infrastructure/controller.go b/pkg/cloudprovider/aws/controllers/infrastructure/controller.go index 3bcb2eb9a981..a926a0f2a600 100644 --- a/pkg/cloudprovider/aws/controllers/infrastructure/controller.go +++ b/pkg/cloudprovider/aws/controllers/infrastructure/controller.go @@ -20,37 +20,28 @@ import ( "k8s.io/apimachinery/pkg/util/clock" "knative.dev/pkg/logging" - "sigs.k8s.io/controller-runtime/pkg/client" + "github.com/aws/karpenter/pkg/cloudprovider/aws" "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers" - "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event" - "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event/aggregatedparser" - "github.com/aws/karpenter/pkg/controllers/provisioning" - "github.com/aws/karpenter/pkg/controllers/state" ) -// Controller is the consolidation controller. It is not a standard controller-runtime controller in that it doesn't +// Controller is the AWS infrastructure controller. It is not a standard controller-runtime controller in that it doesn't // have a reconcile method. type Controller struct { - kubeClient client.Client - provisioner *provisioning.Provisioner - cluster *state.Cluster + sqsProvider *aws.SQSProvider recorder controllers.Recorder clock clock.Clock - parser event.Parser } -// pollingPeriod that we go to the SQS queue to check if there are any new events -const pollingPeriod = 2 * time.Second +// pollingPeriod that we go to AWS APIs to ensure that the appropriate AWS infrastructure is provisioned +const pollingPeriod = 15 * time.Minute -func NewController(ctx context.Context, clk clock.Clock, kubeClient client.Client, recorder controllers.Recorder, - cluster *state.Cluster, startAsync <-chan struct{}) *Controller { +func NewController(ctx context.Context, clk clock.Clock, recorder controllers.Recorder, + sqsProvider *aws.SQSProvider, startAsync <-chan struct{}) *Controller { c := &Controller{ - kubeClient: kubeClient, - cluster: cluster, - recorder: recorder, - clock: clk, - parser: aggregatedparser.NewAggregatedParser(aggregatedparser.DefaultParsers...), + recorder: recorder, + clock: clk, + sqsProvider: sqsProvider, } go func() { @@ -66,7 +57,7 @@ func NewController(ctx context.Context, clk clock.Clock, kubeClient client.Clien } func (c *Controller) run(ctx context.Context) { - logger := logging.FromContext(ctx).Named("notification") + logger := logging.FromContext(ctx).Named("infrastructure") ctx = logging.WithLogger(ctx, logger) for { select { @@ -74,7 +65,11 @@ func (c *Controller) run(ctx context.Context) { logger.Infof("Shutting down") return case <-time.After(pollingPeriod): - logging.FromContext(ctx).Infof("polled after the polling period") + c.ensureInfrastructure(ctx) } } } + +func (c *Controller) ensureInfrastructure(ctx context.Context) error { + return nil +} diff --git a/pkg/cloudprovider/aws/controllers/notification/controller.go b/pkg/cloudprovider/aws/controllers/notification/controller.go index bad3535699d4..d8147b3c6a3b 100644 --- a/pkg/cloudprovider/aws/controllers/notification/controller.go +++ b/pkg/cloudprovider/aws/controllers/notification/controller.go @@ -48,7 +48,7 @@ var Actions = struct { NoAction: "NoAction", } -// Controller is the consolidation controller. It is not a standard controller-runtime controller in that it doesn't +// Controller is the notification controller. It is not a standard controller-runtime controller in that it doesn't // have a reconcile method. type Controller struct { kubeClient client.Client @@ -112,7 +112,6 @@ func (c *Controller) pollSQS(ctx context.Context) error { if len(sqsMessages) == 0 { return nil } - instanceIDMap := c.makeInstanceIDMap() for _, msg := range sqsMessages { e := c.handleMessage(ctx, instanceIDMap, msg) @@ -134,15 +133,13 @@ func (c *Controller) handleMessage(ctx context.Context, instanceIDMap map[string for i := range nodes { node := nodes[i] - c.notificationForEvent(evt, node) + c.notifyForEvent(evt, node) if action != Actions.NoAction { e := c.handleInstance(ctx, node, evtAction) err = multierr.Append(err, e) } } - - // If everything is successful, we can delete the notification associated with this event if err != nil { return err } @@ -159,7 +156,7 @@ func (c *Controller) handleInstance(ctx context.Context, node *v1.Node, _ Action return nil } -func (c *Controller) notificationForEvent(evt event.Interface, n *v1.Node) { +func (c *Controller) notifyForEvent(evt event.Interface, n *v1.Node) { switch evt.Kind() { case event.Kinds.RebalanceRecommendation: c.recorder.EC2SpotRebalanceRecommendation(n) diff --git a/pkg/cloudprovider/aws/createfleetbatcher.go b/pkg/cloudprovider/aws/createfleetbatcher.go index 70426f6a9394..27d98075f53f 100644 --- a/pkg/cloudprovider/aws/createfleetbatcher.go +++ b/pkg/cloudprovider/aws/createfleetbatcher.go @@ -15,9 +15,7 @@ limitations under the License. package aws import ( - "bytes" "context" - "encoding/json" "fmt" "sync" "time" @@ -205,17 +203,3 @@ func (b *CreateFleetBatcher) runCalls() { } } } - -func deepCopy(v *ec2.CreateFleetInput) (*ec2.CreateFleetInput, error) { - var buf bytes.Buffer - enc := json.NewEncoder(&buf) - if err := enc.Encode(v); err != nil { - return nil, err - } - dec := json.NewDecoder(&buf) - var cp ec2.CreateFleetInput - if err := dec.Decode(&cp); err != nil { - return nil, err - } - return &cp, nil -} diff --git a/pkg/cloudprovider/aws/helpers.go b/pkg/cloudprovider/aws/helpers.go new file mode 100644 index 000000000000..a2dbd7114df5 --- /dev/null +++ b/pkg/cloudprovider/aws/helpers.go @@ -0,0 +1,20 @@ +package aws + +import ( + "bytes" + "encoding/json" +) + +func deepCopy[T any](v *T) (*T, error) { + var buf bytes.Buffer + enc := json.NewEncoder(&buf) + if err := enc.Encode(v); err != nil { + return nil, err + } + dec := json.NewDecoder(&buf) + var cp T + if err := dec.Decode(&cp); err != nil { + return nil, err + } + return &cp, nil +} diff --git a/pkg/cloudprovider/aws/sqs.go b/pkg/cloudprovider/aws/sqs.go index 79a189190bee..1e2f2e49deb9 100644 --- a/pkg/cloudprovider/aws/sqs.go +++ b/pkg/cloudprovider/aws/sqs.go @@ -15,10 +15,9 @@ limitations under the License. package aws import ( - "bytes" "context" - "encoding/json" "fmt" + "sync" "github.com/aws/aws-sdk-go/aws" "github.com/aws/aws-sdk-go/service/sqs" @@ -27,14 +26,25 @@ import ( ) type SQSProvider struct { + createQueueInput *sqs.CreateQueueInput + getQueueURLInput *sqs.GetQueueUrlInput receiveMessageInput *sqs.ReceiveMessageInput - deleteMessageInput *sqs.DeleteMessageInput client sqsiface.SQSAPI + mutex *sync.RWMutex + queueURL string } -func NewSQSProvider(client sqsiface.SQSAPI, queueURL string) *SQSProvider { +func NewSQSProvider(client sqsiface.SQSAPI, queueName string) *SQSProvider { + createQueueInput := &sqs.CreateQueueInput{ + Attributes: map[string]*string{ + sqs.QueueAttributeNameMessageRetentionPeriod: aws.String("300"), + }, + QueueName: aws.String(queueName), + } + getQueueURLInput := &sqs.GetQueueUrlInput{ + QueueName: aws.String(queueName), + } receiveMessageInput := &sqs.ReceiveMessageInput{ - QueueUrl: aws.String(queueURL), MaxNumberOfMessages: aws.Int64(10), VisibilityTimeout: aws.Int64(20), // Seconds WaitTimeSeconds: aws.Int64(20), // Seconds, maximum for long polling @@ -46,21 +56,47 @@ func NewSQSProvider(client sqsiface.SQSAPI, queueURL string) *SQSProvider { }, } - deleteMessageInput := &sqs.DeleteMessageInput{ - QueueUrl: aws.String(queueURL), - } - return &SQSProvider{ + createQueueInput: createQueueInput, + getQueueURLInput: getQueueURLInput, receiveMessageInput: receiveMessageInput, - deleteMessageInput: deleteMessageInput, client: client, + mutex: &sync.RWMutex{}, + } +} + +func (s *SQSProvider) DiscoverQueueURL(ctx context.Context) (string, error) { + s.mutex.RLock() + defer s.mutex.RUnlock() + if s.queueURL != "" { + return s.queueURL, nil + } + result, err := s.client.GetQueueUrlWithContext(ctx, s.getQueueURLInput) + if err != nil { + return "", fmt.Errorf("failed fetching queue url, %w", err) } + s.mutex.Lock() + defer s.mutex.Unlock() + s.queueURL = aws.StringValue(result.QueueUrl) + return aws.StringValue(result.QueueUrl), nil } func (s *SQSProvider) GetSQSMessages(ctx context.Context) ([]*sqs.Message, error) { ctx = logging.WithLogger(ctx, logging.FromContext(ctx).Named("sqsClient.getMessages")) - result, err := s.client.ReceiveMessageWithContext(ctx, s.receiveMessageInput) + queueURL, err := s.DiscoverQueueURL(ctx) + if err != nil { + return nil, fmt.Errorf("failed getting sqs messages, %w", err) + } + + // Copy the input template and add the discovered queue url + input, err := deepCopy(s.receiveMessageInput) + if err != nil { + return nil, fmt.Errorf("error copying input, %w", err) + } + input.QueueUrl = aws.String(queueURL) + + result, err := s.client.ReceiveMessageWithContext(ctx, input) if err != nil { logging.FromContext(ctx). With("error", err). @@ -74,11 +110,15 @@ func (s *SQSProvider) GetSQSMessages(ctx context.Context) ([]*sqs.Message, error func (s *SQSProvider) DeleteSQSMessage(ctx context.Context, msg *sqs.Message) error { ctx = logging.WithLogger(ctx, logging.FromContext(ctx).Named("sqsClient.deleteMessage")) - input, err := deepCopyDeleteMessage(s.deleteMessageInput) + queueURL, err := s.DiscoverQueueURL(ctx) if err != nil { - return fmt.Errorf("error copying delete message input, %w", err) + return fmt.Errorf("failed getting sqs messages, %w", err) + } + + input := &sqs.DeleteMessageInput{ + QueueUrl: aws.String(queueURL), + ReceiptHandle: msg.ReceiptHandle, } - input.ReceiptHandle = msg.ReceiptHandle _, err = s.client.DeleteMessageWithContext(ctx, input) if err != nil { @@ -90,17 +130,3 @@ func (s *SQSProvider) DeleteSQSMessage(ctx context.Context, msg *sqs.Message) er return nil } - -func deepCopyDeleteMessage(input *sqs.DeleteMessageInput) (*sqs.DeleteMessageInput, error) { - var buf bytes.Buffer - enc := json.NewEncoder(&buf) - if err := enc.Encode(input); err != nil { - return nil, err - } - dec := json.NewDecoder(&buf) - var cp sqs.DeleteMessageInput - if err := dec.Decode(&cp); err != nil { - return nil, err - } - return &cp, nil -} From e369569ace80e3e160bfd319d451f21f891f115d Mon Sep 17 00:00:00 2001 From: Jonathan Innis Date: Thu, 8 Sep 2022 16:14:42 -0700 Subject: [PATCH 04/55] Add queue policy to receive messages --- cmd/controller/main.go | 14 +-- pkg/cloudprovider/aws/cloudprovider.go | 21 ++-- .../controllers/infrastructure/controller.go | 11 +- .../controllers/notification/controller.go | 6 +- pkg/cloudprovider/aws/controllers/register.go | 22 ++++ .../aws/{controllers => events}/recorder.go | 6 +- pkg/cloudprovider/aws/iam.go | 15 +++ pkg/cloudprovider/aws/metadata.go | 46 ++++++++ pkg/cloudprovider/aws/sqs.go | 109 +++++++++++++++--- 9 files changed, 204 insertions(+), 46 deletions(-) create mode 100644 pkg/cloudprovider/aws/controllers/register.go rename pkg/cloudprovider/aws/{controllers => events}/recorder.go (94%) create mode 100644 pkg/cloudprovider/aws/iam.go create mode 100644 pkg/cloudprovider/aws/metadata.go diff --git a/cmd/controller/main.go b/cmd/controller/main.go index bbc06ee7934f..e3924431cbc9 100644 --- a/cmd/controller/main.go +++ b/cmd/controller/main.go @@ -17,27 +17,17 @@ package main import ( "context" - "knative.dev/pkg/logging" - "github.com/aws/karpenter/pkg/cloudprovider" "github.com/aws/karpenter/pkg/cloudprovider/aws" awscontrollers "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers" - "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/infrastructure" - "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification" "github.com/aws/karpenter/pkg/controllers" ) func main() { controllers.Initialize(func(ctx context.Context, options cloudprovider.Options) (cloudprovider.CloudProvider, func(context.Context, *controllers.ControllerOptions)) { provider := aws.NewCloudProvider(ctx, options) - injectControllers := func(ctx context.Context, opts *controllers.ControllerOptions) { - recorder := awscontrollers.NewRecorder(opts.Recorder) - ctx = logging.WithLogger(ctx, logging.FromContext(ctx).Named("aws")) - - // Injecting the controllers that will start when opts.StartAsync is triggered - notification.NewController(ctx, opts.Clock, opts.KubeClient, provider.SQSProvider, recorder, opts.Provisioner, opts.Cluster, opts.StartAsync) - infrastructure.NewController(ctx, opts.Clock, opts.KubeClient, recorder, opts.Cluster, opts.StartAsync) + return provider, func(c context.Context, opts *controllers.ControllerOptions) { + awscontrollers.Register(c, provider, opts) } - return provider, injectControllers }) } diff --git a/pkg/cloudprovider/aws/cloudprovider.go b/pkg/cloudprovider/aws/cloudprovider.go index 6ce125a913ae..6d25156e82da 100644 --- a/pkg/cloudprovider/aws/cloudprovider.go +++ b/pkg/cloudprovider/aws/cloudprovider.go @@ -25,11 +25,11 @@ import ( "github.com/aws/aws-sdk-go/aws" "github.com/aws/aws-sdk-go/aws/awserr" "github.com/aws/aws-sdk-go/aws/client" - "github.com/aws/aws-sdk-go/aws/ec2metadata" "github.com/aws/aws-sdk-go/aws/endpoints" "github.com/aws/aws-sdk-go/aws/request" "github.com/aws/aws-sdk-go/aws/session" "github.com/aws/aws-sdk-go/service/ec2" + "github.com/aws/aws-sdk-go/service/iam" "github.com/aws/aws-sdk-go/service/sqs" "github.com/aws/aws-sdk-go/service/ssm" "github.com/patrickmn/go-cache" @@ -78,6 +78,7 @@ type CloudProvider struct { instanceProvider *InstanceProvider kubeClient k8sClient.Client sqsProvider *SQSProvider + iamProvider *IAMProvider } func NewCloudProvider(ctx context.Context, options cloudprovider.Options) *CloudProvider { @@ -97,9 +98,10 @@ func NewCloudProvider(ctx context.Context, options cloudprovider.Options) *Cloud client.DefaultRetryer{NumMaxRetries: client.DefaultRetryerMaxNumRetries}, ), ))) + metadata := NewMetadataProvider(sess) if *sess.Config.Region == "" { logging.FromContext(ctx).Debug("AWS region not configured, asking EC2 Instance Metadata Service") - *sess.Config.Region = getRegionFromIMDS(sess) + *sess.Config.Region = metadata.Region(ctx) } logging.FromContext(ctx).Debugf("Using AWS region %s", *sess.Config.Region) @@ -108,11 +110,12 @@ func NewCloudProvider(ctx context.Context, options cloudprovider.Options) *Cloud logging.FromContext(ctx).Errorf("Checking EC2 API connectivity, %s", err) } sqsapi := sqs.New(sess) + iamapi := iam.New(sess) subnetProvider := NewSubnetProvider(ec2api) instanceTypeProvider := NewInstanceTypeProvider(ctx, sess, options, ec2api, subnetProvider) - // TODO: Change this queue url value to a useful value - sqsProvider := NewSQSProvider(sqsapi, "test-stack-Queue-VimlxX8fIySZ") + sqsProvider := NewSQSProvider(sqsapi, "new-queue5", *sess.Config.Region, metadata.AccountID(ctx)) + iamProvider := NewIAMProvider(iamapi) cloudprovider := &CloudProvider{ instanceTypeProvider: instanceTypeProvider, instanceProvider: NewInstanceProvider(ctx, ec2api, instanceTypeProvider, subnetProvider, @@ -127,6 +130,7 @@ func NewCloudProvider(ctx context.Context, options cloudprovider.Options) *Cloud ), ), sqsProvider: sqsProvider, + iamProvider: iamProvider, kubeClient: options.KubeClient, } v1alpha5.ValidateHook = cloudprovider.Validate @@ -256,15 +260,6 @@ func (c *CloudProvider) Name() string { return "aws" } -// get the current region from EC2 IMDS -func getRegionFromIMDS(sess *session.Session) string { - region, err := ec2metadata.New(sess).Region() - if err != nil { - panic(fmt.Sprintf("Failed to call the metadata server's region API, %s", err)) - } - return region -} - // withUserAgent adds a karpenter specific user-agent string to AWS session func withUserAgent(sess *session.Session) *session.Session { userAgent := fmt.Sprintf("karpenter.sh-%s", project.Version) diff --git a/pkg/cloudprovider/aws/controllers/infrastructure/controller.go b/pkg/cloudprovider/aws/controllers/infrastructure/controller.go index a926a0f2a600..0cbf9b9b428e 100644 --- a/pkg/cloudprovider/aws/controllers/infrastructure/controller.go +++ b/pkg/cloudprovider/aws/controllers/infrastructure/controller.go @@ -22,21 +22,21 @@ import ( "knative.dev/pkg/logging" "github.com/aws/karpenter/pkg/cloudprovider/aws" - "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers" + "github.com/aws/karpenter/pkg/cloudprovider/aws/events" ) // Controller is the AWS infrastructure controller. It is not a standard controller-runtime controller in that it doesn't // have a reconcile method. type Controller struct { sqsProvider *aws.SQSProvider - recorder controllers.Recorder + recorder events.Recorder clock clock.Clock } // pollingPeriod that we go to AWS APIs to ensure that the appropriate AWS infrastructure is provisioned const pollingPeriod = 15 * time.Minute -func NewController(ctx context.Context, clk clock.Clock, recorder controllers.Recorder, +func NewController(ctx context.Context, clk clock.Clock, recorder events.Recorder, sqsProvider *aws.SQSProvider, startAsync <-chan struct{}) *Controller { c := &Controller{ recorder: recorder, @@ -44,6 +44,11 @@ func NewController(ctx context.Context, clk clock.Clock, recorder controllers.Re sqsProvider: sqsProvider, } + err := sqsProvider.CreateQueue(ctx) + if err != nil { + logging.FromContext(ctx).Errorf("Creating SQS queue with policy, %v", err) + } + go func() { select { case <-ctx.Done(): diff --git a/pkg/cloudprovider/aws/controllers/notification/controller.go b/pkg/cloudprovider/aws/controllers/notification/controller.go index d8147b3c6a3b..9f1f9ec2a39e 100644 --- a/pkg/cloudprovider/aws/controllers/notification/controller.go +++ b/pkg/cloudprovider/aws/controllers/notification/controller.go @@ -29,9 +29,9 @@ import ( "github.com/aws/karpenter/pkg/apis/provisioning/v1alpha5" "github.com/aws/karpenter/pkg/cloudprovider/aws" - "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers" "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event" "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event/aggregatedparser" + "github.com/aws/karpenter/pkg/cloudprovider/aws/events" "github.com/aws/karpenter/pkg/controllers/provisioning" "github.com/aws/karpenter/pkg/controllers/state" ) @@ -54,7 +54,7 @@ type Controller struct { kubeClient client.Client provisioner *provisioning.Provisioner cluster *state.Cluster - recorder controllers.Recorder + recorder events.Recorder clock clock.Clock provider *aws.SQSProvider parser event.Parser @@ -64,7 +64,7 @@ type Controller struct { const pollingPeriod = 2 * time.Second func NewController(ctx context.Context, clk clock.Clock, kubeClient client.Client, sqsProvider *aws.SQSProvider, - recorder controllers.Recorder, provisioner *provisioning.Provisioner, cluster *state.Cluster, startAsync <-chan struct{}) *Controller { + recorder events.Recorder, provisioner *provisioning.Provisioner, cluster *state.Cluster, startAsync <-chan struct{}) *Controller { c := &Controller{ kubeClient: kubeClient, provisioner: provisioner, diff --git a/pkg/cloudprovider/aws/controllers/register.go b/pkg/cloudprovider/aws/controllers/register.go new file mode 100644 index 000000000000..f5e1ff69f534 --- /dev/null +++ b/pkg/cloudprovider/aws/controllers/register.go @@ -0,0 +1,22 @@ +package controllers + +import ( + "context" + + "knative.dev/pkg/logging" + + "github.com/aws/karpenter/pkg/cloudprovider/aws" + "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/infrastructure" + "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification" + "github.com/aws/karpenter/pkg/cloudprovider/aws/events" + "github.com/aws/karpenter/pkg/controllers" +) + +func Register(ctx context.Context, provider *aws.CloudProvider, opts *controllers.ControllerOptions) { + rec := events.NewRecorder(opts.Recorder) + ctx = logging.WithLogger(ctx, logging.FromContext(ctx).Named("aws")) + + // Injecting the controllers that will start when opts.StartAsync is triggered + notification.NewController(ctx, opts.Clock, opts.KubeClient, provider.SQSProvider(), rec, opts.Provisioner, opts.Cluster, opts.StartAsync) + infrastructure.NewController(ctx, opts.Clock, rec, provider.SQSProvider(), opts.StartAsync) +} diff --git a/pkg/cloudprovider/aws/controllers/recorder.go b/pkg/cloudprovider/aws/events/recorder.go similarity index 94% rename from pkg/cloudprovider/aws/controllers/recorder.go rename to pkg/cloudprovider/aws/events/recorder.go index dd4ad33535d3..cfc3f0943d73 100644 --- a/pkg/cloudprovider/aws/controllers/recorder.go +++ b/pkg/cloudprovider/aws/events/recorder.go @@ -12,11 +12,13 @@ See the License for the specific language governing permissions and limitations under the License. */ -package controllers +package events import ( v1 "k8s.io/api/core/v1" "k8s.io/client-go/tools/record" + + "github.com/aws/karpenter/pkg/events" ) type recorder struct { @@ -34,7 +36,7 @@ type Recorder interface { EC2HealthWarning(*v1.Node) } -func NewRecorder(r record.EventRecorder) Recorder { +func NewRecorder(r events.Recorder) Recorder { return recorder{ EventRecorder: r, } diff --git a/pkg/cloudprovider/aws/iam.go b/pkg/cloudprovider/aws/iam.go new file mode 100644 index 000000000000..069c91d6b938 --- /dev/null +++ b/pkg/cloudprovider/aws/iam.go @@ -0,0 +1,15 @@ +package aws + +import ( + "github.com/aws/aws-sdk-go/service/iam/iamiface" +) + +type IAMProvider struct { + client iamiface.IAMAPI +} + +func NewIAMProvider(api iamiface.IAMAPI) *IAMProvider { + return &IAMProvider{ + client: api, + } +} diff --git a/pkg/cloudprovider/aws/metadata.go b/pkg/cloudprovider/aws/metadata.go new file mode 100644 index 000000000000..0322dd09fa7e --- /dev/null +++ b/pkg/cloudprovider/aws/metadata.go @@ -0,0 +1,46 @@ +package aws + +import ( + "context" + "fmt" + + "github.com/aws/aws-sdk-go/aws" + "github.com/aws/aws-sdk-go/aws/ec2metadata" + "github.com/aws/aws-sdk-go/aws/session" + "github.com/aws/aws-sdk-go/service/sts" + "github.com/aws/aws-sdk-go/service/sts/stsiface" +) + +type MetadataProvider struct { + imdsClient *ec2metadata.EC2Metadata + stsClient stsiface.STSAPI +} + +func NewMetadataProvider(sess *session.Session) *MetadataProvider { + return &MetadataProvider{ + imdsClient: ec2metadata.New(sess), + stsClient: sts.New(sess), + } +} + +// Region gets the current region from EC2 IMDS +func (i *MetadataProvider) Region(ctx context.Context) string { + region, err := i.imdsClient.RegionWithContext(ctx) + if err != nil { + panic(fmt.Sprintf("Failed to call the metadata server's region API, %s", err)) + } + return region +} + +func (i *MetadataProvider) AccountID(ctx context.Context) string { + doc, err := i.imdsClient.GetInstanceIdentityDocumentWithContext(ctx) + if err != nil { + // Fallback to using the STS provider if IMDS fails + result, err := i.stsClient.GetCallerIdentity(&sts.GetCallerIdentityInput{}) + if err != nil { + panic(fmt.Sprintf("Failed to get account ID from IMDS or STS, %s", err)) + } + return aws.StringValue(result.Account) + } + return doc.AccountID +} diff --git a/pkg/cloudprovider/aws/sqs.go b/pkg/cloudprovider/aws/sqs.go index 1e2f2e49deb9..aca8869268e1 100644 --- a/pkg/cloudprovider/aws/sqs.go +++ b/pkg/cloudprovider/aws/sqs.go @@ -16,12 +16,14 @@ package aws import ( "context" + "encoding/json" "fmt" "sync" "github.com/aws/aws-sdk-go/aws" "github.com/aws/aws-sdk-go/service/sqs" "github.com/aws/aws-sdk-go/service/sqs/sqsiface" + "github.com/samber/lo" "knative.dev/pkg/logging" ) @@ -32,19 +34,50 @@ type SQSProvider struct { client sqsiface.SQSAPI mutex *sync.RWMutex queueURL string + queueName string + metadata *AccountMetadata } -func NewSQSProvider(client sqsiface.SQSAPI, queueName string) *SQSProvider { - createQueueInput := &sqs.CreateQueueInput{ - Attributes: map[string]*string{ - sqs.QueueAttributeNameMessageRetentionPeriod: aws.String("300"), +type AccountMetadata struct { + region string + accountID string +} + +type QueuePolicy struct { + Version string `json:"Version"` + Id string `json:"Id"` + Statement []QueuePolicyStatement `json:"Statement"` +} + +type QueuePolicyStatement struct { + Effect string `json:"Effect"` + Principal Principal `json:"Principal"` + Action []string `json:"Action"` + Resource string `json:"Resource"` +} + +type Principal struct { + Service []string `json:"Service"` +} + +func NewSQSProvider(client sqsiface.SQSAPI, queueName, region, accountID string) *SQSProvider { + provider := &SQSProvider{ + client: client, + mutex: &sync.RWMutex{}, + queueName: queueName, + metadata: &AccountMetadata{ + region: region, + accountID: accountID, }, - QueueName: aws.String(queueName), } - getQueueURLInput := &sqs.GetQueueUrlInput{ + provider.createQueueInput = &sqs.CreateQueueInput{ + Attributes: provider.getQueueAttributes(), + QueueName: aws.String(queueName), + } + provider.getQueueURLInput = &sqs.GetQueueUrlInput{ QueueName: aws.String(queueName), } - receiveMessageInput := &sqs.ReceiveMessageInput{ + provider.receiveMessageInput = &sqs.ReceiveMessageInput{ MaxNumberOfMessages: aws.Int64(10), VisibilityTimeout: aws.Int64(20), // Seconds WaitTimeSeconds: aws.Int64(20), // Seconds, maximum for long polling @@ -55,16 +88,38 @@ func NewSQSProvider(client sqsiface.SQSAPI, queueName string) *SQSProvider { aws.String(sqs.QueueAttributeNameAll), }, } + return provider +} - return &SQSProvider{ - createQueueInput: createQueueInput, - getQueueURLInput: getQueueURLInput, - receiveMessageInput: receiveMessageInput, - client: client, - mutex: &sync.RWMutex{}, +func (s *SQSProvider) CreateQueue(ctx context.Context) error { + result, err := s.client.CreateQueueWithContext(ctx, s.createQueueInput) + if err != nil { + return fmt.Errorf("failed to create SQS queue, %w", err) } + s.mutex.Lock() + defer s.mutex.Unlock() + s.queueURL = aws.StringValue(result.QueueUrl) + return nil } +func (s *SQSProvider) SetQueueAttributes(ctx context.Context) error { + return nil +} + +//func (s *SQSProvider) CreateQueuePolicy(ctx context.Context) error { +// queueURL, err := s.DiscoverQueueURL(ctx) +// if err != nil { +// return fmt.Errorf("failed getting sqs messages, %w", err) +// } +// _, err = s.client.SetQueueAttributesWithContext(ctx, &sqs.SetQueueAttributesInput{ +// Attributes: +// }) +// if err != nil { +// return fmt.Errorf("failed to create SQS policy, %w", err) +// } +// return nil +//} + func (s *SQSProvider) DiscoverQueueURL(ctx context.Context) (string, error) { s.mutex.RLock() defer s.mutex.RUnlock() @@ -130,3 +185,31 @@ func (s *SQSProvider) DeleteSQSMessage(ctx context.Context, msg *sqs.Message) er return nil } + +func (s *SQSProvider) getQueueAttributes() map[string]*string { + policy := lo.Must(json.Marshal(s.getQueuePolicy())) + return map[string]*string{ + sqs.QueueAttributeNameMessageRetentionPeriod: aws.String("300"), + sqs.QueueAttributeNamePolicy: aws.String(string(policy)), + } +} + +func (s *SQSProvider) getQueuePolicy() *QueuePolicy { + return &QueuePolicy{ + Version: "2008-10-17", + Id: "EC2NotificationPolicy", + Statement: []QueuePolicyStatement{ + { + Effect: "Allow", + Principal: Principal{ + Service: []string{ + "events.amazonaws.com", + "sqs.amazonaws.com", + }, + }, + Action: []string{"sqs:SendMessage"}, + Resource: fmt.Sprintf("arn:aws:sqs:%s:%s:%s", s.metadata.region, s.metadata.accountID, s.queueName), + }, + }, + } +} From 6c568fa610874cf6565d2ea4a8142afa171d1e56 Mon Sep 17 00:00:00 2001 From: Jonathan Innis Date: Thu, 8 Sep 2022 17:08:04 -0700 Subject: [PATCH 05/55] Add SQS and EventBridge providers with SQS policy --- pkg/apis/provisioning/v1alpha5/labels.go | 2 +- pkg/cloudprovider/aws/cloudprovider.go | 29 ++-- .../controllers/infrastructure/controller.go | 20 ++- pkg/cloudprovider/aws/controllers/register.go | 2 +- pkg/cloudprovider/aws/eventbridge.go | 136 ++++++++++++++++++ pkg/cloudprovider/aws/iam.go | 15 -- pkg/cloudprovider/aws/sqs.go | 76 +++++----- 7 files changed, 209 insertions(+), 71 deletions(-) create mode 100644 pkg/cloudprovider/aws/eventbridge.go delete mode 100644 pkg/cloudprovider/aws/iam.go diff --git a/pkg/apis/provisioning/v1alpha5/labels.go b/pkg/apis/provisioning/v1alpha5/labels.go index 5f38819d5d46..712145e2f4d1 100644 --- a/pkg/apis/provisioning/v1alpha5/labels.go +++ b/pkg/apis/provisioning/v1alpha5/labels.go @@ -31,9 +31,9 @@ var ( KarpenterLabelDomain = "karpenter.sh" ProvisionerNameLabelKey = Group + "/provisioner-name" + DiscoveryLabelKey = Group + "/discovery" DoNotEvictPodAnnotationKey = Group + "/do-not-evict" DoNotConsolidateNodeAnnotationKey = KarpenterLabelDomain + "/do-not-consolidate" - MarkedForDeletionAnnotationKey = KarpenterLabelDomain + "/marked-for-delete" EmptinessTimestampAnnotationKey = Group + "/emptiness-timestamp" TerminationFinalizer = Group + "/termination" diff --git a/pkg/cloudprovider/aws/cloudprovider.go b/pkg/cloudprovider/aws/cloudprovider.go index 6d25156e82da..3e992f3b393e 100644 --- a/pkg/cloudprovider/aws/cloudprovider.go +++ b/pkg/cloudprovider/aws/cloudprovider.go @@ -29,7 +29,7 @@ import ( "github.com/aws/aws-sdk-go/aws/request" "github.com/aws/aws-sdk-go/aws/session" "github.com/aws/aws-sdk-go/service/ec2" - "github.com/aws/aws-sdk-go/service/iam" + "github.com/aws/aws-sdk-go/service/eventbridge" "github.com/aws/aws-sdk-go/service/sqs" "github.com/aws/aws-sdk-go/service/ssm" "github.com/patrickmn/go-cache" @@ -78,7 +78,12 @@ type CloudProvider struct { instanceProvider *InstanceProvider kubeClient k8sClient.Client sqsProvider *SQSProvider - iamProvider *IAMProvider + eventBridgeProvider *EventBridgeProvider +} + +type Metadata struct { + region string + accountID string } func NewCloudProvider(ctx context.Context, options cloudprovider.Options) *CloudProvider { @@ -109,13 +114,15 @@ func NewCloudProvider(ctx context.Context, options cloudprovider.Options) *Cloud if err := checkEC2Connectivity(ec2api); err != nil { logging.FromContext(ctx).Errorf("Checking EC2 API connectivity, %s", err) } - sqsapi := sqs.New(sess) - iamapi := iam.New(sess) subnetProvider := NewSubnetProvider(ec2api) instanceTypeProvider := NewInstanceTypeProvider(ctx, sess, options, ec2api, subnetProvider) - sqsProvider := NewSQSProvider(sqsapi, "new-queue5", *sess.Config.Region, metadata.AccountID(ctx)) - iamProvider := NewIAMProvider(iamapi) + m := &Metadata{ + region: *sess.Config.Region, + accountID: metadata.AccountID(ctx), + } + sqsProvider := NewSQSProvider(ctx, sqs.New(sess), m) + eventBridgeProvider := NewEventBridgeProvider(eventbridge.New(sess), m, sqsProvider.queueName) cloudprovider := &CloudProvider{ instanceTypeProvider: instanceTypeProvider, instanceProvider: NewInstanceProvider(ctx, ec2api, instanceTypeProvider, subnetProvider, @@ -129,9 +136,9 @@ func NewCloudProvider(ctx context.Context, options cloudprovider.Options) *Cloud options.StartAsync, ), ), - sqsProvider: sqsProvider, - iamProvider: iamProvider, - kubeClient: options.KubeClient, + sqsProvider: sqsProvider, + eventBridgeProvider: eventBridgeProvider, + kubeClient: options.KubeClient, } v1alpha5.ValidateHook = cloudprovider.Validate v1alpha5.DefaultHook = cloudprovider.Default @@ -228,6 +235,10 @@ func (c *CloudProvider) SQSProvider() *SQSProvider { return c.sqsProvider } +func (c *CloudProvider) EventBridgeProvider() *EventBridgeProvider { + return c.eventBridgeProvider +} + // Default the provisioner func (*CloudProvider) Default(ctx context.Context, provisioner *v1alpha5.Provisioner) { defaultLabels(provisioner) diff --git a/pkg/cloudprovider/aws/controllers/infrastructure/controller.go b/pkg/cloudprovider/aws/controllers/infrastructure/controller.go index 0cbf9b9b428e..7959e253ac2b 100644 --- a/pkg/cloudprovider/aws/controllers/infrastructure/controller.go +++ b/pkg/cloudprovider/aws/controllers/infrastructure/controller.go @@ -28,26 +28,32 @@ import ( // Controller is the AWS infrastructure controller. It is not a standard controller-runtime controller in that it doesn't // have a reconcile method. type Controller struct { - sqsProvider *aws.SQSProvider - recorder events.Recorder - clock clock.Clock + sqsProvider *aws.SQSProvider + eventBridgeProvider *aws.EventBridgeProvider + recorder events.Recorder + clock clock.Clock } // pollingPeriod that we go to AWS APIs to ensure that the appropriate AWS infrastructure is provisioned const pollingPeriod = 15 * time.Minute func NewController(ctx context.Context, clk clock.Clock, recorder events.Recorder, - sqsProvider *aws.SQSProvider, startAsync <-chan struct{}) *Controller { + sqsProvider *aws.SQSProvider, eventBridgeProvider *aws.EventBridgeProvider, startAsync <-chan struct{}) *Controller { c := &Controller{ - recorder: recorder, - clock: clk, - sqsProvider: sqsProvider, + recorder: recorder, + clock: clk, + sqsProvider: sqsProvider, + eventBridgeProvider: eventBridgeProvider, } err := sqsProvider.CreateQueue(ctx) if err != nil { logging.FromContext(ctx).Errorf("Creating SQS queue with policy, %v", err) } + err = eventBridgeProvider.CreateEC2NotificationRules(ctx) + if err != nil { + logging.FromContext(ctx).Errorf("Creating event bridge notification rules, %v", err) + } go func() { select { diff --git a/pkg/cloudprovider/aws/controllers/register.go b/pkg/cloudprovider/aws/controllers/register.go index f5e1ff69f534..419e40deff16 100644 --- a/pkg/cloudprovider/aws/controllers/register.go +++ b/pkg/cloudprovider/aws/controllers/register.go @@ -18,5 +18,5 @@ func Register(ctx context.Context, provider *aws.CloudProvider, opts *controller // Injecting the controllers that will start when opts.StartAsync is triggered notification.NewController(ctx, opts.Clock, opts.KubeClient, provider.SQSProvider(), rec, opts.Provisioner, opts.Cluster, opts.StartAsync) - infrastructure.NewController(ctx, opts.Clock, rec, provider.SQSProvider(), opts.StartAsync) + infrastructure.NewController(ctx, opts.Clock, rec, provider.SQSProvider(), provider.EventBridgeProvider(), opts.StartAsync) } diff --git a/pkg/cloudprovider/aws/eventbridge.go b/pkg/cloudprovider/aws/eventbridge.go new file mode 100644 index 000000000000..80689f101be2 --- /dev/null +++ b/pkg/cloudprovider/aws/eventbridge.go @@ -0,0 +1,136 @@ +package aws + +import ( + "context" + "encoding/json" + "fmt" + + "github.com/aws/aws-sdk-go/aws" + "github.com/aws/aws-sdk-go/aws/request" + "github.com/aws/aws-sdk-go/service/eventbridge" + "github.com/samber/lo" + "go.uber.org/multierr" + + "github.com/aws/karpenter/pkg/apis/provisioning/v1alpha5" + "github.com/aws/karpenter/pkg/utils/injection" +) + +type EventBridgeClient interface { + PutRuleWithContext(context.Context, *eventbridge.PutRuleInput, ...request.Option) (*eventbridge.PutRuleOutput, error) + PutTargetsWithContext(context.Context, *eventbridge.PutTargetsInput, ...request.Option) (*eventbridge.PutTargetsOutput, error) +} + +type EventBridgeProvider struct { + EventBridgeClient + queueName string + metadata *Metadata +} + +type EventRule struct { + Name string + Pattern *EventPattern + Target *EventTarget +} + +type EventTarget struct { + ID string + ARN string +} + +type EventPattern struct { + Source []string `json:"source,omitempty"` + DetailType []string `json:"detailType,omitempty"` +} + +func (ep *EventPattern) Serialize() []byte { + return lo.Must(json.Marshal(ep)) +} + +func NewEventBridgeProvider(eb EventBridgeClient, metadata *Metadata, queueName string) *EventBridgeProvider { + return &EventBridgeProvider{ + EventBridgeClient: eb, + metadata: metadata, + queueName: queueName, + } +} + +func (eb *EventBridgeProvider) CreateEC2NotificationRules(ctx context.Context) error { + var err error + for _, rule := range eb.getEC2NotificationEventRules(ctx) { + _, e := eb.PutRuleWithContext(ctx, &eventbridge.PutRuleInput{ + Name: aws.String(rule.Name), + EventPattern: aws.String(string(rule.Pattern.Serialize())), + Tags: []*eventbridge.Tag{ + { + Key: aws.String(v1alpha5.DiscoveryLabelKey), + Value: aws.String(injection.GetOptions(ctx).ClusterName), + }, + }, + }) + err = multierr.Append(err, e) + _, e = eb.PutTargetsWithContext(ctx, &eventbridge.PutTargetsInput{ + Rule: aws.String(rule.Name), + Targets: []*eventbridge.Target{ + { + Id: aws.String(rule.Target.ID), + Arn: aws.String(rule.Target.ARN), + }, + }, + }) + err = multierr.Append(err, e) + } + return err +} + +func (eb *EventBridgeProvider) getEC2NotificationEventRules(ctx context.Context) []EventRule { + return []EventRule{ + { + Name: fmt.Sprintf("%s-ScheduledChangeRule", injection.GetOptions(ctx).ClusterName), + Pattern: &EventPattern{ + Source: []string{"aws.health"}, + DetailType: []string{"AWS Health Event"}, + }, + Target: &EventTarget{ + ID: "1", + ARN: eb.getQueueARN(), + }, + }, + { + Name: fmt.Sprintf("%s-SpotTerminationRule", injection.GetOptions(ctx).ClusterName), + Pattern: &EventPattern{ + Source: []string{"aws.ec2"}, + DetailType: []string{"EC2 Spot Instance Interruption Warning"}, + }, + Target: &EventTarget{ + ID: "1", + ARN: eb.getQueueARN(), + }, + }, + { + Name: fmt.Sprintf("%s-RebalanceRule", injection.GetOptions(ctx).ClusterName), + Pattern: &EventPattern{ + Source: []string{"aws.ec2"}, + DetailType: []string{"EC2 Instance Rebalance Recommendation"}, + }, + Target: &EventTarget{ + ID: "1", + ARN: eb.getQueueARN(), + }, + }, + { + Name: fmt.Sprintf("%s-InstanceStateChangeRule", injection.GetOptions(ctx).ClusterName), + Pattern: &EventPattern{ + Source: []string{"aws.ec2"}, + DetailType: []string{"EC2 Instance State-change Notification"}, + }, + Target: &EventTarget{ + ID: "1", + ARN: eb.getQueueARN(), + }, + }, + } +} + +func (eb *EventBridgeProvider) getQueueARN() string { + return fmt.Sprintf("arn:aws:sqs:%s:%s:%s", eb.metadata.region, eb.metadata.accountID, eb.queueName) +} diff --git a/pkg/cloudprovider/aws/iam.go b/pkg/cloudprovider/aws/iam.go deleted file mode 100644 index 069c91d6b938..000000000000 --- a/pkg/cloudprovider/aws/iam.go +++ /dev/null @@ -1,15 +0,0 @@ -package aws - -import ( - "github.com/aws/aws-sdk-go/service/iam/iamiface" -) - -type IAMProvider struct { - client iamiface.IAMAPI -} - -func NewIAMProvider(api iamiface.IAMAPI) *IAMProvider { - return &IAMProvider{ - client: api, - } -} diff --git a/pkg/cloudprovider/aws/sqs.go b/pkg/cloudprovider/aws/sqs.go index aca8869268e1..964a9b1513ed 100644 --- a/pkg/cloudprovider/aws/sqs.go +++ b/pkg/cloudprovider/aws/sqs.go @@ -21,31 +21,37 @@ import ( "sync" "github.com/aws/aws-sdk-go/aws" + "github.com/aws/aws-sdk-go/aws/request" "github.com/aws/aws-sdk-go/service/sqs" - "github.com/aws/aws-sdk-go/service/sqs/sqsiface" "github.com/samber/lo" "knative.dev/pkg/logging" + + "github.com/aws/karpenter/pkg/apis/provisioning/v1alpha5" + "github.com/aws/karpenter/pkg/utils/injection" ) +type SQSClient interface { + CreateQueueWithContext(context.Context, *sqs.CreateQueueInput, ...request.Option) (*sqs.CreateQueueOutput, error) + GetQueueUrlWithContext(context.Context, *sqs.GetQueueUrlInput, ...request.Option) (*sqs.GetQueueUrlOutput, error) + ReceiveMessageWithContext(context.Context, *sqs.ReceiveMessageInput, ...request.Option) (*sqs.ReceiveMessageOutput, error) + DeleteMessageWithContext(context.Context, *sqs.DeleteMessageInput, ...request.Option) (*sqs.DeleteMessageOutput, error) +} + type SQSProvider struct { + SQSClient + createQueueInput *sqs.CreateQueueInput getQueueURLInput *sqs.GetQueueUrlInput receiveMessageInput *sqs.ReceiveMessageInput - client sqsiface.SQSAPI mutex *sync.RWMutex queueURL string queueName string - metadata *AccountMetadata -} - -type AccountMetadata struct { - region string - accountID string + metadata *Metadata } type QueuePolicy struct { Version string `json:"Version"` - Id string `json:"Id"` + ID string `json:"Id"` Statement []QueuePolicyStatement `json:"Statement"` } @@ -60,22 +66,22 @@ type Principal struct { Service []string `json:"Service"` } -func NewSQSProvider(client sqsiface.SQSAPI, queueName, region, accountID string) *SQSProvider { +func NewSQSProvider(ctx context.Context, client SQSClient, metadata *Metadata) *SQSProvider { provider := &SQSProvider{ - client: client, + SQSClient: client, mutex: &sync.RWMutex{}, - queueName: queueName, - metadata: &AccountMetadata{ - region: region, - accountID: accountID, - }, + queueName: getName(ctx), + metadata: metadata, } provider.createQueueInput = &sqs.CreateQueueInput{ Attributes: provider.getQueueAttributes(), - QueueName: aws.String(queueName), + QueueName: aws.String(provider.queueName), + Tags: map[string]*string{ + v1alpha5.DiscoveryLabelKey: aws.String(injection.GetOptions(ctx).ClusterName), + }, } provider.getQueueURLInput = &sqs.GetQueueUrlInput{ - QueueName: aws.String(queueName), + QueueName: aws.String(provider.queueName), } provider.receiveMessageInput = &sqs.ReceiveMessageInput{ MaxNumberOfMessages: aws.Int64(10), @@ -92,7 +98,7 @@ func NewSQSProvider(client sqsiface.SQSAPI, queueName, region, accountID string) } func (s *SQSProvider) CreateQueue(ctx context.Context) error { - result, err := s.client.CreateQueueWithContext(ctx, s.createQueueInput) + result, err := s.CreateQueueWithContext(ctx, s.createQueueInput) if err != nil { return fmt.Errorf("failed to create SQS queue, %w", err) } @@ -106,27 +112,13 @@ func (s *SQSProvider) SetQueueAttributes(ctx context.Context) error { return nil } -//func (s *SQSProvider) CreateQueuePolicy(ctx context.Context) error { -// queueURL, err := s.DiscoverQueueURL(ctx) -// if err != nil { -// return fmt.Errorf("failed getting sqs messages, %w", err) -// } -// _, err = s.client.SetQueueAttributesWithContext(ctx, &sqs.SetQueueAttributesInput{ -// Attributes: -// }) -// if err != nil { -// return fmt.Errorf("failed to create SQS policy, %w", err) -// } -// return nil -//} - func (s *SQSProvider) DiscoverQueueURL(ctx context.Context) (string, error) { s.mutex.RLock() defer s.mutex.RUnlock() if s.queueURL != "" { return s.queueURL, nil } - result, err := s.client.GetQueueUrlWithContext(ctx, s.getQueueURLInput) + result, err := s.GetQueueUrlWithContext(ctx, s.getQueueURLInput) if err != nil { return "", fmt.Errorf("failed fetching queue url, %w", err) } @@ -151,7 +143,7 @@ func (s *SQSProvider) GetSQSMessages(ctx context.Context) ([]*sqs.Message, error } input.QueueUrl = aws.String(queueURL) - result, err := s.client.ReceiveMessageWithContext(ctx, input) + result, err := s.ReceiveMessageWithContext(ctx, input) if err != nil { logging.FromContext(ctx). With("error", err). @@ -175,7 +167,7 @@ func (s *SQSProvider) DeleteSQSMessage(ctx context.Context, msg *sqs.Message) er ReceiptHandle: msg.ReceiptHandle, } - _, err = s.client.DeleteMessageWithContext(ctx, input) + _, err = s.DeleteMessageWithContext(ctx, input) if err != nil { logging.FromContext(ctx). With("error", err). @@ -197,7 +189,7 @@ func (s *SQSProvider) getQueueAttributes() map[string]*string { func (s *SQSProvider) getQueuePolicy() *QueuePolicy { return &QueuePolicy{ Version: "2008-10-17", - Id: "EC2NotificationPolicy", + ID: "EC2NotificationPolicy", Statement: []QueuePolicyStatement{ { Effect: "Allow", @@ -208,8 +200,16 @@ func (s *SQSProvider) getQueuePolicy() *QueuePolicy { }, }, Action: []string{"sqs:SendMessage"}, - Resource: fmt.Sprintf("arn:aws:sqs:%s:%s:%s", s.metadata.region, s.metadata.accountID, s.queueName), + Resource: s.getQueueARN(), }, }, } } + +func (s *SQSProvider) getQueueARN() string { + return fmt.Sprintf("arn:aws:sqs:%s:%s:%s", s.metadata.region, s.metadata.accountID, s.queueName) +} + +func getName(ctx context.Context) string { + return fmt.Sprintf("Karpenter-%s-Queue", injection.GetOptions(ctx).ClusterName) +} From 803527836b879ab41029f20ce4edfcf237478531 Mon Sep 17 00:00:00 2001 From: Jonathan Innis Date: Thu, 8 Sep 2022 17:14:05 -0700 Subject: [PATCH 06/55] Add new required policy to cloudformation for controller --- .../cloudformation.yaml | 27 +++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/website/content/en/preview/getting-started/getting-started-with-eksctl/cloudformation.yaml b/website/content/en/preview/getting-started/getting-started-with-eksctl/cloudformation.yaml index c091c5c2d807..d6c7cc347265 100644 --- a/website/content/en/preview/getting-started/getting-started-with-eksctl/cloudformation.yaml +++ b/website/content/en/preview/getting-started/getting-started-with-eksctl/cloudformation.yaml @@ -64,3 +64,30 @@ Resources: Action: - iam:PassRole Resource: !Sub "arn:${AWS::Partition}:iam::${AWS::AccountId}:role/KarpenterNodeRole-${ClusterName}" + KarpenterEventPolicy: + Type: AWS::IAM::ManagedPolicy + Properties: + ManagedPolicyName: !Sub "KarpenterEventPolicy-${ClusterName}" + PolicyDocument: + Version: "2012-10-17" + Statement: + - Effect: Allow + Resource: "*" + Action: + # Write Operations + - sqs:CreateQueue + - sqs:DeleteMessage + - sqs:DeleteQueue + - events:PutRule + - events:PutTargets + - events:DeleteRule + - events:RemoveTargets + # Read Operations + - sqs:GetQueueUrl + - sqs:ReceiveMessage + - events:ListRules + - events:DescribeRule + - Effect: Allow + Action: + - iam:PassRole + Resource: !Sub "arn:${AWS::Partition}:iam::${AWS::AccountId}:role/KarpenterNodeRole-${ClusterName}" From df8702249fcf66ba56b732014d70e2fcff36ae27 Mon Sep 17 00:00:00 2001 From: Jonathan Innis Date: Thu, 15 Sep 2022 15:23:38 -0700 Subject: [PATCH 07/55] Some code cleanup in notification controller --- .../controllers/infrastructure/controller.go | 5 +- .../controllers/notification/controller.go | 18 ++++--- .../event/statechange/v0/parser.go | 2 +- .../controllers/notification/event/types.go | 12 ++--- pkg/cloudprovider/aws/controllers/register.go | 14 ++++++ pkg/cloudprovider/aws/eventbridge.go | 14 ++++++ pkg/cloudprovider/aws/helpers.go | 14 ++++++ pkg/cloudprovider/aws/metadata.go | 14 ++++++ pkg/cloudprovider/fake/cloudprovider.go | 4 -- pkg/controllers/consolidation/controller.go | 48 ------------------- 10 files changed, 73 insertions(+), 72 deletions(-) diff --git a/pkg/cloudprovider/aws/controllers/infrastructure/controller.go b/pkg/cloudprovider/aws/controllers/infrastructure/controller.go index 7959e253ac2b..9b41de8a869a 100644 --- a/pkg/cloudprovider/aws/controllers/infrastructure/controller.go +++ b/pkg/cloudprovider/aws/controllers/infrastructure/controller.go @@ -18,7 +18,7 @@ import ( "context" "time" - "k8s.io/apimachinery/pkg/util/clock" + "k8s.io/utils/clock" "knative.dev/pkg/logging" "github.com/aws/karpenter/pkg/cloudprovider/aws" @@ -81,6 +81,5 @@ func (c *Controller) run(ctx context.Context) { } } -func (c *Controller) ensureInfrastructure(ctx context.Context) error { - return nil +func (c *Controller) ensureInfrastructure(ctx context.Context) { } diff --git a/pkg/cloudprovider/aws/controllers/notification/controller.go b/pkg/cloudprovider/aws/controllers/notification/controller.go index 9f1f9ec2a39e..e45262e53784 100644 --- a/pkg/cloudprovider/aws/controllers/notification/controller.go +++ b/pkg/cloudprovider/aws/controllers/notification/controller.go @@ -23,7 +23,7 @@ import ( sqsapi "github.com/aws/aws-sdk-go/service/sqs" "go.uber.org/multierr" v1 "k8s.io/api/core/v1" - "k8s.io/apimachinery/pkg/util/clock" + "k8s.io/utils/clock" "knative.dev/pkg/logging" "sigs.k8s.io/controller-runtime/pkg/client" @@ -40,11 +40,9 @@ type Action = string var Actions = struct { CordonAndDrain, - Cordon, NoAction Action }{ CordonAndDrain: "CordonAndDrain", - Cordon: "Cordon", NoAction: "NoAction", } @@ -126,7 +124,7 @@ func (c *Controller) handleMessage(ctx context.Context, instanceIDMap map[string return nil } evt := c.parser.Parse(ctx, *msg.Body) - evtAction := actionForEvent(evt) + ctx = logging.WithLogger(ctx, logging.FromContext(ctx).With("event", evt.Kind())) nodes := getInvolvedNodes(evt.EC2InstanceIDs(), instanceIDMap) action := actionForEvent(evt) @@ -134,9 +132,8 @@ func (c *Controller) handleMessage(ctx context.Context, instanceIDMap map[string for i := range nodes { node := nodes[i] c.notifyForEvent(evt, node) - if action != Actions.NoAction { - e := c.handleInstance(ctx, node, evtAction) + e := c.deleteInstance(ctx, node) err = multierr.Append(err, e) } } @@ -146,10 +143,10 @@ func (c *Controller) handleMessage(ctx context.Context, instanceIDMap map[string return c.provider.DeleteSQSMessage(ctx, msg) } -// TODO: Handle the instance appropriately, this should be handled with a batcher potentially -func (c *Controller) handleInstance(ctx context.Context, node *v1.Node, _ Action) error { +func (c *Controller) deleteInstance(ctx context.Context, node *v1.Node) error { ctx = logging.WithLogger(ctx, logging.FromContext(ctx).With("node", node.Name)) - logging.FromContext(ctx).Infof("Terminating node due to spot interruption warning") + logging.FromContext(ctx).Infof("Queue notification triggered ") + if err := c.kubeClient.Delete(ctx, node); err != nil { return fmt.Errorf("deleting the spot interrupted node, %w", err) } @@ -187,7 +184,6 @@ func actionForEvent(evt event.Interface) Action { case event.Kinds.SpotInterruption: return Actions.CordonAndDrain - // TODO: understand what the state change action is case event.Kinds.StateChange: return Actions.NoAction @@ -196,6 +192,8 @@ func actionForEvent(evt event.Interface) Action { } } +// getInvolvedNodes gets all the nodes that are involved in an event based +// on the instanceIDs passed in from the event func getInvolvedNodes(instanceIDs []string, instanceIDMap map[string]*v1.Node) []*v1.Node { var nodes []*v1.Node for _, id := range instanceIDs { diff --git a/pkg/cloudprovider/aws/controllers/notification/event/statechange/v0/parser.go b/pkg/cloudprovider/aws/controllers/notification/event/statechange/v0/parser.go index 629944973e4b..113c86e97079 100644 --- a/pkg/cloudprovider/aws/controllers/notification/event/statechange/v0/parser.go +++ b/pkg/cloudprovider/aws/controllers/notification/event/statechange/v0/parser.go @@ -31,7 +31,7 @@ const ( acceptedStates = "stopping,stopped,shutting-down,terminated" ) -var acceptedStatesList = strings.Split(acceptedStates, ",") +//var acceptedStatesList = strings.Split(acceptedStates, ",") type Parser struct{} diff --git a/pkg/cloudprovider/aws/controllers/notification/event/types.go b/pkg/cloudprovider/aws/controllers/notification/event/types.go index 09e8be62c20a..3abe899f9d1e 100644 --- a/pkg/cloudprovider/aws/controllers/notification/event/types.go +++ b/pkg/cloudprovider/aws/controllers/notification/event/types.go @@ -41,10 +41,10 @@ var Kinds = struct { StateChange, Noop Kind }{ - AutoScalingTermination: Kind("autoScalingTermination"), - RebalanceRecommendation: Kind("rebalanceRecommendation"), - ScheduledChange: Kind("scheduledChange"), - SpotInterruption: Kind("spotInterruption"), - StateChange: Kind("stateChange"), - Noop: Kind("noop"), + AutoScalingTermination: "autoScalingTermination", + RebalanceRecommendation: "rebalanceRecommendation", + ScheduledChange: "scheduledChange", + SpotInterruption: "spotInterruption", + StateChange: "stateChange", + Noop: "noop", } diff --git a/pkg/cloudprovider/aws/controllers/register.go b/pkg/cloudprovider/aws/controllers/register.go index 419e40deff16..caf8ba0af9ca 100644 --- a/pkg/cloudprovider/aws/controllers/register.go +++ b/pkg/cloudprovider/aws/controllers/register.go @@ -1,3 +1,17 @@ +/* +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + package controllers import ( diff --git a/pkg/cloudprovider/aws/eventbridge.go b/pkg/cloudprovider/aws/eventbridge.go index 80689f101be2..d299cc65f9e1 100644 --- a/pkg/cloudprovider/aws/eventbridge.go +++ b/pkg/cloudprovider/aws/eventbridge.go @@ -1,3 +1,17 @@ +/* +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + package aws import ( diff --git a/pkg/cloudprovider/aws/helpers.go b/pkg/cloudprovider/aws/helpers.go index a2dbd7114df5..0838e0ef17da 100644 --- a/pkg/cloudprovider/aws/helpers.go +++ b/pkg/cloudprovider/aws/helpers.go @@ -1,3 +1,17 @@ +/* +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + package aws import ( diff --git a/pkg/cloudprovider/aws/metadata.go b/pkg/cloudprovider/aws/metadata.go index 0322dd09fa7e..acde880e599f 100644 --- a/pkg/cloudprovider/aws/metadata.go +++ b/pkg/cloudprovider/aws/metadata.go @@ -1,3 +1,17 @@ +/* +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + package aws import ( diff --git a/pkg/cloudprovider/fake/cloudprovider.go b/pkg/cloudprovider/fake/cloudprovider.go index b624ebe3f464..c4d23193aed7 100644 --- a/pkg/cloudprovider/fake/cloudprovider.go +++ b/pkg/cloudprovider/fake/cloudprovider.go @@ -45,10 +45,6 @@ type CloudProvider struct { var _ cloudprovider.CloudProvider = (*CloudProvider)(nil) var _ cloudprovider.InstanceType = (*InstanceType)(nil) -func (c *CloudProvider) InjectControllers(ctx context.Context, opts cloudprovider.ControllerOptions) error { - return nil -} - func (c *CloudProvider) Create(ctx context.Context, nodeRequest *cloudprovider.NodeRequest) (*v1.Node, error) { c.mu.Lock() c.CreateCalls = append(c.CreateCalls, nodeRequest) diff --git a/pkg/controllers/consolidation/controller.go b/pkg/controllers/consolidation/controller.go index 020eb95ddf3b..2a78a81a4a7d 100644 --- a/pkg/controllers/consolidation/controller.go +++ b/pkg/controllers/consolidation/controller.go @@ -468,54 +468,6 @@ func (c *Controller) calculateLifetimeRemaining(node candidateNode) float64 { return remaining } -func (c *Controller) spotTerminationOptionReplace(ctx context.Context, nodes []candidateNode) (consolidationAction, error) { - var stateNodes []*state.Node - c.cluster.ForEachNode(func(n *state.Node) bool { - stateNodes = append(stateNodes, n.DeepCopy()) - return true - }) - var nodeNames []string - var pods []*v1.Pod - for _, node := range nodes { - nodeNames = append(nodeNames, node.Name) - pods = append(pods, node.pods...) - } - scheduler, err := c.provisioner.NewScheduler(ctx, pods, stateNodes, scheduling.SchedulerOptions{ - SimulationMode: true, - ExcludeNodes: nodeNames, - }) - if err != nil { - return consolidationAction{result: consolidateResultUnknown}, fmt.Errorf("creating scheduler, %w", err) - } - - newNodes, inflightNodes, err := scheduler.Solve(ctx, pods) - if err != nil { - return consolidationAction{result: consolidateResultUnknown}, fmt.Errorf("simulating scheduling, %w", err) - } - - // were we able to schedule all the pods on the inflight nodes? - // delete all the nodes that are going to be deleted by spot interruption - if len(newNodes) == 0 { - schedulableCount := 0 - for _, inflight := range inflightNodes { - schedulableCount += len(inflight.Pods) - } - if len(pods) == schedulableCount { - return consolidationAction{ - oldNodes: lo.Map(nodes, func(n candidateNode, _ int) *v1.Node { return n.Node }), - disruptionCost: disruptionCost(ctx, pods), - result: consolidateResultDelete, - }, nil - } - } - return consolidationAction{ - oldNodes: lo.Map(nodes, func(n candidateNode, _ int) *v1.Node { return n.Node }), - disruptionCost: disruptionCost(ctx, pods), - result: consolidateResultReplace, - replacementNodes: newNodes, - }, nil -} - // nolint:gocyclo func (c *Controller) nodeConsolidationOptionReplaceOrDelete(ctx context.Context, node candidateNode) (consolidationAction, error) { defer metrics.Measure(consolidationDurationHistogram.WithLabelValues("Replace/Delete"))() From 15c07956e2836e3d96283c307c352c436b0ae4cb Mon Sep 17 00:00:00 2001 From: Jonathan Innis Date: Thu, 15 Sep 2022 16:32:50 -0700 Subject: [PATCH 08/55] Add a signaler for infrastructure setup --- .../controllers/infrastructure/controller.go | 90 +++++++++++++++---- .../controllers/notification/controller.go | 36 +++++--- pkg/cloudprovider/aws/controllers/register.go | 4 +- pkg/cloudprovider/aws/sqs.go | 14 +++ 4 files changed, 113 insertions(+), 31 deletions(-) diff --git a/pkg/cloudprovider/aws/controllers/infrastructure/controller.go b/pkg/cloudprovider/aws/controllers/infrastructure/controller.go index 9b41de8a869a..ec76f4dc2412 100644 --- a/pkg/cloudprovider/aws/controllers/infrastructure/controller.go +++ b/pkg/cloudprovider/aws/controllers/infrastructure/controller.go @@ -16,8 +16,11 @@ package infrastructure import ( "context" + "fmt" + "sync" "time" + "golang.org/x/sync/errgroup" "k8s.io/utils/clock" "knative.dev/pkg/logging" @@ -32,10 +35,17 @@ type Controller struct { eventBridgeProvider *aws.EventBridgeProvider recorder events.Recorder clock clock.Clock + + ready bool + mutex *sync.RWMutex + readinessChan chan struct{} // A signal to other controllers that infrastructure is in a good state } -// pollingPeriod that we go to AWS APIs to ensure that the appropriate AWS infrastructure is provisioned -const pollingPeriod = 15 * time.Minute +// pollingPeriod is the period that we go to AWS APIs to ensure that the appropriate AWS infrastructure is provisioned +const pollingPeriod = time.Hour + +// backoffPeriod is the period that we go to AWS APIs if we fail to ensure the infrastructure exists +const backoffPeriod = time.Minute func NewController(ctx context.Context, clk clock.Clock, recorder events.Recorder, sqsProvider *aws.SQSProvider, eventBridgeProvider *aws.EventBridgeProvider, startAsync <-chan struct{}) *Controller { @@ -44,15 +54,9 @@ func NewController(ctx context.Context, clk clock.Clock, recorder events.Recorde clock: clk, sqsProvider: sqsProvider, eventBridgeProvider: eventBridgeProvider, - } - - err := sqsProvider.CreateQueue(ctx) - if err != nil { - logging.FromContext(ctx).Errorf("Creating SQS queue with policy, %v", err) - } - err = eventBridgeProvider.CreateEC2NotificationRules(ctx) - if err != nil { - logging.FromContext(ctx).Errorf("Creating event bridge notification rules, %v", err) + ready: false, + mutex: &sync.RWMutex{}, + readinessChan: make(chan struct{}), } go func() { @@ -70,16 +74,72 @@ func NewController(ctx context.Context, clk clock.Clock, recorder events.Recorde func (c *Controller) run(ctx context.Context) { logger := logging.FromContext(ctx).Named("infrastructure") ctx = logging.WithLogger(ctx, logger) + + defer func() { + logger.Infof("Shutting down") + }() for { + if err := c.ensureInfrastructure(ctx); err != nil { + logging.FromContext(ctx).Errorf("ensuring infrastructure established, %v", err) + c.setReady(false) + + // Backoff with a shorter polling interval if we fail to ensure the infrastructure + select { + case <-ctx.Done(): + return + case <-c.clock.After(backoffPeriod): + continue + } + } + c.setReady(true) select { case <-ctx.Done(): - logger.Infof("Shutting down") return - case <-time.After(pollingPeriod): - c.ensureInfrastructure(ctx) + case <-c.clock.After(pollingPeriod): } } } -func (c *Controller) ensureInfrastructure(ctx context.Context) { +func (c *Controller) Ready() <-chan struct{} { + c.mutex.RLock() + defer c.mutex.RUnlock() + return c.readinessChan +} + +func (c *Controller) setReady(ready bool) { + c.mutex.Lock() + defer c.mutex.Unlock() + c.ready = ready + if ready { + close(c.readinessChan) + } else { + c.readinessChan = make(chan struct{}) + } +} + +func (c *Controller) ensureInfrastructure(ctx context.Context) error { + g, ctx := errgroup.WithContext(ctx) + g.Go(func() error { return c.ensureQueue(ctx) }) + g.Go(func() error { return c.ensureEventBridge(ctx) }) + if err := g.Wait(); err != nil { + return err + } + return nil +} + +func (c *Controller) ensureQueue(ctx context.Context) error { + if err := c.sqsProvider.CreateQueue(ctx); err != nil { + return fmt.Errorf("creating SQS queue with policy, %w", err) + } + if err := c.sqsProvider.SetQueueAttributes(ctx); err != nil { + return fmt.Errorf("setting queue attributes for queue, %w", err) + } + return nil +} + +func (c *Controller) ensureEventBridge(ctx context.Context) error { + if err := c.eventBridgeProvider.CreateEC2NotificationRules(ctx); err != nil { + return fmt.Errorf("creating event bridge notification rules, %w", err) + } + return nil } diff --git a/pkg/cloudprovider/aws/controllers/notification/controller.go b/pkg/cloudprovider/aws/controllers/notification/controller.go index e45262e53784..ce5a41c69a27 100644 --- a/pkg/cloudprovider/aws/controllers/notification/controller.go +++ b/pkg/cloudprovider/aws/controllers/notification/controller.go @@ -29,6 +29,7 @@ import ( "github.com/aws/karpenter/pkg/apis/provisioning/v1alpha5" "github.com/aws/karpenter/pkg/cloudprovider/aws" + "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/infrastructure" "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event" "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event/aggregatedparser" "github.com/aws/karpenter/pkg/cloudprovider/aws/events" @@ -56,21 +57,25 @@ type Controller struct { clock clock.Clock provider *aws.SQSProvider parser event.Parser + + infraController *infrastructure.Controller } // pollingPeriod that we go to the SQS queue to check if there are any new events -const pollingPeriod = 2 * time.Second +const pollingPeriod = 5 * time.Second func NewController(ctx context.Context, clk clock.Clock, kubeClient client.Client, sqsProvider *aws.SQSProvider, - recorder events.Recorder, provisioner *provisioning.Provisioner, cluster *state.Cluster, startAsync <-chan struct{}) *Controller { + recorder events.Recorder, provisioner *provisioning.Provisioner, infraController *infrastructure.Controller, + cluster *state.Cluster, startAsync <-chan struct{}) *Controller { c := &Controller{ - kubeClient: kubeClient, - provisioner: provisioner, - cluster: cluster, - recorder: recorder, - clock: clk, - provider: sqsProvider, - parser: aggregatedparser.NewAggregatedParser(aggregatedparser.DefaultParsers...), + kubeClient: kubeClient, + provisioner: provisioner, + cluster: cluster, + recorder: recorder, + clock: clk, + provider: sqsProvider, + parser: aggregatedparser.NewAggregatedParser(aggregatedparser.DefaultParsers...), + infraController: infraController, } go func() { @@ -89,20 +94,23 @@ func (c *Controller) run(ctx context.Context) { logger := logging.FromContext(ctx).Named("notification") ctx = logging.WithLogger(ctx, logger) for { + <-c.infraController.Ready() // block until the infrastructure is up and ready + err := c.pollSQS(ctx) + if err != nil { + logging.FromContext(ctx).Errorf("Handling notification messages from SQS queue, %v", err) + } + select { case <-ctx.Done(): logger.Infof("Shutting down") return - case <-time.After(pollingPeriod): - err := c.pollSQS(ctx) - if err != nil { - logging.FromContext(ctx).Errorf("Handling notification messages from SQS queue, %v", err) - } + case <-c.clock.After(pollingPeriod): } } } func (c *Controller) pollSQS(ctx context.Context) error { + logging.FromContext(ctx).Infof("Polling SQS") sqsMessages, err := c.provider.GetSQSMessages(ctx) if err != nil { return err diff --git a/pkg/cloudprovider/aws/controllers/register.go b/pkg/cloudprovider/aws/controllers/register.go index caf8ba0af9ca..d25305533d69 100644 --- a/pkg/cloudprovider/aws/controllers/register.go +++ b/pkg/cloudprovider/aws/controllers/register.go @@ -31,6 +31,6 @@ func Register(ctx context.Context, provider *aws.CloudProvider, opts *controller ctx = logging.WithLogger(ctx, logging.FromContext(ctx).Named("aws")) // Injecting the controllers that will start when opts.StartAsync is triggered - notification.NewController(ctx, opts.Clock, opts.KubeClient, provider.SQSProvider(), rec, opts.Provisioner, opts.Cluster, opts.StartAsync) - infrastructure.NewController(ctx, opts.Clock, rec, provider.SQSProvider(), provider.EventBridgeProvider(), opts.StartAsync) + infraController := infrastructure.NewController(ctx, opts.Clock, rec, provider.SQSProvider(), provider.EventBridgeProvider(), opts.StartAsync) + notification.NewController(ctx, opts.Clock, opts.KubeClient, provider.SQSProvider(), rec, opts.Provisioner, infraController, opts.Cluster, opts.StartAsync) } diff --git a/pkg/cloudprovider/aws/sqs.go b/pkg/cloudprovider/aws/sqs.go index 964a9b1513ed..34650837d42a 100644 --- a/pkg/cloudprovider/aws/sqs.go +++ b/pkg/cloudprovider/aws/sqs.go @@ -33,6 +33,7 @@ import ( type SQSClient interface { CreateQueueWithContext(context.Context, *sqs.CreateQueueInput, ...request.Option) (*sqs.CreateQueueOutput, error) GetQueueUrlWithContext(context.Context, *sqs.GetQueueUrlInput, ...request.Option) (*sqs.GetQueueUrlOutput, error) + SetQueueAttributesWithContext(context.Context, *sqs.SetQueueAttributesInput, ...request.Option) (*sqs.SetQueueAttributesOutput, error) ReceiveMessageWithContext(context.Context, *sqs.ReceiveMessageInput, ...request.Option) (*sqs.ReceiveMessageOutput, error) DeleteMessageWithContext(context.Context, *sqs.DeleteMessageInput, ...request.Option) (*sqs.DeleteMessageOutput, error) } @@ -109,6 +110,19 @@ func (s *SQSProvider) CreateQueue(ctx context.Context) error { } func (s *SQSProvider) SetQueueAttributes(ctx context.Context) error { + queueURL, err := s.DiscoverQueueURL(ctx) + if err != nil { + return fmt.Errorf("failed setting queue attributes, %w", err) + } + + setQueueAttributesInput := &sqs.SetQueueAttributesInput{ + Attributes: s.getQueueAttributes(), + QueueUrl: aws.String(queueURL), + } + _, err = s.SetQueueAttributesWithContext(ctx, setQueueAttributesInput) + if err != nil { + return fmt.Errorf("failed setting queue attributes, %w", err) + } return nil } From 0606ceb33c59f3aba3fd5b11d2e149a1e79ab5f9 Mon Sep 17 00:00:00 2001 From: Jonathan Innis Date: Thu, 15 Sep 2022 17:29:48 -0700 Subject: [PATCH 09/55] Update queue creation logic --- .../controllers/infrastructure/controller.go | 62 ++++++++++++++++--- .../aggregatedparser/aggregatedparser.go | 30 ++++----- pkg/cloudprovider/aws/errors.go | 1 + pkg/cloudprovider/aws/sqs.go | 10 +-- .../cloudformation.yaml | 16 ++--- .../scripts/step05-controller-iam.sh | 1 + 6 files changed, 84 insertions(+), 36 deletions(-) diff --git a/pkg/cloudprovider/aws/controllers/infrastructure/controller.go b/pkg/cloudprovider/aws/controllers/infrastructure/controller.go index ec76f4dc2412..c8d1c807bdd2 100644 --- a/pkg/cloudprovider/aws/controllers/infrastructure/controller.go +++ b/pkg/cloudprovider/aws/controllers/infrastructure/controller.go @@ -16,10 +16,13 @@ package infrastructure import ( "context" + "errors" "fmt" "sync" "time" + "github.com/aws/aws-sdk-go/aws/awserr" + "github.com/aws/aws-sdk-go/service/sqs" "golang.org/x/sync/errgroup" "k8s.io/utils/clock" "knative.dev/pkg/logging" @@ -36,7 +39,6 @@ type Controller struct { recorder events.Recorder clock clock.Clock - ready bool mutex *sync.RWMutex readinessChan chan struct{} // A signal to other controllers that infrastructure is in a good state } @@ -44,9 +46,6 @@ type Controller struct { // pollingPeriod is the period that we go to AWS APIs to ensure that the appropriate AWS infrastructure is provisioned const pollingPeriod = time.Hour -// backoffPeriod is the period that we go to AWS APIs if we fail to ensure the infrastructure exists -const backoffPeriod = time.Minute - func NewController(ctx context.Context, clk clock.Clock, recorder events.Recorder, sqsProvider *aws.SQSProvider, eventBridgeProvider *aws.EventBridgeProvider, startAsync <-chan struct{}) *Controller { c := &Controller{ @@ -54,7 +53,6 @@ func NewController(ctx context.Context, clk clock.Clock, recorder events.Recorde clock: clk, sqsProvider: sqsProvider, eventBridgeProvider: eventBridgeProvider, - ready: false, mutex: &sync.RWMutex{}, readinessChan: make(chan struct{}), } @@ -82,6 +80,7 @@ func (c *Controller) run(ctx context.Context) { if err := c.ensureInfrastructure(ctx); err != nil { logging.FromContext(ctx).Errorf("ensuring infrastructure established, %v", err) c.setReady(false) + backoffPeriod := c.getBackoff(err) // Backoff with a shorter polling interval if we fail to ensure the infrastructure select { @@ -109,7 +108,10 @@ func (c *Controller) Ready() <-chan struct{} { func (c *Controller) setReady(ready bool) { c.mutex.Lock() defer c.mutex.Unlock() - c.ready = ready + + // If the infrastructure we close the readiness channel to let all + // other channels that are waiting on Ready() proceed; otherwise, open + // a channel to tell the other goroutines to wait if ready { close(c.readinessChan) } else { @@ -128,8 +130,25 @@ func (c *Controller) ensureInfrastructure(ctx context.Context) error { } func (c *Controller) ensureQueue(ctx context.Context) error { - if err := c.sqsProvider.CreateQueue(ctx); err != nil { - return fmt.Errorf("creating SQS queue with policy, %w", err) + // Attempt to find the queue. If we can't find it, assume it isn't created and try to create it + // If we did find it, then just set the queue attributes on the existing queue + if _, err := c.sqsProvider.DiscoverQueueURL(ctx, true); err != nil { + var awsErr awserr.Error + if !errors.As(err, &awsErr) { + // This shouldn't happen, but if it does we should capture it + return fmt.Errorf("failed conversion to AWS error, %w", err) + } + switch awsErr.Code() { + case sqs.ErrCodeQueueDoesNotExist: + if err := c.sqsProvider.CreateQueue(ctx); err != nil { + return fmt.Errorf("creating sqs queue with policy, %w", err) + } + return nil + case aws.AccessDeniedCode: + return fmt.Errorf("failed obtaining permission to discover sqs queue url, %w", err) + default: + return fmt.Errorf("failed discovering sqs queue url, %w", err) + } } if err := c.sqsProvider.SetQueueAttributes(ctx); err != nil { return fmt.Errorf("setting queue attributes for queue, %w", err) @@ -139,7 +158,32 @@ func (c *Controller) ensureQueue(ctx context.Context) error { func (c *Controller) ensureEventBridge(ctx context.Context) error { if err := c.eventBridgeProvider.CreateEC2NotificationRules(ctx); err != nil { - return fmt.Errorf("creating event bridge notification rules, %w", err) + var awsErr awserr.Error + if !errors.As(err, &awsErr) { + // This shouldn't happen, but if it does we should capture it + return fmt.Errorf("failed conversion to AWS error, %w", err) + } + switch awsErr.Code() { + case aws.AccessDeniedCode: + return fmt.Errorf("obtaining permission to eventbridge, %w", err) + default: + return fmt.Errorf("creating event bridge notification rules, %w", err) + } } return nil } + +// getBackoff gets a dynamic backoff timeframe based on the error +// that we receive from the AWS API +func (c *Controller) getBackoff(err error) time.Duration { + var awsErr awserr.Error + if !errors.As(err, &awsErr) { + return time.Minute + } + switch awsErr.Code() { + case aws.AccessDeniedCode: + return time.Minute * 10 + default: + return time.Minute + } +} diff --git a/pkg/cloudprovider/aws/controllers/notification/event/aggregatedparser/aggregatedparser.go b/pkg/cloudprovider/aws/controllers/notification/event/aggregatedparser/aggregatedparser.go index 8f99863ccbc5..570315605391 100644 --- a/pkg/cloudprovider/aws/controllers/notification/event/aggregatedparser/aggregatedparser.go +++ b/pkg/cloudprovider/aws/controllers/notification/event/aggregatedparser/aggregatedparser.go @@ -20,33 +20,33 @@ import ( "knative.dev/pkg/logging" - event2 "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event" + "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event" rebalancerecommendationv0 "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event/rebalancerecommendation/v0" - scheduledchangev1 "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event/scheduledchange/v0" - spotinterruptionv1 "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event/spotinterruption/v0" - statechangev1 "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event/statechange/v0" + scheduledchangev0 "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event/scheduledchange/v0" + spotinterruptionv0 "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event/spotinterruption/v0" + statechangev0 "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event/statechange/v0" ) var ( - DefaultParsers = []event2.Parser{ - statechangev1.Parser{}, - spotinterruptionv1.Parser{}, - scheduledchangev1.Parser{}, + DefaultParsers = []event.Parser{ + statechangev0.Parser{}, + spotinterruptionv0.Parser{}, + scheduledchangev0.Parser{}, rebalancerecommendationv0.Parser{}, } ) -type AggregatedParser []event2.Parser +type AggregatedParser []event.Parser -func NewAggregatedParser(parsers ...event2.Parser) AggregatedParser { +func NewAggregatedParser(parsers ...event.Parser) AggregatedParser { return parsers } -func (p AggregatedParser) Parse(ctx context.Context, str string) event2.Interface { +func (p AggregatedParser) Parse(ctx context.Context, str string) event.Interface { ctx = logging.WithLogger(ctx, logging.FromContext(ctx).Named("event.parser")) if str == "" { - return event2.NoOp{} + return event.NoOp{} } // We will go through all the parsers to see if we can parse @@ -57,12 +57,12 @@ func (p AggregatedParser) Parse(ctx context.Context, str string) event2.Interfac } } - md := event2.AWSMetadata{} + md := event.AWSMetadata{} if err := json.Unmarshal([]byte(str), &md); err != nil { logging.FromContext(ctx). With("error", err). Error("failed to unmarshal message metadata") - return event2.NoOp{} + return event.NoOp{} } - return event2.NoOp(md) + return event.NoOp(md) } diff --git a/pkg/cloudprovider/aws/errors.go b/pkg/cloudprovider/aws/errors.go index 41ae3ce43e44..e64c35328b4f 100644 --- a/pkg/cloudprovider/aws/errors.go +++ b/pkg/cloudprovider/aws/errors.go @@ -24,6 +24,7 @@ import ( const ( launchTemplateNotFoundCode = "InvalidLaunchTemplateName.NotFoundException" + AccessDeniedCode = "AccessDenied" ) var ( diff --git a/pkg/cloudprovider/aws/sqs.go b/pkg/cloudprovider/aws/sqs.go index 34650837d42a..b185942bf9fd 100644 --- a/pkg/cloudprovider/aws/sqs.go +++ b/pkg/cloudprovider/aws/sqs.go @@ -110,7 +110,7 @@ func (s *SQSProvider) CreateQueue(ctx context.Context) error { } func (s *SQSProvider) SetQueueAttributes(ctx context.Context) error { - queueURL, err := s.DiscoverQueueURL(ctx) + queueURL, err := s.DiscoverQueueURL(ctx, false) if err != nil { return fmt.Errorf("failed setting queue attributes, %w", err) } @@ -126,10 +126,10 @@ func (s *SQSProvider) SetQueueAttributes(ctx context.Context) error { return nil } -func (s *SQSProvider) DiscoverQueueURL(ctx context.Context) (string, error) { +func (s *SQSProvider) DiscoverQueueURL(ctx context.Context, ignoreCache bool) (string, error) { s.mutex.RLock() defer s.mutex.RUnlock() - if s.queueURL != "" { + if s.queueURL != "" && !ignoreCache { return s.queueURL, nil } result, err := s.GetQueueUrlWithContext(ctx, s.getQueueURLInput) @@ -145,7 +145,7 @@ func (s *SQSProvider) DiscoverQueueURL(ctx context.Context) (string, error) { func (s *SQSProvider) GetSQSMessages(ctx context.Context) ([]*sqs.Message, error) { ctx = logging.WithLogger(ctx, logging.FromContext(ctx).Named("sqsClient.getMessages")) - queueURL, err := s.DiscoverQueueURL(ctx) + queueURL, err := s.DiscoverQueueURL(ctx, false) if err != nil { return nil, fmt.Errorf("failed getting sqs messages, %w", err) } @@ -171,7 +171,7 @@ func (s *SQSProvider) GetSQSMessages(ctx context.Context) ([]*sqs.Message, error func (s *SQSProvider) DeleteSQSMessage(ctx context.Context, msg *sqs.Message) error { ctx = logging.WithLogger(ctx, logging.FromContext(ctx).Named("sqsClient.deleteMessage")) - queueURL, err := s.DiscoverQueueURL(ctx) + queueURL, err := s.DiscoverQueueURL(ctx, false) if err != nil { return fmt.Errorf("failed getting sqs messages, %w", err) } diff --git a/website/content/en/preview/getting-started/getting-started-with-eksctl/cloudformation.yaml b/website/content/en/preview/getting-started/getting-started-with-eksctl/cloudformation.yaml index d6c7cc347265..e58fa4b32294 100644 --- a/website/content/en/preview/getting-started/getting-started-with-eksctl/cloudformation.yaml +++ b/website/content/en/preview/getting-started/getting-started-with-eksctl/cloudformation.yaml @@ -72,22 +72,24 @@ Resources: Version: "2012-10-17" Statement: - Effect: Allow - Resource: "*" + Resource: !Sub "arn:${AWS::Partition}:sqs:${AWS::Region}:${AWS::AccountId}:Karpenter-${ClusterName}-Queue" Action: # Write Operations - sqs:CreateQueue + - sqs:TagResource + - sqs:SetQueueAttributes - sqs:DeleteMessage - sqs:DeleteQueue + # Read Operations + - sqs:GetQueueUrl + - sqs:ReceiveMessage + - Effect: Allow + Resource: !Sub "arn:${AWS::Partition}:events:${AWS::Region}:${AWS::AccountId}:Karpenter-${ClusterName}-*" - events:PutRule - events:PutTargets + - events:TagResource - events:DeleteRule - events:RemoveTargets # Read Operations - - sqs:GetQueueUrl - - sqs:ReceiveMessage - events:ListRules - events:DescribeRule - - Effect: Allow - Action: - - iam:PassRole - Resource: !Sub "arn:${AWS::Partition}:iam::${AWS::AccountId}:role/KarpenterNodeRole-${ClusterName}" diff --git a/website/content/en/preview/getting-started/getting-started-with-eksctl/scripts/step05-controller-iam.sh b/website/content/en/preview/getting-started/getting-started-with-eksctl/scripts/step05-controller-iam.sh index 32673a49c9e0..0940503acf26 100755 --- a/website/content/en/preview/getting-started/getting-started-with-eksctl/scripts/step05-controller-iam.sh +++ b/website/content/en/preview/getting-started/getting-started-with-eksctl/scripts/step05-controller-iam.sh @@ -2,6 +2,7 @@ eksctl create iamserviceaccount \ --cluster "${CLUSTER_NAME}" --name karpenter --namespace karpenter \ --role-name "${CLUSTER_NAME}-karpenter" \ --attach-policy-arn "arn:aws:iam::${AWS_ACCOUNT_ID}:policy/KarpenterControllerPolicy-${CLUSTER_NAME}" \ + --attach-policy-arn "arn:aws:iam::${AWS_ACCOUNT_ID}:policy/KarpenterEventPolicy-${CLUSTER_NAME}" \ --role-only \ --approve From 696b41cbbba5c5507cbf954e7e659b4e2403358f Mon Sep 17 00:00:00 2001 From: Jonathan Innis Date: Mon, 19 Sep 2022 16:03:56 -0700 Subject: [PATCH 10/55] Add deletion options for cleanup of infra --- .../controllers/infrastructure/controller.go | 48 +++++++++--- pkg/cloudprovider/aws/eventbridge.go | 74 +++++++++++++------ pkg/cloudprovider/aws/sqs.go | 41 +++++----- 3 files changed, 115 insertions(+), 48 deletions(-) diff --git a/pkg/cloudprovider/aws/controllers/infrastructure/controller.go b/pkg/cloudprovider/aws/controllers/infrastructure/controller.go index c8d1c807bdd2..b44c45e2037d 100644 --- a/pkg/cloudprovider/aws/controllers/infrastructure/controller.go +++ b/pkg/cloudprovider/aws/controllers/infrastructure/controller.go @@ -23,7 +23,7 @@ import ( "github.com/aws/aws-sdk-go/aws/awserr" "github.com/aws/aws-sdk-go/service/sqs" - "golang.org/x/sync/errgroup" + "go.uber.org/multierr" "k8s.io/utils/clock" "knative.dev/pkg/logging" @@ -99,6 +99,26 @@ func (c *Controller) run(ctx context.Context) { } } +func (c *Controller) cleanup(ctx context.Context) (err error) { + wg := &sync.WaitGroup{} + m := &sync.Mutex{} + + go func() { + e := c.sqsProvider.DeleteQueue(ctx) + m.Lock() + err = multierr.Append(err, e) + m.Unlock() + }() + go func() { + e := c.eventBridgeProvider.DeleteEC2NotificationRules(ctx) + m.Lock() + err = multierr.Append(err, e) + m.Unlock() + }() + wg.Wait() + return err +} + func (c *Controller) Ready() <-chan struct{} { c.mutex.RLock() defer c.mutex.RUnlock() @@ -119,14 +139,24 @@ func (c *Controller) setReady(ready bool) { } } -func (c *Controller) ensureInfrastructure(ctx context.Context) error { - g, ctx := errgroup.WithContext(ctx) - g.Go(func() error { return c.ensureQueue(ctx) }) - g.Go(func() error { return c.ensureEventBridge(ctx) }) - if err := g.Wait(); err != nil { - return err - } - return nil +func (c *Controller) ensureInfrastructure(ctx context.Context) (err error) { + wg := &sync.WaitGroup{} + m := &sync.Mutex{} + + go func() { + e := c.ensureQueue(ctx) + m.Lock() + err = multierr.Append(err, e) + m.Unlock() + }() + go func() { + e := c.ensureEventBridge(ctx) + m.Lock() + err = multierr.Append(err, e) + m.Unlock() + }() + wg.Wait() + return err } func (c *Controller) ensureQueue(ctx context.Context) error { diff --git a/pkg/cloudprovider/aws/eventbridge.go b/pkg/cloudprovider/aws/eventbridge.go index d299cc65f9e1..222f2edcd304 100644 --- a/pkg/cloudprovider/aws/eventbridge.go +++ b/pkg/cloudprovider/aws/eventbridge.go @@ -18,6 +18,7 @@ import ( "context" "encoding/json" "fmt" + "sync" "github.com/aws/aws-sdk-go/aws" "github.com/aws/aws-sdk-go/aws/request" @@ -32,6 +33,7 @@ import ( type EventBridgeClient interface { PutRuleWithContext(context.Context, *eventbridge.PutRuleInput, ...request.Option) (*eventbridge.PutRuleOutput, error) PutTargetsWithContext(context.Context, *eventbridge.PutTargetsInput, ...request.Option) (*eventbridge.PutTargetsOutput, error) + DeleteRuleWithContext(context.Context, *eventbridge.DeleteRuleInput, ...request.Option) (*eventbridge.DeleteRuleOutput, error) } type EventBridgeProvider struct { @@ -68,31 +70,61 @@ func NewEventBridgeProvider(eb EventBridgeClient, metadata *Metadata, queueName } } -func (eb *EventBridgeProvider) CreateEC2NotificationRules(ctx context.Context) error { - var err error +func (eb *EventBridgeProvider) CreateEC2NotificationRules(ctx context.Context) (err error) { + wg := &sync.WaitGroup{} + m := &sync.Mutex{} for _, rule := range eb.getEC2NotificationEventRules(ctx) { - _, e := eb.PutRuleWithContext(ctx, &eventbridge.PutRuleInput{ - Name: aws.String(rule.Name), - EventPattern: aws.String(string(rule.Pattern.Serialize())), - Tags: []*eventbridge.Tag{ - { - Key: aws.String(v1alpha5.DiscoveryLabelKey), - Value: aws.String(injection.GetOptions(ctx).ClusterName), + wg.Add(1) + go func(r EventRule) { + defer wg.Done() + _, e := eb.PutRuleWithContext(ctx, &eventbridge.PutRuleInput{ + Name: aws.String(r.Name), + EventPattern: aws.String(string(r.Pattern.Serialize())), + Tags: []*eventbridge.Tag{ + { + Key: aws.String(v1alpha5.DiscoveryLabelKey), + Value: aws.String(injection.GetOptions(ctx).ClusterName), + }, }, - }, - }) - err = multierr.Append(err, e) - _, e = eb.PutTargetsWithContext(ctx, &eventbridge.PutTargetsInput{ - Rule: aws.String(rule.Name), - Targets: []*eventbridge.Target{ - { - Id: aws.String(rule.Target.ID), - Arn: aws.String(rule.Target.ARN), + }) + m.Lock() + err = multierr.Append(err, e) + m.Unlock() + _, e = eb.PutTargetsWithContext(ctx, &eventbridge.PutTargetsInput{ + Rule: aws.String(r.Name), + Targets: []*eventbridge.Target{ + { + Id: aws.String(r.Target.ID), + Arn: aws.String(r.Target.ARN), + }, }, - }, - }) - err = multierr.Append(err, e) + }) + m.Lock() + err = multierr.Append(err, e) + m.Unlock() + }(rule) + } + wg.Wait() + return err +} + +func (eb *EventBridgeProvider) DeleteEC2NotificationRules(ctx context.Context) (err error) { + wg := &sync.WaitGroup{} + m := &sync.Mutex{} + for _, rule := range eb.getEC2NotificationEventRules(ctx) { + wg.Add(1) + go func(r EventRule) { + defer wg.Done() + input := &eventbridge.DeleteRuleInput{ + Name: aws.String(r.Name), + } + _, e := eb.DeleteRuleWithContext(ctx, input) + m.Lock() + err = multierr.Append(err, e) + m.Unlock() + }(rule) } + wg.Wait() return err } diff --git a/pkg/cloudprovider/aws/sqs.go b/pkg/cloudprovider/aws/sqs.go index b185942bf9fd..112f46c527a1 100644 --- a/pkg/cloudprovider/aws/sqs.go +++ b/pkg/cloudprovider/aws/sqs.go @@ -24,7 +24,6 @@ import ( "github.com/aws/aws-sdk-go/aws/request" "github.com/aws/aws-sdk-go/service/sqs" "github.com/samber/lo" - "knative.dev/pkg/logging" "github.com/aws/karpenter/pkg/apis/provisioning/v1alpha5" "github.com/aws/karpenter/pkg/utils/injection" @@ -36,6 +35,7 @@ type SQSClient interface { SetQueueAttributesWithContext(context.Context, *sqs.SetQueueAttributesInput, ...request.Option) (*sqs.SetQueueAttributesOutput, error) ReceiveMessageWithContext(context.Context, *sqs.ReceiveMessageInput, ...request.Option) (*sqs.ReceiveMessageOutput, error) DeleteMessageWithContext(context.Context, *sqs.DeleteMessageInput, ...request.Option) (*sqs.DeleteMessageOutput, error) + DeleteQueueWithContext(context.Context, *sqs.DeleteQueueInput, ...request.Option) (*sqs.DeleteQueueOutput, error) } type SQSProvider struct { @@ -101,7 +101,7 @@ func NewSQSProvider(ctx context.Context, client SQSClient, metadata *Metadata) * func (s *SQSProvider) CreateQueue(ctx context.Context) error { result, err := s.CreateQueueWithContext(ctx, s.createQueueInput) if err != nil { - return fmt.Errorf("failed to create SQS queue, %w", err) + return fmt.Errorf("failed creating sqs queue, %w", err) } s.mutex.Lock() defer s.mutex.Unlock() @@ -112,7 +112,7 @@ func (s *SQSProvider) CreateQueue(ctx context.Context) error { func (s *SQSProvider) SetQueueAttributes(ctx context.Context) error { queueURL, err := s.DiscoverQueueURL(ctx, false) if err != nil { - return fmt.Errorf("failed setting queue attributes, %w", err) + return fmt.Errorf("failed fetching queue url, %w", err) } setQueueAttributesInput := &sqs.SetQueueAttributesInput{ @@ -143,37 +143,30 @@ func (s *SQSProvider) DiscoverQueueURL(ctx context.Context, ignoreCache bool) (s } func (s *SQSProvider) GetSQSMessages(ctx context.Context) ([]*sqs.Message, error) { - ctx = logging.WithLogger(ctx, logging.FromContext(ctx).Named("sqsClient.getMessages")) - queueURL, err := s.DiscoverQueueURL(ctx, false) if err != nil { - return nil, fmt.Errorf("failed getting sqs messages, %w", err) + return nil, fmt.Errorf("failed fetching queue url, %w", err) } // Copy the input template and add the discovered queue url input, err := deepCopy(s.receiveMessageInput) if err != nil { - return nil, fmt.Errorf("error copying input, %w", err) + return nil, fmt.Errorf("failed copying input, %w", err) } input.QueueUrl = aws.String(queueURL) result, err := s.ReceiveMessageWithContext(ctx, input) if err != nil { - logging.FromContext(ctx). - With("error", err). - Error("failed to fetch messages") - return nil, err + return nil, fmt.Errorf("failed receiving sqs messages, %w", err) } return result.Messages, nil } func (s *SQSProvider) DeleteSQSMessage(ctx context.Context, msg *sqs.Message) error { - ctx = logging.WithLogger(ctx, logging.FromContext(ctx).Named("sqsClient.deleteMessage")) - queueURL, err := s.DiscoverQueueURL(ctx, false) if err != nil { - return fmt.Errorf("failed getting sqs messages, %w", err) + return fmt.Errorf("failed fetching queue url, %w", err) } input := &sqs.DeleteMessageInput{ @@ -183,12 +176,24 @@ func (s *SQSProvider) DeleteSQSMessage(ctx context.Context, msg *sqs.Message) er _, err = s.DeleteMessageWithContext(ctx, input) if err != nil { - logging.FromContext(ctx). - With("error", err). - Error("failed to delete message") - return err + return fmt.Errorf("failed deleting messages from sqs queue, %w", err) + } + return nil +} + +func (s *SQSProvider) DeleteQueue(ctx context.Context) error { + queueURL, err := s.DiscoverQueueURL(ctx, false) + if err != nil { + return fmt.Errorf("failed fetching queue url, %w", err) } + input := &sqs.DeleteQueueInput{ + QueueUrl: aws.String(queueURL), + } + _, err = s.DeleteQueueWithContext(ctx, input) + if err != nil { + return fmt.Errorf("failed deleting sqs queue, %w", err) + } return nil } From 614e2bfa52bf0742189704012ceb95c689ac8b3e Mon Sep 17 00:00:00 2001 From: Jonathan Innis Date: Mon, 19 Sep 2022 16:28:54 -0700 Subject: [PATCH 11/55] Inject cleanup channel into controller --- .../controllers/infrastructure/controller.go | 16 +++++++-- pkg/cloudprovider/aws/controllers/register.go | 4 ++- pkg/controllers/controllers.go | 36 ++++++++++++++----- 3 files changed, 45 insertions(+), 11 deletions(-) diff --git a/pkg/cloudprovider/aws/controllers/infrastructure/controller.go b/pkg/cloudprovider/aws/controllers/infrastructure/controller.go index b44c45e2037d..3bac5c38a081 100644 --- a/pkg/cloudprovider/aws/controllers/infrastructure/controller.go +++ b/pkg/cloudprovider/aws/controllers/infrastructure/controller.go @@ -18,7 +18,10 @@ import ( "context" "errors" "fmt" + "os" + "os/signal" "sync" + "syscall" "time" "github.com/aws/aws-sdk-go/aws/awserr" @@ -46,8 +49,9 @@ type Controller struct { // pollingPeriod is the period that we go to AWS APIs to ensure that the appropriate AWS infrastructure is provisioned const pollingPeriod = time.Hour -func NewController(ctx context.Context, clk clock.Clock, recorder events.Recorder, - sqsProvider *aws.SQSProvider, eventBridgeProvider *aws.EventBridgeProvider, startAsync <-chan struct{}) *Controller { +func NewController(ctx context.Context, cleanupCtx context.Context, clk clock.Clock, recorder events.Recorder, + sqsProvider *aws.SQSProvider, eventBridgeProvider *aws.EventBridgeProvider, + startAsync <-chan struct{}, cleanupAsync <-chan os.Signal) *Controller { c := &Controller{ recorder: recorder, clock: clk, @@ -57,6 +61,14 @@ func NewController(ctx context.Context, clk clock.Clock, recorder events.Recorde readinessChan: make(chan struct{}), } + sigChan := make(chan os.Signal, 1) + signal.Notify(sigChan, syscall.SIGINT) + + go func() { + <-cleanupAsync + c.cleanup(cleanupCtx) + }() + go func() { select { case <-ctx.Done(): diff --git a/pkg/cloudprovider/aws/controllers/register.go b/pkg/cloudprovider/aws/controllers/register.go index d25305533d69..1e6ff40764ed 100644 --- a/pkg/cloudprovider/aws/controllers/register.go +++ b/pkg/cloudprovider/aws/controllers/register.go @@ -29,8 +29,10 @@ import ( func Register(ctx context.Context, provider *aws.CloudProvider, opts *controllers.ControllerOptions) { rec := events.NewRecorder(opts.Recorder) ctx = logging.WithLogger(ctx, logging.FromContext(ctx).Named("aws")) + cleanupContext := opts.BaseContext() + cleanupContext = logging.WithLogger(ctx, logging.FromContext(ctx).Named("aws")) // Injecting the controllers that will start when opts.StartAsync is triggered - infraController := infrastructure.NewController(ctx, opts.Clock, rec, provider.SQSProvider(), provider.EventBridgeProvider(), opts.StartAsync) + infraController := infrastructure.NewController(ctx, cleanupContext, opts.Clock, rec, provider.SQSProvider(), provider.EventBridgeProvider(), opts.StartAsync, opts.CleanupAsync) notification.NewController(ctx, opts.Clock, opts.KubeClient, provider.SQSProvider(), rec, opts.Provisioner, infraController, opts.Cluster, opts.StartAsync) } diff --git a/pkg/controllers/controllers.go b/pkg/controllers/controllers.go index e7dbbf1094a3..357335a2d44c 100644 --- a/pkg/controllers/controllers.go +++ b/pkg/controllers/controllers.go @@ -19,7 +19,10 @@ import ( "fmt" "net/http" "net/http/pprof" + "os" + "os/signal" "runtime/debug" + "syscall" "github.com/go-logr/logr" "github.com/go-logr/zapr" @@ -85,12 +88,14 @@ type Controller interface { } type ControllerOptions struct { - Cluster *state.Cluster - KubeClient client.Client - Provisioner *provisioning.Provisioner - Recorder events.Recorder - StartAsync <-chan struct{} - Clock clock.Clock + BaseContext func() context.Context + Cluster *state.Cluster + KubeClient client.Client + Provisioner *provisioning.Provisioner + Recorder events.Recorder + StartAsync <-chan struct{} + CleanupAsync <-chan os.Signal + Clock clock.Clock } func Initialize(injectCloudProvider func(context.Context, cloudprovider.Options) (cloudprovider.CloudProvider, func(context.Context, *ControllerOptions))) { @@ -129,7 +134,6 @@ func Initialize(injectCloudProvider func(context.Context, cloudprovider.Options) if opts.EnableProfiling { utilruntime.Must(registerPprof(manager)) } - cloudProvider, injectControllers := injectCloudProvider(ctx, cloudprovider.Options{ClientSet: clientSet, KubeClient: manager.GetClient(), StartAsync: manager.Elected()}) if hp, ok := cloudProvider.(HealthCheck); ok { utilruntime.Must(manager.AddHealthzCheck("cloud-provider", hp.LivenessProbe)) @@ -157,7 +161,17 @@ func Initialize(injectCloudProvider func(context.Context, cloudprovider.Options) // Inject cloudprovider-specific controllers into the controller-set using the injectControllers function // Inject the base cloud provider into the injection function rather than the decorated interface - injectControllers(ctx, &ControllerOptions{Cluster: cluster, KubeClient: manager.GetClient(), Provisioner: provisioner, Recorder: recorder, StartAsync: manager.Elected(), Clock: realClock}) + controllerOptions := &ControllerOptions{ + BaseContext: newRunnableContext(controllerRuntimeConfig, opts, logging.FromContext(ctx)), + Cluster: cluster, + KubeClient: manager.GetClient(), + Provisioner: provisioner, + Recorder: recorder, + StartAsync: manager.Elected(), + CleanupAsync: Cleanup(), + Clock: realClock, + } + injectControllers(ctx, controllerOptions) metricsstate.StartMetricScraper(ctx, cluster) @@ -269,6 +283,12 @@ func ignoreDebugEvents(logger logr.Logger) logr.Logger { return logr.New(&ignoreDebugEventsSink{sink: logger.GetSink()}) } +func Cleanup() <-chan os.Signal { + c := make(chan os.Signal, 1) + signal.Notify(c, syscall.SIGINT) + return c +} + func newRunnableContext(config *rest.Config, options *options.Options, logger *zap.SugaredLogger) func() context.Context { return func() context.Context { ctx := context.Background() From fac2d0eb1bf84e8ba2f98be450c36d3cc4273842 Mon Sep 17 00:00:00 2001 From: Jonathan Innis Date: Mon, 19 Sep 2022 16:48:27 -0700 Subject: [PATCH 12/55] Add deployment details into container env vars --- charts/karpenter/templates/deployment.yaml | 4 + .../controllers/infrastructure/controller.go | 76 +++++++++++++------ .../controllers/notification/controller.go | 27 ++++--- pkg/cloudprovider/aws/controllers/register.go | 7 +- pkg/controllers/controllers.go | 3 +- pkg/utils/options/options.go | 6 ++ 6 files changed, 81 insertions(+), 42 deletions(-) diff --git a/charts/karpenter/templates/deployment.yaml b/charts/karpenter/templates/deployment.yaml index 6da4354990d6..ab688d428b2d 100644 --- a/charts/karpenter/templates/deployment.yaml +++ b/charts/karpenter/templates/deployment.yaml @@ -138,6 +138,10 @@ spec: {{- end }} - name: KARPENTER_SERVICE value: {{ include "karpenter.fullname" . }} + - name: DEPLOYMENT_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name - name: SYSTEM_NAMESPACE valueFrom: fieldRef: diff --git a/pkg/cloudprovider/aws/controllers/infrastructure/controller.go b/pkg/cloudprovider/aws/controllers/infrastructure/controller.go index 3bac5c38a081..5f0d33c15717 100644 --- a/pkg/cloudprovider/aws/controllers/infrastructure/controller.go +++ b/pkg/cloudprovider/aws/controllers/infrastructure/controller.go @@ -27,32 +27,40 @@ import ( "github.com/aws/aws-sdk-go/aws/awserr" "github.com/aws/aws-sdk-go/service/sqs" "go.uber.org/multierr" + appsv1 "k8s.io/api/apps/v1" + "k8s.io/apimachinery/pkg/types" "k8s.io/utils/clock" "knative.dev/pkg/logging" + "sigs.k8s.io/controller-runtime/pkg/client" "github.com/aws/karpenter/pkg/cloudprovider/aws" "github.com/aws/karpenter/pkg/cloudprovider/aws/events" + "github.com/aws/karpenter/pkg/utils/injection" ) // Controller is the AWS infrastructure controller. It is not a standard controller-runtime controller in that it doesn't // have a reconcile method. type Controller struct { + kubeClient client.Client + recorder events.Recorder + clock clock.Clock + sqsProvider *aws.SQSProvider eventBridgeProvider *aws.EventBridgeProvider - recorder events.Recorder - clock clock.Clock mutex *sync.RWMutex readinessChan chan struct{} // A signal to other controllers that infrastructure is in a good state } // pollingPeriod is the period that we go to AWS APIs to ensure that the appropriate AWS infrastructure is provisioned +// This period can be reduced to a backoffPeriod if there is an error in ensuring the infrastructure const pollingPeriod = time.Hour -func NewController(ctx context.Context, cleanupCtx context.Context, clk clock.Clock, recorder events.Recorder, - sqsProvider *aws.SQSProvider, eventBridgeProvider *aws.EventBridgeProvider, +func NewController(ctx context.Context, cleanupCtx context.Context, kubeClient client.Client, clk clock.Clock, + recorder events.Recorder, sqsProvider *aws.SQSProvider, eventBridgeProvider *aws.EventBridgeProvider, startAsync <-chan struct{}, cleanupAsync <-chan os.Signal) *Controller { c := &Controller{ + kubeClient: kubeClient, recorder: recorder, clock: clk, sqsProvider: sqsProvider, @@ -111,24 +119,27 @@ func (c *Controller) run(ctx context.Context) { } } -func (c *Controller) cleanup(ctx context.Context) (err error) { - wg := &sync.WaitGroup{} - m := &sync.Mutex{} +func (c *Controller) cleanup(ctx context.Context) { + logging.WithLogger(ctx, logging.FromContext(ctx).Named("infrastructure.cleanup")) - go func() { - e := c.sqsProvider.DeleteQueue(ctx) - m.Lock() - err = multierr.Append(err, e) - m.Unlock() - }() - go func() { - e := c.eventBridgeProvider.DeleteEC2NotificationRules(ctx) - m.Lock() - err = multierr.Append(err, e) - m.Unlock() - }() - wg.Wait() - return err + dep := &appsv1.Deployment{} + nn := types.NamespacedName{ + Name: injection.GetOptions(ctx).DeploymentName, + Namespace: injection.GetOptions(ctx).DeploymentNamespace, + } + + err := c.kubeClient.Get(ctx, nn, dep) + if err != nil { + logging.FromContext(ctx).Errorf("Getting the deployment %s for cleanup, %v", nn, err) + } + + // Deployment is deleting so we should cleanup the infrastructure + if !dep.DeletionTimestamp.IsZero() { + err = c.deleteInfrastructure(ctx) + if err != nil { + logging.FromContext(ctx).Errorf("Deleting the infrastructure, %v", err) + } + } } func (c *Controller) Ready() <-chan struct{} { @@ -171,6 +182,27 @@ func (c *Controller) ensureInfrastructure(ctx context.Context) (err error) { return err } +func (c *Controller) deleteInfrastructure(ctx context.Context) (err error) { + logging.FromContext(ctx).Infof("Deprovisioning the infrastructure...") + wg := &sync.WaitGroup{} + m := &sync.Mutex{} + + go func() { + e := c.sqsProvider.DeleteQueue(ctx) + m.Lock() + err = multierr.Append(err, e) + m.Unlock() + }() + go func() { + e := c.eventBridgeProvider.DeleteEC2NotificationRules(ctx) + m.Lock() + err = multierr.Append(err, e) + m.Unlock() + }() + wg.Wait() + return err +} + func (c *Controller) ensureQueue(ctx context.Context) error { // Attempt to find the queue. If we can't find it, assume it isn't created and try to create it // If we did find it, then just set the queue attributes on the existing queue @@ -202,7 +234,7 @@ func (c *Controller) ensureEventBridge(ctx context.Context) error { if err := c.eventBridgeProvider.CreateEC2NotificationRules(ctx); err != nil { var awsErr awserr.Error if !errors.As(err, &awsErr) { - // This shouldn't happen, but if it does we should capture it + // This shouldn't happen, but if it does, we should capture it return fmt.Errorf("failed conversion to AWS error, %w", err) } switch awsErr.Code() { diff --git a/pkg/cloudprovider/aws/controllers/notification/controller.go b/pkg/cloudprovider/aws/controllers/notification/controller.go index ce5a41c69a27..bde2f55cbb5b 100644 --- a/pkg/cloudprovider/aws/controllers/notification/controller.go +++ b/pkg/cloudprovider/aws/controllers/notification/controller.go @@ -29,7 +29,6 @@ import ( "github.com/aws/karpenter/pkg/apis/provisioning/v1alpha5" "github.com/aws/karpenter/pkg/cloudprovider/aws" - "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/infrastructure" "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event" "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event/aggregatedparser" "github.com/aws/karpenter/pkg/cloudprovider/aws/events" @@ -58,24 +57,24 @@ type Controller struct { provider *aws.SQSProvider parser event.Parser - infraController *infrastructure.Controller + infraReady func() <-chan struct{} } // pollingPeriod that we go to the SQS queue to check if there are any new events const pollingPeriod = 5 * time.Second -func NewController(ctx context.Context, clk clock.Clock, kubeClient client.Client, sqsProvider *aws.SQSProvider, - recorder events.Recorder, provisioner *provisioning.Provisioner, infraController *infrastructure.Controller, - cluster *state.Cluster, startAsync <-chan struct{}) *Controller { +func NewController(ctx context.Context, kubeClient client.Client, clk clock.Clock, sqsProvider *aws.SQSProvider, + recorder events.Recorder, provisioner *provisioning.Provisioner, cluster *state.Cluster, + startAsync <-chan struct{}, infraReady func() <-chan struct{}) *Controller { c := &Controller{ - kubeClient: kubeClient, - provisioner: provisioner, - cluster: cluster, - recorder: recorder, - clock: clk, - provider: sqsProvider, - parser: aggregatedparser.NewAggregatedParser(aggregatedparser.DefaultParsers...), - infraController: infraController, + kubeClient: kubeClient, + provisioner: provisioner, + cluster: cluster, + recorder: recorder, + clock: clk, + provider: sqsProvider, + parser: aggregatedparser.NewAggregatedParser(aggregatedparser.DefaultParsers...), + infraReady: infraReady, } go func() { @@ -94,7 +93,7 @@ func (c *Controller) run(ctx context.Context) { logger := logging.FromContext(ctx).Named("notification") ctx = logging.WithLogger(ctx, logger) for { - <-c.infraController.Ready() // block until the infrastructure is up and ready + <-c.infraReady() // block until the infrastructure is up and ready err := c.pollSQS(ctx) if err != nil { logging.FromContext(ctx).Errorf("Handling notification messages from SQS queue, %v", err) diff --git a/pkg/cloudprovider/aws/controllers/register.go b/pkg/cloudprovider/aws/controllers/register.go index 1e6ff40764ed..94588deb10a9 100644 --- a/pkg/cloudprovider/aws/controllers/register.go +++ b/pkg/cloudprovider/aws/controllers/register.go @@ -29,10 +29,9 @@ import ( func Register(ctx context.Context, provider *aws.CloudProvider, opts *controllers.ControllerOptions) { rec := events.NewRecorder(opts.Recorder) ctx = logging.WithLogger(ctx, logging.FromContext(ctx).Named("aws")) - cleanupContext := opts.BaseContext() - cleanupContext = logging.WithLogger(ctx, logging.FromContext(ctx).Named("aws")) + cleanupContext := logging.WithLogger(opts.BaseContext(), logging.FromContext(ctx).Named("aws")) // Injecting the controllers that will start when opts.StartAsync is triggered - infraController := infrastructure.NewController(ctx, cleanupContext, opts.Clock, rec, provider.SQSProvider(), provider.EventBridgeProvider(), opts.StartAsync, opts.CleanupAsync) - notification.NewController(ctx, opts.Clock, opts.KubeClient, provider.SQSProvider(), rec, opts.Provisioner, infraController, opts.Cluster, opts.StartAsync) + infraController := infrastructure.NewController(ctx, cleanupContext, opts.KubeClient, opts.Clock, rec, provider.SQSProvider(), provider.EventBridgeProvider(), opts.StartAsync, opts.CleanupAsync) + notification.NewController(ctx, opts.KubeClient, opts.Clock, provider.SQSProvider(), rec, opts.Provisioner, opts.Cluster, opts.StartAsync, infraController.Ready) } diff --git a/pkg/controllers/controllers.go b/pkg/controllers/controllers.go index 357335a2d44c..ba00f4a83b29 100644 --- a/pkg/controllers/controllers.go +++ b/pkg/controllers/controllers.go @@ -109,8 +109,7 @@ func Initialize(injectCloudProvider func(context.Context, cloudprovider.Options) // Set up logger and watch for changes to log level cmw := informer.NewInformedWatcher(clientSet, system.Namespace()) ctx := injection.LoggingContextOrDie(component, controllerRuntimeConfig, cmw) - ctx = injection.WithConfig(ctx, controllerRuntimeConfig) - ctx = injection.WithOptions(ctx, *opts) + ctx = newRunnableContext(controllerRuntimeConfig, opts, logging.FromContext(ctx))() logging.FromContext(ctx).Infof("Initializing with version %s", project.Version) diff --git a/pkg/utils/options/options.go b/pkg/utils/options/options.go index ab1e43cec3f6..58402aaaa522 100644 --- a/pkg/utils/options/options.go +++ b/pkg/utils/options/options.go @@ -22,6 +22,7 @@ import ( "os" "go.uber.org/multierr" + "knative.dev/pkg/system" "github.com/aws/karpenter/pkg/utils/env" ) @@ -44,6 +45,9 @@ type Options struct { EnableProfiling bool EnableLeaderElection bool MemoryLimit int64 + // Metadata information + DeploymentName string + DeploymentNamespace string // AWS Specific ClusterName string ClusterEndpoint string @@ -96,6 +100,8 @@ func (o *Options) MustParse() *Options { if err := o.Validate(); err != nil { panic(err) } + o.DeploymentName = env.WithDefaultString("DEPLOYMENT_NAME", "karpenter") + o.DeploymentNamespace = system.Namespace() return o } From bd5bf9c97117ab648ac26ee3cdae41e9cdac71f5 Mon Sep 17 00:00:00 2001 From: Jonathan Innis Date: Mon, 19 Sep 2022 17:02:56 -0700 Subject: [PATCH 13/55] Permission fixes --- .../aws/controllers/infrastructure/controller.go | 4 +++- pkg/cloudprovider/aws/errors.go | 1 + pkg/cloudprovider/aws/eventbridge.go | 8 ++++---- .../getting-started-with-eksctl/cloudformation.yaml | 4 ++-- 4 files changed, 10 insertions(+), 7 deletions(-) diff --git a/pkg/cloudprovider/aws/controllers/infrastructure/controller.go b/pkg/cloudprovider/aws/controllers/infrastructure/controller.go index 5f0d33c15717..916719fe0b0a 100644 --- a/pkg/cloudprovider/aws/controllers/infrastructure/controller.go +++ b/pkg/cloudprovider/aws/controllers/infrastructure/controller.go @@ -238,7 +238,7 @@ func (c *Controller) ensureEventBridge(ctx context.Context) error { return fmt.Errorf("failed conversion to AWS error, %w", err) } switch awsErr.Code() { - case aws.AccessDeniedCode: + case aws.AccessDeniedException: return fmt.Errorf("obtaining permission to eventbridge, %w", err) default: return fmt.Errorf("creating event bridge notification rules, %w", err) @@ -255,6 +255,8 @@ func (c *Controller) getBackoff(err error) time.Duration { return time.Minute } switch awsErr.Code() { + case aws.AccessDeniedException: + return time.Minute * 10 case aws.AccessDeniedCode: return time.Minute * 10 default: diff --git a/pkg/cloudprovider/aws/errors.go b/pkg/cloudprovider/aws/errors.go index e64c35328b4f..0e13d68430d5 100644 --- a/pkg/cloudprovider/aws/errors.go +++ b/pkg/cloudprovider/aws/errors.go @@ -25,6 +25,7 @@ import ( const ( launchTemplateNotFoundCode = "InvalidLaunchTemplateName.NotFoundException" AccessDeniedCode = "AccessDenied" + AccessDeniedException = "AccessDeniedException" ) var ( diff --git a/pkg/cloudprovider/aws/eventbridge.go b/pkg/cloudprovider/aws/eventbridge.go index 222f2edcd304..5106854b60a3 100644 --- a/pkg/cloudprovider/aws/eventbridge.go +++ b/pkg/cloudprovider/aws/eventbridge.go @@ -131,7 +131,7 @@ func (eb *EventBridgeProvider) DeleteEC2NotificationRules(ctx context.Context) ( func (eb *EventBridgeProvider) getEC2NotificationEventRules(ctx context.Context) []EventRule { return []EventRule{ { - Name: fmt.Sprintf("%s-ScheduledChangeRule", injection.GetOptions(ctx).ClusterName), + Name: fmt.Sprintf("Karpenter-%s-ScheduledChangeRule", injection.GetOptions(ctx).ClusterName), Pattern: &EventPattern{ Source: []string{"aws.health"}, DetailType: []string{"AWS Health Event"}, @@ -142,7 +142,7 @@ func (eb *EventBridgeProvider) getEC2NotificationEventRules(ctx context.Context) }, }, { - Name: fmt.Sprintf("%s-SpotTerminationRule", injection.GetOptions(ctx).ClusterName), + Name: fmt.Sprintf("Karpenter-%s-SpotTerminationRule", injection.GetOptions(ctx).ClusterName), Pattern: &EventPattern{ Source: []string{"aws.ec2"}, DetailType: []string{"EC2 Spot Instance Interruption Warning"}, @@ -153,7 +153,7 @@ func (eb *EventBridgeProvider) getEC2NotificationEventRules(ctx context.Context) }, }, { - Name: fmt.Sprintf("%s-RebalanceRule", injection.GetOptions(ctx).ClusterName), + Name: fmt.Sprintf("Karpenter-%s-RebalanceRule", injection.GetOptions(ctx).ClusterName), Pattern: &EventPattern{ Source: []string{"aws.ec2"}, DetailType: []string{"EC2 Instance Rebalance Recommendation"}, @@ -164,7 +164,7 @@ func (eb *EventBridgeProvider) getEC2NotificationEventRules(ctx context.Context) }, }, { - Name: fmt.Sprintf("%s-InstanceStateChangeRule", injection.GetOptions(ctx).ClusterName), + Name: fmt.Sprintf("Karpenter-%s-InstanceStateChangeRule", injection.GetOptions(ctx).ClusterName), Pattern: &EventPattern{ Source: []string{"aws.ec2"}, DetailType: []string{"EC2 Instance State-change Notification"}, diff --git a/website/content/en/preview/getting-started/getting-started-with-eksctl/cloudformation.yaml b/website/content/en/preview/getting-started/getting-started-with-eksctl/cloudformation.yaml index e58fa4b32294..481e01c0f716 100644 --- a/website/content/en/preview/getting-started/getting-started-with-eksctl/cloudformation.yaml +++ b/website/content/en/preview/getting-started/getting-started-with-eksctl/cloudformation.yaml @@ -76,7 +76,7 @@ Resources: Action: # Write Operations - sqs:CreateQueue - - sqs:TagResource + - sqs:TagQueue - sqs:SetQueueAttributes - sqs:DeleteMessage - sqs:DeleteQueue @@ -84,7 +84,7 @@ Resources: - sqs:GetQueueUrl - sqs:ReceiveMessage - Effect: Allow - Resource: !Sub "arn:${AWS::Partition}:events:${AWS::Region}:${AWS::AccountId}:Karpenter-${ClusterName}-*" + Resource: !Sub "arn:${AWS::Partition}:events:${AWS::Region}:${AWS::AccountId}:rule/Karpenter-${ClusterName}-*" - events:PutRule - events:PutTargets - events:TagResource From 6ebcb9efcd9e1032c563884bc9f7571e5451508b Mon Sep 17 00:00:00 2001 From: Jonathan Innis Date: Tue, 20 Sep 2022 12:48:08 -0700 Subject: [PATCH 14/55] Separate out the metrics into packages --- pkg/cloudprovider/aws/cloudprovider.go | 24 ++--- .../controllers/infrastructure/controller.go | 42 +++++--- .../aws/controllers/metrics/metrics.go | 82 ++++++++++++++ .../controllers/notification/controller.go | 6 +- pkg/cloudprovider/aws/eventbridge.go | 5 +- pkg/cloudprovider/aws/helpers.go | 34 ------ .../aws/{ => metadata}/metadata.go | 12 +-- pkg/cloudprovider/aws/metadata/types.go | 21 ++++ pkg/cloudprovider/aws/sqs/metrics.go | 82 ++++++++++++++ pkg/cloudprovider/aws/{ => sqs}/sqs.go | 102 +++++++----------- pkg/cloudprovider/aws/sqs/types.go | 52 +++++++++ pkg/utils/functional/functional.go | 16 +++ 12 files changed, 336 insertions(+), 142 deletions(-) create mode 100644 pkg/cloudprovider/aws/controllers/metrics/metrics.go delete mode 100644 pkg/cloudprovider/aws/helpers.go rename pkg/cloudprovider/aws/{ => metadata}/metadata.go (84%) create mode 100644 pkg/cloudprovider/aws/metadata/types.go create mode 100644 pkg/cloudprovider/aws/sqs/metrics.go rename pkg/cloudprovider/aws/{ => sqs}/sqs.go (59%) create mode 100644 pkg/cloudprovider/aws/sqs/types.go diff --git a/pkg/cloudprovider/aws/cloudprovider.go b/pkg/cloudprovider/aws/cloudprovider.go index 3e992f3b393e..0425da1a2bb0 100644 --- a/pkg/cloudprovider/aws/cloudprovider.go +++ b/pkg/cloudprovider/aws/cloudprovider.go @@ -48,6 +48,8 @@ import ( "github.com/aws/karpenter/pkg/cloudprovider" "github.com/aws/karpenter/pkg/cloudprovider/aws/amifamily" "github.com/aws/karpenter/pkg/cloudprovider/aws/apis/v1alpha1" + "github.com/aws/karpenter/pkg/cloudprovider/aws/metadata" + cloudprovidersqs "github.com/aws/karpenter/pkg/cloudprovider/aws/sqs" "github.com/aws/karpenter/pkg/utils/functional" "github.com/aws/karpenter/pkg/utils/injection" "github.com/aws/karpenter/pkg/utils/project" @@ -77,15 +79,10 @@ type CloudProvider struct { instanceTypeProvider *InstanceTypeProvider instanceProvider *InstanceProvider kubeClient k8sClient.Client - sqsProvider *SQSProvider + sqsProvider *cloudprovidersqs.Provider eventBridgeProvider *EventBridgeProvider } -type Metadata struct { - region string - accountID string -} - func NewCloudProvider(ctx context.Context, options cloudprovider.Options) *CloudProvider { // if performing validation only, then only the Validate()/Default() methods will be called which // don't require any other setup @@ -103,10 +100,10 @@ func NewCloudProvider(ctx context.Context, options cloudprovider.Options) *Cloud client.DefaultRetryer{NumMaxRetries: client.DefaultRetryerMaxNumRetries}, ), ))) - metadata := NewMetadataProvider(sess) + metadataProvider := metadata.NewMetadataProvider(sess) if *sess.Config.Region == "" { logging.FromContext(ctx).Debug("AWS region not configured, asking EC2 Instance Metadata Service") - *sess.Config.Region = metadata.Region(ctx) + *sess.Config.Region = metadataProvider.Region(ctx) } logging.FromContext(ctx).Debugf("Using AWS region %s", *sess.Config.Region) @@ -117,12 +114,9 @@ func NewCloudProvider(ctx context.Context, options cloudprovider.Options) *Cloud subnetProvider := NewSubnetProvider(ec2api) instanceTypeProvider := NewInstanceTypeProvider(ctx, sess, options, ec2api, subnetProvider) - m := &Metadata{ - region: *sess.Config.Region, - accountID: metadata.AccountID(ctx), - } - sqsProvider := NewSQSProvider(ctx, sqs.New(sess), m) - eventBridgeProvider := NewEventBridgeProvider(eventbridge.New(sess), m, sqsProvider.queueName) + m := metadata.NewInfo(*sess.Config.Region, metadataProvider.AccountID(ctx)) + sqsProvider := cloudprovidersqs.NewProvider(ctx, sqs.New(sess), m) + eventBridgeProvider := NewEventBridgeProvider(eventbridge.New(sess), m, sqsProvider.QueueName()) cloudprovider := &CloudProvider{ instanceTypeProvider: instanceTypeProvider, instanceProvider: NewInstanceProvider(ctx, ec2api, instanceTypeProvider, subnetProvider, @@ -231,7 +225,7 @@ func (*CloudProvider) Validate(ctx context.Context, provisioner *v1alpha5.Provis return provider.Validate() } -func (c *CloudProvider) SQSProvider() *SQSProvider { +func (c *CloudProvider) SQSProvider() *sqs2.SQSProvider { return c.sqsProvider } diff --git a/pkg/cloudprovider/aws/controllers/infrastructure/controller.go b/pkg/cloudprovider/aws/controllers/infrastructure/controller.go index 916719fe0b0a..c32e680746c6 100644 --- a/pkg/cloudprovider/aws/controllers/infrastructure/controller.go +++ b/pkg/cloudprovider/aws/controllers/infrastructure/controller.go @@ -19,9 +19,7 @@ import ( "errors" "fmt" "os" - "os/signal" "sync" - "syscall" "time" "github.com/aws/aws-sdk-go/aws/awserr" @@ -35,6 +33,7 @@ import ( "github.com/aws/karpenter/pkg/cloudprovider/aws" "github.com/aws/karpenter/pkg/cloudprovider/aws/events" + sqs2 "github.com/aws/karpenter/pkg/cloudprovider/aws/sqs" "github.com/aws/karpenter/pkg/utils/injection" ) @@ -45,7 +44,7 @@ type Controller struct { recorder events.Recorder clock clock.Clock - sqsProvider *aws.SQSProvider + sqsProvider *sqs2.SQSProvider eventBridgeProvider *aws.EventBridgeProvider mutex *sync.RWMutex @@ -56,8 +55,12 @@ type Controller struct { // This period can be reduced to a backoffPeriod if there is an error in ensuring the infrastructure const pollingPeriod = time.Hour +// defaultBackoffPeriod is the default period that we go to AWS APIs to ensure that the appropriate AWS infrastructure +// is provisioned if there is an error in the reconciliation loop +const defaultBackoffPeriod = time.Minute * 10 + func NewController(ctx context.Context, cleanupCtx context.Context, kubeClient client.Client, clk clock.Clock, - recorder events.Recorder, sqsProvider *aws.SQSProvider, eventBridgeProvider *aws.EventBridgeProvider, + recorder events.Recorder, sqsProvider *sqs2.SQSProvider, eventBridgeProvider *aws.EventBridgeProvider, startAsync <-chan struct{}, cleanupAsync <-chan os.Signal) *Controller { c := &Controller{ kubeClient: kubeClient, @@ -69,9 +72,6 @@ func NewController(ctx context.Context, cleanupCtx context.Context, kubeClient c readinessChan: make(chan struct{}), } - sigChan := make(chan os.Signal, 1) - signal.Notify(sigChan, syscall.SIGINT) - go func() { <-cleanupAsync c.cleanup(cleanupCtx) @@ -99,7 +99,7 @@ func (c *Controller) run(ctx context.Context) { for { if err := c.ensureInfrastructure(ctx); err != nil { logging.FromContext(ctx).Errorf("ensuring infrastructure established, %v", err) - c.setReady(false) + c.setReady(ctx, false) backoffPeriod := c.getBackoff(err) // Backoff with a shorter polling interval if we fail to ensure the infrastructure @@ -110,7 +110,7 @@ func (c *Controller) run(ctx context.Context) { continue } } - c.setReady(true) + c.setReady(ctx, true) select { case <-ctx.Done(): return @@ -148,7 +148,7 @@ func (c *Controller) Ready() <-chan struct{} { return c.readinessChan } -func (c *Controller) setReady(ready bool) { +func (c *Controller) setReady(ctx context.Context, ready bool) { c.mutex.Lock() defer c.mutex.Unlock() @@ -156,8 +156,10 @@ func (c *Controller) setReady(ready bool) { // other channels that are waiting on Ready() proceed; otherwise, open // a channel to tell the other goroutines to wait if ready { + logging.FromContext(ctx).Infof("Reconciled infrastructure is healthy") close(c.readinessChan) } else { + logging.FromContext(ctx).Infof("Reconciled infrastructure is unhealthy") c.readinessChan = make(chan struct{}) } } @@ -166,13 +168,16 @@ func (c *Controller) ensureInfrastructure(ctx context.Context) (err error) { wg := &sync.WaitGroup{} m := &sync.Mutex{} + wg.Add(2) go func() { + defer wg.Done() e := c.ensureQueue(ctx) m.Lock() err = multierr.Append(err, e) m.Unlock() }() go func() { + defer wg.Done() e := c.ensureEventBridge(ctx) m.Lock() err = multierr.Append(err, e) @@ -187,19 +192,23 @@ func (c *Controller) deleteInfrastructure(ctx context.Context) (err error) { wg := &sync.WaitGroup{} m := &sync.Mutex{} + wg.Add(2) go func() { + defer wg.Done() e := c.sqsProvider.DeleteQueue(ctx) m.Lock() err = multierr.Append(err, e) m.Unlock() }() go func() { + defer wg.Done() e := c.eventBridgeProvider.DeleteEC2NotificationRules(ctx) m.Lock() err = multierr.Append(err, e) m.Unlock() }() wg.Wait() + time.Sleep(time.Minute) return err } @@ -209,11 +218,12 @@ func (c *Controller) ensureQueue(ctx context.Context) error { if _, err := c.sqsProvider.DiscoverQueueURL(ctx, true); err != nil { var awsErr awserr.Error if !errors.As(err, &awsErr) { - // This shouldn't happen, but if it does we should capture it + // This shouldn't happen, but if it does, we should capture it return fmt.Errorf("failed conversion to AWS error, %w", err) } switch awsErr.Code() { case sqs.ErrCodeQueueDoesNotExist: + logging.FromContext(ctx).Infof("Creating the SQS queue for EC2 notifications...") if err := c.sqsProvider.CreateQueue(ctx); err != nil { return fmt.Errorf("creating sqs queue with policy, %w", err) } @@ -252,14 +262,12 @@ func (c *Controller) ensureEventBridge(ctx context.Context) error { func (c *Controller) getBackoff(err error) time.Duration { var awsErr awserr.Error if !errors.As(err, &awsErr) { - return time.Minute + return defaultBackoffPeriod } switch awsErr.Code() { - case aws.AccessDeniedException: - return time.Minute * 10 - case aws.AccessDeniedCode: - return time.Minute * 10 + case sqs.ErrCodeQueueDeletedRecently: + return time.Minute * 2 default: - return time.Minute + return defaultBackoffPeriod } } diff --git a/pkg/cloudprovider/aws/controllers/metrics/metrics.go b/pkg/cloudprovider/aws/controllers/metrics/metrics.go new file mode 100644 index 000000000000..226f716b3dc1 --- /dev/null +++ b/pkg/cloudprovider/aws/controllers/metrics/metrics.go @@ -0,0 +1,82 @@ +/* +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package metrics + +import ( + "context" + + "github.com/prometheus/client_golang/prometheus" + v1 "k8s.io/api/core/v1" + crmetrics "sigs.k8s.io/controller-runtime/pkg/metrics" + + "github.com/aws/karpenter/pkg/apis/provisioning/v1alpha5" + "github.com/aws/karpenter/pkg/cloudprovider" + "github.com/aws/karpenter/pkg/metrics" + "github.com/aws/karpenter/pkg/utils/injection" +) + +const ( + metricLabelController = "controller" + metricLabelMethod = "method" + metricLabelProvider = "provider" +) + +var methodDurationHistogramVec = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Namespace: metrics.Namespace, + Subsystem: "cloudprovider", + Name: "duration_seconds", + Help: "Duration of cloud provider method calls. Labeled by the controller, method name and provider.", + }, + []string{ + metricLabelController, + metricLabelMethod, + metricLabelProvider, + }, +) + +func init() { + crmetrics.Registry.MustRegister(methodDurationHistogramVec) +} + +type decorator struct { + cloudprovider.CloudProvider +} + +// Decorate returns a new `CloudProvider` instance that will delegate all method +// calls to the argument, `cloudProvider`, and publish aggregated latency metrics. The +// value used for the metric label, "controller", is taken from the `Context` object +// passed to the methods of `CloudProvider`. +// +// Do not decorate a `CloudProvider` multiple times or published metrics will contain +// duplicated method call counts and latencies. +func Decorate(cloudProvider cloudprovider.CloudProvider) cloudprovider.CloudProvider { + return &decorator{cloudProvider} +} + +func (d *decorator) Create(ctx context.Context, nodeRequest *cloudprovider.NodeRequest) (*v1.Node, error) { + defer metrics.Measure(methodDurationHistogramVec.WithLabelValues(injection.GetControllerName(ctx), "Create", d.Name()))() + return d.CloudProvider.Create(ctx, nodeRequest) +} + +func (d *decorator) Delete(ctx context.Context, node *v1.Node) error { + defer metrics.Measure(methodDurationHistogramVec.WithLabelValues(injection.GetControllerName(ctx), "Delete", d.Name()))() + return d.CloudProvider.Delete(ctx, node) +} + +func (d *decorator) GetInstanceTypes(ctx context.Context, provisioner *v1alpha5.Provisioner) ([]cloudprovider.InstanceType, error) { + defer metrics.Measure(methodDurationHistogramVec.WithLabelValues(injection.GetControllerName(ctx), "GetInstanceTypes", d.Name()))() + return d.CloudProvider.GetInstanceTypes(ctx, provisioner) +} diff --git a/pkg/cloudprovider/aws/controllers/notification/controller.go b/pkg/cloudprovider/aws/controllers/notification/controller.go index bde2f55cbb5b..d66434f7795c 100644 --- a/pkg/cloudprovider/aws/controllers/notification/controller.go +++ b/pkg/cloudprovider/aws/controllers/notification/controller.go @@ -28,10 +28,10 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" "github.com/aws/karpenter/pkg/apis/provisioning/v1alpha5" - "github.com/aws/karpenter/pkg/cloudprovider/aws" "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event" "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event/aggregatedparser" "github.com/aws/karpenter/pkg/cloudprovider/aws/events" + "github.com/aws/karpenter/pkg/cloudprovider/aws/sqs" "github.com/aws/karpenter/pkg/controllers/provisioning" "github.com/aws/karpenter/pkg/controllers/state" ) @@ -54,7 +54,7 @@ type Controller struct { cluster *state.Cluster recorder events.Recorder clock clock.Clock - provider *aws.SQSProvider + provider *sqs.SQSProvider parser event.Parser infraReady func() <-chan struct{} @@ -63,7 +63,7 @@ type Controller struct { // pollingPeriod that we go to the SQS queue to check if there are any new events const pollingPeriod = 5 * time.Second -func NewController(ctx context.Context, kubeClient client.Client, clk clock.Clock, sqsProvider *aws.SQSProvider, +func NewController(ctx context.Context, kubeClient client.Client, clk clock.Clock, sqsProvider *sqs.SQSProvider, recorder events.Recorder, provisioner *provisioning.Provisioner, cluster *state.Cluster, startAsync <-chan struct{}, infraReady func() <-chan struct{}) *Controller { c := &Controller{ diff --git a/pkg/cloudprovider/aws/eventbridge.go b/pkg/cloudprovider/aws/eventbridge.go index 5106854b60a3..838734a6e94c 100644 --- a/pkg/cloudprovider/aws/eventbridge.go +++ b/pkg/cloudprovider/aws/eventbridge.go @@ -27,6 +27,7 @@ import ( "go.uber.org/multierr" "github.com/aws/karpenter/pkg/apis/provisioning/v1alpha5" + "github.com/aws/karpenter/pkg/cloudprovider/aws/metadata" "github.com/aws/karpenter/pkg/utils/injection" ) @@ -39,7 +40,7 @@ type EventBridgeClient interface { type EventBridgeProvider struct { EventBridgeClient queueName string - metadata *Metadata + metadata *metadata.Info } type EventRule struct { @@ -62,7 +63,7 @@ func (ep *EventPattern) Serialize() []byte { return lo.Must(json.Marshal(ep)) } -func NewEventBridgeProvider(eb EventBridgeClient, metadata *Metadata, queueName string) *EventBridgeProvider { +func NewEventBridgeProvider(eb EventBridgeClient, metadata *metadata.Info, queueName string) *EventBridgeProvider { return &EventBridgeProvider{ EventBridgeClient: eb, metadata: metadata, diff --git a/pkg/cloudprovider/aws/helpers.go b/pkg/cloudprovider/aws/helpers.go deleted file mode 100644 index 0838e0ef17da..000000000000 --- a/pkg/cloudprovider/aws/helpers.go +++ /dev/null @@ -1,34 +0,0 @@ -/* -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package aws - -import ( - "bytes" - "encoding/json" -) - -func deepCopy[T any](v *T) (*T, error) { - var buf bytes.Buffer - enc := json.NewEncoder(&buf) - if err := enc.Encode(v); err != nil { - return nil, err - } - dec := json.NewDecoder(&buf) - var cp T - if err := dec.Decode(&cp); err != nil { - return nil, err - } - return &cp, nil -} diff --git a/pkg/cloudprovider/aws/metadata.go b/pkg/cloudprovider/aws/metadata/metadata.go similarity index 84% rename from pkg/cloudprovider/aws/metadata.go rename to pkg/cloudprovider/aws/metadata/metadata.go index acde880e599f..cd25979e8f2a 100644 --- a/pkg/cloudprovider/aws/metadata.go +++ b/pkg/cloudprovider/aws/metadata/metadata.go @@ -12,7 +12,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package aws +package metadata import ( "context" @@ -25,20 +25,20 @@ import ( "github.com/aws/aws-sdk-go/service/sts/stsiface" ) -type MetadataProvider struct { +type Provider struct { imdsClient *ec2metadata.EC2Metadata stsClient stsiface.STSAPI } -func NewMetadataProvider(sess *session.Session) *MetadataProvider { - return &MetadataProvider{ +func NewMetadataProvider(sess *session.Session) *Provider { + return &Provider{ imdsClient: ec2metadata.New(sess), stsClient: sts.New(sess), } } // Region gets the current region from EC2 IMDS -func (i *MetadataProvider) Region(ctx context.Context) string { +func (i *Provider) Region(ctx context.Context) string { region, err := i.imdsClient.RegionWithContext(ctx) if err != nil { panic(fmt.Sprintf("Failed to call the metadata server's region API, %s", err)) @@ -46,7 +46,7 @@ func (i *MetadataProvider) Region(ctx context.Context) string { return region } -func (i *MetadataProvider) AccountID(ctx context.Context) string { +func (i *Provider) AccountID(ctx context.Context) string { doc, err := i.imdsClient.GetInstanceIdentityDocumentWithContext(ctx) if err != nil { // Fallback to using the STS provider if IMDS fails diff --git a/pkg/cloudprovider/aws/metadata/types.go b/pkg/cloudprovider/aws/metadata/types.go new file mode 100644 index 000000000000..4ac30a2f4a0c --- /dev/null +++ b/pkg/cloudprovider/aws/metadata/types.go @@ -0,0 +1,21 @@ +package metadata + +type Info struct { + region string + accountID string +} + +func NewInfo(region, accountID string) *Info { + return &Info{ + region: region, + accountID: accountID, + } +} + +func (i *Info) Region() string { + return i.region +} + +func (i *Info) AccountID() string { + return i.accountID +} diff --git a/pkg/cloudprovider/aws/sqs/metrics.go b/pkg/cloudprovider/aws/sqs/metrics.go new file mode 100644 index 000000000000..abbc3dd44bf7 --- /dev/null +++ b/pkg/cloudprovider/aws/sqs/metrics.go @@ -0,0 +1,82 @@ +/* +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package sqs + +import ( + "context" + + "github.com/prometheus/client_golang/prometheus" + v1 "k8s.io/api/core/v1" + crmetrics "sigs.k8s.io/controller-runtime/pkg/metrics" + + "github.com/aws/karpenter/pkg/apis/provisioning/v1alpha5" + "github.com/aws/karpenter/pkg/cloudprovider" + "github.com/aws/karpenter/pkg/metrics" + "github.com/aws/karpenter/pkg/utils/injection" +) + +const ( + metricLabelController = "controller" + metricLabelMethod = "method" + metricLabelProvider = "provider" +) + +var methodDurationHistogramVec = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Namespace: metrics.Namespace, + Subsystem: "cloudprovider.sqs", + Name: "duration_seconds", + Help: "Duration of cloud provider method calls. Labeled by the controller, method name and provider.", + }, + []string{ + metricLabelController, + metricLabelMethod, + metricLabelProvider, + }, +) + +func init() { + crmetrics.Registry.MustRegister(methodDurationHistogramVec) +} + +type decorator struct { + Provider +} + +// Decorate returns a new `CloudProvider` instance that will delegate all method +// calls to the argument, `cloudProvider`, and publish aggregated latency metrics. The +// value used for the metric label, "controller", is taken from the `Context` object +// passed to the methods of `CloudProvider`. +// +// Do not decorate a `CloudProvider` multiple times or published metrics will contain +// duplicated method call counts and latencies. +func Decorate(provider Provider) Provider { + return &decorator{provider} +} + +func (d *decorator) Create(ctx context.Context, nodeRequest *cloudprovider.NodeRequest) (*v1.Node, error) { + defer metrics.Measure(methodDurationHistogramVec.WithLabelValues(injection.GetControllerName(ctx), "Create", d.Name()))() + return d.CloudProvider.Create(ctx, nodeRequest) +} + +func (d *decorator) Delete(ctx context.Context, node *v1.Node) error { + defer metrics.Measure(methodDurationHistogramVec.WithLabelValues(injection.GetControllerName(ctx), "Delete", d.Name()))() + return d.CloudProvider.Delete(ctx, node) +} + +func (d *decorator) GetInstanceTypes(ctx context.Context, provisioner *v1alpha5.Provisioner) ([]cloudprovider.InstanceType, error) { + defer metrics.Measure(methodDurationHistogramVec.WithLabelValues(injection.GetControllerName(ctx), "GetInstanceTypes", d.Name()))() + return d.CloudProvider.GetInstanceTypes(ctx, provisioner) +} diff --git a/pkg/cloudprovider/aws/sqs.go b/pkg/cloudprovider/aws/sqs/sqs.go similarity index 59% rename from pkg/cloudprovider/aws/sqs.go rename to pkg/cloudprovider/aws/sqs/sqs.go index 112f46c527a1..193a096dacbb 100644 --- a/pkg/cloudprovider/aws/sqs.go +++ b/pkg/cloudprovider/aws/sqs/sqs.go @@ -12,7 +12,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package aws +package sqs import ( "context" @@ -21,58 +21,21 @@ import ( "sync" "github.com/aws/aws-sdk-go/aws" - "github.com/aws/aws-sdk-go/aws/request" "github.com/aws/aws-sdk-go/service/sqs" "github.com/samber/lo" "github.com/aws/karpenter/pkg/apis/provisioning/v1alpha5" + "github.com/aws/karpenter/pkg/cloudprovider/aws/metadata" + "github.com/aws/karpenter/pkg/utils/functional" "github.com/aws/karpenter/pkg/utils/injection" ) -type SQSClient interface { - CreateQueueWithContext(context.Context, *sqs.CreateQueueInput, ...request.Option) (*sqs.CreateQueueOutput, error) - GetQueueUrlWithContext(context.Context, *sqs.GetQueueUrlInput, ...request.Option) (*sqs.GetQueueUrlOutput, error) - SetQueueAttributesWithContext(context.Context, *sqs.SetQueueAttributesInput, ...request.Option) (*sqs.SetQueueAttributesOutput, error) - ReceiveMessageWithContext(context.Context, *sqs.ReceiveMessageInput, ...request.Option) (*sqs.ReceiveMessageOutput, error) - DeleteMessageWithContext(context.Context, *sqs.DeleteMessageInput, ...request.Option) (*sqs.DeleteMessageOutput, error) - DeleteQueueWithContext(context.Context, *sqs.DeleteQueueInput, ...request.Option) (*sqs.DeleteQueueOutput, error) -} - -type SQSProvider struct { - SQSClient - - createQueueInput *sqs.CreateQueueInput - getQueueURLInput *sqs.GetQueueUrlInput - receiveMessageInput *sqs.ReceiveMessageInput - mutex *sync.RWMutex - queueURL string - queueName string - metadata *Metadata -} - -type QueuePolicy struct { - Version string `json:"Version"` - ID string `json:"Id"` - Statement []QueuePolicyStatement `json:"Statement"` -} - -type QueuePolicyStatement struct { - Effect string `json:"Effect"` - Principal Principal `json:"Principal"` - Action []string `json:"Action"` - Resource string `json:"Resource"` -} - -type Principal struct { - Service []string `json:"Service"` -} - -func NewSQSProvider(ctx context.Context, client SQSClient, metadata *Metadata) *SQSProvider { - provider := &SQSProvider{ - SQSClient: client, +func NewProvider(ctx context.Context, client Client, metadata *metadata.Info) *Provider { + provider := &Provider{ + client: client, mutex: &sync.RWMutex{}, - queueName: getName(ctx), metadata: metadata, + queueName: getQueueName(ctx), } provider.createQueueInput = &sqs.CreateQueueInput{ Attributes: provider.getQueueAttributes(), @@ -98,8 +61,12 @@ func NewSQSProvider(ctx context.Context, client SQSClient, metadata *Metadata) * return provider } -func (s *SQSProvider) CreateQueue(ctx context.Context) error { - result, err := s.CreateQueueWithContext(ctx, s.createQueueInput) +func (s *Provider) QueueName() string { + return s.queueName +} + +func (s *Provider) CreateQueue(ctx context.Context) error { + result, err := s.client.CreateQueueWithContext(ctx, s.createQueueInput) if err != nil { return fmt.Errorf("failed creating sqs queue, %w", err) } @@ -109,7 +76,7 @@ func (s *SQSProvider) CreateQueue(ctx context.Context) error { return nil } -func (s *SQSProvider) SetQueueAttributes(ctx context.Context) error { +func (s *Provider) SetQueueAttributes(ctx context.Context) error { queueURL, err := s.DiscoverQueueURL(ctx, false) if err != nil { return fmt.Errorf("failed fetching queue url, %w", err) @@ -119,43 +86,48 @@ func (s *SQSProvider) SetQueueAttributes(ctx context.Context) error { Attributes: s.getQueueAttributes(), QueueUrl: aws.String(queueURL), } - _, err = s.SetQueueAttributesWithContext(ctx, setQueueAttributesInput) + _, err = s.client.SetQueueAttributesWithContext(ctx, setQueueAttributesInput) if err != nil { return fmt.Errorf("failed setting queue attributes, %w", err) } return nil } -func (s *SQSProvider) DiscoverQueueURL(ctx context.Context, ignoreCache bool) (string, error) { +func (s *Provider) DiscoverQueueURL(ctx context.Context, ignoreCache bool) (string, error) { s.mutex.RLock() - defer s.mutex.RUnlock() + queueURL := s.queueURL + s.mutex.RUnlock() + if queueURL != "" && !ignoreCache { + return queueURL, nil + } + s.mutex.Lock() + defer s.mutex.Unlock() + // We have to check if the queueUrl is set again here in case multiple threads make it past the read-locked section if s.queueURL != "" && !ignoreCache { return s.queueURL, nil } - result, err := s.GetQueueUrlWithContext(ctx, s.getQueueURLInput) + result, err := s.client.GetQueueUrlWithContext(ctx, s.getQueueURLInput) if err != nil { return "", fmt.Errorf("failed fetching queue url, %w", err) } - s.mutex.Lock() - defer s.mutex.Unlock() s.queueURL = aws.StringValue(result.QueueUrl) return aws.StringValue(result.QueueUrl), nil } -func (s *SQSProvider) GetSQSMessages(ctx context.Context) ([]*sqs.Message, error) { +func (s *Provider) GetSQSMessages(ctx context.Context) ([]*sqs.Message, error) { queueURL, err := s.DiscoverQueueURL(ctx, false) if err != nil { return nil, fmt.Errorf("failed fetching queue url, %w", err) } // Copy the input template and add the discovered queue url - input, err := deepCopy(s.receiveMessageInput) + input, err := functional.DeepCopy(s.receiveMessageInput) if err != nil { return nil, fmt.Errorf("failed copying input, %w", err) } input.QueueUrl = aws.String(queueURL) - result, err := s.ReceiveMessageWithContext(ctx, input) + result, err := s.client.ReceiveMessageWithContext(ctx, input) if err != nil { return nil, fmt.Errorf("failed receiving sqs messages, %w", err) } @@ -163,7 +135,7 @@ func (s *SQSProvider) GetSQSMessages(ctx context.Context) ([]*sqs.Message, error return result.Messages, nil } -func (s *SQSProvider) DeleteSQSMessage(ctx context.Context, msg *sqs.Message) error { +func (s *Provider) DeleteSQSMessage(ctx context.Context, msg *sqs.Message) error { queueURL, err := s.DiscoverQueueURL(ctx, false) if err != nil { return fmt.Errorf("failed fetching queue url, %w", err) @@ -174,14 +146,14 @@ func (s *SQSProvider) DeleteSQSMessage(ctx context.Context, msg *sqs.Message) er ReceiptHandle: msg.ReceiptHandle, } - _, err = s.DeleteMessageWithContext(ctx, input) + _, err = s.client.DeleteMessageWithContext(ctx, input) if err != nil { return fmt.Errorf("failed deleting messages from sqs queue, %w", err) } return nil } -func (s *SQSProvider) DeleteQueue(ctx context.Context) error { +func (s *Provider) DeleteQueue(ctx context.Context) error { queueURL, err := s.DiscoverQueueURL(ctx, false) if err != nil { return fmt.Errorf("failed fetching queue url, %w", err) @@ -190,14 +162,14 @@ func (s *SQSProvider) DeleteQueue(ctx context.Context) error { input := &sqs.DeleteQueueInput{ QueueUrl: aws.String(queueURL), } - _, err = s.DeleteQueueWithContext(ctx, input) + _, err = s.client.DeleteQueueWithContext(ctx, input) if err != nil { return fmt.Errorf("failed deleting sqs queue, %w", err) } return nil } -func (s *SQSProvider) getQueueAttributes() map[string]*string { +func (s *Provider) getQueueAttributes() map[string]*string { policy := lo.Must(json.Marshal(s.getQueuePolicy())) return map[string]*string{ sqs.QueueAttributeNameMessageRetentionPeriod: aws.String("300"), @@ -205,7 +177,7 @@ func (s *SQSProvider) getQueueAttributes() map[string]*string { } } -func (s *SQSProvider) getQueuePolicy() *QueuePolicy { +func (s *Provider) getQueuePolicy() *QueuePolicy { return &QueuePolicy{ Version: "2008-10-17", ID: "EC2NotificationPolicy", @@ -225,10 +197,10 @@ func (s *SQSProvider) getQueuePolicy() *QueuePolicy { } } -func (s *SQSProvider) getQueueARN() string { - return fmt.Sprintf("arn:aws:sqs:%s:%s:%s", s.metadata.region, s.metadata.accountID, s.queueName) +func (s *Provider) getQueueARN() string { + return fmt.Sprintf("arn:aws:sqs:%s:%s:%s", s.metadata.Region(), s.metadata.AccountID(), s.queueName) } -func getName(ctx context.Context) string { +func getQueueName(ctx context.Context) string { return fmt.Sprintf("Karpenter-%s-Queue", injection.GetOptions(ctx).ClusterName) } diff --git a/pkg/cloudprovider/aws/sqs/types.go b/pkg/cloudprovider/aws/sqs/types.go new file mode 100644 index 000000000000..9ed1c3aed5bc --- /dev/null +++ b/pkg/cloudprovider/aws/sqs/types.go @@ -0,0 +1,52 @@ +package sqs + +import ( + "context" + "sync" + + "github.com/aws/aws-sdk-go/aws/request" + "github.com/aws/aws-sdk-go/service/sqs" + + "github.com/aws/karpenter/pkg/cloudprovider/aws/metadata" +) + +type Client interface { + CreateQueueWithContext(context.Context, *sqs.CreateQueueInput, ...request.Option) (*sqs.CreateQueueOutput, error) + GetQueueUrlWithContext(context.Context, *sqs.GetQueueUrlInput, ...request.Option) (*sqs.GetQueueUrlOutput, error) + SetQueueAttributesWithContext(context.Context, *sqs.SetQueueAttributesInput, ...request.Option) (*sqs.SetQueueAttributesOutput, error) + ReceiveMessageWithContext(context.Context, *sqs.ReceiveMessageInput, ...request.Option) (*sqs.ReceiveMessageOutput, error) + DeleteMessageWithContext(context.Context, *sqs.DeleteMessageInput, ...request.Option) (*sqs.DeleteMessageOutput, error) + DeleteQueueWithContext(context.Context, *sqs.DeleteQueueInput, ...request.Option) (*sqs.DeleteQueueOutput, error) +} + +type Interface interface { +} + +type Provider struct { + client Client + + createQueueInput *sqs.CreateQueueInput + getQueueURLInput *sqs.GetQueueUrlInput + receiveMessageInput *sqs.ReceiveMessageInput + mutex *sync.RWMutex + queueURL string + queueName string + metadata *metadata.Info +} + +type QueuePolicy struct { + Version string `json:"Version"` + ID string `json:"Id"` + Statement []QueuePolicyStatement `json:"Statement"` +} + +type QueuePolicyStatement struct { + Effect string `json:"Effect"` + Principal Principal `json:"Principal"` + Action []string `json:"Action"` + Resource string `json:"Resource"` +} + +type Principal struct { + Service []string `json:"Service"` +} diff --git a/pkg/utils/functional/functional.go b/pkg/utils/functional/functional.go index 754818e7e551..eab082235caf 100644 --- a/pkg/utils/functional/functional.go +++ b/pkg/utils/functional/functional.go @@ -15,6 +15,8 @@ limitations under the License. package functional import ( + "bytes" + "encoding/json" "strings" ) @@ -37,3 +39,17 @@ func SplitCommaSeparatedString(value string) []string { } return result } + +func DeepCopy[T any](v *T) (*T, error) { + var buf bytes.Buffer + enc := json.NewEncoder(&buf) + if err := enc.Encode(v); err != nil { + return nil, err + } + dec := json.NewDecoder(&buf) + var cp T + if err := dec.Decode(&cp); err != nil { + return nil, err + } + return &cp, nil +} From fadbcb2faec8d69248495d0051ecdda9ed8098bc Mon Sep 17 00:00:00 2001 From: Jonathan Innis Date: Tue, 20 Sep 2022 14:20:33 -0700 Subject: [PATCH 15/55] Add metrics to controllers --- pkg/cloudprovider/aws/cloudprovider.go | 12 ++- .../controllers/infrastructure/controller.go | 22 +++-- .../aws/controllers/infrastructure/metrics.go | 50 +++++++++++ .../aws/controllers/metrics/metrics.go | 82 ------------------- .../controllers/notification/controller.go | 32 ++++++-- .../aws/controllers/notification/metrics.go | 73 +++++++++++++++++ pkg/cloudprovider/aws/createfleetbatcher.go | 4 +- pkg/cloudprovider/aws/eventbridge.go | 7 +- pkg/cloudprovider/aws/events/recorder.go | 36 ++++++++ .../aws/{metadata => }/metadata.go | 32 ++++++-- pkg/cloudprovider/aws/metadata/types.go | 21 ----- pkg/cloudprovider/aws/{sqs => }/sqs.go | 66 +++++++++++---- pkg/cloudprovider/aws/sqs/metrics.go | 82 ------------------- pkg/cloudprovider/aws/sqs/types.go | 52 ------------ 14 files changed, 288 insertions(+), 283 deletions(-) create mode 100644 pkg/cloudprovider/aws/controllers/infrastructure/metrics.go delete mode 100644 pkg/cloudprovider/aws/controllers/metrics/metrics.go create mode 100644 pkg/cloudprovider/aws/controllers/notification/metrics.go rename pkg/cloudprovider/aws/{metadata => }/metadata.go (72%) delete mode 100644 pkg/cloudprovider/aws/metadata/types.go rename pkg/cloudprovider/aws/{sqs => }/sqs.go (68%) delete mode 100644 pkg/cloudprovider/aws/sqs/metrics.go delete mode 100644 pkg/cloudprovider/aws/sqs/types.go diff --git a/pkg/cloudprovider/aws/cloudprovider.go b/pkg/cloudprovider/aws/cloudprovider.go index 0425da1a2bb0..028846e71b14 100644 --- a/pkg/cloudprovider/aws/cloudprovider.go +++ b/pkg/cloudprovider/aws/cloudprovider.go @@ -48,8 +48,6 @@ import ( "github.com/aws/karpenter/pkg/cloudprovider" "github.com/aws/karpenter/pkg/cloudprovider/aws/amifamily" "github.com/aws/karpenter/pkg/cloudprovider/aws/apis/v1alpha1" - "github.com/aws/karpenter/pkg/cloudprovider/aws/metadata" - cloudprovidersqs "github.com/aws/karpenter/pkg/cloudprovider/aws/sqs" "github.com/aws/karpenter/pkg/utils/functional" "github.com/aws/karpenter/pkg/utils/injection" "github.com/aws/karpenter/pkg/utils/project" @@ -79,7 +77,7 @@ type CloudProvider struct { instanceTypeProvider *InstanceTypeProvider instanceProvider *InstanceProvider kubeClient k8sClient.Client - sqsProvider *cloudprovidersqs.Provider + sqsProvider *SQSProvider eventBridgeProvider *EventBridgeProvider } @@ -100,7 +98,7 @@ func NewCloudProvider(ctx context.Context, options cloudprovider.Options) *Cloud client.DefaultRetryer{NumMaxRetries: client.DefaultRetryerMaxNumRetries}, ), ))) - metadataProvider := metadata.NewMetadataProvider(sess) + metadataProvider := NewMetadataProvider(sess) if *sess.Config.Region == "" { logging.FromContext(ctx).Debug("AWS region not configured, asking EC2 Instance Metadata Service") *sess.Config.Region = metadataProvider.Region(ctx) @@ -114,8 +112,8 @@ func NewCloudProvider(ctx context.Context, options cloudprovider.Options) *Cloud subnetProvider := NewSubnetProvider(ec2api) instanceTypeProvider := NewInstanceTypeProvider(ctx, sess, options, ec2api, subnetProvider) - m := metadata.NewInfo(*sess.Config.Region, metadataProvider.AccountID(ctx)) - sqsProvider := cloudprovidersqs.NewProvider(ctx, sqs.New(sess), m) + m := NewMetadata(*sess.Config.Region, metadataProvider.AccountID(ctx)) + sqsProvider := NewProvider(ctx, sqs.New(sess), m) eventBridgeProvider := NewEventBridgeProvider(eventbridge.New(sess), m, sqsProvider.QueueName()) cloudprovider := &CloudProvider{ instanceTypeProvider: instanceTypeProvider, @@ -225,7 +223,7 @@ func (*CloudProvider) Validate(ctx context.Context, provisioner *v1alpha5.Provis return provider.Validate() } -func (c *CloudProvider) SQSProvider() *sqs2.SQSProvider { +func (c *CloudProvider) SQSProvider() *SQSProvider { return c.sqsProvider } diff --git a/pkg/cloudprovider/aws/controllers/infrastructure/controller.go b/pkg/cloudprovider/aws/controllers/infrastructure/controller.go index c32e680746c6..f45d0924c203 100644 --- a/pkg/cloudprovider/aws/controllers/infrastructure/controller.go +++ b/pkg/cloudprovider/aws/controllers/infrastructure/controller.go @@ -33,7 +33,7 @@ import ( "github.com/aws/karpenter/pkg/cloudprovider/aws" "github.com/aws/karpenter/pkg/cloudprovider/aws/events" - sqs2 "github.com/aws/karpenter/pkg/cloudprovider/aws/sqs" + "github.com/aws/karpenter/pkg/metrics" "github.com/aws/karpenter/pkg/utils/injection" ) @@ -44,11 +44,12 @@ type Controller struct { recorder events.Recorder clock clock.Clock - sqsProvider *sqs2.SQSProvider + sqsProvider *aws.SQSProvider eventBridgeProvider *aws.EventBridgeProvider mutex *sync.RWMutex readinessChan chan struct{} // A signal to other controllers that infrastructure is in a good state + ready bool } // pollingPeriod is the period that we go to AWS APIs to ensure that the appropriate AWS infrastructure is provisioned @@ -60,7 +61,7 @@ const pollingPeriod = time.Hour const defaultBackoffPeriod = time.Minute * 10 func NewController(ctx context.Context, cleanupCtx context.Context, kubeClient client.Client, clk clock.Clock, - recorder events.Recorder, sqsProvider *sqs2.SQSProvider, eventBridgeProvider *aws.EventBridgeProvider, + recorder events.Recorder, sqsProvider *aws.SQSProvider, eventBridgeProvider *aws.EventBridgeProvider, startAsync <-chan struct{}, cleanupAsync <-chan os.Signal) *Controller { c := &Controller{ kubeClient: kubeClient, @@ -156,15 +157,26 @@ func (c *Controller) setReady(ctx context.Context, ready bool) { // other channels that are waiting on Ready() proceed; otherwise, open // a channel to tell the other goroutines to wait if ready { - logging.FromContext(ctx).Infof("Reconciled infrastructure is healthy") + healthy.Set(1) + if c.ready != ready { + logging.FromContext(ctx).Infof("Infrastructure is healthy") + c.recorder.InfrastructureHealthy(ctx, c.kubeClient) + } close(c.readinessChan) } else { - logging.FromContext(ctx).Infof("Reconciled infrastructure is unhealthy") + healthy.Set(0) + if c.ready != ready { + logging.FromContext(ctx).Infof("Infrastructure is unhealthy") + c.recorder.InfrastructureUnhealthy(ctx, c.kubeClient) + } c.readinessChan = make(chan struct{}) } + c.ready = ready } func (c *Controller) ensureInfrastructure(ctx context.Context) (err error) { + defer metrics.Measure(reconcileDuration)() + wg := &sync.WaitGroup{} m := &sync.Mutex{} diff --git a/pkg/cloudprovider/aws/controllers/infrastructure/metrics.go b/pkg/cloudprovider/aws/controllers/infrastructure/metrics.go new file mode 100644 index 000000000000..2ce1e2a22b4e --- /dev/null +++ b/pkg/cloudprovider/aws/controllers/infrastructure/metrics.go @@ -0,0 +1,50 @@ +/* +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package infrastructure + +import ( + "github.com/prometheus/client_golang/prometheus" + crmetrics "sigs.k8s.io/controller-runtime/pkg/metrics" + + "github.com/aws/karpenter/pkg/metrics" +) + +const ( + subSystem = "aws_infrastructure_controller" +) + +var ( + reconcileDuration = prometheus.NewHistogram( + prometheus.HistogramOpts{ + Namespace: metrics.Namespace, + Subsystem: subSystem, + Name: "reconcile_duration_seconds", + Help: "Duration of scheduling process in seconds. Broken down by provisioner and error.", + Buckets: metrics.DurationBuckets(), + }, + ) + healthy = prometheus.NewGauge( + prometheus.GaugeOpts{ + Namespace: metrics.Namespace, + Subsystem: subSystem, + Name: "healthy", + Help: "Whether the infrastructure that should be up for this controller is in a healthy state.", + }, + ) +) + +func init() { + crmetrics.Registry.MustRegister(reconcileDuration, healthy) +} diff --git a/pkg/cloudprovider/aws/controllers/metrics/metrics.go b/pkg/cloudprovider/aws/controllers/metrics/metrics.go deleted file mode 100644 index 226f716b3dc1..000000000000 --- a/pkg/cloudprovider/aws/controllers/metrics/metrics.go +++ /dev/null @@ -1,82 +0,0 @@ -/* -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package metrics - -import ( - "context" - - "github.com/prometheus/client_golang/prometheus" - v1 "k8s.io/api/core/v1" - crmetrics "sigs.k8s.io/controller-runtime/pkg/metrics" - - "github.com/aws/karpenter/pkg/apis/provisioning/v1alpha5" - "github.com/aws/karpenter/pkg/cloudprovider" - "github.com/aws/karpenter/pkg/metrics" - "github.com/aws/karpenter/pkg/utils/injection" -) - -const ( - metricLabelController = "controller" - metricLabelMethod = "method" - metricLabelProvider = "provider" -) - -var methodDurationHistogramVec = prometheus.NewHistogramVec( - prometheus.HistogramOpts{ - Namespace: metrics.Namespace, - Subsystem: "cloudprovider", - Name: "duration_seconds", - Help: "Duration of cloud provider method calls. Labeled by the controller, method name and provider.", - }, - []string{ - metricLabelController, - metricLabelMethod, - metricLabelProvider, - }, -) - -func init() { - crmetrics.Registry.MustRegister(methodDurationHistogramVec) -} - -type decorator struct { - cloudprovider.CloudProvider -} - -// Decorate returns a new `CloudProvider` instance that will delegate all method -// calls to the argument, `cloudProvider`, and publish aggregated latency metrics. The -// value used for the metric label, "controller", is taken from the `Context` object -// passed to the methods of `CloudProvider`. -// -// Do not decorate a `CloudProvider` multiple times or published metrics will contain -// duplicated method call counts and latencies. -func Decorate(cloudProvider cloudprovider.CloudProvider) cloudprovider.CloudProvider { - return &decorator{cloudProvider} -} - -func (d *decorator) Create(ctx context.Context, nodeRequest *cloudprovider.NodeRequest) (*v1.Node, error) { - defer metrics.Measure(methodDurationHistogramVec.WithLabelValues(injection.GetControllerName(ctx), "Create", d.Name()))() - return d.CloudProvider.Create(ctx, nodeRequest) -} - -func (d *decorator) Delete(ctx context.Context, node *v1.Node) error { - defer metrics.Measure(methodDurationHistogramVec.WithLabelValues(injection.GetControllerName(ctx), "Delete", d.Name()))() - return d.CloudProvider.Delete(ctx, node) -} - -func (d *decorator) GetInstanceTypes(ctx context.Context, provisioner *v1alpha5.Provisioner) ([]cloudprovider.InstanceType, error) { - defer metrics.Measure(methodDurationHistogramVec.WithLabelValues(injection.GetControllerName(ctx), "GetInstanceTypes", d.Name()))() - return d.CloudProvider.GetInstanceTypes(ctx, provisioner) -} diff --git a/pkg/cloudprovider/aws/controllers/notification/controller.go b/pkg/cloudprovider/aws/controllers/notification/controller.go index d66434f7795c..210adced1da0 100644 --- a/pkg/cloudprovider/aws/controllers/notification/controller.go +++ b/pkg/cloudprovider/aws/controllers/notification/controller.go @@ -28,12 +28,13 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" "github.com/aws/karpenter/pkg/apis/provisioning/v1alpha5" + "github.com/aws/karpenter/pkg/cloudprovider/aws" "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event" "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event/aggregatedparser" "github.com/aws/karpenter/pkg/cloudprovider/aws/events" - "github.com/aws/karpenter/pkg/cloudprovider/aws/sqs" "github.com/aws/karpenter/pkg/controllers/provisioning" "github.com/aws/karpenter/pkg/controllers/state" + "github.com/aws/karpenter/pkg/metrics" ) type Action = string @@ -54,7 +55,7 @@ type Controller struct { cluster *state.Cluster recorder events.Recorder clock clock.Clock - provider *sqs.SQSProvider + provider *aws.SQSProvider parser event.Parser infraReady func() <-chan struct{} @@ -63,7 +64,7 @@ type Controller struct { // pollingPeriod that we go to the SQS queue to check if there are any new events const pollingPeriod = 5 * time.Second -func NewController(ctx context.Context, kubeClient client.Client, clk clock.Clock, sqsProvider *sqs.SQSProvider, +func NewController(ctx context.Context, kubeClient client.Client, clk clock.Clock, sqsProvider *aws.SQSProvider, recorder events.Recorder, provisioner *provisioning.Provisioner, cluster *state.Cluster, startAsync <-chan struct{}, infraReady func() <-chan struct{}) *Controller { c := &Controller{ @@ -109,7 +110,8 @@ func (c *Controller) run(ctx context.Context) { } func (c *Controller) pollSQS(ctx context.Context) error { - logging.FromContext(ctx).Infof("Polling SQS") + defer metrics.Measure(reconcileDuration.WithLabelValues())() + sqsMessages, err := c.provider.GetSQSMessages(ctx) if err != nil { return err @@ -134,20 +136,35 @@ func (c *Controller) handleMessage(ctx context.Context, instanceIDMap map[string ctx = logging.WithLogger(ctx, logging.FromContext(ctx).With("event", evt.Kind())) nodes := getInvolvedNodes(evt.EC2InstanceIDs(), instanceIDMap) + // There's no action to take here since the event doesn't pertain to any of our instances + if len(nodes) == 0 { + receivedMessages.WithLabelValues(evt.Kind(), "false").Inc() + return + } action := actionForEvent(evt) for i := range nodes { node := nodes[i] + + // Record metrics and events for this action c.notifyForEvent(evt, node) + receivedMessages.WithLabelValues(evt.Kind(), "true").Inc() + actionsTaken.WithLabelValues(action).Inc() + if action != Actions.NoAction { e := c.deleteInstance(ctx, node) err = multierr.Append(err, e) } } if err != nil { - return err + return fmt.Errorf("failed to act on nodes [%s], %w", nodes[:3], err) } - return c.provider.DeleteSQSMessage(ctx, msg) + err = c.provider.DeleteSQSMessage(ctx, msg) + if err != nil { + return fmt.Errorf("failed to delete message from queue, %w", err) + } + deletedMessages.WithLabelValues().Inc() + return nil } func (c *Controller) deleteInstance(ctx context.Context, node *v1.Node) error { @@ -173,10 +190,7 @@ func (c *Controller) notifyForEvent(evt event.Interface, n *v1.Node) { // For now, we won't do anything with the state change action case event.Kinds.StateChange: - return - default: - return } } diff --git a/pkg/cloudprovider/aws/controllers/notification/metrics.go b/pkg/cloudprovider/aws/controllers/notification/metrics.go new file mode 100644 index 000000000000..41ce9d1fdb6a --- /dev/null +++ b/pkg/cloudprovider/aws/controllers/notification/metrics.go @@ -0,0 +1,73 @@ +/* +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package notification + +import ( + "github.com/prometheus/client_golang/prometheus" + crmetrics "sigs.k8s.io/controller-runtime/pkg/metrics" + + "github.com/aws/karpenter/pkg/metrics" +) + +const ( + subSystem = "aws_notification_controller" + messageTypeLabel = "message_type" + actionableTypeLabel = "actionable" + actionTypeLabel = "action_type" +) + +var ( + reconcileDuration = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Namespace: metrics.Namespace, + Subsystem: subSystem, + Name: "reconcile_duration_seconds", + Help: "Duration of notification reconciliation process in seconds.", + Buckets: metrics.DurationBuckets(), + }, + []string{}, + ) + receivedMessages = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Namespace: metrics.Namespace, + Subsystem: subSystem, + Name: "received_messages", + Help: "Count of messages received from the SQS queue. Broken down by message type and whether the message was actionable.", + }, + []string{messageTypeLabel, actionableTypeLabel}, + ) + deletedMessages = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Namespace: metrics.Namespace, + Subsystem: subSystem, + Name: "deleted_messages", + Help: "Count of messages deleted from the SQS queue.", + }, + []string{}, + ) + actionsTaken = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Namespace: metrics.Namespace, + Subsystem: subSystem, + Name: "actions_taken", + Help: "Count of actions taken based on notifications from the SQS queue. Broken down by action type", + }, + []string{actionTypeLabel}, + ) +) + +func init() { + crmetrics.Registry.MustRegister(reconcileDuration, receivedMessages, deletedMessages, actionsTaken) +} diff --git a/pkg/cloudprovider/aws/createfleetbatcher.go b/pkg/cloudprovider/aws/createfleetbatcher.go index 27d98075f53f..67f3dcf73a10 100644 --- a/pkg/cloudprovider/aws/createfleetbatcher.go +++ b/pkg/cloudprovider/aws/createfleetbatcher.go @@ -25,6 +25,8 @@ import ( "github.com/aws/aws-sdk-go/service/ec2/ec2iface" "github.com/mitchellh/hashstructure/v2" "knative.dev/pkg/logging" + + "github.com/aws/karpenter/pkg/utils/functional" ) // CreateFleetBatcher is used to batch CreateFleet calls from the cloud provider with identical parameters into a single @@ -133,7 +135,7 @@ func (b *CreateFleetBatcher) runCalls() { // of instances that we request call := requestBatch[0] // deep copy the input we are about to modify so that we don't modify any caller's input parameter - input, err := deepCopy(call.input) + input, err := functional.DeepCopy(call.input) if err != nil { // shouldn't occur, but if it does we log an error and just modify the caller's input so we // can continue to launch instances diff --git a/pkg/cloudprovider/aws/eventbridge.go b/pkg/cloudprovider/aws/eventbridge.go index 838734a6e94c..cf9ab5b36e2a 100644 --- a/pkg/cloudprovider/aws/eventbridge.go +++ b/pkg/cloudprovider/aws/eventbridge.go @@ -27,7 +27,6 @@ import ( "go.uber.org/multierr" "github.com/aws/karpenter/pkg/apis/provisioning/v1alpha5" - "github.com/aws/karpenter/pkg/cloudprovider/aws/metadata" "github.com/aws/karpenter/pkg/utils/injection" ) @@ -40,7 +39,7 @@ type EventBridgeClient interface { type EventBridgeProvider struct { EventBridgeClient queueName string - metadata *metadata.Info + metadata *Metadata } type EventRule struct { @@ -63,7 +62,7 @@ func (ep *EventPattern) Serialize() []byte { return lo.Must(json.Marshal(ep)) } -func NewEventBridgeProvider(eb EventBridgeClient, metadata *metadata.Info, queueName string) *EventBridgeProvider { +func NewEventBridgeProvider(eb EventBridgeClient, metadata *Metadata, queueName string) *EventBridgeProvider { return &EventBridgeProvider{ EventBridgeClient: eb, metadata: metadata, @@ -179,5 +178,5 @@ func (eb *EventBridgeProvider) getEC2NotificationEventRules(ctx context.Context) } func (eb *EventBridgeProvider) getQueueARN() string { - return fmt.Sprintf("arn:aws:sqs:%s:%s:%s", eb.metadata.region, eb.metadata.accountID, eb.queueName) + return fmt.Sprintf("arn:aws:sqs:%s:%s:%s", eb.metadata.Region(), eb.metadata.AccountID(), eb.queueName) } diff --git a/pkg/cloudprovider/aws/events/recorder.go b/pkg/cloudprovider/aws/events/recorder.go index cfc3f0943d73..4d5bd8ea7d79 100644 --- a/pkg/cloudprovider/aws/events/recorder.go +++ b/pkg/cloudprovider/aws/events/recorder.go @@ -15,10 +15,18 @@ limitations under the License. package events import ( + "context" + + "github.com/avast/retry-go" + appsv1 "k8s.io/api/apps/v1" v1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/types" "k8s.io/client-go/tools/record" + "knative.dev/pkg/logging" + "sigs.k8s.io/controller-runtime/pkg/client" "github.com/aws/karpenter/pkg/events" + "github.com/aws/karpenter/pkg/utils/injection" ) type recorder struct { @@ -34,6 +42,10 @@ type Recorder interface { EC2SpotRebalanceRecommendation(*v1.Node) // EC2HealthWarning is called when EC2 sends a health warning notification for a health issue for the node from the SQS queue EC2HealthWarning(*v1.Node) + // InfrastructureUnhealthy event is called when infrastructure reconciliation errors and the controller enters an unhealthy state + InfrastructureUnhealthy(context.Context, client.Client) + // InfrastructureHealthy event is called when infrastructure reconciliation succeeds and the controller enters a healthy state + InfrastructureHealthy(context.Context, client.Client) } func NewRecorder(r events.Recorder) Recorder { @@ -53,3 +65,27 @@ func (r recorder) EC2SpotRebalanceRecommendation(node *v1.Node) { func (r recorder) EC2HealthWarning(node *v1.Node) { r.Eventf(node, "Normal", "EC2HealthWarning", "Node %s event: EC2 triggered a health warning for the node", node.Name) } + +func (r recorder) InfrastructureHealthy(ctx context.Context, kubeClient client.Client) { + dep := &appsv1.Deployment{} + err := retry.Do(func() error { + return kubeClient.Get(ctx, types.NamespacedName{Namespace: injection.GetOptions(ctx).DeploymentNamespace, Name: injection.GetOptions(ctx).DeploymentName}, dep) + }) + if err != nil { + logging.FromContext(ctx).Errorf("Sending InfrastructureHealthy event, %v", err) + return + } + r.Eventf(dep, "Normal", "InfrastructureHealthy", "Karpenter infrastructure reconciliation is healthy") +} + +func (r recorder) InfrastructureUnhealthy(ctx context.Context, kubeClient client.Client) { + dep := &appsv1.Deployment{} + err := retry.Do(func() error { + return kubeClient.Get(ctx, types.NamespacedName{Namespace: injection.GetOptions(ctx).DeploymentNamespace, Name: injection.GetOptions(ctx).DeploymentName}, dep) + }) + if err != nil { + logging.FromContext(ctx).Errorf("Sending InfrastructureUnhealthy event, %v", err) + return + } + r.Eventf(dep, "Normal", "InfrastructureUnhealthy", "Karpenter infrastructure reconciliation is unhealthy") +} diff --git a/pkg/cloudprovider/aws/metadata/metadata.go b/pkg/cloudprovider/aws/metadata.go similarity index 72% rename from pkg/cloudprovider/aws/metadata/metadata.go rename to pkg/cloudprovider/aws/metadata.go index cd25979e8f2a..dff96a24198a 100644 --- a/pkg/cloudprovider/aws/metadata/metadata.go +++ b/pkg/cloudprovider/aws/metadata.go @@ -12,7 +12,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package metadata +package aws import ( "context" @@ -25,20 +25,40 @@ import ( "github.com/aws/aws-sdk-go/service/sts/stsiface" ) -type Provider struct { +type Metadata struct { + region string + accountID string +} + +func NewMetadata(region, accountID string) *Metadata { + return &Metadata{ + region: region, + accountID: accountID, + } +} + +func (i *Metadata) Region() string { + return i.region +} + +func (i *Metadata) AccountID() string { + return i.accountID +} + +type MetadataProvider struct { imdsClient *ec2metadata.EC2Metadata stsClient stsiface.STSAPI } -func NewMetadataProvider(sess *session.Session) *Provider { - return &Provider{ +func NewMetadataProvider(sess *session.Session) *MetadataProvider { + return &MetadataProvider{ imdsClient: ec2metadata.New(sess), stsClient: sts.New(sess), } } // Region gets the current region from EC2 IMDS -func (i *Provider) Region(ctx context.Context) string { +func (i *MetadataProvider) Region(ctx context.Context) string { region, err := i.imdsClient.RegionWithContext(ctx) if err != nil { panic(fmt.Sprintf("Failed to call the metadata server's region API, %s", err)) @@ -46,7 +66,7 @@ func (i *Provider) Region(ctx context.Context) string { return region } -func (i *Provider) AccountID(ctx context.Context) string { +func (i *MetadataProvider) AccountID(ctx context.Context) string { doc, err := i.imdsClient.GetInstanceIdentityDocumentWithContext(ctx) if err != nil { // Fallback to using the STS provider if IMDS fails diff --git a/pkg/cloudprovider/aws/metadata/types.go b/pkg/cloudprovider/aws/metadata/types.go deleted file mode 100644 index 4ac30a2f4a0c..000000000000 --- a/pkg/cloudprovider/aws/metadata/types.go +++ /dev/null @@ -1,21 +0,0 @@ -package metadata - -type Info struct { - region string - accountID string -} - -func NewInfo(region, accountID string) *Info { - return &Info{ - region: region, - accountID: accountID, - } -} - -func (i *Info) Region() string { - return i.region -} - -func (i *Info) AccountID() string { - return i.accountID -} diff --git a/pkg/cloudprovider/aws/sqs/sqs.go b/pkg/cloudprovider/aws/sqs.go similarity index 68% rename from pkg/cloudprovider/aws/sqs/sqs.go rename to pkg/cloudprovider/aws/sqs.go index 193a096dacbb..fe5f338b5e4c 100644 --- a/pkg/cloudprovider/aws/sqs/sqs.go +++ b/pkg/cloudprovider/aws/sqs.go @@ -12,7 +12,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package sqs +package aws import ( "context" @@ -21,17 +21,55 @@ import ( "sync" "github.com/aws/aws-sdk-go/aws" + "github.com/aws/aws-sdk-go/aws/request" "github.com/aws/aws-sdk-go/service/sqs" "github.com/samber/lo" "github.com/aws/karpenter/pkg/apis/provisioning/v1alpha5" - "github.com/aws/karpenter/pkg/cloudprovider/aws/metadata" "github.com/aws/karpenter/pkg/utils/functional" "github.com/aws/karpenter/pkg/utils/injection" ) -func NewProvider(ctx context.Context, client Client, metadata *metadata.Info) *Provider { - provider := &Provider{ +type SQSClient interface { + CreateQueueWithContext(context.Context, *sqs.CreateQueueInput, ...request.Option) (*sqs.CreateQueueOutput, error) + GetQueueUrlWithContext(context.Context, *sqs.GetQueueUrlInput, ...request.Option) (*sqs.GetQueueUrlOutput, error) + SetQueueAttributesWithContext(context.Context, *sqs.SetQueueAttributesInput, ...request.Option) (*sqs.SetQueueAttributesOutput, error) + ReceiveMessageWithContext(context.Context, *sqs.ReceiveMessageInput, ...request.Option) (*sqs.ReceiveMessageOutput, error) + DeleteMessageWithContext(context.Context, *sqs.DeleteMessageInput, ...request.Option) (*sqs.DeleteMessageOutput, error) + DeleteQueueWithContext(context.Context, *sqs.DeleteQueueInput, ...request.Option) (*sqs.DeleteQueueOutput, error) +} + +type QueuePolicy struct { + Version string `json:"Version"` + ID string `json:"Id"` + Statement []QueuePolicyStatement `json:"Statement"` +} + +type QueuePolicyStatement struct { + Effect string `json:"Effect"` + Principal Principal `json:"Principal"` + Action []string `json:"Action"` + Resource string `json:"Resource"` +} + +type Principal struct { + Service []string `json:"Service"` +} + +type SQSProvider struct { + client SQSClient + + createQueueInput *sqs.CreateQueueInput + getQueueURLInput *sqs.GetQueueUrlInput + receiveMessageInput *sqs.ReceiveMessageInput + mutex *sync.RWMutex + queueURL string + queueName string + metadata *Metadata +} + +func NewProvider(ctx context.Context, client SQSClient, metadata *Metadata) *SQSProvider { + provider := &SQSProvider{ client: client, mutex: &sync.RWMutex{}, metadata: metadata, @@ -61,11 +99,11 @@ func NewProvider(ctx context.Context, client Client, metadata *metadata.Info) *P return provider } -func (s *Provider) QueueName() string { +func (s *SQSProvider) QueueName() string { return s.queueName } -func (s *Provider) CreateQueue(ctx context.Context) error { +func (s *SQSProvider) CreateQueue(ctx context.Context) error { result, err := s.client.CreateQueueWithContext(ctx, s.createQueueInput) if err != nil { return fmt.Errorf("failed creating sqs queue, %w", err) @@ -76,7 +114,7 @@ func (s *Provider) CreateQueue(ctx context.Context) error { return nil } -func (s *Provider) SetQueueAttributes(ctx context.Context) error { +func (s *SQSProvider) SetQueueAttributes(ctx context.Context) error { queueURL, err := s.DiscoverQueueURL(ctx, false) if err != nil { return fmt.Errorf("failed fetching queue url, %w", err) @@ -93,7 +131,7 @@ func (s *Provider) SetQueueAttributes(ctx context.Context) error { return nil } -func (s *Provider) DiscoverQueueURL(ctx context.Context, ignoreCache bool) (string, error) { +func (s *SQSProvider) DiscoverQueueURL(ctx context.Context, ignoreCache bool) (string, error) { s.mutex.RLock() queueURL := s.queueURL s.mutex.RUnlock() @@ -114,7 +152,7 @@ func (s *Provider) DiscoverQueueURL(ctx context.Context, ignoreCache bool) (stri return aws.StringValue(result.QueueUrl), nil } -func (s *Provider) GetSQSMessages(ctx context.Context) ([]*sqs.Message, error) { +func (s *SQSProvider) GetSQSMessages(ctx context.Context) ([]*sqs.Message, error) { queueURL, err := s.DiscoverQueueURL(ctx, false) if err != nil { return nil, fmt.Errorf("failed fetching queue url, %w", err) @@ -135,7 +173,7 @@ func (s *Provider) GetSQSMessages(ctx context.Context) ([]*sqs.Message, error) { return result.Messages, nil } -func (s *Provider) DeleteSQSMessage(ctx context.Context, msg *sqs.Message) error { +func (s *SQSProvider) DeleteSQSMessage(ctx context.Context, msg *sqs.Message) error { queueURL, err := s.DiscoverQueueURL(ctx, false) if err != nil { return fmt.Errorf("failed fetching queue url, %w", err) @@ -153,7 +191,7 @@ func (s *Provider) DeleteSQSMessage(ctx context.Context, msg *sqs.Message) error return nil } -func (s *Provider) DeleteQueue(ctx context.Context) error { +func (s *SQSProvider) DeleteQueue(ctx context.Context) error { queueURL, err := s.DiscoverQueueURL(ctx, false) if err != nil { return fmt.Errorf("failed fetching queue url, %w", err) @@ -169,7 +207,7 @@ func (s *Provider) DeleteQueue(ctx context.Context) error { return nil } -func (s *Provider) getQueueAttributes() map[string]*string { +func (s *SQSProvider) getQueueAttributes() map[string]*string { policy := lo.Must(json.Marshal(s.getQueuePolicy())) return map[string]*string{ sqs.QueueAttributeNameMessageRetentionPeriod: aws.String("300"), @@ -177,7 +215,7 @@ func (s *Provider) getQueueAttributes() map[string]*string { } } -func (s *Provider) getQueuePolicy() *QueuePolicy { +func (s *SQSProvider) getQueuePolicy() *QueuePolicy { return &QueuePolicy{ Version: "2008-10-17", ID: "EC2NotificationPolicy", @@ -197,7 +235,7 @@ func (s *Provider) getQueuePolicy() *QueuePolicy { } } -func (s *Provider) getQueueARN() string { +func (s *SQSProvider) getQueueARN() string { return fmt.Sprintf("arn:aws:sqs:%s:%s:%s", s.metadata.Region(), s.metadata.AccountID(), s.queueName) } diff --git a/pkg/cloudprovider/aws/sqs/metrics.go b/pkg/cloudprovider/aws/sqs/metrics.go deleted file mode 100644 index abbc3dd44bf7..000000000000 --- a/pkg/cloudprovider/aws/sqs/metrics.go +++ /dev/null @@ -1,82 +0,0 @@ -/* -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package sqs - -import ( - "context" - - "github.com/prometheus/client_golang/prometheus" - v1 "k8s.io/api/core/v1" - crmetrics "sigs.k8s.io/controller-runtime/pkg/metrics" - - "github.com/aws/karpenter/pkg/apis/provisioning/v1alpha5" - "github.com/aws/karpenter/pkg/cloudprovider" - "github.com/aws/karpenter/pkg/metrics" - "github.com/aws/karpenter/pkg/utils/injection" -) - -const ( - metricLabelController = "controller" - metricLabelMethod = "method" - metricLabelProvider = "provider" -) - -var methodDurationHistogramVec = prometheus.NewHistogramVec( - prometheus.HistogramOpts{ - Namespace: metrics.Namespace, - Subsystem: "cloudprovider.sqs", - Name: "duration_seconds", - Help: "Duration of cloud provider method calls. Labeled by the controller, method name and provider.", - }, - []string{ - metricLabelController, - metricLabelMethod, - metricLabelProvider, - }, -) - -func init() { - crmetrics.Registry.MustRegister(methodDurationHistogramVec) -} - -type decorator struct { - Provider -} - -// Decorate returns a new `CloudProvider` instance that will delegate all method -// calls to the argument, `cloudProvider`, and publish aggregated latency metrics. The -// value used for the metric label, "controller", is taken from the `Context` object -// passed to the methods of `CloudProvider`. -// -// Do not decorate a `CloudProvider` multiple times or published metrics will contain -// duplicated method call counts and latencies. -func Decorate(provider Provider) Provider { - return &decorator{provider} -} - -func (d *decorator) Create(ctx context.Context, nodeRequest *cloudprovider.NodeRequest) (*v1.Node, error) { - defer metrics.Measure(methodDurationHistogramVec.WithLabelValues(injection.GetControllerName(ctx), "Create", d.Name()))() - return d.CloudProvider.Create(ctx, nodeRequest) -} - -func (d *decorator) Delete(ctx context.Context, node *v1.Node) error { - defer metrics.Measure(methodDurationHistogramVec.WithLabelValues(injection.GetControllerName(ctx), "Delete", d.Name()))() - return d.CloudProvider.Delete(ctx, node) -} - -func (d *decorator) GetInstanceTypes(ctx context.Context, provisioner *v1alpha5.Provisioner) ([]cloudprovider.InstanceType, error) { - defer metrics.Measure(methodDurationHistogramVec.WithLabelValues(injection.GetControllerName(ctx), "GetInstanceTypes", d.Name()))() - return d.CloudProvider.GetInstanceTypes(ctx, provisioner) -} diff --git a/pkg/cloudprovider/aws/sqs/types.go b/pkg/cloudprovider/aws/sqs/types.go deleted file mode 100644 index 9ed1c3aed5bc..000000000000 --- a/pkg/cloudprovider/aws/sqs/types.go +++ /dev/null @@ -1,52 +0,0 @@ -package sqs - -import ( - "context" - "sync" - - "github.com/aws/aws-sdk-go/aws/request" - "github.com/aws/aws-sdk-go/service/sqs" - - "github.com/aws/karpenter/pkg/cloudprovider/aws/metadata" -) - -type Client interface { - CreateQueueWithContext(context.Context, *sqs.CreateQueueInput, ...request.Option) (*sqs.CreateQueueOutput, error) - GetQueueUrlWithContext(context.Context, *sqs.GetQueueUrlInput, ...request.Option) (*sqs.GetQueueUrlOutput, error) - SetQueueAttributesWithContext(context.Context, *sqs.SetQueueAttributesInput, ...request.Option) (*sqs.SetQueueAttributesOutput, error) - ReceiveMessageWithContext(context.Context, *sqs.ReceiveMessageInput, ...request.Option) (*sqs.ReceiveMessageOutput, error) - DeleteMessageWithContext(context.Context, *sqs.DeleteMessageInput, ...request.Option) (*sqs.DeleteMessageOutput, error) - DeleteQueueWithContext(context.Context, *sqs.DeleteQueueInput, ...request.Option) (*sqs.DeleteQueueOutput, error) -} - -type Interface interface { -} - -type Provider struct { - client Client - - createQueueInput *sqs.CreateQueueInput - getQueueURLInput *sqs.GetQueueUrlInput - receiveMessageInput *sqs.ReceiveMessageInput - mutex *sync.RWMutex - queueURL string - queueName string - metadata *metadata.Info -} - -type QueuePolicy struct { - Version string `json:"Version"` - ID string `json:"Id"` - Statement []QueuePolicyStatement `json:"Statement"` -} - -type QueuePolicyStatement struct { - Effect string `json:"Effect"` - Principal Principal `json:"Principal"` - Action []string `json:"Action"` - Resource string `json:"Resource"` -} - -type Principal struct { - Service []string `json:"Service"` -} From 4d19abe73d31c4c6d7b7bca63d95a28031626d7e Mon Sep 17 00:00:00 2001 From: Jonathan Innis Date: Tue, 20 Sep 2022 16:31:53 -0700 Subject: [PATCH 16/55] Improve logging --- .../controllers/infrastructure/controller.go | 4 +++- .../controllers/notification/controller.go | 22 ++++++++++++++----- pkg/cloudprovider/aws/eventbridge.go | 2 +- pkg/cloudprovider/aws/events/recorder.go | 8 ++++++- 4 files changed, 28 insertions(+), 8 deletions(-) diff --git a/pkg/cloudprovider/aws/controllers/infrastructure/controller.go b/pkg/cloudprovider/aws/controllers/infrastructure/controller.go index f45d0924c203..3fb41846b04a 100644 --- a/pkg/cloudprovider/aws/controllers/infrastructure/controller.go +++ b/pkg/cloudprovider/aws/controllers/infrastructure/controller.go @@ -134,8 +134,10 @@ func (c *Controller) cleanup(ctx context.Context) { logging.FromContext(ctx).Errorf("Getting the deployment %s for cleanup, %v", nn, err) } - // Deployment is deleting so we should cleanup the infrastructure + // Deployment is deleting so we should clean-up the infrastructure + logging.FromContext(ctx).Infof("Checking on the state of the Karpenter deployment") if !dep.DeletionTimestamp.IsZero() { + logging.FromContext(ctx).Infof("Karpenter deployment is deleted") err = c.deleteInfrastructure(ctx) if err != nil { logging.FromContext(ctx).Errorf("Deleting the infrastructure, %v", err) diff --git a/pkg/cloudprovider/aws/controllers/notification/controller.go b/pkg/cloudprovider/aws/controllers/notification/controller.go index 210adced1da0..8dd90085ff37 100644 --- a/pkg/cloudprovider/aws/controllers/notification/controller.go +++ b/pkg/cloudprovider/aws/controllers/notification/controller.go @@ -18,9 +18,11 @@ import ( "context" "fmt" "regexp" + "strings" "time" sqsapi "github.com/aws/aws-sdk-go/service/sqs" + "github.com/samber/lo" "go.uber.org/multierr" v1 "k8s.io/api/core/v1" "k8s.io/utils/clock" @@ -95,6 +97,7 @@ func (c *Controller) run(ctx context.Context) { ctx = logging.WithLogger(ctx, logger) for { <-c.infraReady() // block until the infrastructure is up and ready + logging.FromContext(ctx).Infof("Infrastructure is healthy so proceeding with polling") err := c.pollSQS(ctx) if err != nil { logging.FromContext(ctx).Errorf("Handling notification messages from SQS queue, %v", err) @@ -112,6 +115,7 @@ func (c *Controller) run(ctx context.Context) { func (c *Controller) pollSQS(ctx context.Context) error { defer metrics.Measure(reconcileDuration.WithLabelValues())() + logging.FromContext(ctx).Infof("polling SQS queue for messages") sqsMessages, err := c.provider.GetSQSMessages(ctx) if err != nil { return err @@ -141,14 +145,19 @@ func (c *Controller) handleMessage(ctx context.Context, instanceIDMap map[string receivedMessages.WithLabelValues(evt.Kind(), "false").Inc() return } + receivedMessages.WithLabelValues(evt.Kind(), "true").Inc() + action := actionForEvent(evt) + nodeNames := lo.Map(nodes, func(n *v1.Node, _ int) string { return n.Name }) + logging.FromContext(ctx).Infof("Received actionable event from SQS queue for nodes [%s%s]", + strings.Join(lo.Slice(nodeNames, 0, 3), ","), + lo.Ternary(len(nodeNames) > 3, "...", "")) for i := range nodes { node := nodes[i] - // Record metrics and events for this action + // Record metric and event for this action c.notifyForEvent(evt, node) - receivedMessages.WithLabelValues(evt.Kind(), "true").Inc() actionsTaken.WithLabelValues(action).Inc() if action != Actions.NoAction { @@ -157,7 +166,9 @@ func (c *Controller) handleMessage(ctx context.Context, instanceIDMap map[string } } if err != nil { - return fmt.Errorf("failed to act on nodes [%s], %w", nodes[:3], err) + return fmt.Errorf("failed to act on nodes [%s%s], %w", + strings.Join(lo.Slice(nodeNames, 0, 3), ","), + lo.Ternary(len(nodeNames) > 3, "...", ""), err) } err = c.provider.DeleteSQSMessage(ctx, msg) if err != nil { @@ -169,8 +180,7 @@ func (c *Controller) handleMessage(ctx context.Context, instanceIDMap map[string func (c *Controller) deleteInstance(ctx context.Context, node *v1.Node) error { ctx = logging.WithLogger(ctx, logging.FromContext(ctx).With("node", node.Name)) - logging.FromContext(ctx).Infof("Queue notification triggered ") - + c.recorder.TerminatingNodeOnNotification(node) if err := c.kubeClient.Delete(ctx, node); err != nil { return fmt.Errorf("deleting the spot interrupted node, %w", err) } @@ -244,6 +254,8 @@ func (c *Controller) makeInstanceIDMap() map[string]*v1.Node { return m } +// parseProviderID parses the provider ID stored on the node to get the instance ID +// associated with a node func parseProviderID(pid string) string { r := regexp.MustCompile(`aws:///(?P.*)/(?P.*)`) matches := r.FindStringSubmatch(pid) diff --git a/pkg/cloudprovider/aws/eventbridge.go b/pkg/cloudprovider/aws/eventbridge.go index cf9ab5b36e2a..e19034e04a9d 100644 --- a/pkg/cloudprovider/aws/eventbridge.go +++ b/pkg/cloudprovider/aws/eventbridge.go @@ -55,7 +55,7 @@ type EventTarget struct { type EventPattern struct { Source []string `json:"source,omitempty"` - DetailType []string `json:"detailType,omitempty"` + DetailType []string `json:"detail-type,omitempty"` } func (ep *EventPattern) Serialize() []byte { diff --git a/pkg/cloudprovider/aws/events/recorder.go b/pkg/cloudprovider/aws/events/recorder.go index 4d5bd8ea7d79..127d63c5d6cf 100644 --- a/pkg/cloudprovider/aws/events/recorder.go +++ b/pkg/cloudprovider/aws/events/recorder.go @@ -42,6 +42,8 @@ type Recorder interface { EC2SpotRebalanceRecommendation(*v1.Node) // EC2HealthWarning is called when EC2 sends a health warning notification for a health issue for the node from the SQS queue EC2HealthWarning(*v1.Node) + // TerminatingNodeOnNotification is called when a notification that is sent to the notification controller triggers node deletion + TerminatingNodeOnNotification(*v1.Node) // InfrastructureUnhealthy event is called when infrastructure reconciliation errors and the controller enters an unhealthy state InfrastructureUnhealthy(context.Context, client.Client) // InfrastructureHealthy event is called when infrastructure reconciliation succeeds and the controller enters a healthy state @@ -66,6 +68,10 @@ func (r recorder) EC2HealthWarning(node *v1.Node) { r.Eventf(node, "Normal", "EC2HealthWarning", "Node %s event: EC2 triggered a health warning for the node", node.Name) } +func (r recorder) TerminatingNodeOnNotification(node *v1.Node) { + r.Eventf(node, "Normal", "NotificationTerminateNode", "Node %s event: Notification triggered termination for the node", node.Name) +} + func (r recorder) InfrastructureHealthy(ctx context.Context, kubeClient client.Client) { dep := &appsv1.Deployment{} err := retry.Do(func() error { @@ -87,5 +93,5 @@ func (r recorder) InfrastructureUnhealthy(ctx context.Context, kubeClient client logging.FromContext(ctx).Errorf("Sending InfrastructureUnhealthy event, %v", err) return } - r.Eventf(dep, "Normal", "InfrastructureUnhealthy", "Karpenter infrastructure reconciliation is unhealthy") + r.Eventf(dep, "Warning", "InfrastructureUnhealthy", "Karpenter infrastructure reconciliation is unhealthy") } From 29b3559f806a778532777d264de76f41655bb371 Mon Sep 17 00:00:00 2001 From: Jonathan Innis Date: Tue, 20 Sep 2022 16:46:24 -0700 Subject: [PATCH 17/55] Add suite testing to the controllers --- .../controllers/infrastructure/suite_test.go | 70 +++++++++++++++++++ .../controllers/notification/controller.go | 6 +- .../controllers/notification/suite_test.go | 70 +++++++++++++++++++ pkg/cloudprovider/aws/events/recorder.go | 6 +- pkg/cloudprovider/aws/sqs.go | 4 +- 5 files changed, 147 insertions(+), 9 deletions(-) create mode 100644 pkg/cloudprovider/aws/controllers/infrastructure/suite_test.go create mode 100644 pkg/cloudprovider/aws/controllers/notification/suite_test.go diff --git a/pkg/cloudprovider/aws/controllers/infrastructure/suite_test.go b/pkg/cloudprovider/aws/controllers/infrastructure/suite_test.go new file mode 100644 index 000000000000..62ac5e9d30a6 --- /dev/null +++ b/pkg/cloudprovider/aws/controllers/infrastructure/suite_test.go @@ -0,0 +1,70 @@ +/* +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package infrastructure_test + +import ( + "context" + "testing" + "time" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + "k8s.io/client-go/kubernetes" + clock "k8s.io/utils/clock/testing" + . "knative.dev/pkg/logging/testing" + + "github.com/aws/karpenter/pkg/cloudprovider/fake" + "github.com/aws/karpenter/pkg/controllers/provisioning" + "github.com/aws/karpenter/pkg/controllers/state" + "github.com/aws/karpenter/pkg/test" +) + +var ctx context.Context +var env *test.Environment +var cluster *state.Cluster +var provisioner *provisioning.Provisioner +var cloudProvider *fake.CloudProvider +var clientSet *kubernetes.Clientset +var recorder *test.EventRecorder +var fakeClock *clock.FakeClock +var cfg *test.Config + +func TestAPIs(t *testing.T) { + ctx = TestContextWithLogger(t) + RegisterFailHandler(Fail) + RunSpecs(t, "AWS Infrastructure") +} + +var _ = BeforeSuite(func() { + env = test.NewEnvironment(ctx, func(e *test.Environment) { + cloudProvider = &fake.CloudProvider{} + cfg = test.NewConfig() + fakeClock = clock.NewFakeClock(time.Now()) + cluster = state.NewCluster(fakeClock, cfg, env.Client, cloudProvider) + clientSet = kubernetes.NewForConfigOrDie(e.Config) + recorder = test.NewEventRecorder() + provisioner = provisioning.NewProvisioner(ctx, cfg, env.Client, clientSet.CoreV1(), recorder, cloudProvider, cluster) + }) + Expect(env.Start()).To(Succeed(), "Failed to start environment") +}) + +var _ = AfterSuite(func() { + Expect(env.Stop()).To(Succeed(), "Failed to stop environment") +}) + +var _ = BeforeEach(func() { +}) +var _ = AfterEach(func() { +}) diff --git a/pkg/cloudprovider/aws/controllers/notification/controller.go b/pkg/cloudprovider/aws/controllers/notification/controller.go index 8dd90085ff37..70a0087fa9a0 100644 --- a/pkg/cloudprovider/aws/controllers/notification/controller.go +++ b/pkg/cloudprovider/aws/controllers/notification/controller.go @@ -64,7 +64,7 @@ type Controller struct { } // pollingPeriod that we go to the SQS queue to check if there are any new events -const pollingPeriod = 5 * time.Second +const pollingPeriod = 2 * time.Second func NewController(ctx context.Context, kubeClient client.Client, clk clock.Clock, sqsProvider *aws.SQSProvider, recorder events.Recorder, provisioner *provisioning.Provisioner, cluster *state.Cluster, @@ -97,7 +97,6 @@ func (c *Controller) run(ctx context.Context) { ctx = logging.WithLogger(ctx, logger) for { <-c.infraReady() // block until the infrastructure is up and ready - logging.FromContext(ctx).Infof("Infrastructure is healthy so proceeding with polling") err := c.pollSQS(ctx) if err != nil { logging.FromContext(ctx).Errorf("Handling notification messages from SQS queue, %v", err) @@ -115,7 +114,6 @@ func (c *Controller) run(ctx context.Context) { func (c *Controller) pollSQS(ctx context.Context) error { defer metrics.Measure(reconcileDuration.WithLabelValues())() - logging.FromContext(ctx).Infof("polling SQS queue for messages") sqsMessages, err := c.provider.GetSQSMessages(ctx) if err != nil { return err @@ -149,7 +147,7 @@ func (c *Controller) handleMessage(ctx context.Context, instanceIDMap map[string action := actionForEvent(evt) nodeNames := lo.Map(nodes, func(n *v1.Node, _ int) string { return n.Name }) - logging.FromContext(ctx).Infof("Received actionable event from SQS queue for nodes [%s%s]", + logging.FromContext(ctx).Infof("Received actionable event from SQS queue for node(s) [%s%s]", strings.Join(lo.Slice(nodeNames, 0, 3), ","), lo.Ternary(len(nodeNames) > 3, "...", "")) diff --git a/pkg/cloudprovider/aws/controllers/notification/suite_test.go b/pkg/cloudprovider/aws/controllers/notification/suite_test.go new file mode 100644 index 000000000000..9ecbd71aec0d --- /dev/null +++ b/pkg/cloudprovider/aws/controllers/notification/suite_test.go @@ -0,0 +1,70 @@ +/* +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package notification_test + +import ( + "context" + "testing" + "time" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + "k8s.io/client-go/kubernetes" + clock "k8s.io/utils/clock/testing" + . "knative.dev/pkg/logging/testing" + + "github.com/aws/karpenter/pkg/cloudprovider/fake" + "github.com/aws/karpenter/pkg/controllers/provisioning" + "github.com/aws/karpenter/pkg/controllers/state" + "github.com/aws/karpenter/pkg/test" +) + +var ctx context.Context +var env *test.Environment +var cluster *state.Cluster +var provisioner *provisioning.Provisioner +var cloudProvider *fake.CloudProvider +var clientSet *kubernetes.Clientset +var recorder *test.EventRecorder +var fakeClock *clock.FakeClock +var cfg *test.Config + +func TestAPIs(t *testing.T) { + ctx = TestContextWithLogger(t) + RegisterFailHandler(Fail) + RunSpecs(t, "AWS Notification") +} + +var _ = BeforeSuite(func() { + env = test.NewEnvironment(ctx, func(e *test.Environment) { + cloudProvider = &fake.CloudProvider{} + cfg = test.NewConfig() + fakeClock = clock.NewFakeClock(time.Now()) + cluster = state.NewCluster(fakeClock, cfg, env.Client, cloudProvider) + clientSet = kubernetes.NewForConfigOrDie(e.Config) + recorder = test.NewEventRecorder() + provisioner = provisioning.NewProvisioner(ctx, cfg, env.Client, clientSet.CoreV1(), recorder, cloudProvider, cluster) + }) + Expect(env.Start()).To(Succeed(), "Failed to start environment") +}) + +var _ = AfterSuite(func() { + Expect(env.Stop()).To(Succeed(), "Failed to stop environment") +}) + +var _ = BeforeEach(func() { +}) +var _ = AfterEach(func() { +}) diff --git a/pkg/cloudprovider/aws/events/recorder.go b/pkg/cloudprovider/aws/events/recorder.go index 127d63c5d6cf..f99501d7544a 100644 --- a/pkg/cloudprovider/aws/events/recorder.go +++ b/pkg/cloudprovider/aws/events/recorder.go @@ -69,7 +69,7 @@ func (r recorder) EC2HealthWarning(node *v1.Node) { } func (r recorder) TerminatingNodeOnNotification(node *v1.Node) { - r.Eventf(node, "Normal", "NotificationTerminateNode", "Node %s event: Notification triggered termination for the node", node.Name) + r.Eventf(node, "Normal", "AWSNotificationTerminateNode", "Node %s event: Notification triggered termination for the node", node.Name) } func (r recorder) InfrastructureHealthy(ctx context.Context, kubeClient client.Client) { @@ -81,7 +81,7 @@ func (r recorder) InfrastructureHealthy(ctx context.Context, kubeClient client.C logging.FromContext(ctx).Errorf("Sending InfrastructureHealthy event, %v", err) return } - r.Eventf(dep, "Normal", "InfrastructureHealthy", "Karpenter infrastructure reconciliation is healthy") + r.Eventf(dep, "Normal", "AWSInfrastructureHealthy", "Karpenter infrastructure reconciliation is healthy") } func (r recorder) InfrastructureUnhealthy(ctx context.Context, kubeClient client.Client) { @@ -93,5 +93,5 @@ func (r recorder) InfrastructureUnhealthy(ctx context.Context, kubeClient client logging.FromContext(ctx).Errorf("Sending InfrastructureUnhealthy event, %v", err) return } - r.Eventf(dep, "Warning", "InfrastructureUnhealthy", "Karpenter infrastructure reconciliation is unhealthy") + r.Eventf(dep, "Warning", "AWSInfrastructureUnhealthy", "Karpenter infrastructure reconciliation is unhealthy") } diff --git a/pkg/cloudprovider/aws/sqs.go b/pkg/cloudprovider/aws/sqs.go index fe5f338b5e4c..ac8c3d124487 100644 --- a/pkg/cloudprovider/aws/sqs.go +++ b/pkg/cloudprovider/aws/sqs.go @@ -87,8 +87,8 @@ func NewProvider(ctx context.Context, client SQSClient, metadata *Metadata) *SQS } provider.receiveMessageInput = &sqs.ReceiveMessageInput{ MaxNumberOfMessages: aws.Int64(10), - VisibilityTimeout: aws.Int64(20), // Seconds - WaitTimeSeconds: aws.Int64(20), // Seconds, maximum for long polling + VisibilityTimeout: aws.Int64(10), // Seconds + WaitTimeSeconds: aws.Int64(10), // Seconds, maximum for long polling AttributeNames: []*string{ aws.String(sqs.MessageSystemAttributeNameSentTimestamp), }, From 88d221b003195e5c7b9a8fde020ac925a0643633 Mon Sep 17 00:00:00 2001 From: Jonathan Innis Date: Tue, 20 Sep 2022 17:11:41 -0700 Subject: [PATCH 18/55] mend --- .../controllers/infrastructure/controller.go | 26 ++++++++++++------- pkg/cloudprovider/types.go | 2 ++ pkg/controllers/controllers.go | 18 ++++++++++--- 3 files changed, 34 insertions(+), 12 deletions(-) diff --git a/pkg/cloudprovider/aws/controllers/infrastructure/controller.go b/pkg/cloudprovider/aws/controllers/infrastructure/controller.go index 3fb41846b04a..3b666ac58636 100644 --- a/pkg/cloudprovider/aws/controllers/infrastructure/controller.go +++ b/pkg/cloudprovider/aws/controllers/infrastructure/controller.go @@ -18,14 +18,15 @@ import ( "context" "errors" "fmt" - "os" "sync" "time" + "github.com/avast/retry-go" "github.com/aws/aws-sdk-go/aws/awserr" "github.com/aws/aws-sdk-go/service/sqs" "go.uber.org/multierr" appsv1 "k8s.io/api/apps/v1" + errors2 "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/types" "k8s.io/utils/clock" "knative.dev/pkg/logging" @@ -62,7 +63,7 @@ const defaultBackoffPeriod = time.Minute * 10 func NewController(ctx context.Context, cleanupCtx context.Context, kubeClient client.Client, clk clock.Clock, recorder events.Recorder, sqsProvider *aws.SQSProvider, eventBridgeProvider *aws.EventBridgeProvider, - startAsync <-chan struct{}, cleanupAsync <-chan os.Signal) *Controller { + startAsync <-chan struct{}, cleanupAsync <-chan struct{}) *Controller { c := &Controller{ kubeClient: kubeClient, recorder: recorder, @@ -75,6 +76,7 @@ func NewController(ctx context.Context, cleanupCtx context.Context, kubeClient c go func() { <-cleanupAsync + logging.FromContext(cleanupCtx).Infof("Cleanup was triggered for the Karpenter deployment") c.cleanup(cleanupCtx) }() @@ -129,19 +131,26 @@ func (c *Controller) cleanup(ctx context.Context) { Namespace: injection.GetOptions(ctx).DeploymentNamespace, } - err := c.kubeClient.Get(ctx, nn, dep) + notFound := false + err := retry.Do(func() error { + err := c.kubeClient.Get(ctx, nn, dep) + if errors2.IsNotFound(err) { + notFound = true + } + return client.IgnoreNotFound(err) + }) if err != nil { logging.FromContext(ctx).Errorf("Getting the deployment %s for cleanup, %v", nn, err) } - // Deployment is deleting so we should clean-up the infrastructure - logging.FromContext(ctx).Infof("Checking on the state of the Karpenter deployment") - if !dep.DeletionTimestamp.IsZero() { - logging.FromContext(ctx).Infof("Karpenter deployment is deleted") + // Deployment is already deleted or currently deleting, so we should clean-up the infrastructure + if notFound || !dep.DeletionTimestamp.IsZero() { err = c.deleteInfrastructure(ctx) if err != nil { - logging.FromContext(ctx).Errorf("Deleting the infrastructure, %v", err) + logging.FromContext(ctx).Errorf("Deprovisioning the infrastructure, %v", err) + return } + logging.FromContext(ctx).Infof("Successfully deprovisioned the infrastructure, %v", err) } } @@ -222,7 +231,6 @@ func (c *Controller) deleteInfrastructure(ctx context.Context) (err error) { m.Unlock() }() wg.Wait() - time.Sleep(time.Minute) return err } diff --git a/pkg/cloudprovider/types.go b/pkg/cloudprovider/types.go index 0dea45be150f..d61cb6cc5b74 100644 --- a/pkg/cloudprovider/types.go +++ b/pkg/cloudprovider/types.go @@ -36,6 +36,8 @@ type Options struct { // StartAsync is a channel that is closed when leader election has been won. This is a signal to start any async // processing that should only occur while the cloud provider is the leader. StartAsync <-chan struct{} + // CleanupAsync is a channel that is closed when cleanup is initiated by a SIGINT signal sent to the container + CleanupAsync <-chan struct{} } // CloudProvider interface is implemented by cloud providers to support provisioning. diff --git a/pkg/controllers/controllers.go b/pkg/controllers/controllers.go index ba00f4a83b29..85a657455dab 100644 --- a/pkg/controllers/controllers.go +++ b/pkg/controllers/controllers.go @@ -94,7 +94,7 @@ type ControllerOptions struct { Provisioner *provisioning.Provisioner Recorder events.Recorder StartAsync <-chan struct{} - CleanupAsync <-chan os.Signal + CleanupAsync <-chan struct{} Clock clock.Clock } @@ -110,6 +110,18 @@ func Initialize(injectCloudProvider func(context.Context, cloudprovider.Options) cmw := informer.NewInformedWatcher(clientSet, system.Namespace()) ctx := injection.LoggingContextOrDie(component, controllerRuntimeConfig, cmw) ctx = newRunnableContext(controllerRuntimeConfig, opts, logging.FromContext(ctx))() + ctx, cancel := context.WithCancel(ctx) + + // Setup the cleanup logic for teardown on SIGINT or SIGTERM + cleanup := make(chan struct{}) // This is a channel to broadcast to controllers cleanup can start + go func() { + sigs := make(chan os.Signal, 1) + signal.Notify(sigs, syscall.SIGINT, syscall.SIGTERM) + <-sigs + logging.FromContext(context.Background()).Infof("Got a signal to react to") + close(cleanup) + cancel() + }() logging.FromContext(ctx).Infof("Initializing with version %s", project.Version) @@ -133,7 +145,7 @@ func Initialize(injectCloudProvider func(context.Context, cloudprovider.Options) if opts.EnableProfiling { utilruntime.Must(registerPprof(manager)) } - cloudProvider, injectControllers := injectCloudProvider(ctx, cloudprovider.Options{ClientSet: clientSet, KubeClient: manager.GetClient(), StartAsync: manager.Elected()}) + cloudProvider, injectControllers := injectCloudProvider(ctx, cloudprovider.Options{ClientSet: clientSet, KubeClient: manager.GetClient(), StartAsync: manager.Elected(), CleanupAsync: cleanup}) if hp, ok := cloudProvider.(HealthCheck); ok { utilruntime.Must(manager.AddHealthzCheck("cloud-provider", hp.LivenessProbe)) } @@ -167,7 +179,7 @@ func Initialize(injectCloudProvider func(context.Context, cloudprovider.Options) Provisioner: provisioner, Recorder: recorder, StartAsync: manager.Elected(), - CleanupAsync: Cleanup(), + CleanupAsync: cleanup, Clock: realClock, } injectControllers(ctx, controllerOptions) From 057c94aef1063d2d13e4632d200241f0b8a31f37 Mon Sep 17 00:00:00 2001 From: Jonathan Innis Date: Tue, 20 Sep 2022 23:41:49 -0700 Subject: [PATCH 19/55] Done channel for cleanup --- Makefile | 2 +- cmd/controller/main.go | 6 +++--- .../controllers/infrastructure/controller.go | 20 ++++++++++++++++++- pkg/cloudprovider/aws/controllers/register.go | 7 ++++--- pkg/cloudprovider/aws/eventbridge.go | 18 +++++++++++++++-- pkg/cloudprovider/aws/sqs.go | 2 ++ pkg/controllers/controllers.go | 7 +++++-- 7 files changed, 50 insertions(+), 12 deletions(-) diff --git a/Makefile b/Makefile index 6e3a7ff6ba22..7ce472225d24 100644 --- a/Makefile +++ b/Makefile @@ -20,7 +20,7 @@ HELM_OPTS ?= --set serviceAccount.annotations.eks\\.amazonaws\\.com/role-arn=${K --set clusterName=${CLUSTER_NAME} \ --set clusterEndpoint=${CLUSTER_ENDPOINT} \ --set aws.defaultInstanceProfile=KarpenterNodeInstanceProfile-${CLUSTER_NAME} \ - --create-namespace + --set terminationGracePeriodSeconds=300 --create-namespace TEST_FILTER ?= .* # CR for local builds of Karpenter diff --git a/cmd/controller/main.go b/cmd/controller/main.go index e3924431cbc9..7e8e7e2c9e4c 100644 --- a/cmd/controller/main.go +++ b/cmd/controller/main.go @@ -24,10 +24,10 @@ import ( ) func main() { - controllers.Initialize(func(ctx context.Context, options cloudprovider.Options) (cloudprovider.CloudProvider, func(context.Context, *controllers.ControllerOptions)) { + controllers.Initialize(func(ctx context.Context, options cloudprovider.Options) (cloudprovider.CloudProvider, controllers.ControllerInitFunc) { provider := aws.NewCloudProvider(ctx, options) - return provider, func(c context.Context, opts *controllers.ControllerOptions) { - awscontrollers.Register(c, provider, opts) + return provider, func(c context.Context, opts *controllers.ControllerOptions) <-chan struct{} { + return awscontrollers.Register(c, provider, opts) } }) } diff --git a/pkg/cloudprovider/aws/controllers/infrastructure/controller.go b/pkg/cloudprovider/aws/controllers/infrastructure/controller.go index 3b666ac58636..e9825ec2e50c 100644 --- a/pkg/cloudprovider/aws/controllers/infrastructure/controller.go +++ b/pkg/cloudprovider/aws/controllers/infrastructure/controller.go @@ -51,6 +51,8 @@ type Controller struct { mutex *sync.RWMutex readinessChan chan struct{} // A signal to other controllers that infrastructure is in a good state ready bool + + done chan struct{} } // pollingPeriod is the period that we go to AWS APIs to ensure that the appropriate AWS infrastructure is provisioned @@ -72,12 +74,14 @@ func NewController(ctx context.Context, cleanupCtx context.Context, kubeClient c eventBridgeProvider: eventBridgeProvider, mutex: &sync.RWMutex{}, readinessChan: make(chan struct{}), + done: make(chan struct{}), } go func() { <-cleanupAsync logging.FromContext(cleanupCtx).Infof("Cleanup was triggered for the Karpenter deployment") c.cleanup(cleanupCtx) + close(c.done) }() go func() { @@ -123,7 +127,7 @@ func (c *Controller) run(ctx context.Context) { } func (c *Controller) cleanup(ctx context.Context) { - logging.WithLogger(ctx, logging.FromContext(ctx).Named("infrastructure.cleanup")) + ctx = logging.WithLogger(ctx, logging.FromContext(ctx).Named("infrastructure")) dep := &appsv1.Deployment{} nn := types.NamespacedName{ @@ -132,6 +136,7 @@ func (c *Controller) cleanup(ctx context.Context) { } notFound := false + logging.FromContext(ctx).Infof("Getting the deployment from the api server") err := retry.Do(func() error { err := c.kubeClient.Get(ctx, nn, dep) if errors2.IsNotFound(err) { @@ -142,6 +147,7 @@ func (c *Controller) cleanup(ctx context.Context) { if err != nil { logging.FromContext(ctx).Errorf("Getting the deployment %s for cleanup, %v", nn, err) } + logging.FromContext(ctx).Infof("Successfully got the deployment from the api server") // Deployment is already deleted or currently deleting, so we should clean-up the infrastructure if notFound || !dep.DeletionTimestamp.IsZero() { @@ -160,6 +166,14 @@ func (c *Controller) Ready() <-chan struct{} { return c.readinessChan } +func (c *Controller) Done() <-chan struct{} { + return c.done +} + +func (c *Controller) deploymentWatcher() { + +} + func (c *Controller) setReady(ctx context.Context, ready bool) { c.mutex.Lock() defer c.mutex.Unlock() @@ -218,14 +232,18 @@ func (c *Controller) deleteInfrastructure(ctx context.Context) (err error) { wg.Add(2) go func() { defer wg.Done() + logging.FromContext(ctx).Infof("Started deleting the queue") e := c.sqsProvider.DeleteQueue(ctx) + logging.FromContext(ctx).Infof("Finished deleting the queue") m.Lock() err = multierr.Append(err, e) m.Unlock() }() go func() { defer wg.Done() + logging.FromContext(ctx).Infof("Started deleting the eventbridge rules") e := c.eventBridgeProvider.DeleteEC2NotificationRules(ctx) + logging.FromContext(ctx).Infof("Finished deleting the eventbridge rules") m.Lock() err = multierr.Append(err, e) m.Unlock() diff --git a/pkg/cloudprovider/aws/controllers/register.go b/pkg/cloudprovider/aws/controllers/register.go index 94588deb10a9..81ee7edd746e 100644 --- a/pkg/cloudprovider/aws/controllers/register.go +++ b/pkg/cloudprovider/aws/controllers/register.go @@ -26,12 +26,13 @@ import ( "github.com/aws/karpenter/pkg/controllers" ) -func Register(ctx context.Context, provider *aws.CloudProvider, opts *controllers.ControllerOptions) { +func Register(ctx context.Context, provider *aws.CloudProvider, opts *controllers.ControllerOptions) <-chan struct{} { rec := events.NewRecorder(opts.Recorder) ctx = logging.WithLogger(ctx, logging.FromContext(ctx).Named("aws")) - cleanupContext := logging.WithLogger(opts.BaseContext(), logging.FromContext(ctx).Named("aws")) + cleanupContext := logging.WithLogger(opts.BaseContext(), logging.FromContext(opts.BaseContext()).Named("aws")) - // Injecting the controllers that will start when opts.StartAsync is triggered + // Injecting the cloudprovider-specific controllers that will start when opts.StartAsync is triggered infraController := infrastructure.NewController(ctx, cleanupContext, opts.KubeClient, opts.Clock, rec, provider.SQSProvider(), provider.EventBridgeProvider(), opts.StartAsync, opts.CleanupAsync) notification.NewController(ctx, opts.KubeClient, opts.Clock, provider.SQSProvider(), rec, opts.Provisioner, opts.Cluster, opts.StartAsync, infraController.Ready) + return infraController.Done() // This is the only controller that has a done channel so just return it } diff --git a/pkg/cloudprovider/aws/eventbridge.go b/pkg/cloudprovider/aws/eventbridge.go index e19034e04a9d..f14ffb9d7ec1 100644 --- a/pkg/cloudprovider/aws/eventbridge.go +++ b/pkg/cloudprovider/aws/eventbridge.go @@ -25,6 +25,7 @@ import ( "github.com/aws/aws-sdk-go/service/eventbridge" "github.com/samber/lo" "go.uber.org/multierr" + "knative.dev/pkg/logging" "github.com/aws/karpenter/pkg/apis/provisioning/v1alpha5" "github.com/aws/karpenter/pkg/utils/injection" @@ -34,6 +35,7 @@ type EventBridgeClient interface { PutRuleWithContext(context.Context, *eventbridge.PutRuleInput, ...request.Option) (*eventbridge.PutRuleOutput, error) PutTargetsWithContext(context.Context, *eventbridge.PutTargetsInput, ...request.Option) (*eventbridge.PutTargetsOutput, error) DeleteRuleWithContext(context.Context, *eventbridge.DeleteRuleInput, ...request.Option) (*eventbridge.DeleteRuleOutput, error) + RemoveTargetsWithContext(context.Context, *eventbridge.RemoveTargetsInput, ...request.Option) (*eventbridge.RemoveTargetsOutput, error) } type EventBridgeProvider struct { @@ -115,10 +117,22 @@ func (eb *EventBridgeProvider) DeleteEC2NotificationRules(ctx context.Context) ( wg.Add(1) go func(r EventRule) { defer wg.Done() - input := &eventbridge.DeleteRuleInput{ + targetInput := &eventbridge.RemoveTargetsInput{ + Ids: []*string{aws.String(r.Target.ID)}, + Rule: aws.String(r.Name), + } + _, e := eb.RemoveTargetsWithContext(ctx, targetInput) + m.Lock() + err = multierr.Append(err, e) + m.Unlock() + if e != nil { + return + } + ruleInput := &eventbridge.DeleteRuleInput{ Name: aws.String(r.Name), } - _, e := eb.DeleteRuleWithContext(ctx, input) + _, e = eb.DeleteRuleWithContext(ctx, ruleInput) + logging.FromContext(ctx).Errorf("Might have got an error here, %v", e) m.Lock() err = multierr.Append(err, e) m.Unlock() diff --git a/pkg/cloudprovider/aws/sqs.go b/pkg/cloudprovider/aws/sqs.go index ac8c3d124487..ae9babada7a0 100644 --- a/pkg/cloudprovider/aws/sqs.go +++ b/pkg/cloudprovider/aws/sqs.go @@ -24,6 +24,7 @@ import ( "github.com/aws/aws-sdk-go/aws/request" "github.com/aws/aws-sdk-go/service/sqs" "github.com/samber/lo" + "knative.dev/pkg/logging" "github.com/aws/karpenter/pkg/apis/provisioning/v1alpha5" "github.com/aws/karpenter/pkg/utils/functional" @@ -202,6 +203,7 @@ func (s *SQSProvider) DeleteQueue(ctx context.Context) error { } _, err = s.client.DeleteQueueWithContext(ctx, input) if err != nil { + logging.FromContext(ctx).Errorf("Might have got an error here in the queue, %v", err) return fmt.Errorf("failed deleting sqs queue, %w", err) } return nil diff --git a/pkg/controllers/controllers.go b/pkg/controllers/controllers.go index 85a657455dab..a8e29cc37e2f 100644 --- a/pkg/controllers/controllers.go +++ b/pkg/controllers/controllers.go @@ -77,6 +77,8 @@ func init() { metrics.MustRegister() // Registers cross-controller metrics } +type ControllerInitFunc func(context.Context, *ControllerOptions) <-chan struct{} + // Controller is an interface implemented by Karpenter custom resources. type Controller interface { // Reconcile hands a hydrated kubernetes resource to the controller for @@ -98,7 +100,7 @@ type ControllerOptions struct { Clock clock.Clock } -func Initialize(injectCloudProvider func(context.Context, cloudprovider.Options) (cloudprovider.CloudProvider, func(context.Context, *ControllerOptions))) { +func Initialize(injectCloudProvider func(context.Context, cloudprovider.Options) (cloudprovider.CloudProvider, ControllerInitFunc)) { opts := options.New().MustParse() // Setup Client controllerRuntimeConfig := controllerruntime.GetConfigOrDie() @@ -182,7 +184,7 @@ func Initialize(injectCloudProvider func(context.Context, cloudprovider.Options) CleanupAsync: cleanup, Clock: realClock, } - injectControllers(ctx, controllerOptions) + done := injectControllers(ctx, controllerOptions) metricsstate.StartMetricScraper(ctx, cluster) @@ -200,6 +202,7 @@ func Initialize(injectCloudProvider func(context.Context, cloudprovider.Options) ).Start(ctx); err != nil { panic(fmt.Sprintf("Unable to start manager, %s", err)) } + <-done // Wait for controller cleanup to also be completed } // NewManagerOrDie instantiates a controller manager or panics From 5a6310c595024e82b65d050c33c63967f6fb98a8 Mon Sep 17 00:00:00 2001 From: Jonathan Innis Date: Wed, 21 Sep 2022 00:16:18 -0700 Subject: [PATCH 20/55] Deployment watcher with finalizer for cleanup --- Makefile | 2 +- charts/karpenter/templates/role.yaml | 7 +- cmd/controller/main.go | 6 +- pkg/cloudprovider/aws/cloudprovider.go | 2 +- .../aws/controllers/deployment/controller.go | 143 ++++++++++++++++++ .../aws/controllers/deployment/suite_test.go | 70 +++++++++ .../controllers/infrastructure/controller.go | 97 ++---------- .../controllers/notification/controller.go | 6 +- .../rebalancerecommendation/v0/parser.go | 1 - .../event/scheduledchange/v0/parser.go | 7 +- .../event/spotinterruption/v0/parser.go | 3 +- .../event/statechange/v0/parser.go | 7 +- pkg/cloudprovider/aws/controllers/register.go | 18 ++- pkg/cloudprovider/aws/eventbridge.go | 2 - pkg/cloudprovider/aws/sqs.go | 2 +- pkg/cloudprovider/types.go | 2 - pkg/controllers/controllers.go | 50 ++---- 17 files changed, 276 insertions(+), 149 deletions(-) create mode 100644 pkg/cloudprovider/aws/controllers/deployment/controller.go create mode 100644 pkg/cloudprovider/aws/controllers/deployment/suite_test.go diff --git a/Makefile b/Makefile index 7ce472225d24..6e3a7ff6ba22 100644 --- a/Makefile +++ b/Makefile @@ -20,7 +20,7 @@ HELM_OPTS ?= --set serviceAccount.annotations.eks\\.amazonaws\\.com/role-arn=${K --set clusterName=${CLUSTER_NAME} \ --set clusterEndpoint=${CLUSTER_ENDPOINT} \ --set aws.defaultInstanceProfile=KarpenterNodeInstanceProfile-${CLUSTER_NAME} \ - --set terminationGracePeriodSeconds=300 --create-namespace + --create-namespace TEST_FILTER ?= .* # CR for local builds of Karpenter diff --git a/charts/karpenter/templates/role.yaml b/charts/karpenter/templates/role.yaml index f4a3cdf3ca83..efd6c57a1577 100644 --- a/charts/karpenter/templates/role.yaml +++ b/charts/karpenter/templates/role.yaml @@ -28,6 +28,11 @@ rules: resourceNames: - karpenter-global-settings - config-logging + - apiGroups: ["apps"] + resources: ["deployments"] + verbs: ["get", "list", "watch", "patch", "update"] + resourceNames: + - {{ include "karpenter.fullname" . }} - apiGroups: ["coordination.k8s.io"] resources: ["leases"] verbs: ["patch", "update"] @@ -44,4 +49,4 @@ rules: verbs: ["create"] - apiGroups: [""] resources: ["configmaps"] - verbs: ["create"] + verbs: ["create"] \ No newline at end of file diff --git a/cmd/controller/main.go b/cmd/controller/main.go index 7e8e7e2c9e4c..1e5ed4af0593 100644 --- a/cmd/controller/main.go +++ b/cmd/controller/main.go @@ -17,6 +17,8 @@ package main import ( "context" + "sigs.k8s.io/controller-runtime/pkg/manager" + "github.com/aws/karpenter/pkg/cloudprovider" "github.com/aws/karpenter/pkg/cloudprovider/aws" awscontrollers "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers" @@ -26,8 +28,8 @@ import ( func main() { controllers.Initialize(func(ctx context.Context, options cloudprovider.Options) (cloudprovider.CloudProvider, controllers.ControllerInitFunc) { provider := aws.NewCloudProvider(ctx, options) - return provider, func(c context.Context, opts *controllers.ControllerOptions) <-chan struct{} { - return awscontrollers.Register(c, provider, opts) + return provider, func(c context.Context, manager manager.Manager, opts *controllers.ControllerOptions) { + awscontrollers.Register(c, provider, manager, opts) } }) } diff --git a/pkg/cloudprovider/aws/cloudprovider.go b/pkg/cloudprovider/aws/cloudprovider.go index 028846e71b14..6444aad3c990 100644 --- a/pkg/cloudprovider/aws/cloudprovider.go +++ b/pkg/cloudprovider/aws/cloudprovider.go @@ -113,7 +113,7 @@ func NewCloudProvider(ctx context.Context, options cloudprovider.Options) *Cloud instanceTypeProvider := NewInstanceTypeProvider(ctx, sess, options, ec2api, subnetProvider) m := NewMetadata(*sess.Config.Region, metadataProvider.AccountID(ctx)) - sqsProvider := NewProvider(ctx, sqs.New(sess), m) + sqsProvider := NewSQSProvider(ctx, sqs.New(sess), m) eventBridgeProvider := NewEventBridgeProvider(eventbridge.New(sess), m, sqsProvider.QueueName()) cloudprovider := &CloudProvider{ instanceTypeProvider: instanceTypeProvider, diff --git a/pkg/cloudprovider/aws/controllers/deployment/controller.go b/pkg/cloudprovider/aws/controllers/deployment/controller.go new file mode 100644 index 000000000000..3ad1434a472d --- /dev/null +++ b/pkg/cloudprovider/aws/controllers/deployment/controller.go @@ -0,0 +1,143 @@ +/* +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package deployment + +import ( + "context" + "sync" + + "go.uber.org/multierr" + appsv1 "k8s.io/api/apps/v1" + "k8s.io/apimachinery/pkg/api/errors" + "knative.dev/pkg/logging" + controllerruntime "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" + "sigs.k8s.io/controller-runtime/pkg/manager" + "sigs.k8s.io/controller-runtime/pkg/predicate" + "sigs.k8s.io/controller-runtime/pkg/reconcile" + + "github.com/aws/karpenter/pkg/apis/provisioning/v1alpha5" + "github.com/aws/karpenter/pkg/cloudprovider/aws" + "github.com/aws/karpenter/pkg/utils/injection" +) + +const controllerName = "deployment" + +// Controller is responsible for watching the Karpenter deployment +// It is responsible for patching the termination finalizer on when the leader pod comes up +// and reacting to the deletion of the deployment so that we can perform some cleanup actions +type Controller struct { + kubeClient client.Client + cancel context.CancelFunc + + sqsProvider *aws.SQSProvider + eventBridgeProvider *aws.EventBridgeProvider +} + +func NewController(kubeClient client.Client, cancel context.CancelFunc, + sqsProvider *aws.SQSProvider, eventBridgeProvider *aws.EventBridgeProvider) *Controller { + return &Controller{ + kubeClient: kubeClient, + cancel: cancel, + sqsProvider: sqsProvider, + eventBridgeProvider: eventBridgeProvider, + } +} + +func (c *Controller) Reconcile(ctx context.Context, req reconcile.Request) (reconcile.Result, error) { + ctx = logging.WithLogger(ctx, logging.FromContext(ctx).Named(controllerName)) + + deployment := &appsv1.Deployment{} + if err := c.kubeClient.Get(ctx, req.NamespacedName, deployment); err != nil { + if errors.IsNotFound(err) { + return reconcile.Result{}, nil + } + return reconcile.Result{}, err + } + // If the deletion timestamp is set, that means the deployment is attempting to be deleted + // and we should perform the cleanup actions associated with the Karpenter deployment + if !deployment.DeletionTimestamp.IsZero() { + if err := c.deleteInfrastructure(ctx); err != nil { + return reconcile.Result{}, err + } + patch := client.MergeFrom(deployment.DeepCopy()) + controllerutil.RemoveFinalizer(deployment, v1alpha5.TerminationFinalizer) + if err := c.kubeClient.Patch(ctx, deployment, patch); err != nil { + return reconcile.Result{}, err + } + c.cancel() // Call cancel to stop the other controllers relying on the infrastructure + return reconcile.Result{}, nil + } + // Otherwise, this is a create/update, so we should just ensure that the finalizer exists + if !controllerutil.ContainsFinalizer(deployment, v1alpha5.TerminationFinalizer) { + patch := client.MergeFrom(deployment.DeepCopy()) + controllerutil.AddFinalizer(deployment, v1alpha5.TerminationFinalizer) + if err := c.kubeClient.Patch(ctx, deployment, patch); err != nil { + return reconcile.Result{}, err + } + } + return reconcile.Result{}, nil +} + +// Register the controller to the manager +func (c *Controller) Register(ctx context.Context, m manager.Manager) error { + return controllerruntime. + NewControllerManagedBy(m). + Named(controllerName). + For(&appsv1.Deployment{}). + WithEventFilter(predicate.NewPredicateFuncs(func(object client.Object) bool { + // This function ensures that we are filtering out every event that isn't related to the + // karpenter controller deployment + if object.GetNamespace() != injection.GetOptions(ctx).DeploymentNamespace { + return false + } + if object.GetName() != injection.GetOptions(ctx).DeploymentName { + return false + } + return true + })). + Complete(c) +} + +// Delete infrastructure removes the infrastructure that was stood up and reconciled +// by the infrastructure controller for SQS message polling +func (c *Controller) deleteInfrastructure(ctx context.Context) (err error) { + logging.FromContext(ctx).Infof("Deprovisioning the infrastructure...") + wg := &sync.WaitGroup{} + m := &sync.Mutex{} + + wg.Add(2) + go func() { + defer wg.Done() + e := c.sqsProvider.DeleteQueue(ctx) + m.Lock() + err = multierr.Append(err, e) + m.Unlock() + }() + go func() { + defer wg.Done() + e := c.eventBridgeProvider.DeleteEC2NotificationRules(ctx) + m.Lock() + err = multierr.Append(err, e) + m.Unlock() + }() + wg.Wait() + if err != nil { + return err + } + logging.FromContext(ctx).Infof("Successfully deprovisioned the infrastructure") + return nil +} diff --git a/pkg/cloudprovider/aws/controllers/deployment/suite_test.go b/pkg/cloudprovider/aws/controllers/deployment/suite_test.go new file mode 100644 index 000000000000..9540978524a6 --- /dev/null +++ b/pkg/cloudprovider/aws/controllers/deployment/suite_test.go @@ -0,0 +1,70 @@ +/* +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package deployment_test + +import ( + "context" + "testing" + "time" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + "k8s.io/client-go/kubernetes" + clock "k8s.io/utils/clock/testing" + . "knative.dev/pkg/logging/testing" + + "github.com/aws/karpenter/pkg/cloudprovider/fake" + "github.com/aws/karpenter/pkg/controllers/provisioning" + "github.com/aws/karpenter/pkg/controllers/state" + "github.com/aws/karpenter/pkg/test" +) + +var ctx context.Context +var env *test.Environment +var cluster *state.Cluster +var provisioner *provisioning.Provisioner +var cloudProvider *fake.CloudProvider +var clientSet *kubernetes.Clientset +var recorder *test.EventRecorder +var fakeClock *clock.FakeClock +var cfg *test.Config + +func TestAPIs(t *testing.T) { + ctx = TestContextWithLogger(t) + RegisterFailHandler(Fail) + RunSpecs(t, "AWS Infrastructure") +} + +var _ = BeforeSuite(func() { + env = test.NewEnvironment(ctx, func(e *test.Environment) { + cloudProvider = &fake.CloudProvider{} + cfg = test.NewConfig() + fakeClock = clock.NewFakeClock(time.Now()) + cluster = state.NewCluster(fakeClock, cfg, env.Client, cloudProvider) + clientSet = kubernetes.NewForConfigOrDie(e.Config) + recorder = test.NewEventRecorder() + provisioner = provisioning.NewProvisioner(ctx, cfg, env.Client, clientSet.CoreV1(), recorder, cloudProvider, cluster) + }) + Expect(env.Start()).To(Succeed(), "Failed to start environment") +}) + +var _ = AfterSuite(func() { + Expect(env.Stop()).To(Succeed(), "Failed to stop environment") +}) + +var _ = BeforeEach(func() { +}) +var _ = AfterEach(func() { +}) diff --git a/pkg/cloudprovider/aws/controllers/infrastructure/controller.go b/pkg/cloudprovider/aws/controllers/infrastructure/controller.go index e9825ec2e50c..23a55241edb7 100644 --- a/pkg/cloudprovider/aws/controllers/infrastructure/controller.go +++ b/pkg/cloudprovider/aws/controllers/infrastructure/controller.go @@ -21,13 +21,9 @@ import ( "sync" "time" - "github.com/avast/retry-go" "github.com/aws/aws-sdk-go/aws/awserr" "github.com/aws/aws-sdk-go/service/sqs" "go.uber.org/multierr" - appsv1 "k8s.io/api/apps/v1" - errors2 "k8s.io/apimachinery/pkg/api/errors" - "k8s.io/apimachinery/pkg/types" "k8s.io/utils/clock" "knative.dev/pkg/logging" "sigs.k8s.io/controller-runtime/pkg/client" @@ -35,7 +31,6 @@ import ( "github.com/aws/karpenter/pkg/cloudprovider/aws" "github.com/aws/karpenter/pkg/cloudprovider/aws/events" "github.com/aws/karpenter/pkg/metrics" - "github.com/aws/karpenter/pkg/utils/injection" ) // Controller is the AWS infrastructure controller. It is not a standard controller-runtime controller in that it doesn't @@ -51,8 +46,6 @@ type Controller struct { mutex *sync.RWMutex readinessChan chan struct{} // A signal to other controllers that infrastructure is in a good state ready bool - - done chan struct{} } // pollingPeriod is the period that we go to AWS APIs to ensure that the appropriate AWS infrastructure is provisioned @@ -63,9 +56,9 @@ const pollingPeriod = time.Hour // is provisioned if there is an error in the reconciliation loop const defaultBackoffPeriod = time.Minute * 10 -func NewController(ctx context.Context, cleanupCtx context.Context, kubeClient client.Client, clk clock.Clock, +func NewController(ctx context.Context, kubeClient client.Client, clk clock.Clock, recorder events.Recorder, sqsProvider *aws.SQSProvider, eventBridgeProvider *aws.EventBridgeProvider, - startAsync <-chan struct{}, cleanupAsync <-chan struct{}) *Controller { + startAsync <-chan struct{}) *Controller { c := &Controller{ kubeClient: kubeClient, recorder: recorder, @@ -74,16 +67,8 @@ func NewController(ctx context.Context, cleanupCtx context.Context, kubeClient c eventBridgeProvider: eventBridgeProvider, mutex: &sync.RWMutex{}, readinessChan: make(chan struct{}), - done: make(chan struct{}), } - go func() { - <-cleanupAsync - logging.FromContext(cleanupCtx).Infof("Cleanup was triggered for the Karpenter deployment") - c.cleanup(cleanupCtx) - close(c.done) - }() - go func() { select { case <-ctx.Done(): @@ -92,7 +77,6 @@ func NewController(ctx context.Context, cleanupCtx context.Context, kubeClient c c.run(ctx) } }() - return c } @@ -126,54 +110,15 @@ func (c *Controller) run(ctx context.Context) { } } -func (c *Controller) cleanup(ctx context.Context) { - ctx = logging.WithLogger(ctx, logging.FromContext(ctx).Named("infrastructure")) - - dep := &appsv1.Deployment{} - nn := types.NamespacedName{ - Name: injection.GetOptions(ctx).DeploymentName, - Namespace: injection.GetOptions(ctx).DeploymentNamespace, - } - - notFound := false - logging.FromContext(ctx).Infof("Getting the deployment from the api server") - err := retry.Do(func() error { - err := c.kubeClient.Get(ctx, nn, dep) - if errors2.IsNotFound(err) { - notFound = true - } - return client.IgnoreNotFound(err) - }) - if err != nil { - logging.FromContext(ctx).Errorf("Getting the deployment %s for cleanup, %v", nn, err) - } - logging.FromContext(ctx).Infof("Successfully got the deployment from the api server") - - // Deployment is already deleted or currently deleting, so we should clean-up the infrastructure - if notFound || !dep.DeletionTimestamp.IsZero() { - err = c.deleteInfrastructure(ctx) - if err != nil { - logging.FromContext(ctx).Errorf("Deprovisioning the infrastructure, %v", err) - return - } - logging.FromContext(ctx).Infof("Successfully deprovisioned the infrastructure, %v", err) - } -} - +// Ready returns a channel that serves as a gate for other controllers +// to wait on the infrastructure to be in a good state. When the infrastructure is ready, +// this channel is closed so other controllers can proceed with their operations func (c *Controller) Ready() <-chan struct{} { c.mutex.RLock() defer c.mutex.RUnlock() return c.readinessChan } -func (c *Controller) Done() <-chan struct{} { - return c.done -} - -func (c *Controller) deploymentWatcher() { - -} - func (c *Controller) setReady(ctx context.Context, ready bool) { c.mutex.Lock() defer c.mutex.Unlock() @@ -199,6 +144,8 @@ func (c *Controller) setReady(ctx context.Context, ready bool) { c.ready = ready } +// ensureInfrastructure reconciles the SQS queue and the EventBridge rules with the expected +// configuration prescribed by Karpenter func (c *Controller) ensureInfrastructure(ctx context.Context) (err error) { defer metrics.Measure(reconcileDuration)() @@ -224,34 +171,7 @@ func (c *Controller) ensureInfrastructure(ctx context.Context) (err error) { return err } -func (c *Controller) deleteInfrastructure(ctx context.Context) (err error) { - logging.FromContext(ctx).Infof("Deprovisioning the infrastructure...") - wg := &sync.WaitGroup{} - m := &sync.Mutex{} - - wg.Add(2) - go func() { - defer wg.Done() - logging.FromContext(ctx).Infof("Started deleting the queue") - e := c.sqsProvider.DeleteQueue(ctx) - logging.FromContext(ctx).Infof("Finished deleting the queue") - m.Lock() - err = multierr.Append(err, e) - m.Unlock() - }() - go func() { - defer wg.Done() - logging.FromContext(ctx).Infof("Started deleting the eventbridge rules") - e := c.eventBridgeProvider.DeleteEC2NotificationRules(ctx) - logging.FromContext(ctx).Infof("Finished deleting the eventbridge rules") - m.Lock() - err = multierr.Append(err, e) - m.Unlock() - }() - wg.Wait() - return err -} - +// ensureQueue reconciles the SQS queue with the configuration prescribed by Karpenter func (c *Controller) ensureQueue(ctx context.Context) error { // Attempt to find the queue. If we can't find it, assume it isn't created and try to create it // If we did find it, then just set the queue attributes on the existing queue @@ -280,6 +200,7 @@ func (c *Controller) ensureQueue(ctx context.Context) error { return nil } +// ensureEventBridge reconciles the Eventbridge rules with the configuration prescribed by Karpenter func (c *Controller) ensureEventBridge(ctx context.Context) error { if err := c.eventBridgeProvider.CreateEC2NotificationRules(ctx); err != nil { var awsErr awserr.Error diff --git a/pkg/cloudprovider/aws/controllers/notification/controller.go b/pkg/cloudprovider/aws/controllers/notification/controller.go index 70a0087fa9a0..c6d2444d0dd8 100644 --- a/pkg/cloudprovider/aws/controllers/notification/controller.go +++ b/pkg/cloudprovider/aws/controllers/notification/controller.go @@ -129,6 +129,8 @@ func (c *Controller) pollSQS(ctx context.Context) error { return nil } +// handleMessage gets the node names of the instances involved in the queue message and takes the +// assigned action on the instances based on the message event func (c *Controller) handleMessage(ctx context.Context, instanceIDMap map[string]*v1.Node, msg *sqsapi.Message) (err error) { // No message to parse in this case if msg == nil || msg.Body == nil { @@ -153,13 +155,14 @@ func (c *Controller) handleMessage(ctx context.Context, instanceIDMap map[string for i := range nodes { node := nodes[i] + nodeCtx := logging.WithLogger(ctx, logging.FromContext(ctx).With("node", node.Name)) // Record metric and event for this action c.notifyForEvent(evt, node) actionsTaken.WithLabelValues(action).Inc() if action != Actions.NoAction { - e := c.deleteInstance(ctx, node) + e := c.deleteInstance(nodeCtx, node) err = multierr.Append(err, e) } } @@ -177,7 +180,6 @@ func (c *Controller) handleMessage(ctx context.Context, instanceIDMap map[string } func (c *Controller) deleteInstance(ctx context.Context, node *v1.Node) error { - ctx = logging.WithLogger(ctx, logging.FromContext(ctx).With("node", node.Name)) c.recorder.TerminatingNodeOnNotification(node) if err := c.kubeClient.Delete(ctx, node); err != nil { return fmt.Errorf("deleting the spot interrupted node, %w", err) diff --git a/pkg/cloudprovider/aws/controllers/notification/event/rebalancerecommendation/v0/parser.go b/pkg/cloudprovider/aws/controllers/notification/event/rebalancerecommendation/v0/parser.go index 1b96574713bf..0cade3105fbb 100644 --- a/pkg/cloudprovider/aws/controllers/notification/event/rebalancerecommendation/v0/parser.go +++ b/pkg/cloudprovider/aws/controllers/notification/event/rebalancerecommendation/v0/parser.go @@ -45,6 +45,5 @@ func (Parser) Parse(ctx context.Context, str string) event.Interface { if evt.Source != source || evt.DetailType != detailType || evt.Version != version { return nil } - return evt } diff --git a/pkg/cloudprovider/aws/controllers/notification/event/scheduledchange/v0/parser.go b/pkg/cloudprovider/aws/controllers/notification/event/scheduledchange/v0/parser.go index a08164cdf122..414c0a4cbe47 100644 --- a/pkg/cloudprovider/aws/controllers/notification/event/scheduledchange/v0/parser.go +++ b/pkg/cloudprovider/aws/controllers/notification/event/scheduledchange/v0/parser.go @@ -34,7 +34,7 @@ const ( type Parser struct{} func (Parser) Parse(ctx context.Context, str string) event.Interface { - ctx = logging.WithLogger(ctx, logging.FromContext(ctx).Named("scheduledChange.v1")) + ctx = logging.WithLogger(ctx, logging.FromContext(ctx).Named("scheduledChange.v0")) evt := AWSHealthEvent{} if err := json.Unmarshal([]byte(str), &evt); err != nil { @@ -52,7 +52,7 @@ func (Parser) Parse(ctx context.Context, str string) event.Interface { logging.FromContext(ctx). With("eventDetails", evt). With("acceptedService", acceptedService). - Warn("ignoring AWS health event") + Debug("ignoring AWS health event") return nil } @@ -60,9 +60,8 @@ func (Parser) Parse(ctx context.Context, str string) event.Interface { logging.FromContext(ctx). With("eventDetails", evt). With("acceptedEventTypeCategory", acceptedEventTypeCategory). - Warn("ignoring AWS health event") + Debug("ignoring AWS health event") return nil } - return evt } diff --git a/pkg/cloudprovider/aws/controllers/notification/event/spotinterruption/v0/parser.go b/pkg/cloudprovider/aws/controllers/notification/event/spotinterruption/v0/parser.go index 3bcc1fc523d3..ea283ab52c87 100644 --- a/pkg/cloudprovider/aws/controllers/notification/event/spotinterruption/v0/parser.go +++ b/pkg/cloudprovider/aws/controllers/notification/event/spotinterruption/v0/parser.go @@ -32,7 +32,7 @@ const ( type Parser struct{} func (Parser) Parse(ctx context.Context, str string) event.Interface { - ctx = logging.WithLogger(ctx, logging.FromContext(ctx).Named("spotInterruption.v1")) + ctx = logging.WithLogger(ctx, logging.FromContext(ctx).Named("spotInterruption.v0")) evt := EC2SpotInstanceInterruptionWarning{} if err := json.Unmarshal([]byte(str), &evt); err != nil { @@ -45,6 +45,5 @@ func (Parser) Parse(ctx context.Context, str string) event.Interface { if evt.Source != source || evt.DetailType != detailType || evt.Version != version { return nil } - return evt } diff --git a/pkg/cloudprovider/aws/controllers/notification/event/statechange/v0/parser.go b/pkg/cloudprovider/aws/controllers/notification/event/statechange/v0/parser.go index 113c86e97079..09d97658f049 100644 --- a/pkg/cloudprovider/aws/controllers/notification/event/statechange/v0/parser.go +++ b/pkg/cloudprovider/aws/controllers/notification/event/statechange/v0/parser.go @@ -36,7 +36,7 @@ const ( type Parser struct{} func (Parser) Parse(ctx context.Context, str string) event.Interface { - ctx = logging.WithLogger(ctx, logging.FromContext(ctx).Named("stateChange.v1")) + ctx = logging.WithLogger(ctx, logging.FromContext(ctx).Named("stateChange.v0")) evt := EC2InstanceStateChangeNotification{} if err := json.Unmarshal([]byte(str), &evt); err != nil { @@ -52,8 +52,11 @@ func (Parser) Parse(ctx context.Context, str string) event.Interface { // Do not log the information on instance state change if it isn't in accepted states if !strings.Contains(acceptedStates, strings.ToLower(evt.Detail.State)) { + logging.FromContext(ctx). + With("eventDetails", evt). + With("acceptedStates", acceptedStates). + Debug("ignoring AWS state change event") return nil } - return evt } diff --git a/pkg/cloudprovider/aws/controllers/register.go b/pkg/cloudprovider/aws/controllers/register.go index 81ee7edd746e..98d3f628d4de 100644 --- a/pkg/cloudprovider/aws/controllers/register.go +++ b/pkg/cloudprovider/aws/controllers/register.go @@ -18,21 +18,29 @@ import ( "context" "knative.dev/pkg/logging" + "sigs.k8s.io/controller-runtime/pkg/manager" "github.com/aws/karpenter/pkg/cloudprovider/aws" + "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/deployment" "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/infrastructure" "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification" "github.com/aws/karpenter/pkg/cloudprovider/aws/events" "github.com/aws/karpenter/pkg/controllers" ) -func Register(ctx context.Context, provider *aws.CloudProvider, opts *controllers.ControllerOptions) <-chan struct{} { +func Register(ctx context.Context, provider *aws.CloudProvider, manager manager.Manager, opts *controllers.ControllerOptions) { rec := events.NewRecorder(opts.Recorder) ctx = logging.WithLogger(ctx, logging.FromContext(ctx).Named("aws")) - cleanupContext := logging.WithLogger(opts.BaseContext(), logging.FromContext(opts.BaseContext()).Named("aws")) // Injecting the cloudprovider-specific controllers that will start when opts.StartAsync is triggered - infraController := infrastructure.NewController(ctx, cleanupContext, opts.KubeClient, opts.Clock, rec, provider.SQSProvider(), provider.EventBridgeProvider(), opts.StartAsync, opts.CleanupAsync) - notification.NewController(ctx, opts.KubeClient, opts.Clock, provider.SQSProvider(), rec, opts.Provisioner, opts.Cluster, opts.StartAsync, infraController.Ready) - return infraController.Done() // This is the only controller that has a done channel so just return it + // All these controllers should run with the same context since they rely on each other + infraCtx, cancel := context.WithCancel(ctx) + deploymentController := deployment.NewController(opts.KubeClient, cancel, provider.SQSProvider(), provider.EventBridgeProvider()) + infraController := infrastructure.NewController(infraCtx, opts.KubeClient, opts.Clock, rec, provider.SQSProvider(), provider.EventBridgeProvider(), opts.StartAsync) + notification.NewController(infraCtx, opts.KubeClient, opts.Clock, provider.SQSProvider(), rec, opts.Provisioner, opts.Cluster, opts.StartAsync, infraController.Ready) + + // Register the controller-runtime controller with the global manager + if err := deploymentController.Register(infraCtx, manager); err != nil { + panic(err) + } } diff --git a/pkg/cloudprovider/aws/eventbridge.go b/pkg/cloudprovider/aws/eventbridge.go index f14ffb9d7ec1..10cf75cf6346 100644 --- a/pkg/cloudprovider/aws/eventbridge.go +++ b/pkg/cloudprovider/aws/eventbridge.go @@ -25,7 +25,6 @@ import ( "github.com/aws/aws-sdk-go/service/eventbridge" "github.com/samber/lo" "go.uber.org/multierr" - "knative.dev/pkg/logging" "github.com/aws/karpenter/pkg/apis/provisioning/v1alpha5" "github.com/aws/karpenter/pkg/utils/injection" @@ -132,7 +131,6 @@ func (eb *EventBridgeProvider) DeleteEC2NotificationRules(ctx context.Context) ( Name: aws.String(r.Name), } _, e = eb.DeleteRuleWithContext(ctx, ruleInput) - logging.FromContext(ctx).Errorf("Might have got an error here, %v", e) m.Lock() err = multierr.Append(err, e) m.Unlock() diff --git a/pkg/cloudprovider/aws/sqs.go b/pkg/cloudprovider/aws/sqs.go index ae9babada7a0..701f9b0e7a17 100644 --- a/pkg/cloudprovider/aws/sqs.go +++ b/pkg/cloudprovider/aws/sqs.go @@ -69,7 +69,7 @@ type SQSProvider struct { metadata *Metadata } -func NewProvider(ctx context.Context, client SQSClient, metadata *Metadata) *SQSProvider { +func NewSQSProvider(ctx context.Context, client SQSClient, metadata *Metadata) *SQSProvider { provider := &SQSProvider{ client: client, mutex: &sync.RWMutex{}, diff --git a/pkg/cloudprovider/types.go b/pkg/cloudprovider/types.go index d61cb6cc5b74..0dea45be150f 100644 --- a/pkg/cloudprovider/types.go +++ b/pkg/cloudprovider/types.go @@ -36,8 +36,6 @@ type Options struct { // StartAsync is a channel that is closed when leader election has been won. This is a signal to start any async // processing that should only occur while the cloud provider is the leader. StartAsync <-chan struct{} - // CleanupAsync is a channel that is closed when cleanup is initiated by a SIGINT signal sent to the container - CleanupAsync <-chan struct{} } // CloudProvider interface is implemented by cloud providers to support provisioning. diff --git a/pkg/controllers/controllers.go b/pkg/controllers/controllers.go index a8e29cc37e2f..861939aece0e 100644 --- a/pkg/controllers/controllers.go +++ b/pkg/controllers/controllers.go @@ -19,10 +19,7 @@ import ( "fmt" "net/http" "net/http/pprof" - "os" - "os/signal" "runtime/debug" - "syscall" "github.com/go-logr/logr" "github.com/go-logr/zapr" @@ -77,7 +74,7 @@ func init() { metrics.MustRegister() // Registers cross-controller metrics } -type ControllerInitFunc func(context.Context, *ControllerOptions) <-chan struct{} +type ControllerInitFunc func(context.Context, manager.Manager, *ControllerOptions) // Controller is an interface implemented by Karpenter custom resources. type Controller interface { @@ -90,14 +87,12 @@ type Controller interface { } type ControllerOptions struct { - BaseContext func() context.Context - Cluster *state.Cluster - KubeClient client.Client - Provisioner *provisioning.Provisioner - Recorder events.Recorder - StartAsync <-chan struct{} - CleanupAsync <-chan struct{} - Clock clock.Clock + Cluster *state.Cluster + KubeClient client.Client + Provisioner *provisioning.Provisioner + Recorder events.Recorder + StartAsync <-chan struct{} + Clock clock.Clock } func Initialize(injectCloudProvider func(context.Context, cloudprovider.Options) (cloudprovider.CloudProvider, ControllerInitFunc)) { @@ -112,18 +107,6 @@ func Initialize(injectCloudProvider func(context.Context, cloudprovider.Options) cmw := informer.NewInformedWatcher(clientSet, system.Namespace()) ctx := injection.LoggingContextOrDie(component, controllerRuntimeConfig, cmw) ctx = newRunnableContext(controllerRuntimeConfig, opts, logging.FromContext(ctx))() - ctx, cancel := context.WithCancel(ctx) - - // Setup the cleanup logic for teardown on SIGINT or SIGTERM - cleanup := make(chan struct{}) // This is a channel to broadcast to controllers cleanup can start - go func() { - sigs := make(chan os.Signal, 1) - signal.Notify(sigs, syscall.SIGINT, syscall.SIGTERM) - <-sigs - logging.FromContext(context.Background()).Infof("Got a signal to react to") - close(cleanup) - cancel() - }() logging.FromContext(ctx).Infof("Initializing with version %s", project.Version) @@ -147,7 +130,7 @@ func Initialize(injectCloudProvider func(context.Context, cloudprovider.Options) if opts.EnableProfiling { utilruntime.Must(registerPprof(manager)) } - cloudProvider, injectControllers := injectCloudProvider(ctx, cloudprovider.Options{ClientSet: clientSet, KubeClient: manager.GetClient(), StartAsync: manager.Elected(), CleanupAsync: cleanup}) + cloudProvider, injectControllers := injectCloudProvider(ctx, cloudprovider.Options{ClientSet: clientSet, KubeClient: manager.GetClient(), StartAsync: manager.Elected()}) if hp, ok := cloudProvider.(HealthCheck); ok { utilruntime.Must(manager.AddHealthzCheck("cloud-provider", hp.LivenessProbe)) } @@ -175,16 +158,14 @@ func Initialize(injectCloudProvider func(context.Context, cloudprovider.Options) // Inject cloudprovider-specific controllers into the controller-set using the injectControllers function // Inject the base cloud provider into the injection function rather than the decorated interface controllerOptions := &ControllerOptions{ - BaseContext: newRunnableContext(controllerRuntimeConfig, opts, logging.FromContext(ctx)), - Cluster: cluster, - KubeClient: manager.GetClient(), - Provisioner: provisioner, - Recorder: recorder, - StartAsync: manager.Elected(), - CleanupAsync: cleanup, - Clock: realClock, + Cluster: cluster, + KubeClient: manager.GetClient(), + Provisioner: provisioner, + Recorder: recorder, + StartAsync: manager.Elected(), + Clock: realClock, } - done := injectControllers(ctx, controllerOptions) + injectControllers(ctx, manager, controllerOptions) metricsstate.StartMetricScraper(ctx, cluster) @@ -202,7 +183,6 @@ func Initialize(injectCloudProvider func(context.Context, cloudprovider.Options) ).Start(ctx); err != nil { panic(fmt.Sprintf("Unable to start manager, %s", err)) } - <-done // Wait for controller cleanup to also be completed } // NewManagerOrDie instantiates a controller manager or panics From c53921a23d457dc5a4caad799841e6992daafd42 Mon Sep 17 00:00:00 2001 From: Jonathan Innis Date: Wed, 21 Sep 2022 12:08:39 -0700 Subject: [PATCH 21/55] Add setup for suite_tests for controllers --- .../aws/controllers/deployment/controller.go | 37 +++-- .../aws/controllers/deployment/suite_test.go | 44 +++--- .../controllers/infrastructure/controller.go | 20 +-- .../controllers/infrastructure/suite_test.go | 40 ++--- .../controllers/notification/controller.go | 37 +++-- .../controllers/notification/suite_test.go | 29 ++-- pkg/cloudprovider/aws/controllers/register.go | 4 +- pkg/cloudprovider/aws/errors.go | 28 +++- pkg/cloudprovider/aws/eventbridge.go | 44 +++--- pkg/cloudprovider/aws/events/recorder.go | 34 +++++ pkg/cloudprovider/aws/fake/eventbridgeapi.go | 101 +++++++++++++ pkg/cloudprovider/aws/fake/eventrecorder.go | 32 ++++ pkg/cloudprovider/aws/fake/sqsapi.go | 139 ++++++++++++++++++ pkg/cloudprovider/aws/instance.go | 6 +- pkg/cloudprovider/aws/launchtemplate.go | 2 +- pkg/cloudprovider/aws/sqs.go | 22 +-- pkg/controllers/controllers.go | 23 ++- .../cloudformation.yaml | 3 - 18 files changed, 489 insertions(+), 156 deletions(-) create mode 100644 pkg/cloudprovider/aws/fake/eventbridgeapi.go create mode 100644 pkg/cloudprovider/aws/fake/eventrecorder.go create mode 100644 pkg/cloudprovider/aws/fake/sqsapi.go diff --git a/pkg/cloudprovider/aws/controllers/deployment/controller.go b/pkg/cloudprovider/aws/controllers/deployment/controller.go index 3ad1434a472d..14759cb44941 100644 --- a/pkg/cloudprovider/aws/controllers/deployment/controller.go +++ b/pkg/cloudprovider/aws/controllers/deployment/controller.go @@ -20,7 +20,7 @@ import ( "go.uber.org/multierr" appsv1 "k8s.io/api/apps/v1" - "k8s.io/apimachinery/pkg/api/errors" + apierrors "k8s.io/apimachinery/pkg/api/errors" "knative.dev/pkg/logging" controllerruntime "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" @@ -31,6 +31,7 @@ import ( "github.com/aws/karpenter/pkg/apis/provisioning/v1alpha5" "github.com/aws/karpenter/pkg/cloudprovider/aws" + "github.com/aws/karpenter/pkg/cloudprovider/aws/events" "github.com/aws/karpenter/pkg/utils/injection" ) @@ -42,16 +43,18 @@ const controllerName = "deployment" type Controller struct { kubeClient client.Client cancel context.CancelFunc + recorder events.Recorder sqsProvider *aws.SQSProvider eventBridgeProvider *aws.EventBridgeProvider } -func NewController(kubeClient client.Client, cancel context.CancelFunc, +func NewController(kubeClient client.Client, cancel context.CancelFunc, recorder events.Recorder, sqsProvider *aws.SQSProvider, eventBridgeProvider *aws.EventBridgeProvider) *Controller { return &Controller{ kubeClient: kubeClient, cancel: cancel, + recorder: recorder, sqsProvider: sqsProvider, eventBridgeProvider: eventBridgeProvider, } @@ -62,7 +65,7 @@ func (c *Controller) Reconcile(ctx context.Context, req reconcile.Request) (reco deployment := &appsv1.Deployment{} if err := c.kubeClient.Get(ctx, req.NamespacedName, deployment); err != nil { - if errors.IsNotFound(err) { + if apierrors.IsNotFound(err) { return reconcile.Result{}, nil } return reconcile.Result{}, err @@ -71,8 +74,10 @@ func (c *Controller) Reconcile(ctx context.Context, req reconcile.Request) (reco // and we should perform the cleanup actions associated with the Karpenter deployment if !deployment.DeletionTimestamp.IsZero() { if err := c.deleteInfrastructure(ctx); err != nil { + c.recorder.InfrastructureDeletionFailed(ctx, c.kubeClient) return reconcile.Result{}, err } + c.recorder.InfrastructureDeletionSucceeded(ctx, c.kubeClient) patch := client.MergeFrom(deployment.DeepCopy()) controllerutil.RemoveFinalizer(deployment, v1alpha5.TerminationFinalizer) if err := c.kubeClient.Patch(ctx, deployment, patch); err != nil { @@ -123,21 +128,33 @@ func (c *Controller) deleteInfrastructure(ctx context.Context) (err error) { go func() { defer wg.Done() e := c.sqsProvider.DeleteQueue(ctx) - m.Lock() - err = multierr.Append(err, e) - m.Unlock() + + // If we get access denied, nothing we can do so just log and don't return the error + if aws.IsAccessDenied(e) { + logging.FromContext(ctx).Errorf("Access denied while trying to delete SQS queue, %v", err) + } else if err != nil { + m.Lock() + err = multierr.Append(err, e) + m.Unlock() + } }() go func() { defer wg.Done() e := c.eventBridgeProvider.DeleteEC2NotificationRules(ctx) - m.Lock() - err = multierr.Append(err, e) - m.Unlock() + + // If we get access denied, nothing we can do so just log and don't return the error + if aws.IsAccessDenied(e) { + logging.FromContext(ctx).Errorf("Access denied while trying to delete notification rules, %v", err) + } else if err != nil { + m.Lock() + err = multierr.Append(err, e) + m.Unlock() + } }() wg.Wait() if err != nil { return err } - logging.FromContext(ctx).Infof("Successfully deprovisioned the infrastructure") + logging.FromContext(ctx).Infof("Completed deprovisioning the infrastructure") return nil } diff --git a/pkg/cloudprovider/aws/controllers/deployment/suite_test.go b/pkg/cloudprovider/aws/controllers/deployment/suite_test.go index 9540978524a6..6a054014bd9d 100644 --- a/pkg/cloudprovider/aws/controllers/deployment/suite_test.go +++ b/pkg/cloudprovider/aws/controllers/deployment/suite_test.go @@ -17,45 +17,43 @@ package deployment_test import ( "context" "testing" - "time" . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" - "k8s.io/client-go/kubernetes" - clock "k8s.io/utils/clock/testing" . "knative.dev/pkg/logging/testing" - "github.com/aws/karpenter/pkg/cloudprovider/fake" - "github.com/aws/karpenter/pkg/controllers/provisioning" - "github.com/aws/karpenter/pkg/controllers/state" + . "github.com/aws/karpenter/pkg/test/expectations" + + "github.com/aws/karpenter/pkg/cloudprovider/aws" + "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/deployment" + awsfake "github.com/aws/karpenter/pkg/cloudprovider/aws/fake" "github.com/aws/karpenter/pkg/test" ) var ctx context.Context var env *test.Environment -var cluster *state.Cluster -var provisioner *provisioning.Provisioner -var cloudProvider *fake.CloudProvider -var clientSet *kubernetes.Clientset -var recorder *test.EventRecorder -var fakeClock *clock.FakeClock -var cfg *test.Config +var sqsapi *awsfake.SQSAPI +var sqsProvider *aws.SQSProvider +var eventbridgeapi *awsfake.EventBridgeAPI +var eventBridgeProvider *aws.EventBridgeProvider +var recorder *awsfake.EventRecorder +var controller *deployment.Controller func TestAPIs(t *testing.T) { ctx = TestContextWithLogger(t) RegisterFailHandler(Fail) - RunSpecs(t, "AWS Infrastructure") + RunSpecs(t, "AWS Karpenter Deployment") } var _ = BeforeSuite(func() { env = test.NewEnvironment(ctx, func(e *test.Environment) { - cloudProvider = &fake.CloudProvider{} - cfg = test.NewConfig() - fakeClock = clock.NewFakeClock(time.Now()) - cluster = state.NewCluster(fakeClock, cfg, env.Client, cloudProvider) - clientSet = kubernetes.NewForConfigOrDie(e.Config) - recorder = test.NewEventRecorder() - provisioner = provisioning.NewProvisioner(ctx, cfg, env.Client, clientSet.CoreV1(), recorder, cloudProvider, cluster) + recorder = awsfake.NewEventRecorder() + metadata := aws.NewMetadata("us-east-1", "000000000000") + + sqsapi = &awsfake.SQSAPI{} + eventbridgeapi = &awsfake.EventBridgeAPI{} + sqsProvider = aws.NewSQSProvider(ctx, sqsapi, metadata) + eventBridgeProvider = aws.NewEventBridgeProvider(eventbridgeapi, metadata, sqsProvider.QueueName()) }) Expect(env.Start()).To(Succeed(), "Failed to start environment") }) @@ -65,6 +63,10 @@ var _ = AfterSuite(func() { }) var _ = BeforeEach(func() { + sqsapi.Reset() + eventbridgeapi.Reset() + controller = deployment.NewController(env.Client, nil, recorder, sqsProvider, eventBridgeProvider) }) var _ = AfterEach(func() { + ExpectCleanedUp(ctx, env.Client) }) diff --git a/pkg/cloudprovider/aws/controllers/infrastructure/controller.go b/pkg/cloudprovider/aws/controllers/infrastructure/controller.go index 23a55241edb7..b742f045d44a 100644 --- a/pkg/cloudprovider/aws/controllers/infrastructure/controller.go +++ b/pkg/cloudprovider/aws/controllers/infrastructure/controller.go @@ -176,19 +176,14 @@ func (c *Controller) ensureQueue(ctx context.Context) error { // Attempt to find the queue. If we can't find it, assume it isn't created and try to create it // If we did find it, then just set the queue attributes on the existing queue if _, err := c.sqsProvider.DiscoverQueueURL(ctx, true); err != nil { - var awsErr awserr.Error - if !errors.As(err, &awsErr) { - // This shouldn't happen, but if it does, we should capture it - return fmt.Errorf("failed conversion to AWS error, %w", err) - } - switch awsErr.Code() { - case sqs.ErrCodeQueueDoesNotExist: + switch { + case aws.IsNotFound(err): logging.FromContext(ctx).Infof("Creating the SQS queue for EC2 notifications...") if err := c.sqsProvider.CreateQueue(ctx); err != nil { return fmt.Errorf("creating sqs queue with policy, %w", err) } return nil - case aws.AccessDeniedCode: + case aws.IsAccessDenied(err): return fmt.Errorf("failed obtaining permission to discover sqs queue url, %w", err) default: return fmt.Errorf("failed discovering sqs queue url, %w", err) @@ -203,13 +198,8 @@ func (c *Controller) ensureQueue(ctx context.Context) error { // ensureEventBridge reconciles the Eventbridge rules with the configuration prescribed by Karpenter func (c *Controller) ensureEventBridge(ctx context.Context) error { if err := c.eventBridgeProvider.CreateEC2NotificationRules(ctx); err != nil { - var awsErr awserr.Error - if !errors.As(err, &awsErr) { - // This shouldn't happen, but if it does, we should capture it - return fmt.Errorf("failed conversion to AWS error, %w", err) - } - switch awsErr.Code() { - case aws.AccessDeniedException: + switch { + case aws.IsAccessDenied(err): return fmt.Errorf("obtaining permission to eventbridge, %w", err) default: return fmt.Errorf("creating event bridge notification rules, %w", err) diff --git a/pkg/cloudprovider/aws/controllers/infrastructure/suite_test.go b/pkg/cloudprovider/aws/controllers/infrastructure/suite_test.go index 62ac5e9d30a6..bea5ffa328f5 100644 --- a/pkg/cloudprovider/aws/controllers/infrastructure/suite_test.go +++ b/pkg/cloudprovider/aws/controllers/infrastructure/suite_test.go @@ -21,41 +21,43 @@ import ( . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" - "k8s.io/client-go/kubernetes" clock "k8s.io/utils/clock/testing" . "knative.dev/pkg/logging/testing" - "github.com/aws/karpenter/pkg/cloudprovider/fake" - "github.com/aws/karpenter/pkg/controllers/provisioning" - "github.com/aws/karpenter/pkg/controllers/state" + . "github.com/aws/karpenter/pkg/test/expectations" + + "github.com/aws/karpenter/pkg/cloudprovider/aws" + "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/infrastructure" + awsfake "github.com/aws/karpenter/pkg/cloudprovider/aws/fake" "github.com/aws/karpenter/pkg/test" ) var ctx context.Context var env *test.Environment -var cluster *state.Cluster -var provisioner *provisioning.Provisioner -var cloudProvider *fake.CloudProvider -var clientSet *kubernetes.Clientset -var recorder *test.EventRecorder +var sqsapi *awsfake.SQSAPI +var sqsProvider *aws.SQSProvider +var eventbridgeapi *awsfake.EventBridgeAPI +var eventBridgeProvider *aws.EventBridgeProvider +var recorder *awsfake.EventRecorder var fakeClock *clock.FakeClock -var cfg *test.Config +var controller *infrastructure.Controller func TestAPIs(t *testing.T) { ctx = TestContextWithLogger(t) RegisterFailHandler(Fail) - RunSpecs(t, "AWS Infrastructure") + RunSpecs(t, "AWS Notification") } var _ = BeforeSuite(func() { env = test.NewEnvironment(ctx, func(e *test.Environment) { - cloudProvider = &fake.CloudProvider{} - cfg = test.NewConfig() fakeClock = clock.NewFakeClock(time.Now()) - cluster = state.NewCluster(fakeClock, cfg, env.Client, cloudProvider) - clientSet = kubernetes.NewForConfigOrDie(e.Config) - recorder = test.NewEventRecorder() - provisioner = provisioning.NewProvisioner(ctx, cfg, env.Client, clientSet.CoreV1(), recorder, cloudProvider, cluster) + recorder = awsfake.NewEventRecorder() + metadata := aws.NewMetadata("us-east-1", "000000000000") + + sqsapi = &awsfake.SQSAPI{} + eventbridgeapi = &awsfake.EventBridgeAPI{} + sqsProvider = aws.NewSQSProvider(ctx, sqsapi, metadata) + eventBridgeProvider = aws.NewEventBridgeProvider(eventbridgeapi, metadata, sqsProvider.QueueName()) }) Expect(env.Start()).To(Succeed(), "Failed to start environment") }) @@ -65,6 +67,10 @@ var _ = AfterSuite(func() { }) var _ = BeforeEach(func() { + sqsapi.Reset() + eventbridgeapi.Reset() + controller = infrastructure.NewController(env.Ctx, env.Client, fakeClock, recorder, sqsProvider, eventBridgeProvider, nil) }) var _ = AfterEach(func() { + ExpectCleanedUp(ctx, env.Client) }) diff --git a/pkg/cloudprovider/aws/controllers/notification/controller.go b/pkg/cloudprovider/aws/controllers/notification/controller.go index c6d2444d0dd8..9ce17be8662d 100644 --- a/pkg/cloudprovider/aws/controllers/notification/controller.go +++ b/pkg/cloudprovider/aws/controllers/notification/controller.go @@ -34,7 +34,6 @@ import ( "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event" "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event/aggregatedparser" "github.com/aws/karpenter/pkg/cloudprovider/aws/events" - "github.com/aws/karpenter/pkg/controllers/provisioning" "github.com/aws/karpenter/pkg/controllers/state" "github.com/aws/karpenter/pkg/metrics" ) @@ -52,13 +51,12 @@ var Actions = struct { // Controller is the notification controller. It is not a standard controller-runtime controller in that it doesn't // have a reconcile method. type Controller struct { - kubeClient client.Client - provisioner *provisioning.Provisioner - cluster *state.Cluster - recorder events.Recorder - clock clock.Clock - provider *aws.SQSProvider - parser event.Parser + kubeClient client.Client + cluster *state.Cluster + recorder events.Recorder + clock clock.Clock + provider *aws.SQSProvider + parser event.Parser infraReady func() <-chan struct{} } @@ -66,18 +64,17 @@ type Controller struct { // pollingPeriod that we go to the SQS queue to check if there are any new events const pollingPeriod = 2 * time.Second -func NewController(ctx context.Context, kubeClient client.Client, clk clock.Clock, sqsProvider *aws.SQSProvider, - recorder events.Recorder, provisioner *provisioning.Provisioner, cluster *state.Cluster, +func NewController(ctx context.Context, kubeClient client.Client, clk clock.Clock, + recorder events.Recorder, cluster *state.Cluster, sqsProvider *aws.SQSProvider, startAsync <-chan struct{}, infraReady func() <-chan struct{}) *Controller { c := &Controller{ - kubeClient: kubeClient, - provisioner: provisioner, - cluster: cluster, - recorder: recorder, - clock: clk, - provider: sqsProvider, - parser: aggregatedparser.NewAggregatedParser(aggregatedparser.DefaultParsers...), - infraReady: infraReady, + kubeClient: kubeClient, + cluster: cluster, + recorder: recorder, + clock: clk, + provider: sqsProvider, + parser: aggregatedparser.NewAggregatedParser(aggregatedparser.DefaultParsers...), + infraReady: infraReady, } go func() { @@ -198,8 +195,8 @@ func (c *Controller) notifyForEvent(evt event.Interface, n *v1.Node) { case event.Kinds.SpotInterruption: c.recorder.EC2SpotInterruptionWarning(n) - // For now, we won't do anything with the state change action case event.Kinds.StateChange: + c.recorder.EC2StateChange(n) default: } } @@ -216,7 +213,7 @@ func actionForEvent(evt event.Interface) Action { return Actions.CordonAndDrain case event.Kinds.StateChange: - return Actions.NoAction + return Actions.CordonAndDrain default: return Actions.NoAction diff --git a/pkg/cloudprovider/aws/controllers/notification/suite_test.go b/pkg/cloudprovider/aws/controllers/notification/suite_test.go index 9ecbd71aec0d..f73ed4712a51 100644 --- a/pkg/cloudprovider/aws/controllers/notification/suite_test.go +++ b/pkg/cloudprovider/aws/controllers/notification/suite_test.go @@ -21,12 +21,15 @@ import ( . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" - "k8s.io/client-go/kubernetes" clock "k8s.io/utils/clock/testing" . "knative.dev/pkg/logging/testing" + . "github.com/aws/karpenter/pkg/test/expectations" + + "github.com/aws/karpenter/pkg/cloudprovider/aws" + "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification" + awsfake "github.com/aws/karpenter/pkg/cloudprovider/aws/fake" "github.com/aws/karpenter/pkg/cloudprovider/fake" - "github.com/aws/karpenter/pkg/controllers/provisioning" "github.com/aws/karpenter/pkg/controllers/state" "github.com/aws/karpenter/pkg/test" ) @@ -34,12 +37,14 @@ import ( var ctx context.Context var env *test.Environment var cluster *state.Cluster -var provisioner *provisioning.Provisioner +var sqsapi *awsfake.SQSAPI var cloudProvider *fake.CloudProvider -var clientSet *kubernetes.Clientset -var recorder *test.EventRecorder +var sqsProvider *aws.SQSProvider +var recorder *awsfake.EventRecorder var fakeClock *clock.FakeClock var cfg *test.Config +var controller *notification.Controller +var ready func() <-chan struct{} func TestAPIs(t *testing.T) { ctx = TestContextWithLogger(t) @@ -49,13 +54,15 @@ func TestAPIs(t *testing.T) { var _ = BeforeSuite(func() { env = test.NewEnvironment(ctx, func(e *test.Environment) { - cloudProvider = &fake.CloudProvider{} cfg = test.NewConfig() fakeClock = clock.NewFakeClock(time.Now()) + cloudProvider = &fake.CloudProvider{} cluster = state.NewCluster(fakeClock, cfg, env.Client, cloudProvider) - clientSet = kubernetes.NewForConfigOrDie(e.Config) - recorder = test.NewEventRecorder() - provisioner = provisioning.NewProvisioner(ctx, cfg, env.Client, clientSet.CoreV1(), recorder, cloudProvider, cluster) + recorder = awsfake.NewEventRecorder() + metadata := aws.NewMetadata("us-east-1", "000000000000") + + sqsapi = &awsfake.SQSAPI{} + sqsProvider = aws.NewSQSProvider(ctx, sqsapi, metadata) }) Expect(env.Start()).To(Succeed(), "Failed to start environment") }) @@ -65,6 +72,10 @@ var _ = AfterSuite(func() { }) var _ = BeforeEach(func() { + sqsapi.Reset() + ready = func() <-chan struct{} { return make(chan struct{}) } + controller = notification.NewController(env.Ctx, env.Client, fakeClock, recorder, cluster, sqsProvider, nil, ready) }) var _ = AfterEach(func() { + ExpectCleanedUp(ctx, env.Client) }) diff --git a/pkg/cloudprovider/aws/controllers/register.go b/pkg/cloudprovider/aws/controllers/register.go index 98d3f628d4de..875a42f268b2 100644 --- a/pkg/cloudprovider/aws/controllers/register.go +++ b/pkg/cloudprovider/aws/controllers/register.go @@ -35,9 +35,9 @@ func Register(ctx context.Context, provider *aws.CloudProvider, manager manager. // Injecting the cloudprovider-specific controllers that will start when opts.StartAsync is triggered // All these controllers should run with the same context since they rely on each other infraCtx, cancel := context.WithCancel(ctx) - deploymentController := deployment.NewController(opts.KubeClient, cancel, provider.SQSProvider(), provider.EventBridgeProvider()) + deploymentController := deployment.NewController(opts.KubeClient, cancel, rec, provider.SQSProvider(), provider.EventBridgeProvider()) infraController := infrastructure.NewController(infraCtx, opts.KubeClient, opts.Clock, rec, provider.SQSProvider(), provider.EventBridgeProvider(), opts.StartAsync) - notification.NewController(infraCtx, opts.KubeClient, opts.Clock, provider.SQSProvider(), rec, opts.Provisioner, opts.Cluster, opts.StartAsync, infraController.Ready) + notification.NewController(infraCtx, opts.KubeClient, opts.Clock, rec, opts.Cluster, provider.SQSProvider(), opts.StartAsync, infraController.Ready) // Register the controller-runtime controller with the global manager if err := deploymentController.Register(infraCtx, manager); err != nil { diff --git a/pkg/cloudprovider/aws/errors.go b/pkg/cloudprovider/aws/errors.go index 0e13d68430d5..fb1f6a8d5098 100644 --- a/pkg/cloudprovider/aws/errors.go +++ b/pkg/cloudprovider/aws/errors.go @@ -19,13 +19,13 @@ import ( "github.com/aws/aws-sdk-go/aws/awserr" "github.com/aws/aws-sdk-go/service/ec2" + "github.com/aws/aws-sdk-go/service/eventbridge" + "github.com/aws/aws-sdk-go/service/sqs" "github.com/samber/lo" ) const ( launchTemplateNotFoundCode = "InvalidLaunchTemplateName.NotFoundException" - AccessDeniedCode = "AccessDenied" - AccessDeniedException = "AccessDeniedException" ) var ( @@ -33,6 +33,8 @@ var ( notFoundErrorCodes = []string{ "InvalidInstanceID.NotFound", launchTemplateNotFoundCode, + sqs.ErrCodeQueueDoesNotExist, + (&eventbridge.ResourceNotFoundException{}).Code(), } // unfulfillableCapacityErrorCodes signify that capacity is temporarily unable to be launched unfulfillableCapacityErrorCodes = []string{ @@ -42,6 +44,10 @@ var ( "UnfulfillableCapacity", "Unsupported", } + accessDeniedErrorCodes = []string{ + "AccessDenied", + "AccessDeniedException", + } ) type InstanceTerminatedError struct { @@ -56,10 +62,10 @@ func isInstanceTerminated(err error) bool { return errors.As(err, &itErr) } -// isNotFound returns true if the err is an AWS error (even if it's +// IsNotFound returns true if the err is an AWS error (even if it's // wrapped) and is a known to mean "not found" (as opposed to a more // serious or unexpected error) -func isNotFound(err error) bool { +func IsNotFound(err error) bool { if err == nil { return false } @@ -70,6 +76,20 @@ func isNotFound(err error) bool { return false } +// IsAccessDenied returns true if the err is an AWS error (even if it's +// wrapped) and is a known to mean "access denied" (as opposed to a more +// serious or unexpected error) +func IsAccessDenied(err error) bool { + if err == nil { + return false + } + var awsError awserr.Error + if errors.As(err, &awsError) { + return lo.Contains(accessDeniedErrorCodes, awsError.Code()) + } + return false +} + // isUnfulfillableCapacity returns true if the Fleet err means // capacity is temporarily unavailable for launching. // This could be due to account limits, insufficient ec2 capacity, etc. diff --git a/pkg/cloudprovider/aws/eventbridge.go b/pkg/cloudprovider/aws/eventbridge.go index 10cf75cf6346..e41914799fb3 100644 --- a/pkg/cloudprovider/aws/eventbridge.go +++ b/pkg/cloudprovider/aws/eventbridge.go @@ -21,8 +21,8 @@ import ( "sync" "github.com/aws/aws-sdk-go/aws" - "github.com/aws/aws-sdk-go/aws/request" "github.com/aws/aws-sdk-go/service/eventbridge" + "github.com/aws/aws-sdk-go/service/eventbridge/eventbridgeiface" "github.com/samber/lo" "go.uber.org/multierr" @@ -30,15 +30,8 @@ import ( "github.com/aws/karpenter/pkg/utils/injection" ) -type EventBridgeClient interface { - PutRuleWithContext(context.Context, *eventbridge.PutRuleInput, ...request.Option) (*eventbridge.PutRuleOutput, error) - PutTargetsWithContext(context.Context, *eventbridge.PutTargetsInput, ...request.Option) (*eventbridge.PutTargetsOutput, error) - DeleteRuleWithContext(context.Context, *eventbridge.DeleteRuleInput, ...request.Option) (*eventbridge.DeleteRuleOutput, error) - RemoveTargetsWithContext(context.Context, *eventbridge.RemoveTargetsInput, ...request.Option) (*eventbridge.RemoveTargetsOutput, error) -} - type EventBridgeProvider struct { - EventBridgeClient + client eventbridgeiface.EventBridgeAPI queueName string metadata *Metadata } @@ -63,11 +56,11 @@ func (ep *EventPattern) Serialize() []byte { return lo.Must(json.Marshal(ep)) } -func NewEventBridgeProvider(eb EventBridgeClient, metadata *Metadata, queueName string) *EventBridgeProvider { +func NewEventBridgeProvider(eb eventbridgeiface.EventBridgeAPI, metadata *Metadata, queueName string) *EventBridgeProvider { return &EventBridgeProvider{ - EventBridgeClient: eb, - metadata: metadata, - queueName: queueName, + client: eb, + metadata: metadata, + queueName: queueName, } } @@ -78,7 +71,7 @@ func (eb *EventBridgeProvider) CreateEC2NotificationRules(ctx context.Context) ( wg.Add(1) go func(r EventRule) { defer wg.Done() - _, e := eb.PutRuleWithContext(ctx, &eventbridge.PutRuleInput{ + _, e := eb.client.PutRuleWithContext(ctx, &eventbridge.PutRuleInput{ Name: aws.String(r.Name), EventPattern: aws.String(string(r.Pattern.Serialize())), Tags: []*eventbridge.Tag{ @@ -91,7 +84,7 @@ func (eb *EventBridgeProvider) CreateEC2NotificationRules(ctx context.Context) ( m.Lock() err = multierr.Append(err, e) m.Unlock() - _, e = eb.PutTargetsWithContext(ctx, &eventbridge.PutTargetsInput{ + _, e = eb.client.PutTargetsWithContext(ctx, &eventbridge.PutTargetsInput{ Rule: aws.String(r.Name), Targets: []*eventbridge.Target{ { @@ -120,20 +113,23 @@ func (eb *EventBridgeProvider) DeleteEC2NotificationRules(ctx context.Context) ( Ids: []*string{aws.String(r.Target.ID)}, Rule: aws.String(r.Name), } - _, e := eb.RemoveTargetsWithContext(ctx, targetInput) - m.Lock() - err = multierr.Append(err, e) - m.Unlock() - if e != nil { + _, e := eb.client.RemoveTargetsWithContext(ctx, targetInput) + if err != nil && !IsNotFound(e) { + m.Lock() + err = multierr.Append(err, e) + m.Unlock() return } ruleInput := &eventbridge.DeleteRuleInput{ Name: aws.String(r.Name), } - _, e = eb.DeleteRuleWithContext(ctx, ruleInput) - m.Lock() - err = multierr.Append(err, e) - m.Unlock() + _, e = eb.client.DeleteRuleWithContext(ctx, ruleInput) + if err != nil && !IsNotFound(e) { + m.Lock() + err = multierr.Append(err, e) + m.Unlock() + return + } }(rule) } wg.Wait() diff --git a/pkg/cloudprovider/aws/events/recorder.go b/pkg/cloudprovider/aws/events/recorder.go index f99501d7544a..4a80ba959ebc 100644 --- a/pkg/cloudprovider/aws/events/recorder.go +++ b/pkg/cloudprovider/aws/events/recorder.go @@ -42,12 +42,18 @@ type Recorder interface { EC2SpotRebalanceRecommendation(*v1.Node) // EC2HealthWarning is called when EC2 sends a health warning notification for a health issue for the node from the SQS queue EC2HealthWarning(*v1.Node) + // EC2StateChange is called when EC2 sends a state change notification for a node that is changing to a stopping/terminating state + EC2StateChange(*v1.Node) // TerminatingNodeOnNotification is called when a notification that is sent to the notification controller triggers node deletion TerminatingNodeOnNotification(*v1.Node) // InfrastructureUnhealthy event is called when infrastructure reconciliation errors and the controller enters an unhealthy state InfrastructureUnhealthy(context.Context, client.Client) // InfrastructureHealthy event is called when infrastructure reconciliation succeeds and the controller enters a healthy state InfrastructureHealthy(context.Context, client.Client) + // InfrastructureDeletionSucceeded event is called when infrastructure deletion fails + InfrastructureDeletionSucceeded(context.Context, client.Client) + // InfrastructureDeletionFailed event is called when infrastructure deletion succeeds + InfrastructureDeletionFailed(context.Context, client.Client) } func NewRecorder(r events.Recorder) Recorder { @@ -68,6 +74,10 @@ func (r recorder) EC2HealthWarning(node *v1.Node) { r.Eventf(node, "Normal", "EC2HealthWarning", "Node %s event: EC2 triggered a health warning for the node", node.Name) } +func (r recorder) EC2StateChange(node *v1.Node) { + r.Eventf(node, "Normal", "EC2StateTerminating", `Node %s event: EC2 node is stopping or terminating"`, node.Name) +} + func (r recorder) TerminatingNodeOnNotification(node *v1.Node) { r.Eventf(node, "Normal", "AWSNotificationTerminateNode", "Node %s event: Notification triggered termination for the node", node.Name) } @@ -95,3 +105,27 @@ func (r recorder) InfrastructureUnhealthy(ctx context.Context, kubeClient client } r.Eventf(dep, "Warning", "AWSInfrastructureUnhealthy", "Karpenter infrastructure reconciliation is unhealthy") } + +func (r recorder) InfrastructureDeletionSucceeded(ctx context.Context, kubeClient client.Client) { + dep := &appsv1.Deployment{} + err := retry.Do(func() error { + return kubeClient.Get(ctx, types.NamespacedName{Namespace: injection.GetOptions(ctx).DeploymentNamespace, Name: injection.GetOptions(ctx).DeploymentName}, dep) + }) + if err != nil { + logging.FromContext(ctx).Errorf("Sending InfrastructureUnhealthy event, %v", err) + return + } + r.Eventf(dep, "Warning", "InfrastructureDeletionSucceeded", "Karpenter infrastructure deletion succeeded") +} + +func (r recorder) InfrastructureDeletionFailed(ctx context.Context, kubeClient client.Client) { + dep := &appsv1.Deployment{} + err := retry.Do(func() error { + return kubeClient.Get(ctx, types.NamespacedName{Namespace: injection.GetOptions(ctx).DeploymentNamespace, Name: injection.GetOptions(ctx).DeploymentName}, dep) + }) + if err != nil { + logging.FromContext(ctx).Errorf("Sending InfrastructureUnhealthy event, %v", err) + return + } + r.Eventf(dep, "Warning", "InfrastructureDeletionFailed", "Karpenter infrastructure deletion failed") +} diff --git a/pkg/cloudprovider/aws/fake/eventbridgeapi.go b/pkg/cloudprovider/aws/fake/eventbridgeapi.go new file mode 100644 index 000000000000..254099f32827 --- /dev/null +++ b/pkg/cloudprovider/aws/fake/eventbridgeapi.go @@ -0,0 +1,101 @@ +/* +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package fake + +import ( + "context" + + "github.com/aws/aws-sdk-go/aws/request" + "github.com/aws/aws-sdk-go/service/eventbridge" + "github.com/aws/aws-sdk-go/service/eventbridge/eventbridgeiface" +) + +// EventBridgeBehavior must be reset between tests otherwise tests will +// pollute each other. +type EventBridgeBehavior struct { + PutRuleOutput AtomicPtr[eventbridge.PutRuleOutput] + PutTargetsOutput AtomicPtr[eventbridge.PutTargetsOutput] + + CalledWithPutRuleInput AtomicPtrSlice[eventbridge.PutRuleInput] + CalledWithPutTargetsInput AtomicPtrSlice[eventbridge.PutTargetsInput] + CalledWithDeleteRuleInput AtomicPtrSlice[eventbridge.DeleteRuleInput] + CalledWithRemoveTargetsInput AtomicPtrSlice[eventbridge.RemoveTargetsInput] + NextError AtomicError +} + +type EventBridgeAPI struct { + eventbridgeiface.EventBridgeAPI + EventBridgeBehavior +} + +// Reset must be called between tests otherwise tests will pollute +// each other. +func (eb *EventBridgeAPI) Reset() { + eb.PutTargetsOutput.Reset() + eb.PutTargetsOutput.Reset() + eb.CalledWithPutRuleInput.Reset() + eb.CalledWithPutTargetsInput.Reset() + eb.CalledWithDeleteRuleInput.Reset() + eb.CalledWithRemoveTargetsInput.Reset() + eb.NextError.Reset() +} + +// TODO: Create a dummy rule ARN for the default that is returned from this function +func (eb *EventBridgeAPI) PutRuleWithContext(_ context.Context, input *eventbridge.PutRuleInput, _ ...request.Option) (*eventbridge.PutRuleOutput, error) { + if !eb.NextError.IsNil() { + defer eb.NextError.Reset() + return nil, eb.NextError.Get() + } + eb.CalledWithPutRuleInput.Add(input) + + if !eb.PutRuleOutput.IsNil() { + return eb.PutRuleOutput.Clone(), nil + } + return &eventbridge.PutRuleOutput{}, nil +} + +// TODO: Create a default response that returns failed entries +func (eb *EventBridgeAPI) PutTargetsWithContext(_ context.Context, input *eventbridge.PutTargetsInput, _ ...request.Option) (*eventbridge.PutTargetsOutput, error) { + if !eb.NextError.IsNil() { + defer eb.NextError.Reset() + return nil, eb.NextError.Get() + } + eb.CalledWithPutTargetsInput.Add(input) + + if !eb.PutTargetsOutput.IsNil() { + return eb.PutTargetsOutput.Clone(), nil + } + return &eventbridge.PutTargetsOutput{}, nil +} + +func (eb *EventBridgeAPI) DeleteRuleWithContext(_ context.Context, input *eventbridge.DeleteRuleInput, _ ...request.Option) (*eventbridge.DeleteRuleOutput, error) { + if !eb.NextError.IsNil() { + defer eb.NextError.Reset() + return nil, eb.NextError.Get() + } + eb.CalledWithDeleteRuleInput.Add(input) + + return &eventbridge.DeleteRuleOutput{}, nil +} + +func (eb *EventBridgeAPI) RemoveTargetsWithContext(_ context.Context, input *eventbridge.RemoveTargetsInput, _ ...request.Option) (*eventbridge.RemoveTargetsOutput, error) { + if !eb.NextError.IsNil() { + defer eb.NextError.Reset() + return nil, eb.NextError.Get() + } + eb.CalledWithRemoveTargetsInput.Add(input) + + return &eventbridge.RemoveTargetsOutput{}, nil +} diff --git a/pkg/cloudprovider/aws/fake/eventrecorder.go b/pkg/cloudprovider/aws/fake/eventrecorder.go new file mode 100644 index 000000000000..95299220d701 --- /dev/null +++ b/pkg/cloudprovider/aws/fake/eventrecorder.go @@ -0,0 +1,32 @@ +/* +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package fake + +import ( + "github.com/aws/karpenter/pkg/cloudprovider/aws/events" + "github.com/aws/karpenter/pkg/test" +) + +// EventRecorder is a mock event recorder that is used to facilitate testing. +type EventRecorder struct { + events.Recorder + test.EventRecorder +} + +func NewEventRecorder() *EventRecorder { + return &EventRecorder{ + EventRecorder: *test.NewEventRecorder(), + } +} diff --git a/pkg/cloudprovider/aws/fake/sqsapi.go b/pkg/cloudprovider/aws/fake/sqsapi.go new file mode 100644 index 000000000000..005e070f2153 --- /dev/null +++ b/pkg/cloudprovider/aws/fake/sqsapi.go @@ -0,0 +1,139 @@ +/* +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package fake + +import ( + "context" + + "github.com/aws/aws-sdk-go/aws" + "github.com/aws/aws-sdk-go/aws/request" + "github.com/aws/aws-sdk-go/service/sqs" + "github.com/aws/aws-sdk-go/service/sqs/sqsiface" +) + +const ( + dummyQueueURL = "https://sqs.us-west-2.amazonaws.com/000000000000/Karpenter-cluster-Queue" +) + +// SQSBehavior must be reset between tests otherwise tests will +// pollute each other. +type SQSBehavior struct { + CreateQueueOutput AtomicPtr[sqs.CreateQueueOutput] + GetQueueURLOutput AtomicPtr[sqs.GetQueueUrlOutput] + ReceiveMessageOutput AtomicPtr[sqs.ReceiveMessageOutput] + CalledWithCreateQueueInput AtomicPtrSlice[sqs.CreateQueueInput] + CalledWithGetQueueURLInput AtomicPtrSlice[sqs.GetQueueUrlInput] + CalledWithSetQueueAttributesInput AtomicPtrSlice[sqs.SetQueueAttributesInput] + CalledWithReceiveMessageInput AtomicPtrSlice[sqs.ReceiveMessageInput] + CalledWithDeleteMessageInput AtomicPtrSlice[sqs.DeleteMessageInput] + CalledWithDeleteQueueInput AtomicPtrSlice[sqs.DeleteQueueInput] + NextError AtomicError +} + +type SQSAPI struct { + sqsiface.SQSAPI + SQSBehavior +} + +// Reset must be called between tests otherwise tests will pollute +// each other. +func (s *SQSAPI) Reset() { + s.CreateQueueOutput.Reset() + s.GetQueueURLOutput.Reset() + s.ReceiveMessageOutput.Reset() + s.CalledWithCreateQueueInput.Reset() + s.CalledWithGetQueueURLInput.Reset() + s.CalledWithSetQueueAttributesInput.Reset() + s.CalledWithReceiveMessageInput.Reset() + s.CalledWithDeleteMessageInput.Reset() + s.CalledWithDeleteQueueInput.Reset() + s.NextError.Reset() +} + +func (s *SQSAPI) CreateQueueWithContext(_ context.Context, input *sqs.CreateQueueInput, _ ...request.Option) (*sqs.CreateQueueOutput, error) { + if !s.NextError.IsNil() { + defer s.NextError.Reset() + return nil, s.NextError.Get() + } + s.CalledWithCreateQueueInput.Add(input) + + if !s.CreateQueueOutput.IsNil() { + return s.CreateQueueOutput.Clone(), nil + } + return &sqs.CreateQueueOutput{ + QueueUrl: aws.String(dummyQueueURL), + }, nil +} + +//nolint:revive,stylecheck +func (s *SQSAPI) GetQueueUrlWithContext(_ context.Context, input *sqs.GetQueueUrlInput, _ ...request.Option) (*sqs.GetQueueUrlOutput, error) { + if !s.NextError.IsNil() { + defer s.NextError.Reset() + return nil, s.NextError.Get() + } + s.CalledWithGetQueueURLInput.Add(input) + + if !s.GetQueueURLOutput.IsNil() { + return s.GetQueueURLOutput.Clone(), nil + } + return &sqs.GetQueueUrlOutput{ + QueueUrl: aws.String(dummyQueueURL), + }, nil +} + +func (s *SQSAPI) SetQueueAttributesWithContext(_ context.Context, input *sqs.SetQueueAttributesInput, _ ...request.Option) (*sqs.SetQueueAttributesOutput, error) { + if !s.NextError.IsNil() { + defer s.NextError.Reset() + return nil, s.NextError.Get() + } + s.CalledWithSetQueueAttributesInput.Add(input) + + return &sqs.SetQueueAttributesOutput{}, nil +} + +func (s *SQSAPI) ReceiveMessageWithContext(_ context.Context, input *sqs.ReceiveMessageInput, _ ...request.Option) (*sqs.ReceiveMessageOutput, error) { + if !s.NextError.IsNil() { + defer s.NextError.Reset() + return nil, s.NextError.Get() + } + s.CalledWithReceiveMessageInput.Add(input) + + if !s.ReceiveMessageOutput.IsNil() { + return s.ReceiveMessageOutput.Clone(), nil + } + return &sqs.ReceiveMessageOutput{ + Messages: []*sqs.Message{}, + }, nil +} + +func (s *SQSAPI) DeleteMessageWithContext(_ context.Context, input *sqs.DeleteMessageInput, _ ...request.Option) (*sqs.DeleteMessageOutput, error) { + if !s.NextError.IsNil() { + defer s.NextError.Reset() + return nil, s.NextError.Get() + } + s.CalledWithDeleteMessageInput.Add(input) + + return &sqs.DeleteMessageOutput{}, nil +} + +func (s *SQSAPI) DeleteQueueWithContext(_ context.Context, input *sqs.DeleteQueueInput, _ ...request.Option) (*sqs.DeleteQueueOutput, error) { + if !s.NextError.IsNil() { + defer s.NextError.Reset() + return nil, s.NextError.Get() + } + s.CalledWithDeleteQueueInput.Add(input) + + return &sqs.DeleteQueueOutput{}, nil +} diff --git a/pkg/cloudprovider/aws/instance.go b/pkg/cloudprovider/aws/instance.go index 8e3d97b1d5e3..f5ab945f5590 100644 --- a/pkg/cloudprovider/aws/instance.go +++ b/pkg/cloudprovider/aws/instance.go @@ -116,11 +116,11 @@ func (p *InstanceProvider) Terminate(ctx context.Context, node *v1.Node) error { if _, err = p.ec2api.TerminateInstancesWithContext(ctx, &ec2.TerminateInstancesInput{ InstanceIds: []*string{id}, }); err != nil { - if isNotFound(err) { + if IsNotFound(err) { return nil } if _, errMsg := p.getInstance(ctx, aws.StringValue(id)); err != nil { - if isInstanceTerminated(errMsg) || isNotFound(errMsg) { + if isInstanceTerminated(errMsg) || IsNotFound(errMsg) { logging.FromContext(ctx).Debugf("Instance already terminated, %s", node.Name) return nil } @@ -302,7 +302,7 @@ func (p *InstanceProvider) getOverrides(instanceTypeOptions []cloudprovider.Inst func (p *InstanceProvider) getInstance(ctx context.Context, id string) (*ec2.Instance, error) { describeInstancesOutput, err := p.ec2api.DescribeInstancesWithContext(ctx, &ec2.DescribeInstancesInput{InstanceIds: aws.StringSlice([]string{id})}) - if isNotFound(err) { + if IsNotFound(err) { return nil, err } if err != nil { diff --git a/pkg/cloudprovider/aws/launchtemplate.go b/pkg/cloudprovider/aws/launchtemplate.go index e74747a80ce5..ca039663fa0c 100644 --- a/pkg/cloudprovider/aws/launchtemplate.go +++ b/pkg/cloudprovider/aws/launchtemplate.go @@ -151,7 +151,7 @@ func (p *LaunchTemplateProvider) ensureLaunchTemplate(ctx context.Context, optio LaunchTemplateNames: []*string{aws.String(name)}, }) // Create LT if one doesn't exist - if isNotFound(err) { + if IsNotFound(err) { launchTemplate, err = p.createLaunchTemplate(ctx, options) if err != nil { return nil, fmt.Errorf("creating launch template, %w", err) diff --git a/pkg/cloudprovider/aws/sqs.go b/pkg/cloudprovider/aws/sqs.go index 701f9b0e7a17..ff9e0209a9dd 100644 --- a/pkg/cloudprovider/aws/sqs.go +++ b/pkg/cloudprovider/aws/sqs.go @@ -21,25 +21,15 @@ import ( "sync" "github.com/aws/aws-sdk-go/aws" - "github.com/aws/aws-sdk-go/aws/request" "github.com/aws/aws-sdk-go/service/sqs" + "github.com/aws/aws-sdk-go/service/sqs/sqsiface" "github.com/samber/lo" - "knative.dev/pkg/logging" "github.com/aws/karpenter/pkg/apis/provisioning/v1alpha5" "github.com/aws/karpenter/pkg/utils/functional" "github.com/aws/karpenter/pkg/utils/injection" ) -type SQSClient interface { - CreateQueueWithContext(context.Context, *sqs.CreateQueueInput, ...request.Option) (*sqs.CreateQueueOutput, error) - GetQueueUrlWithContext(context.Context, *sqs.GetQueueUrlInput, ...request.Option) (*sqs.GetQueueUrlOutput, error) - SetQueueAttributesWithContext(context.Context, *sqs.SetQueueAttributesInput, ...request.Option) (*sqs.SetQueueAttributesOutput, error) - ReceiveMessageWithContext(context.Context, *sqs.ReceiveMessageInput, ...request.Option) (*sqs.ReceiveMessageOutput, error) - DeleteMessageWithContext(context.Context, *sqs.DeleteMessageInput, ...request.Option) (*sqs.DeleteMessageOutput, error) - DeleteQueueWithContext(context.Context, *sqs.DeleteQueueInput, ...request.Option) (*sqs.DeleteQueueOutput, error) -} - type QueuePolicy struct { Version string `json:"Version"` ID string `json:"Id"` @@ -58,7 +48,7 @@ type Principal struct { } type SQSProvider struct { - client SQSClient + client sqsiface.SQSAPI createQueueInput *sqs.CreateQueueInput getQueueURLInput *sqs.GetQueueUrlInput @@ -69,7 +59,7 @@ type SQSProvider struct { metadata *Metadata } -func NewSQSProvider(ctx context.Context, client SQSClient, metadata *Metadata) *SQSProvider { +func NewSQSProvider(ctx context.Context, client sqsiface.SQSAPI, metadata *Metadata) *SQSProvider { provider := &SQSProvider{ client: client, mutex: &sync.RWMutex{}, @@ -195,6 +185,9 @@ func (s *SQSProvider) DeleteSQSMessage(ctx context.Context, msg *sqs.Message) er func (s *SQSProvider) DeleteQueue(ctx context.Context) error { queueURL, err := s.DiscoverQueueURL(ctx, false) if err != nil { + if IsNotFound(err) { + return nil + } return fmt.Errorf("failed fetching queue url, %w", err) } @@ -202,8 +195,7 @@ func (s *SQSProvider) DeleteQueue(ctx context.Context) error { QueueUrl: aws.String(queueURL), } _, err = s.client.DeleteQueueWithContext(ctx, input) - if err != nil { - logging.FromContext(ctx).Errorf("Might have got an error here in the queue, %v", err) + if err != nil && !IsNotFound(err) { return fmt.Errorf("failed deleting sqs queue, %w", err) } return nil diff --git a/pkg/controllers/controllers.go b/pkg/controllers/controllers.go index 861939aece0e..5630c6fc3ac8 100644 --- a/pkg/controllers/controllers.go +++ b/pkg/controllers/controllers.go @@ -87,12 +87,12 @@ type Controller interface { } type ControllerOptions struct { - Cluster *state.Cluster - KubeClient client.Client - Provisioner *provisioning.Provisioner - Recorder events.Recorder - StartAsync <-chan struct{} - Clock clock.Clock + Cluster *state.Cluster + KubeClient client.Client + Recorder events.Recorder + Clock clock.Clock + + StartAsync <-chan struct{} } func Initialize(injectCloudProvider func(context.Context, cloudprovider.Options) (cloudprovider.CloudProvider, ControllerInitFunc)) { @@ -158,12 +158,11 @@ func Initialize(injectCloudProvider func(context.Context, cloudprovider.Options) // Inject cloudprovider-specific controllers into the controller-set using the injectControllers function // Inject the base cloud provider into the injection function rather than the decorated interface controllerOptions := &ControllerOptions{ - Cluster: cluster, - KubeClient: manager.GetClient(), - Provisioner: provisioner, - Recorder: recorder, - StartAsync: manager.Elected(), - Clock: realClock, + Cluster: cluster, + KubeClient: manager.GetClient(), + Recorder: recorder, + StartAsync: manager.Elected(), + Clock: realClock, } injectControllers(ctx, manager, controllerOptions) diff --git a/website/content/en/preview/getting-started/getting-started-with-eksctl/cloudformation.yaml b/website/content/en/preview/getting-started/getting-started-with-eksctl/cloudformation.yaml index 481e01c0f716..e281e0207ff6 100644 --- a/website/content/en/preview/getting-started/getting-started-with-eksctl/cloudformation.yaml +++ b/website/content/en/preview/getting-started/getting-started-with-eksctl/cloudformation.yaml @@ -90,6 +90,3 @@ Resources: - events:TagResource - events:DeleteRule - events:RemoveTargets - # Read Operations - - events:ListRules - - events:DescribeRule From 25e0d655d434f508e6fe3998b9427b8bcd28cbb2 Mon Sep 17 00:00:00 2001 From: Jonathan Innis Date: Wed, 21 Sep 2022 17:50:25 -0700 Subject: [PATCH 22/55] Revert back to termination-based cleanup --- cmd/controller/main.go | 6 +- .../aws/controllers/deployment/controller.go | 160 ------------------ .../aws/controllers/deployment/suite_test.go | 72 -------- .../controllers/infrastructure/controller.go | 109 +++++++++++- .../controllers/infrastructure/suite_test.go | 5 +- .../controllers/notification/controller.go | 25 +-- pkg/cloudprovider/aws/controllers/register.go | 17 +- pkg/controllers/controllers.go | 46 +++-- pkg/controllers/provisioning/provisioner.go | 4 +- 9 files changed, 166 insertions(+), 278 deletions(-) delete mode 100644 pkg/cloudprovider/aws/controllers/deployment/controller.go delete mode 100644 pkg/cloudprovider/aws/controllers/deployment/suite_test.go diff --git a/cmd/controller/main.go b/cmd/controller/main.go index 1e5ed4af0593..7e8e7e2c9e4c 100644 --- a/cmd/controller/main.go +++ b/cmd/controller/main.go @@ -17,8 +17,6 @@ package main import ( "context" - "sigs.k8s.io/controller-runtime/pkg/manager" - "github.com/aws/karpenter/pkg/cloudprovider" "github.com/aws/karpenter/pkg/cloudprovider/aws" awscontrollers "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers" @@ -28,8 +26,8 @@ import ( func main() { controllers.Initialize(func(ctx context.Context, options cloudprovider.Options) (cloudprovider.CloudProvider, controllers.ControllerInitFunc) { provider := aws.NewCloudProvider(ctx, options) - return provider, func(c context.Context, manager manager.Manager, opts *controllers.ControllerOptions) { - awscontrollers.Register(c, provider, manager, opts) + return provider, func(c context.Context, opts *controllers.ControllerOptions) <-chan struct{} { + return awscontrollers.Register(c, provider, opts) } }) } diff --git a/pkg/cloudprovider/aws/controllers/deployment/controller.go b/pkg/cloudprovider/aws/controllers/deployment/controller.go deleted file mode 100644 index 14759cb44941..000000000000 --- a/pkg/cloudprovider/aws/controllers/deployment/controller.go +++ /dev/null @@ -1,160 +0,0 @@ -/* -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package deployment - -import ( - "context" - "sync" - - "go.uber.org/multierr" - appsv1 "k8s.io/api/apps/v1" - apierrors "k8s.io/apimachinery/pkg/api/errors" - "knative.dev/pkg/logging" - controllerruntime "sigs.k8s.io/controller-runtime" - "sigs.k8s.io/controller-runtime/pkg/client" - "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" - "sigs.k8s.io/controller-runtime/pkg/manager" - "sigs.k8s.io/controller-runtime/pkg/predicate" - "sigs.k8s.io/controller-runtime/pkg/reconcile" - - "github.com/aws/karpenter/pkg/apis/provisioning/v1alpha5" - "github.com/aws/karpenter/pkg/cloudprovider/aws" - "github.com/aws/karpenter/pkg/cloudprovider/aws/events" - "github.com/aws/karpenter/pkg/utils/injection" -) - -const controllerName = "deployment" - -// Controller is responsible for watching the Karpenter deployment -// It is responsible for patching the termination finalizer on when the leader pod comes up -// and reacting to the deletion of the deployment so that we can perform some cleanup actions -type Controller struct { - kubeClient client.Client - cancel context.CancelFunc - recorder events.Recorder - - sqsProvider *aws.SQSProvider - eventBridgeProvider *aws.EventBridgeProvider -} - -func NewController(kubeClient client.Client, cancel context.CancelFunc, recorder events.Recorder, - sqsProvider *aws.SQSProvider, eventBridgeProvider *aws.EventBridgeProvider) *Controller { - return &Controller{ - kubeClient: kubeClient, - cancel: cancel, - recorder: recorder, - sqsProvider: sqsProvider, - eventBridgeProvider: eventBridgeProvider, - } -} - -func (c *Controller) Reconcile(ctx context.Context, req reconcile.Request) (reconcile.Result, error) { - ctx = logging.WithLogger(ctx, logging.FromContext(ctx).Named(controllerName)) - - deployment := &appsv1.Deployment{} - if err := c.kubeClient.Get(ctx, req.NamespacedName, deployment); err != nil { - if apierrors.IsNotFound(err) { - return reconcile.Result{}, nil - } - return reconcile.Result{}, err - } - // If the deletion timestamp is set, that means the deployment is attempting to be deleted - // and we should perform the cleanup actions associated with the Karpenter deployment - if !deployment.DeletionTimestamp.IsZero() { - if err := c.deleteInfrastructure(ctx); err != nil { - c.recorder.InfrastructureDeletionFailed(ctx, c.kubeClient) - return reconcile.Result{}, err - } - c.recorder.InfrastructureDeletionSucceeded(ctx, c.kubeClient) - patch := client.MergeFrom(deployment.DeepCopy()) - controllerutil.RemoveFinalizer(deployment, v1alpha5.TerminationFinalizer) - if err := c.kubeClient.Patch(ctx, deployment, patch); err != nil { - return reconcile.Result{}, err - } - c.cancel() // Call cancel to stop the other controllers relying on the infrastructure - return reconcile.Result{}, nil - } - // Otherwise, this is a create/update, so we should just ensure that the finalizer exists - if !controllerutil.ContainsFinalizer(deployment, v1alpha5.TerminationFinalizer) { - patch := client.MergeFrom(deployment.DeepCopy()) - controllerutil.AddFinalizer(deployment, v1alpha5.TerminationFinalizer) - if err := c.kubeClient.Patch(ctx, deployment, patch); err != nil { - return reconcile.Result{}, err - } - } - return reconcile.Result{}, nil -} - -// Register the controller to the manager -func (c *Controller) Register(ctx context.Context, m manager.Manager) error { - return controllerruntime. - NewControllerManagedBy(m). - Named(controllerName). - For(&appsv1.Deployment{}). - WithEventFilter(predicate.NewPredicateFuncs(func(object client.Object) bool { - // This function ensures that we are filtering out every event that isn't related to the - // karpenter controller deployment - if object.GetNamespace() != injection.GetOptions(ctx).DeploymentNamespace { - return false - } - if object.GetName() != injection.GetOptions(ctx).DeploymentName { - return false - } - return true - })). - Complete(c) -} - -// Delete infrastructure removes the infrastructure that was stood up and reconciled -// by the infrastructure controller for SQS message polling -func (c *Controller) deleteInfrastructure(ctx context.Context) (err error) { - logging.FromContext(ctx).Infof("Deprovisioning the infrastructure...") - wg := &sync.WaitGroup{} - m := &sync.Mutex{} - - wg.Add(2) - go func() { - defer wg.Done() - e := c.sqsProvider.DeleteQueue(ctx) - - // If we get access denied, nothing we can do so just log and don't return the error - if aws.IsAccessDenied(e) { - logging.FromContext(ctx).Errorf("Access denied while trying to delete SQS queue, %v", err) - } else if err != nil { - m.Lock() - err = multierr.Append(err, e) - m.Unlock() - } - }() - go func() { - defer wg.Done() - e := c.eventBridgeProvider.DeleteEC2NotificationRules(ctx) - - // If we get access denied, nothing we can do so just log and don't return the error - if aws.IsAccessDenied(e) { - logging.FromContext(ctx).Errorf("Access denied while trying to delete notification rules, %v", err) - } else if err != nil { - m.Lock() - err = multierr.Append(err, e) - m.Unlock() - } - }() - wg.Wait() - if err != nil { - return err - } - logging.FromContext(ctx).Infof("Completed deprovisioning the infrastructure") - return nil -} diff --git a/pkg/cloudprovider/aws/controllers/deployment/suite_test.go b/pkg/cloudprovider/aws/controllers/deployment/suite_test.go deleted file mode 100644 index 6a054014bd9d..000000000000 --- a/pkg/cloudprovider/aws/controllers/deployment/suite_test.go +++ /dev/null @@ -1,72 +0,0 @@ -/* -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package deployment_test - -import ( - "context" - "testing" - - . "github.com/onsi/ginkgo/v2" - . "github.com/onsi/gomega" - . "knative.dev/pkg/logging/testing" - - . "github.com/aws/karpenter/pkg/test/expectations" - - "github.com/aws/karpenter/pkg/cloudprovider/aws" - "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/deployment" - awsfake "github.com/aws/karpenter/pkg/cloudprovider/aws/fake" - "github.com/aws/karpenter/pkg/test" -) - -var ctx context.Context -var env *test.Environment -var sqsapi *awsfake.SQSAPI -var sqsProvider *aws.SQSProvider -var eventbridgeapi *awsfake.EventBridgeAPI -var eventBridgeProvider *aws.EventBridgeProvider -var recorder *awsfake.EventRecorder -var controller *deployment.Controller - -func TestAPIs(t *testing.T) { - ctx = TestContextWithLogger(t) - RegisterFailHandler(Fail) - RunSpecs(t, "AWS Karpenter Deployment") -} - -var _ = BeforeSuite(func() { - env = test.NewEnvironment(ctx, func(e *test.Environment) { - recorder = awsfake.NewEventRecorder() - metadata := aws.NewMetadata("us-east-1", "000000000000") - - sqsapi = &awsfake.SQSAPI{} - eventbridgeapi = &awsfake.EventBridgeAPI{} - sqsProvider = aws.NewSQSProvider(ctx, sqsapi, metadata) - eventBridgeProvider = aws.NewEventBridgeProvider(eventbridgeapi, metadata, sqsProvider.QueueName()) - }) - Expect(env.Start()).To(Succeed(), "Failed to start environment") -}) - -var _ = AfterSuite(func() { - Expect(env.Stop()).To(Succeed(), "Failed to stop environment") -}) - -var _ = BeforeEach(func() { - sqsapi.Reset() - eventbridgeapi.Reset() - controller = deployment.NewController(env.Client, nil, recorder, sqsProvider, eventBridgeProvider) -}) -var _ = AfterEach(func() { - ExpectCleanedUp(ctx, env.Client) -}) diff --git a/pkg/cloudprovider/aws/controllers/infrastructure/controller.go b/pkg/cloudprovider/aws/controllers/infrastructure/controller.go index b742f045d44a..a68c74e5325d 100644 --- a/pkg/cloudprovider/aws/controllers/infrastructure/controller.go +++ b/pkg/cloudprovider/aws/controllers/infrastructure/controller.go @@ -21,9 +21,13 @@ import ( "sync" "time" + "github.com/avast/retry-go" "github.com/aws/aws-sdk-go/aws/awserr" "github.com/aws/aws-sdk-go/service/sqs" "go.uber.org/multierr" + appsv1 "k8s.io/api/apps/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/types" "k8s.io/utils/clock" "knative.dev/pkg/logging" "sigs.k8s.io/controller-runtime/pkg/client" @@ -31,6 +35,7 @@ import ( "github.com/aws/karpenter/pkg/cloudprovider/aws" "github.com/aws/karpenter/pkg/cloudprovider/aws/events" "github.com/aws/karpenter/pkg/metrics" + "github.com/aws/karpenter/pkg/utils/injection" ) // Controller is the AWS infrastructure controller. It is not a standard controller-runtime controller in that it doesn't @@ -46,6 +51,8 @@ type Controller struct { mutex *sync.RWMutex readinessChan chan struct{} // A signal to other controllers that infrastructure is in a good state ready bool + trigger chan struct{} + done chan struct{} } // pollingPeriod is the period that we go to AWS APIs to ensure that the appropriate AWS infrastructure is provisioned @@ -56,9 +63,9 @@ const pollingPeriod = time.Hour // is provisioned if there is an error in the reconciliation loop const defaultBackoffPeriod = time.Minute * 10 -func NewController(ctx context.Context, kubeClient client.Client, clk clock.Clock, +func NewController(ctx context.Context, cleanupCtx context.Context, kubeClient client.Client, clk clock.Clock, recorder events.Recorder, sqsProvider *aws.SQSProvider, eventBridgeProvider *aws.EventBridgeProvider, - startAsync <-chan struct{}) *Controller { + startAsync <-chan struct{}, cleanupAsync <-chan struct{}) *Controller { c := &Controller{ kubeClient: kubeClient, recorder: recorder, @@ -67,8 +74,16 @@ func NewController(ctx context.Context, kubeClient client.Client, clk clock.Cloc eventBridgeProvider: eventBridgeProvider, mutex: &sync.RWMutex{}, readinessChan: make(chan struct{}), + trigger: make(chan struct{}, 1), + done: make(chan struct{}), } + go func() { + <-cleanupAsync + c.cleanup(cleanupCtx) + close(c.done) + }() + go func() { select { case <-ctx.Done(): @@ -97,6 +112,8 @@ func (c *Controller) run(ctx context.Context) { select { case <-ctx.Done(): return + case <-c.trigger: + continue case <-c.clock.After(backoffPeriod): continue } @@ -105,11 +122,40 @@ func (c *Controller) run(ctx context.Context) { select { case <-ctx.Done(): return + case <-c.trigger: case <-c.clock.After(pollingPeriod): } } } +func (c *Controller) cleanup(ctx context.Context) { + ctx = logging.WithLogger(ctx, logging.FromContext(ctx).Named("infrastructure.cleanup")) + + dep := &appsv1.Deployment{} + nn := types.NamespacedName{ + Name: injection.GetOptions(ctx).DeploymentName, + Namespace: injection.GetOptions(ctx).DeploymentNamespace, + } + + notFound := false + if err := retry.Do(func() error { + err := c.kubeClient.Get(ctx, nn, dep) + if apierrors.IsNotFound(err) { + notFound = true + } + return client.IgnoreNotFound(err) + }); err != nil { + logging.FromContext(ctx).Errorf("Getting the deployment %s for cleanup, %v", nn, err) + } + + // Deployment is already deleted or currently deleting, so we should cleanup the infrastructure + if notFound || !dep.DeletionTimestamp.IsZero() { + if err := retry.Do(func() error { return c.deleteInfrastructure(ctx) }); err != nil { + logging.FromContext(ctx).Errorf("Deprovisioning the infrastructure, %v", err) + } + } +} + // Ready returns a channel that serves as a gate for other controllers // to wait on the infrastructure to be in a good state. When the infrastructure is ready, // this channel is closed so other controllers can proceed with their operations @@ -119,6 +165,14 @@ func (c *Controller) Ready() <-chan struct{} { return c.readinessChan } +func (c *Controller) Trigger() { + c.trigger <- struct{}{} +} + +func (c *Controller) Done() <-chan struct{} { + return c.done +} + func (c *Controller) setReady(ctx context.Context, ready bool) { c.mutex.Lock() defer c.mutex.Unlock() @@ -171,17 +225,63 @@ func (c *Controller) ensureInfrastructure(ctx context.Context) (err error) { return err } +// Delete infrastructure removes the infrastructure that was stood up and reconciled +// by the infrastructure controller for SQS message polling +func (c *Controller) deleteInfrastructure(ctx context.Context) (err error) { + logging.FromContext(ctx).Infof("Deprovisioning the infrastructure...") + wg := &sync.WaitGroup{} + m := &sync.Mutex{} + + wg.Add(2) + go func() { + defer wg.Done() + logging.FromContext(ctx).Debugf("Deleting the SQS notification queue...") + e := c.sqsProvider.DeleteQueue(ctx) + + // If we get access denied, nothing we can do so just log and don't return the error + if aws.IsAccessDenied(e) { + logging.FromContext(ctx).Errorf("Access denied while trying to delete SQS queue, %v", err) + } else if err != nil { + m.Lock() + err = multierr.Append(err, e) + m.Unlock() + } + }() + go func() { + defer wg.Done() + logging.FromContext(ctx).Debugf("Deleting the EventBridge notification rules...") + e := c.eventBridgeProvider.DeleteEC2NotificationRules(ctx) + + // If we get access denied, nothing we can do so just log and don't return the error + if aws.IsAccessDenied(e) { + logging.FromContext(ctx).Errorf("Access denied while trying to delete notification rules, %v", err) + } else if err != nil { + m.Lock() + err = multierr.Append(err, e) + m.Unlock() + } + }() + wg.Wait() + if err != nil { + return err + } + logging.FromContext(ctx).Infof("Completed deprovisioning the infrastructure") + return nil +} + // ensureQueue reconciles the SQS queue with the configuration prescribed by Karpenter func (c *Controller) ensureQueue(ctx context.Context) error { // Attempt to find the queue. If we can't find it, assume it isn't created and try to create it // If we did find it, then just set the queue attributes on the existing queue + logging.FromContext(ctx).Debugf("Reconciling the SQS notification queue") if _, err := c.sqsProvider.DiscoverQueueURL(ctx, true); err != nil { switch { case aws.IsNotFound(err): - logging.FromContext(ctx).Infof("Creating the SQS queue for EC2 notifications...") + logging.FromContext(ctx).Debugf("Queue not found, creating the SQS notification queue...") if err := c.sqsProvider.CreateQueue(ctx); err != nil { return fmt.Errorf("creating sqs queue with policy, %w", err) } + logging.FromContext(ctx).Debugf("Successfully created the SQS notification queue") return nil case aws.IsAccessDenied(err): return fmt.Errorf("failed obtaining permission to discover sqs queue url, %w", err) @@ -197,6 +297,7 @@ func (c *Controller) ensureQueue(ctx context.Context) error { // ensureEventBridge reconciles the Eventbridge rules with the configuration prescribed by Karpenter func (c *Controller) ensureEventBridge(ctx context.Context) error { + logging.FromContext(ctx).Debugf("Reconciling the EventBridge notification rules") if err := c.eventBridgeProvider.CreateEC2NotificationRules(ctx); err != nil { switch { case aws.IsAccessDenied(err): @@ -217,7 +318,7 @@ func (c *Controller) getBackoff(err error) time.Duration { } switch awsErr.Code() { case sqs.ErrCodeQueueDeletedRecently: - return time.Minute * 2 + return time.Minute default: return defaultBackoffPeriod } diff --git a/pkg/cloudprovider/aws/controllers/infrastructure/suite_test.go b/pkg/cloudprovider/aws/controllers/infrastructure/suite_test.go index bea5ffa328f5..11aa123db21c 100644 --- a/pkg/cloudprovider/aws/controllers/infrastructure/suite_test.go +++ b/pkg/cloudprovider/aws/controllers/infrastructure/suite_test.go @@ -41,6 +41,7 @@ var eventBridgeProvider *aws.EventBridgeProvider var recorder *awsfake.EventRecorder var fakeClock *clock.FakeClock var controller *infrastructure.Controller +var cleanupChan chan struct{} func TestAPIs(t *testing.T) { ctx = TestContextWithLogger(t) @@ -58,18 +59,20 @@ var _ = BeforeSuite(func() { eventbridgeapi = &awsfake.EventBridgeAPI{} sqsProvider = aws.NewSQSProvider(ctx, sqsapi, metadata) eventBridgeProvider = aws.NewEventBridgeProvider(eventbridgeapi, metadata, sqsProvider.QueueName()) + cleanupChan = make(chan struct{}) + infrastructure.NewController(env.Ctx, env.Ctx, env.Client, fakeClock, recorder, sqsProvider, eventBridgeProvider, nil, cleanupChan) }) Expect(env.Start()).To(Succeed(), "Failed to start environment") }) var _ = AfterSuite(func() { + close(cleanupChan) Expect(env.Stop()).To(Succeed(), "Failed to stop environment") }) var _ = BeforeEach(func() { sqsapi.Reset() eventbridgeapi.Reset() - controller = infrastructure.NewController(env.Ctx, env.Client, fakeClock, recorder, sqsProvider, eventBridgeProvider, nil) }) var _ = AfterEach(func() { ExpectCleanedUp(ctx, env.Client) diff --git a/pkg/cloudprovider/aws/controllers/notification/controller.go b/pkg/cloudprovider/aws/controllers/notification/controller.go index 9ce17be8662d..ab15e87db709 100644 --- a/pkg/cloudprovider/aws/controllers/notification/controller.go +++ b/pkg/cloudprovider/aws/controllers/notification/controller.go @@ -31,6 +31,7 @@ import ( "github.com/aws/karpenter/pkg/apis/provisioning/v1alpha5" "github.com/aws/karpenter/pkg/cloudprovider/aws" + "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/infrastructure" "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event" "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event/aggregatedparser" "github.com/aws/karpenter/pkg/cloudprovider/aws/events" @@ -58,7 +59,7 @@ type Controller struct { provider *aws.SQSProvider parser event.Parser - infraReady func() <-chan struct{} + infraController *infrastructure.Controller } // pollingPeriod that we go to the SQS queue to check if there are any new events @@ -66,15 +67,15 @@ const pollingPeriod = 2 * time.Second func NewController(ctx context.Context, kubeClient client.Client, clk clock.Clock, recorder events.Recorder, cluster *state.Cluster, sqsProvider *aws.SQSProvider, - startAsync <-chan struct{}, infraReady func() <-chan struct{}) *Controller { + infraController *infrastructure.Controller, startAsync <-chan struct{}) *Controller { c := &Controller{ - kubeClient: kubeClient, - cluster: cluster, - recorder: recorder, - clock: clk, - provider: sqsProvider, - parser: aggregatedparser.NewAggregatedParser(aggregatedparser.DefaultParsers...), - infraReady: infraReady, + kubeClient: kubeClient, + cluster: cluster, + recorder: recorder, + clock: clk, + provider: sqsProvider, + parser: aggregatedparser.NewAggregatedParser(aggregatedparser.DefaultParsers...), + infraController: infraController, } go func() { @@ -93,7 +94,7 @@ func (c *Controller) run(ctx context.Context) { logger := logging.FromContext(ctx).Named("notification") ctx = logging.WithLogger(ctx, logger) for { - <-c.infraReady() // block until the infrastructure is up and ready + <-c.infraController.Ready() // block until the infrastructure is up and ready err := c.pollSQS(ctx) if err != nil { logging.FromContext(ctx).Errorf("Handling notification messages from SQS queue, %v", err) @@ -113,6 +114,10 @@ func (c *Controller) pollSQS(ctx context.Context) error { sqsMessages, err := c.provider.GetSQSMessages(ctx) if err != nil { + // If the queue isn't found, we should trigger the infrastructure controller to re-reconcile + if aws.IsNotFound(err) { + c.infraController.Trigger() + } return err } if len(sqsMessages) == 0 { diff --git a/pkg/cloudprovider/aws/controllers/register.go b/pkg/cloudprovider/aws/controllers/register.go index 875a42f268b2..17dfbd0cf6fe 100644 --- a/pkg/cloudprovider/aws/controllers/register.go +++ b/pkg/cloudprovider/aws/controllers/register.go @@ -18,29 +18,20 @@ import ( "context" "knative.dev/pkg/logging" - "sigs.k8s.io/controller-runtime/pkg/manager" "github.com/aws/karpenter/pkg/cloudprovider/aws" - "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/deployment" "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/infrastructure" "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification" "github.com/aws/karpenter/pkg/cloudprovider/aws/events" "github.com/aws/karpenter/pkg/controllers" ) -func Register(ctx context.Context, provider *aws.CloudProvider, manager manager.Manager, opts *controllers.ControllerOptions) { +func Register(ctx context.Context, provider *aws.CloudProvider, opts *controllers.ControllerOptions) <-chan struct{} { rec := events.NewRecorder(opts.Recorder) ctx = logging.WithLogger(ctx, logging.FromContext(ctx).Named("aws")) // Injecting the cloudprovider-specific controllers that will start when opts.StartAsync is triggered - // All these controllers should run with the same context since they rely on each other - infraCtx, cancel := context.WithCancel(ctx) - deploymentController := deployment.NewController(opts.KubeClient, cancel, rec, provider.SQSProvider(), provider.EventBridgeProvider()) - infraController := infrastructure.NewController(infraCtx, opts.KubeClient, opts.Clock, rec, provider.SQSProvider(), provider.EventBridgeProvider(), opts.StartAsync) - notification.NewController(infraCtx, opts.KubeClient, opts.Clock, rec, opts.Cluster, provider.SQSProvider(), opts.StartAsync, infraController.Ready) - - // Register the controller-runtime controller with the global manager - if err := deploymentController.Register(infraCtx, manager); err != nil { - panic(err) - } + infraController := infrastructure.NewController(ctx, opts.BaseContext(), opts.KubeClient, opts.Clock, rec, provider.SQSProvider(), provider.EventBridgeProvider(), opts.StartAsync, opts.CleanupAsync) + notification.NewController(ctx, opts.KubeClient, opts.Clock, rec, opts.Cluster, provider.SQSProvider(), infraController, opts.StartAsync) + return infraController.Done() } diff --git a/pkg/controllers/controllers.go b/pkg/controllers/controllers.go index 5630c6fc3ac8..a5a29f0ab4e2 100644 --- a/pkg/controllers/controllers.go +++ b/pkg/controllers/controllers.go @@ -19,7 +19,10 @@ import ( "fmt" "net/http" "net/http/pprof" + "os" + "os/signal" "runtime/debug" + "syscall" "github.com/go-logr/logr" "github.com/go-logr/zapr" @@ -74,7 +77,7 @@ func init() { metrics.MustRegister() // Registers cross-controller metrics } -type ControllerInitFunc func(context.Context, manager.Manager, *ControllerOptions) +type ControllerInitFunc func(context.Context, *ControllerOptions) <-chan struct{} // Controller is an interface implemented by Karpenter custom resources. type Controller interface { @@ -87,12 +90,14 @@ type Controller interface { } type ControllerOptions struct { - Cluster *state.Cluster - KubeClient client.Client - Recorder events.Recorder - Clock clock.Clock - - StartAsync <-chan struct{} + BaseContext func() context.Context + Cluster *state.Cluster + KubeClient client.Client + Recorder events.Recorder + Clock clock.Clock + + StartAsync <-chan struct{} + CleanupAsync <-chan struct{} } func Initialize(injectCloudProvider func(context.Context, cloudprovider.Options) (cloudprovider.CloudProvider, ControllerInitFunc)) { @@ -107,6 +112,18 @@ func Initialize(injectCloudProvider func(context.Context, cloudprovider.Options) cmw := informer.NewInformedWatcher(clientSet, system.Namespace()) ctx := injection.LoggingContextOrDie(component, controllerRuntimeConfig, cmw) ctx = newRunnableContext(controllerRuntimeConfig, opts, logging.FromContext(ctx))() + ctx, cancel := context.WithCancel(ctx) + + // Setup the cleanup logic for teardown on SIGINT or SIGTERM + cleanup := make(chan struct{}) // This is a channel to broadcast to controllers cleanup can start + go func() { + sigs := make(chan os.Signal, 1) + signal.Notify(sigs, syscall.SIGINT, syscall.SIGTERM) + <-sigs + logging.FromContext(context.Background()).Infof("Got a signal to react to") + close(cleanup) + cancel() + }() logging.FromContext(ctx).Infof("Initializing with version %s", project.Version) @@ -158,13 +175,15 @@ func Initialize(injectCloudProvider func(context.Context, cloudprovider.Options) // Inject cloudprovider-specific controllers into the controller-set using the injectControllers function // Inject the base cloud provider into the injection function rather than the decorated interface controllerOptions := &ControllerOptions{ - Cluster: cluster, - KubeClient: manager.GetClient(), - Recorder: recorder, - StartAsync: manager.Elected(), - Clock: realClock, + BaseContext: newRunnableContext(controllerRuntimeConfig, opts, logging.FromContext(ctx)), + Cluster: cluster, + KubeClient: manager.GetClient(), + Recorder: recorder, + StartAsync: manager.Elected(), + CleanupAsync: cleanup, + Clock: realClock, } - injectControllers(ctx, manager, controllerOptions) + done := injectControllers(ctx, controllerOptions) metricsstate.StartMetricScraper(ctx, cluster) @@ -182,6 +201,7 @@ func Initialize(injectCloudProvider func(context.Context, cloudprovider.Options) ).Start(ctx); err != nil { panic(fmt.Sprintf("Unable to start manager, %s", err)) } + <-done } // NewManagerOrDie instantiates a controller manager or panics diff --git a/pkg/controllers/provisioning/provisioner.go b/pkg/controllers/provisioning/provisioner.go index 0a985a522237..a7e7ac6ce734 100644 --- a/pkg/controllers/provisioning/provisioner.go +++ b/pkg/controllers/provisioning/provisioner.go @@ -128,7 +128,9 @@ func (p *Provisioner) Provision(ctx context.Context) error { // wait to ensure that our cluster state is synced with the current known nodes to prevent over-provisioning for WaitForClusterSync { - if err := p.cluster.Synchronized(ctx); err != nil { + if ctx.Err() != nil { + return ctx.Err() + } else if err := p.cluster.Synchronized(ctx); err != nil { logging.FromContext(ctx).Infof("waiting for cluster state to catch up, %s", err) time.Sleep(1 * time.Second) } else { From ed8d04ecd4194cb9f9d0536e403ac8ff0b6d1630 Mon Sep 17 00:00:00 2001 From: Jonathan Innis Date: Thu, 22 Sep 2022 12:45:21 -0700 Subject: [PATCH 23/55] Add baseline unit tests for infrastructure controller --- charts/karpenter/templates/role.yaml | 7 +- go.mod | 4 +- go.sum | 7 +- .../controllers/infrastructure/controller.go | 41 ++- .../controllers/infrastructure/suite_test.go | 196 +++++++++- .../controllers/notification/controller.go | 38 +- .../controllers/notification/event/types.go | 2 - .../controllers/notification/suite_test.go | 348 +++++++++++++++++- pkg/cloudprovider/aws/controllers/register.go | 2 +- pkg/cloudprovider/aws/errors.go | 6 +- pkg/cloudprovider/aws/fake/atomic.go | 30 +- pkg/cloudprovider/aws/fake/eventbridgeapi.go | 61 +-- pkg/cloudprovider/aws/fake/eventrecorder.go | 25 +- pkg/cloudprovider/aws/fake/sqsapi.go | 95 +---- pkg/cloudprovider/aws/fake/types.go | 70 ++++ pkg/cloudprovider/types.go | 3 + pkg/controllers/consolidation/controller.go | 20 +- pkg/controllers/consolidation/types.go | 19 +- pkg/controllers/controllers.go | 7 +- pkg/test/deployment.go | 20 + pkg/test/expectations/expectations.go | 46 ++- pkg/test/nodes.go | 2 + test/go.mod | 4 +- test/go.sum | 8 +- 24 files changed, 845 insertions(+), 216 deletions(-) create mode 100644 pkg/cloudprovider/aws/fake/types.go diff --git a/charts/karpenter/templates/role.yaml b/charts/karpenter/templates/role.yaml index efd6c57a1577..f4a3cdf3ca83 100644 --- a/charts/karpenter/templates/role.yaml +++ b/charts/karpenter/templates/role.yaml @@ -28,11 +28,6 @@ rules: resourceNames: - karpenter-global-settings - config-logging - - apiGroups: ["apps"] - resources: ["deployments"] - verbs: ["get", "list", "watch", "patch", "update"] - resourceNames: - - {{ include "karpenter.fullname" . }} - apiGroups: ["coordination.k8s.io"] resources: ["leases"] verbs: ["patch", "update"] @@ -49,4 +44,4 @@ rules: verbs: ["create"] - apiGroups: [""] resources: ["configmaps"] - verbs: ["create"] \ No newline at end of file + verbs: ["create"] diff --git a/go.mod b/go.mod index 7875fb692d32..21733768a02f 100644 --- a/go.mod +++ b/go.mod @@ -6,9 +6,11 @@ require ( github.com/Pallinder/go-randomdata v1.2.0 github.com/avast/retry-go v3.0.0+incompatible github.com/aws/aws-sdk-go v1.44.114 + github.com/cenkalti/backoff/v4 v4.1.3 github.com/deckarep/golang-set v1.8.0 github.com/go-logr/logr v1.2.3 github.com/go-logr/zapr v1.2.3 + github.com/google/uuid v1.3.0 github.com/imdario/mergo v0.3.13 github.com/mitchellh/hashstructure/v2 v2.0.2 github.com/onsi/ginkgo/v2 v2.2.0 @@ -56,7 +58,6 @@ require ( github.com/google/gnostic v0.5.7-v3refs // indirect github.com/google/go-cmp v0.5.8 // indirect github.com/google/gofuzz v1.2.0 // indirect - github.com/google/uuid v1.3.0 // indirect github.com/grpc-ecosystem/grpc-gateway v1.16.0 // indirect github.com/hashicorp/golang-lru v0.5.4 // indirect github.com/jmespath/go-jmespath v0.4.0 // indirect @@ -77,7 +78,6 @@ require ( go.uber.org/atomic v1.9.0 // indirect go.uber.org/automaxprocs v1.4.0 // indirect golang.org/x/exp v0.0.0-20220303212507-bbda1eaf7a17 // indirect - golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4 // indirect golang.org/x/net v0.0.0-20220722155237-a158d28d115b // indirect golang.org/x/oauth2 v0.0.0-20220223155221-ee480838109b // indirect golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4 // indirect diff --git a/go.sum b/go.sum index 0f3efe6b3834..d7d313f0fc46 100644 --- a/go.sum +++ b/go.sum @@ -76,6 +76,8 @@ github.com/blang/semver/v4 v4.0.0 h1:1PFHFE6yCCTv8C1TeyNNarDzntLi7wMI5i/pzqYIsAM github.com/blang/semver/v4 v4.0.0/go.mod h1:IbckMUScFkM3pff0VJDNKRiT6TG/YpiHIM2yvyW5YoQ= github.com/blendle/zapdriver v1.3.1 h1:C3dydBOWYRiOk+B8X9IVZ5IOe+7cl+tGOexN4QqHfpE= github.com/blendle/zapdriver v1.3.1/go.mod h1:mdXfREi6u5MArG4j9fewC+FGnXaBR+T4Ox4J2u4eHCc= +github.com/cenkalti/backoff/v4 v4.1.3 h1:cFAlzYUlVYDysBEH2T5hyJZMh3+5+WCBvSnK6Q8UtC4= +github.com/cenkalti/backoff/v4 v4.1.3/go.mod h1:scbssz8iZGpm3xbr14ovlUdkxfGXNInqkPWOWmG2CLw= github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU= github.com/census-instrumentation/opencensus-proto v0.3.0 h1:t/LhUZLVitR1Ow2YOnduCsavhwFUklBMoGVYUCqmCqk= github.com/census-instrumentation/opencensus-proto v0.3.0/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU= @@ -441,9 +443,7 @@ golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.4.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.4.1/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.4.2/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= -golang.org/x/mod v0.6.0-dev.0.20220106191415-9b9b3d81d5e3/go.mod h1:3p9vT2HGsQu2K1YbXdKPJLVgG5VJdoTa1poYQBtP1AY= golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4 h1:6zppjxzCulZykYSLyVDYbneBfbaBIQPYMevg0bEwv2s= -golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20180826012351-8a410e7b638d/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20181114220301-adae6a3d119a/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= @@ -519,7 +519,6 @@ golang.org/x/sync v0.0.0-20201207232520-09787c993a3a/go.mod h1:RxMgew5VJxzue5/jJ golang.org/x/sync v0.0.0-20210220032951-036812b2e83c/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4 h1:uVc8UZUe6tr40fFVnUP5Oj+veunVezqYl9z7DYw9xzw= golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sys v0.0.0-20180823144017-11551d06cbcc/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20180905080454-ebe1bf3edb33/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20181116152217-5ac8a444bdc5/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= @@ -577,7 +576,7 @@ golang.org/x/sys v0.0.0-20210908233432-aa78b53d3365/go.mod h1:oPkhp1MJrh7nUepCBc golang.org/x/sys v0.0.0-20211124211545-fe61309f8881/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20211216021012-1d35b9e2eb4e/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220114195835-da31bd327af9/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.0.0-20220209214540-3681064d5158/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220412211240-33da011f77ad/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab h1:2QkjZIsXupsJbJIdSjjUOgWK3aEtzyuh2mPt3l/CkeU= golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= diff --git a/pkg/cloudprovider/aws/controllers/infrastructure/controller.go b/pkg/cloudprovider/aws/controllers/infrastructure/controller.go index a68c74e5325d..ce598990bce9 100644 --- a/pkg/cloudprovider/aws/controllers/infrastructure/controller.go +++ b/pkg/cloudprovider/aws/controllers/infrastructure/controller.go @@ -24,6 +24,7 @@ import ( "github.com/avast/retry-go" "github.com/aws/aws-sdk-go/aws/awserr" "github.com/aws/aws-sdk-go/service/sqs" + "github.com/cenkalti/backoff/v4" "go.uber.org/multierr" appsv1 "k8s.io/api/apps/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" @@ -49,6 +50,7 @@ type Controller struct { eventBridgeProvider *aws.EventBridgeProvider mutex *sync.RWMutex + backoff *backoff.ExponentialBackOff readinessChan chan struct{} // A signal to other controllers that infrastructure is in a good state ready bool trigger chan struct{} @@ -59,10 +61,6 @@ type Controller struct { // This period can be reduced to a backoffPeriod if there is an error in ensuring the infrastructure const pollingPeriod = time.Hour -// defaultBackoffPeriod is the default period that we go to AWS APIs to ensure that the appropriate AWS infrastructure -// is provisioned if there is an error in the reconciliation loop -const defaultBackoffPeriod = time.Minute * 10 - func NewController(ctx context.Context, cleanupCtx context.Context, kubeClient client.Client, clk clock.Clock, recorder events.Recorder, sqsProvider *aws.SQSProvider, eventBridgeProvider *aws.EventBridgeProvider, startAsync <-chan struct{}, cleanupAsync <-chan struct{}) *Controller { @@ -73,14 +71,18 @@ func NewController(ctx context.Context, cleanupCtx context.Context, kubeClient c sqsProvider: sqsProvider, eventBridgeProvider: eventBridgeProvider, mutex: &sync.RWMutex{}, + backoff: newBackoff(), readinessChan: make(chan struct{}), trigger: make(chan struct{}, 1), done: make(chan struct{}), } go func() { - <-cleanupAsync - c.cleanup(cleanupCtx) + select { + case <-cleanupAsync: + c.cleanup(cleanupCtx) + case <-cleanupCtx.Done(): + } close(c.done) }() @@ -95,6 +97,13 @@ func NewController(ctx context.Context, cleanupCtx context.Context, kubeClient c return c } +func newBackoff() *backoff.ExponentialBackOff { + b := backoff.NewExponentialBackOff() + b.InitialInterval = time.Minute + b.MaxElapsedTime = time.Minute * 20 + return b +} + func (c *Controller) run(ctx context.Context) { logger := logging.FromContext(ctx).Named("infrastructure") ctx = logging.WithLogger(ctx, logger) @@ -103,7 +112,7 @@ func (c *Controller) run(ctx context.Context) { logger.Infof("Shutting down") }() for { - if err := c.ensureInfrastructure(ctx); err != nil { + if err := c.EnsureInfrastructure(ctx); err != nil { logging.FromContext(ctx).Errorf("ensuring infrastructure established, %v", err) c.setReady(ctx, false) backoffPeriod := c.getBackoff(err) @@ -119,6 +128,7 @@ func (c *Controller) run(ctx context.Context) { } } c.setReady(ctx, true) + c.backoff.Reset() select { case <-ctx.Done(): return @@ -150,7 +160,7 @@ func (c *Controller) cleanup(ctx context.Context) { // Deployment is already deleted or currently deleting, so we should cleanup the infrastructure if notFound || !dep.DeletionTimestamp.IsZero() { - if err := retry.Do(func() error { return c.deleteInfrastructure(ctx) }); err != nil { + if err := retry.Do(func() error { return c.DeleteInfrastructure(ctx) }); err != nil { logging.FromContext(ctx).Errorf("Deprovisioning the infrastructure, %v", err) } } @@ -185,8 +195,8 @@ func (c *Controller) setReady(ctx context.Context, ready bool) { if c.ready != ready { logging.FromContext(ctx).Infof("Infrastructure is healthy") c.recorder.InfrastructureHealthy(ctx, c.kubeClient) + close(c.readinessChan) } - close(c.readinessChan) } else { healthy.Set(0) if c.ready != ready { @@ -198,9 +208,9 @@ func (c *Controller) setReady(ctx context.Context, ready bool) { c.ready = ready } -// ensureInfrastructure reconciles the SQS queue and the EventBridge rules with the expected +// EnsureInfrastructure reconciles the SQS queue and the EventBridge rules with the expected // configuration prescribed by Karpenter -func (c *Controller) ensureInfrastructure(ctx context.Context) (err error) { +func (c *Controller) EnsureInfrastructure(ctx context.Context) (err error) { defer metrics.Measure(reconcileDuration)() wg := &sync.WaitGroup{} @@ -225,9 +235,9 @@ func (c *Controller) ensureInfrastructure(ctx context.Context) (err error) { return err } -// Delete infrastructure removes the infrastructure that was stood up and reconciled +// DeleteInfrastructure removes the infrastructure that was stood up and reconciled // by the infrastructure controller for SQS message polling -func (c *Controller) deleteInfrastructure(ctx context.Context) (err error) { +func (c *Controller) DeleteInfrastructure(ctx context.Context) (err error) { logging.FromContext(ctx).Infof("Deprovisioning the infrastructure...") wg := &sync.WaitGroup{} m := &sync.Mutex{} @@ -314,12 +324,13 @@ func (c *Controller) ensureEventBridge(ctx context.Context) error { func (c *Controller) getBackoff(err error) time.Duration { var awsErr awserr.Error if !errors.As(err, &awsErr) { - return defaultBackoffPeriod + return c.backoff.NextBackOff() } switch awsErr.Code() { case sqs.ErrCodeQueueDeletedRecently: + // We special-case this error since the queue can be created here much quicker return time.Minute default: - return defaultBackoffPeriod + return c.backoff.NextBackOff() } } diff --git a/pkg/cloudprovider/aws/controllers/infrastructure/suite_test.go b/pkg/cloudprovider/aws/controllers/infrastructure/suite_test.go index 11aa123db21c..4f843beaefd6 100644 --- a/pkg/cloudprovider/aws/controllers/infrastructure/suite_test.go +++ b/pkg/cloudprovider/aws/controllers/infrastructure/suite_test.go @@ -16,15 +16,21 @@ package infrastructure_test import ( "context" + "fmt" "testing" "time" + "github.com/aws/aws-sdk-go/aws/awserr" + "github.com/aws/aws-sdk-go/service/eventbridge" + "github.com/aws/aws-sdk-go/service/sqs" . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" clock "k8s.io/utils/clock/testing" . "knative.dev/pkg/logging/testing" . "github.com/aws/karpenter/pkg/test/expectations" + "github.com/aws/karpenter/pkg/utils/injection" + "github.com/aws/karpenter/pkg/utils/options" "github.com/aws/karpenter/pkg/cloudprovider/aws" "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/infrastructure" @@ -41,39 +47,207 @@ var eventBridgeProvider *aws.EventBridgeProvider var recorder *awsfake.EventRecorder var fakeClock *clock.FakeClock var controller *infrastructure.Controller +var startChan chan struct{} var cleanupChan chan struct{} +var opts options.Options + +var defaultOpts = options.Options{ + ClusterName: "test-cluster", + ClusterEndpoint: "https://test-cluster", + AWSNodeNameConvention: string(options.IPName), + AWSENILimitedPodDensity: true, + AWSEnablePodENI: true, + AWSDefaultInstanceProfile: "test-instance-profile", + DeploymentName: test.KarpenterDeployment().Name, + DeploymentNamespace: test.KarpenterDeployment().Namespace, +} func TestAPIs(t *testing.T) { ctx = TestContextWithLogger(t) RegisterFailHandler(Fail) + SetDefaultEventuallyTimeout(time.Second * 5) RunSpecs(t, "AWS Notification") } -var _ = BeforeSuite(func() { +var _ = BeforeEach(func() { env = test.NewEnvironment(ctx, func(e *test.Environment) { + opts = defaultOpts + Expect(opts.Validate()).To(Succeed(), "Failed to validate options") + e.Ctx = injection.WithOptions(e.Ctx, opts) + fakeClock = clock.NewFakeClock(time.Now()) recorder = awsfake.NewEventRecorder() metadata := aws.NewMetadata("us-east-1", "000000000000") sqsapi = &awsfake.SQSAPI{} eventbridgeapi = &awsfake.EventBridgeAPI{} - sqsProvider = aws.NewSQSProvider(ctx, sqsapi, metadata) + sqsProvider = aws.NewSQSProvider(e.Ctx, sqsapi, metadata) eventBridgeProvider = aws.NewEventBridgeProvider(eventbridgeapi, metadata, sqsProvider.QueueName()) - cleanupChan = make(chan struct{}) - infrastructure.NewController(env.Ctx, env.Ctx, env.Client, fakeClock, recorder, sqsProvider, eventBridgeProvider, nil, cleanupChan) }) Expect(env.Start()).To(Succeed(), "Failed to start environment") + ExpectApplied(env.Ctx, env.Client, test.KarpenterDeployment()) + cleanupChan = make(chan struct{}, 1) + startChan = make(chan struct{}) + sqsapi.Reset() + eventbridgeapi.Reset() + controller = infrastructure.NewController(env.Ctx, env.Ctx, env.Client, fakeClock, recorder, sqsProvider, eventBridgeProvider, startChan, cleanupChan) }) -var _ = AfterSuite(func() { - close(cleanupChan) +var _ = AfterEach(func() { + ExpectCleanedUp(ctx, env.Client) Expect(env.Stop()).To(Succeed(), "Failed to stop environment") + ExpectClosed(cleanupChan) + ExpectClosed(startChan) }) -var _ = BeforeEach(func() { - sqsapi.Reset() - eventbridgeapi.Reset() +var _ = Describe("Reconciliation", func() { + It("should reconcile the queue and the eventbridge rules on start", func() { + sqsapi.GetQueueURLBehavior.Error.Set(awsErrWithCode(sqs.ErrCodeQueueDoesNotExist), awsfake.MaxCalls(1)) // This mocks the queue not existing + Expect(controller.EnsureInfrastructure(env.Ctx)).To(Succeed()) + + Expect(sqsapi.CreateQueueBehavior.SuccessfulCalls()).To(Equal(1)) + Expect(eventbridgeapi.PutRuleBehavior.SuccessfulCalls()).To(Equal(4)) + Expect(eventbridgeapi.PutTargetsBehavior.SuccessfulCalls()).To(Equal(4)) + }) + It("should reconcile the queue and the eventbridge rules on trigger", func() { + sqsapi.GetQueueURLBehavior.Error.Set(awsErrWithCode(sqs.ErrCodeQueueDoesNotExist)) // This mocks the queue not existing + + // Trigger the channel that has been waiting + ExpectClosed(startChan) + + // Reconciliation loop has completed + Eventually(func(g Gomega) { + g.Expect(sqsapi.CreateQueueBehavior.SuccessfulCalls()).To(Equal(1)) + g.Expect(eventbridgeapi.PutRuleBehavior.SuccessfulCalls()).To(Equal(4)) + g.Expect(eventbridgeapi.PutTargetsBehavior.SuccessfulCalls()).To(Equal(4)) + g.Expect(IsClosed(controller.Ready())).To(BeTrue()) + }).Should(Succeed()) + + controller.Trigger() // Trigger another reconciliation loop + + // Reconciliation loop has completed + Eventually(func(g Gomega) { + g.Expect(sqsapi.SetQueueAttributesBehavior.SuccessfulCalls()).To(Equal(1)) + g.Expect(eventbridgeapi.PutRuleBehavior.SuccessfulCalls()).To(Equal(8)) + g.Expect(eventbridgeapi.PutTargetsBehavior.SuccessfulCalls()).To(Equal(8)) + + g.Expect(IsClosed(controller.Ready())).To(BeTrue()) + }).Should(Succeed()) + }) + It("should throw an error but wait with backoff if we get AccessDenied", func() { + sqsapi.GetQueueURLBehavior.Error.Set(awsErrWithCode(sqs.ErrCodeQueueDoesNotExist), awsfake.MaxCalls(0)) // This mocks the queue not existing + sqsapi.CreateQueueBehavior.Error.Set(awsErrWithCode(aws.AccessDeniedCode), awsfake.MaxCalls(0)) + eventbridgeapi.PutRuleBehavior.Error.Set(awsErrWithCode(aws.AccessDeniedExceptionCode), awsfake.MaxCalls(0)) + eventbridgeapi.PutTargetsBehavior.Error.Set(awsErrWithCode(aws.AccessDeniedExceptionCode), awsfake.MaxCalls(0)) + + // Trigger the channel that has been waiting + ExpectClosed(startChan) + Eventually(func(g Gomega) { + g.Expect(sqsapi.CreateQueueBehavior.FailedCalls()).To(Equal(1)) + g.Expect(eventbridgeapi.PutRuleBehavior.FailedCalls()).To(Equal(4)) + g.Expect(eventbridgeapi.PutTargetsBehavior.FailedCalls()).To(Equal(4)) + + g.Expect(IsClosed(controller.Ready())).To(BeFalse()) + }).Should(Succeed()) + + // Backoff is 10 minutes, so we set the fake clock forward 11 minutes + // Access denied has now been resolved + sqsapi.CreateQueueBehavior.Reset() + eventbridgeapi.PutRuleBehavior.Reset() + eventbridgeapi.PutTargetsBehavior.Reset() + fakeClock.Step(time.Minute * 11) + + // Should reconcile again after failed access denied calls + Eventually(func(g Gomega) { + g.Expect(sqsapi.CreateQueueBehavior.SuccessfulCalls()).To(Equal(1)) + g.Expect(eventbridgeapi.PutRuleBehavior.SuccessfulCalls()).To(Equal(4)) + g.Expect(eventbridgeapi.PutTargetsBehavior.SuccessfulCalls()).To(Equal(4)) + + g.Expect(IsClosed(controller.Ready())).To(BeTrue()) + }).Should(Succeed()) + }) + It("should have a shorter backoff if the queue was recently deleted", func() { + sqsapi.GetQueueURLBehavior.Error.Set(awsErrWithCode(sqs.ErrCodeQueueDoesNotExist), awsfake.MaxCalls(0)) // This mocks the queue not existing + sqsapi.CreateQueueBehavior.Error.Set(awsErrWithCode(sqs.ErrCodeQueueDeletedRecently), awsfake.MaxCalls(0)) + + // Trigger the channel that has been waiting + ExpectClosed(startChan) + Eventually(func(g Gomega) { + g.Expect(sqsapi.CreateQueueBehavior.FailedCalls()).To(Equal(1)) + g.Expect(IsClosed(controller.Ready())).To(BeFalse()) + }).Should(Succeed()) + + // Backoff is 2 minutes, so we set the fake clock forward 3 minutes + // Access denied has now been resolved + sqsapi.CreateQueueBehavior.Reset() + fakeClock.Step(time.Minute * 3) + + // Should reconcile again after failed access denied calls + Eventually(func(g Gomega) { + g.Expect(sqsapi.CreateQueueBehavior.SuccessfulCalls()).To(Equal(1)) + g.Expect(IsClosed(controller.Ready())).To(BeTrue()) + }).Should(Succeed()) + }) }) -var _ = AfterEach(func() { - ExpectCleanedUp(ctx, env.Client) + +var _ = Describe("Cleanup", func() { + It("should cleanup the infrastructure when the cleanup channel is triggered", func() { + ExpectDeleted(env.Ctx, env.Client, test.KarpenterDeployment()) + ExpectClosed(cleanupChan) + ExpectDone[struct{}](controller) + Expect(sqsapi.DeleteQueueBehavior.SuccessfulCalls()).To(Equal(1)) + Expect(eventbridgeapi.RemoveTargetsBehavior.SuccessfulCalls()).To(Equal(4)) + Expect(eventbridgeapi.DeleteRuleBehavior.SuccessfulCalls()).To(Equal(4)) + }) + It("should cleanup when queue is already deleted", func() { + ExpectDeleted(env.Ctx, env.Client, test.KarpenterDeployment()) + sqsapi.DeleteQueueBehavior.Error.Set(awsErrWithCode(sqs.ErrCodeQueueDoesNotExist)) + ExpectClosed(cleanupChan) + + // Test that we cleanup in a reasonable amount of time with a DoesNotExist error + select { + case <-time.After(time.Second * 2): + Fail("controller should have completed cleanup in time") + case <-controller.Done(): + } + Expect(sqsapi.DeleteQueueBehavior.SuccessfulCalls()).To(Equal(0)) + Expect(eventbridgeapi.RemoveTargetsBehavior.SuccessfulCalls()).To(Equal(4)) + Expect(eventbridgeapi.DeleteRuleBehavior.SuccessfulCalls()).To(Equal(4)) + }) + It("should cleanup when a single rule is already deleted", func() { + ExpectDeleted(env.Ctx, env.Client, test.KarpenterDeployment()) + eventbridgeapi.RemoveTargetsBehavior.Error.Set(awsErrWithCode((&eventbridge.ResourceNotFoundException{}).Code())) + eventbridgeapi.DeleteRuleBehavior.Error.Set(awsErrWithCode((&eventbridge.ResourceNotFoundException{}).Code())) + close(cleanupChan) + + // Test that we cleanup in a reasonable amount of time with a DoesNotExist error + select { + case <-time.After(time.Second * 5): + Fail("controller should have completed cleanup in time") + case <-controller.Done(): + } + Expect(sqsapi.DeleteQueueBehavior.SuccessfulCalls()).To(Equal(1)) + Expect(eventbridgeapi.RemoveTargetsBehavior.SuccessfulCalls()).To(Equal(3)) + Expect(eventbridgeapi.DeleteRuleBehavior.SuccessfulCalls()).To(Equal(3)) + }) + It("should cleanup when all rule targets and rules are already deleted", func() { + ExpectDeleted(env.Ctx, env.Client, test.KarpenterDeployment()) + eventbridgeapi.RemoveTargetsBehavior.Error.Set(awsErrWithCode((&eventbridge.ResourceNotFoundException{}).Code()), awsfake.MaxCalls(0)) + eventbridgeapi.DeleteRuleBehavior.Error.Set(awsErrWithCode((&eventbridge.ResourceNotFoundException{}).Code()), awsfake.MaxCalls(0)) + close(cleanupChan) + + // Test that we cleanup in a reasonable amount of time with a DoesNotExist error + select { + case <-time.After(time.Second * 2): + Fail("controller should have completed cleanup in time") + case <-controller.Done(): + } + Expect(sqsapi.DeleteQueueBehavior.SuccessfulCalls()).To(Equal(1)) + Expect(eventbridgeapi.RemoveTargetsBehavior.SuccessfulCalls()).To(Equal(0)) + Expect(eventbridgeapi.DeleteRuleBehavior.SuccessfulCalls()).To(Equal(0)) + }) }) + +func awsErrWithCode(code string) awserr.Error { + return awserr.New(code, "", fmt.Errorf("")) +} diff --git a/pkg/cloudprovider/aws/controllers/notification/controller.go b/pkg/cloudprovider/aws/controllers/notification/controller.go index ab15e87db709..8fe9c4341867 100644 --- a/pkg/cloudprovider/aws/controllers/notification/controller.go +++ b/pkg/cloudprovider/aws/controllers/notification/controller.go @@ -29,6 +29,8 @@ import ( "knative.dev/pkg/logging" "sigs.k8s.io/controller-runtime/pkg/client" + "github.com/cenkalti/backoff/v4" + "github.com/aws/karpenter/pkg/apis/provisioning/v1alpha5" "github.com/aws/karpenter/pkg/cloudprovider/aws" "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/infrastructure" @@ -43,9 +45,11 @@ type Action = string var Actions = struct { CordonAndDrain, + Cordon, NoAction Action }{ CordonAndDrain: "CordonAndDrain", + Cordon: "Cordon", NoAction: "NoAction", } @@ -60,6 +64,7 @@ type Controller struct { parser event.Parser infraController *infrastructure.Controller + backoff *backoff.ExponentialBackOff } // pollingPeriod that we go to the SQS queue to check if there are any new events @@ -68,6 +73,7 @@ const pollingPeriod = 2 * time.Second func NewController(ctx context.Context, kubeClient client.Client, clk clock.Clock, recorder events.Recorder, cluster *state.Cluster, sqsProvider *aws.SQSProvider, infraController *infrastructure.Controller, startAsync <-chan struct{}) *Controller { + c := &Controller{ kubeClient: kubeClient, cluster: cluster, @@ -76,6 +82,7 @@ func NewController(ctx context.Context, kubeClient client.Client, clk clock.Cloc provider: sqsProvider, parser: aggregatedparser.NewAggregatedParser(aggregatedparser.DefaultParsers...), infraController: infraController, + backoff: newBackoff(), } go func() { @@ -90,26 +97,42 @@ func NewController(ctx context.Context, kubeClient client.Client, clk clock.Cloc return c } +func newBackoff() *backoff.ExponentialBackOff { + b := backoff.NewExponentialBackOff() + b.InitialInterval = time.Second * 2 + b.MaxElapsedTime = time.Minute * 30 + return b +} + func (c *Controller) run(ctx context.Context) { logger := logging.FromContext(ctx).Named("notification") ctx = logging.WithLogger(ctx, logger) + + defer func() { + logger.Infof("Shutting down") + }() for { <-c.infraController.Ready() // block until the infrastructure is up and ready - err := c.pollSQS(ctx) + err := c.PollSQS(ctx) if err != nil { logging.FromContext(ctx).Errorf("Handling notification messages from SQS queue, %v", err) + select { + case <-ctx.Done(): + return + case <-c.clock.After(c.backoff.NextBackOff()): + continue + } } - + c.backoff.Reset() // We succeeded so reset the backoff period select { case <-ctx.Done(): - logger.Infof("Shutting down") return case <-c.clock.After(pollingPeriod): } } } -func (c *Controller) pollSQS(ctx context.Context) error { +func (c *Controller) PollSQS(ctx context.Context) error { defer metrics.Measure(reconcileDuration.WithLabelValues())() sqsMessages, err := c.provider.GetSQSMessages(ctx) @@ -145,6 +168,13 @@ func (c *Controller) handleMessage(ctx context.Context, instanceIDMap map[string // There's no action to take here since the event doesn't pertain to any of our instances if len(nodes) == 0 { receivedMessages.WithLabelValues(evt.Kind(), "false").Inc() + + // Since there's no action, just delete the message + err = c.provider.DeleteSQSMessage(ctx, msg) + if err != nil { + return fmt.Errorf("failed to delete message from queue, %w", err) + } + deletedMessages.WithLabelValues().Inc() return } receivedMessages.WithLabelValues(evt.Kind(), "true").Inc() diff --git a/pkg/cloudprovider/aws/controllers/notification/event/types.go b/pkg/cloudprovider/aws/controllers/notification/event/types.go index 3abe899f9d1e..7c6916478caf 100644 --- a/pkg/cloudprovider/aws/controllers/notification/event/types.go +++ b/pkg/cloudprovider/aws/controllers/notification/event/types.go @@ -34,14 +34,12 @@ type Interface interface { type Kind = string var Kinds = struct { - AutoScalingTermination, RebalanceRecommendation, ScheduledChange, SpotInterruption, StateChange, Noop Kind }{ - AutoScalingTermination: "autoScalingTermination", RebalanceRecommendation: "rebalanceRecommendation", ScheduledChange: "scheduledChange", SpotInterruption: "spotInterruption", diff --git a/pkg/cloudprovider/aws/controllers/notification/suite_test.go b/pkg/cloudprovider/aws/controllers/notification/suite_test.go index f73ed4712a51..588886149c5b 100644 --- a/pkg/cloudprovider/aws/controllers/notification/suite_test.go +++ b/pkg/cloudprovider/aws/controllers/notification/suite_test.go @@ -16,16 +16,35 @@ package notification_test import ( "context" + "encoding/json" + "fmt" + "math/rand" "testing" "time" + awssdk "github.com/aws/aws-sdk-go/aws" + "github.com/aws/aws-sdk-go/aws/awserr" + "github.com/aws/aws-sdk-go/service/sqs" . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" + "github.com/samber/lo" + v1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" clock "k8s.io/utils/clock/testing" . "knative.dev/pkg/logging/testing" + "github.com/google/uuid" + + "github.com/aws/karpenter/pkg/apis/provisioning/v1alpha5" + "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/infrastructure" + "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event" + scheduledchangev0 "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event/scheduledchange/v0" + spotinterruptionv0 "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event/spotinterruption/v0" + statechangev0 "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event/statechange/v0" . "github.com/aws/karpenter/pkg/test/expectations" + "sigs.k8s.io/controller-runtime/pkg/client" + "github.com/aws/karpenter/pkg/cloudprovider/aws" "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification" awsfake "github.com/aws/karpenter/pkg/cloudprovider/aws/fake" @@ -34,48 +53,359 @@ import ( "github.com/aws/karpenter/pkg/test" ) +const ( + defaultAccountID = "000000000000" + defaultInstanceID = "i-08c6fdb11e28c8c90" + defaultRegion = "us-west-2" + ec2Source = "aws.ec2" + healthSource = "aws.health" +) + var ctx context.Context var env *test.Environment var cluster *state.Cluster var sqsapi *awsfake.SQSAPI +var eventbridgeapi *awsfake.EventBridgeAPI var cloudProvider *fake.CloudProvider var sqsProvider *aws.SQSProvider +var eventBridgeProvider *aws.EventBridgeProvider var recorder *awsfake.EventRecorder var fakeClock *clock.FakeClock var cfg *test.Config var controller *notification.Controller -var ready func() <-chan struct{} +var infraController *infrastructure.Controller +var nodeStateController *state.NodeController +var infraStartChan chan struct{} +var notificationStartChan chan struct{} func TestAPIs(t *testing.T) { ctx = TestContextWithLogger(t) RegisterFailHandler(Fail) + SetDefaultEventuallyTimeout(time.Second * 5) RunSpecs(t, "AWS Notification") } -var _ = BeforeSuite(func() { +var _ = BeforeEach(func() { env = test.NewEnvironment(ctx, func(e *test.Environment) { cfg = test.NewConfig() fakeClock = clock.NewFakeClock(time.Now()) cloudProvider = &fake.CloudProvider{} cluster = state.NewCluster(fakeClock, cfg, env.Client, cloudProvider) + nodeStateController = state.NewNodeController(env.Client, cluster) recorder = awsfake.NewEventRecorder() metadata := aws.NewMetadata("us-east-1", "000000000000") sqsapi = &awsfake.SQSAPI{} sqsProvider = aws.NewSQSProvider(ctx, sqsapi, metadata) + eventbridgeapi = &awsfake.EventBridgeAPI{} + eventBridgeProvider = aws.NewEventBridgeProvider(eventbridgeapi, metadata, sqsProvider.QueueName()) + }) Expect(env.Start()).To(Succeed(), "Failed to start environment") + sqsapi.Reset() + infraStartChan = make(chan struct{}) + notificationStartChan = make(chan struct{}) + infraController = infrastructure.NewController(env.Ctx, env.Ctx, env.Client, fakeClock, recorder, sqsProvider, eventBridgeProvider, infraStartChan, env.Ctx.Done()) + controller = notification.NewController(env.Ctx, env.Client, fakeClock, recorder, cluster, sqsProvider, infraController, notificationStartChan) }) -var _ = AfterSuite(func() { +var _ = AfterEach(func() { + ExpectCleanedUp(ctx, env.Client) Expect(env.Stop()).To(Succeed(), "Failed to stop environment") }) -var _ = BeforeEach(func() { - sqsapi.Reset() - ready = func() <-chan struct{} { return make(chan struct{}) } - controller = notification.NewController(env.Ctx, env.Client, fakeClock, recorder, cluster, sqsProvider, nil, ready) +var _ = Describe("Processing Messages", func() { + It("should delete the node when receiving a spot interruption warning", func() { + node := test.Node(test.NodeOptions{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + v1alpha5.ProvisionerNameLabelKey: "default", + }, + }, + ProviderID: makeProviderID(defaultInstanceID), + }) + ExpectMessagesCreated(spotInterruptionMessage(defaultInstanceID)) + ExpectApplied(env.Ctx, env.Client, node) + ExpectReconcileSucceeded(env.Ctx, nodeStateController, client.ObjectKeyFromObject(node)) + + Expect(controller.PollSQS(env.Ctx)).To(Succeed()) + ExpectNotFound(env.Ctx, env.Client, node) + Expect(sqsapi.DeleteMessageBehavior.SuccessfulCalls()).To(Equal(1)) + }) + It("should delete the node when receiving a scheduled change message", func() { + node := test.Node(test.NodeOptions{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + v1alpha5.ProvisionerNameLabelKey: "default", + }, + }, + ProviderID: makeProviderID(defaultInstanceID), + }) + ExpectMessagesCreated(scheduledChangeMessage(defaultInstanceID)) + ExpectApplied(env.Ctx, env.Client, node) + ExpectReconcileSucceeded(env.Ctx, nodeStateController, client.ObjectKeyFromObject(node)) + + Expect(controller.PollSQS(env.Ctx)).To(Succeed()) + ExpectNotFound(env.Ctx, env.Client, node) + Expect(sqsapi.DeleteMessageBehavior.SuccessfulCalls()).To(Equal(1)) + }) + It("should delete the node when receiving a state change message", func() { + var nodes []*v1.Node + var messages []*sqs.Message + for _, state := range []string{"terminated", "stopped", "stopping", "shutting-down"} { + instanceID := makeInstanceID() + nodes = append(nodes, test.Node(test.NodeOptions{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + v1alpha5.ProvisionerNameLabelKey: "default", + }, + }, + ProviderID: makeProviderID(instanceID), + })) + messages = append(messages, stateChangeMessage(instanceID, state)) + } + ExpectMessagesCreated(messages...) + ExpectApplied(env.Ctx, env.Client, lo.Map(nodes, func(n *v1.Node, _ int) client.Object { return n })...) + + // Wait for the nodes to reconcile with the cluster state + ExpectReconcileSucceeded(env.Ctx, nodeStateController, lo.Map(nodes, func(n *v1.Node, _ int) client.ObjectKey { return client.ObjectKeyFromObject(n) })...) + Expect(controller.PollSQS(env.Ctx)).To(Succeed()) + ExpectNotFound(env.Ctx, env.Client, lo.Map(nodes, func(n *v1.Node, _ int) client.Object { return n })...) + Expect(sqsapi.DeleteMessageBehavior.SuccessfulCalls()).To(Equal(4)) + }) + It("should handle multiple messages that cause node deletion", func() { + var nodes []*v1.Node + var instanceIDs []string + for i := 0; i < 100; i++ { + instanceIDs = append(instanceIDs, makeInstanceID()) + nodes = append(nodes, test.Node(test.NodeOptions{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + v1alpha5.ProvisionerNameLabelKey: "default", + }, + }, + ProviderID: makeProviderID(instanceIDs[len(instanceIDs)-1]), + })) + + } + + var messages []*sqs.Message + for _, id := range instanceIDs { + messages = append(messages, spotInterruptionMessage(id)) + } + ExpectMessagesCreated(messages...) + ExpectApplied(env.Ctx, env.Client, lo.Map(nodes, func(n *v1.Node, _ int) client.Object { return n })...) + + // Wait for the nodes to reconcile with the cluster state + ExpectReconcileSucceeded(env.Ctx, nodeStateController, lo.Map(nodes, func(n *v1.Node, _ int) client.ObjectKey { return client.ObjectKeyFromObject(n) })...) + Expect(controller.PollSQS(env.Ctx)).To(Succeed()) + ExpectNotFound(env.Ctx, env.Client, lo.Map(nodes, func(n *v1.Node, _ int) client.Object { return n })...) + Expect(sqsapi.DeleteMessageBehavior.SuccessfulCalls()).To(Equal(100)) + }) + It("should not delete a node when not owned by provisioner", func() { + node := test.Node(test.NodeOptions{ + ProviderID: makeProviderID(uuid.NewString()), + }) + ExpectMessagesCreated(spotInterruptionMessage(node.Spec.ProviderID)) + ExpectApplied(env.Ctx, env.Client, node) + ExpectReconcileSucceeded(env.Ctx, nodeStateController, client.ObjectKeyFromObject(node)) + + Expect(controller.PollSQS(env.Ctx)).To(Succeed()) + ExpectNodeExists(env.Ctx, env.Client, node.Name) + Expect(sqsapi.DeleteMessageBehavior.SuccessfulCalls()).To(Equal(1)) + }) + It("should delete a message when the message can't be parsed", func() { + badMessage := &sqs.Message{ + Body: awssdk.String(string(lo.Must(json.Marshal(map[string]string{ + "field1": "value1", + "field2": "value2", + })))), + MessageId: awssdk.String(uuid.NewString()), + } + + ExpectMessagesCreated(badMessage) + Expect(controller.PollSQS(env.Ctx)).To(Succeed()) + Expect(sqsapi.DeleteMessageBehavior.SuccessfulCalls()).To(Equal(1)) + }) + It("should delete a state change message when the state isn't in accepted states", func() { + node := test.Node(test.NodeOptions{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + v1alpha5.ProvisionerNameLabelKey: "default", + }, + }, + ProviderID: makeProviderID(defaultInstanceID), + }) + ExpectMessagesCreated(stateChangeMessage(defaultInstanceID, "creating")) + ExpectApplied(env.Ctx, env.Client, node) + ExpectReconcileSucceeded(env.Ctx, nodeStateController, client.ObjectKeyFromObject(node)) + + Expect(controller.PollSQS(env.Ctx)).To(Succeed()) + ExpectNodeExists(env.Ctx, env.Client, node.Name) + Expect(sqsapi.DeleteMessageBehavior.SuccessfulCalls()).To(Equal(1)) + }) }) -var _ = AfterEach(func() { - ExpectCleanedUp(ctx, env.Client) + +var _ = Describe("Error Handling", func() { + BeforeEach(func() { + // This ensures that the readiness gate is set to ready when we start the test + ExpectClosed(infraStartChan) + }) + + It("should send an error on polling when AccessDenied", func() { + sqsapi.ReceiveMessageBehavior.Error.Set(awsErrWithCode(aws.AccessDeniedCode), awsfake.MaxCalls(0)) + Expect(controller.PollSQS(env.Ctx)).ToNot(Succeed()) + }) + It("should trigger a infrastructure reconciliation on SQS queue doesn't exist", func() { + sqsapi.GetQueueURLBehavior.Error.Set(awsErrWithCode(sqs.ErrCodeQueueDoesNotExist), awsfake.MaxCalls(0)) // This mocks the queue not existing + + // Infrastructure reconciliation loop has completed + Eventually(func(g Gomega) { + g.Expect(sqsapi.CreateQueueBehavior.SuccessfulCalls()).To(Equal(1)) + g.Expect(eventbridgeapi.PutRuleBehavior.SuccessfulCalls()).To(Equal(4)) + g.Expect(eventbridgeapi.PutTargetsBehavior.SuccessfulCalls()).To(Equal(4)) + g.Expect(IsClosed(infraController.Ready())).To(BeTrue()) + }).Should(Succeed()) + + sqsapi.ReceiveMessageBehavior.Error.Set(awsErrWithCode(sqs.ErrCodeQueueDoesNotExist)) // This mocks the queue being deleted manually after infra reconciliation + + // This should fail with an error since the queue doesn't exist + Expect(controller.PollSQS(env.Ctx)).ToNot(Succeed()) + + Eventually(func(g Gomega) { + g.Expect(sqsapi.CreateQueueBehavior.SuccessfulCalls()).To(Equal(2)) + g.Expect(eventbridgeapi.PutRuleBehavior.SuccessfulCalls()).To(Equal(8)) + g.Expect(eventbridgeapi.PutTargetsBehavior.SuccessfulCalls()).To(Equal(8)) + g.Expect(IsClosed(infraController.Ready())).To(BeTrue()) + }).Should(Succeed()) + }) }) + +var _ = Describe("Infrastructure Coordination", func() { + It("should wait for the infrastructure to be ready before polling SQS", func() { + ExpectClosed(notificationStartChan) + Expect(IsClosed(infraController.Ready())).To(BeFalse()) + Consistently(func(g Gomega) { + g.Expect(sqsapi.ReceiveMessageBehavior.SuccessfulCalls()).To(Equal(0)) + g.Expect(sqsapi.ReceiveMessageBehavior.FailedCalls()).To(Equal(0)) + }, time.Second*10).Should(Succeed()) + + ExpectClosed(infraStartChan) + + Eventually(func(g Gomega) { + g.Expect(sqsapi.ReceiveMessageBehavior.SuccessfulCalls()).To(BeNumerically(">", 0)) + }, time.Second*10).Should(Succeed()) + }) +}) + +func ExpectMessagesCreated(messages ...*sqs.Message) { + sqsapi.ReceiveMessageBehavior.Output.Set( + &sqs.ReceiveMessageOutput{ + Messages: messages, + }, + ) +} + +func awsErrWithCode(code string) awserr.Error { + return awserr.New(code, "", fmt.Errorf("")) +} + +func spotInterruptionMessage(involvedInstanceID string) *sqs.Message { + evt := spotinterruptionv0.AWSEvent{ + AWSMetadata: event.AWSMetadata{ + Version: "0", + Account: defaultAccountID, + DetailType: "EC2 Spot Instance Interruption Warning", + ID: uuid.NewString(), + Region: defaultRegion, + Resources: []string{ + fmt.Sprintf("arn:aws:ec2:%s:instance/%s", defaultRegion, involvedInstanceID), + }, + Source: ec2Source, + Time: time.Now(), + }, + Detail: spotinterruptionv0.EC2SpotInstanceInterruptionWarningDetail{ + InstanceID: involvedInstanceID, + InstanceAction: "terminate", + }, + } + return &sqs.Message{ + Body: awssdk.String(string(lo.Must(json.Marshal(evt)))), + MessageId: awssdk.String(uuid.NewString()), + } +} + +func stateChangeMessage(involvedInstanceID, state string) *sqs.Message { + evt := statechangev0.AWSEvent{ + AWSMetadata: event.AWSMetadata{ + Version: "0", + Account: defaultAccountID, + DetailType: "EC2 Instance State-change Notification", + ID: uuid.NewString(), + Region: defaultRegion, + Resources: []string{ + fmt.Sprintf("arn:aws:ec2:%s:instance/%s", defaultRegion, involvedInstanceID), + }, + Source: ec2Source, + Time: time.Now(), + }, + Detail: statechangev0.EC2InstanceStateChangeNotificationDetail{ + InstanceID: involvedInstanceID, + State: state, + }, + } + return &sqs.Message{ + Body: awssdk.String(string(lo.Must(json.Marshal(evt)))), + MessageId: awssdk.String(uuid.NewString()), + } +} + +// TODO: Update the scheudled change message to accurately reflect a real health event +func scheduledChangeMessage(involvedInstanceID string) *sqs.Message { + evt := scheduledchangev0.AWSEvent{ + AWSMetadata: event.AWSMetadata{ + Version: "0", + Account: defaultAccountID, + DetailType: "AWS Health Event", + ID: uuid.NewString(), + Region: defaultRegion, + Resources: []string{ + fmt.Sprintf("arn:aws:ec2:%s:instance/%s", defaultRegion, involvedInstanceID), + }, + Source: healthSource, + Time: time.Now(), + }, + Detail: scheduledchangev0.AWSHealthEventDetail{ + Service: "EC2", + EventTypeCategory: "scheduledChange", + AffectedEntities: []scheduledchangev0.AffectedEntity{ + { + EntityValue: involvedInstanceID, + }, + }, + }, + } + return &sqs.Message{ + Body: awssdk.String(string(lo.Must(json.Marshal(evt)))), + MessageId: awssdk.String(uuid.NewString()), + } +} + +func makeProviderID(instanceID string) string { + return fmt.Sprintf("aws:///%s/%s", defaultRegion, instanceID) +} + +var runes = []rune("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789") + +// nolint:gosec +func randStringRunes(n int) string { + b := make([]rune, n) + for i := range b { + b[i] = runes[rand.Intn(len(runes))] + } + return string(b) +} + +func makeInstanceID() string { + return fmt.Sprintf("i-%s", randStringRunes(17)) +} diff --git a/pkg/cloudprovider/aws/controllers/register.go b/pkg/cloudprovider/aws/controllers/register.go index 17dfbd0cf6fe..decb7050d26c 100644 --- a/pkg/cloudprovider/aws/controllers/register.go +++ b/pkg/cloudprovider/aws/controllers/register.go @@ -30,7 +30,7 @@ func Register(ctx context.Context, provider *aws.CloudProvider, opts *controller rec := events.NewRecorder(opts.Recorder) ctx = logging.WithLogger(ctx, logging.FromContext(ctx).Named("aws")) - // Injecting the cloudprovider-specific controllers that will start when opts.StartAsync is triggered + // Injecting the AWS-specific controllers that will start when opts.StartAsync is triggered infraController := infrastructure.NewController(ctx, opts.BaseContext(), opts.KubeClient, opts.Clock, rec, provider.SQSProvider(), provider.EventBridgeProvider(), opts.StartAsync, opts.CleanupAsync) notification.NewController(ctx, opts.KubeClient, opts.Clock, rec, opts.Cluster, provider.SQSProvider(), infraController, opts.StartAsync) return infraController.Done() diff --git a/pkg/cloudprovider/aws/errors.go b/pkg/cloudprovider/aws/errors.go index fb1f6a8d5098..aceea11abc16 100644 --- a/pkg/cloudprovider/aws/errors.go +++ b/pkg/cloudprovider/aws/errors.go @@ -26,6 +26,8 @@ import ( const ( launchTemplateNotFoundCode = "InvalidLaunchTemplateName.NotFoundException" + AccessDeniedCode = "AccessDenied" + AccessDeniedExceptionCode = "AccessDeniedException" ) var ( @@ -45,8 +47,8 @@ var ( "Unsupported", } accessDeniedErrorCodes = []string{ - "AccessDenied", - "AccessDeniedException", + AccessDeniedCode, + AccessDeniedExceptionCode, } ) diff --git a/pkg/cloudprovider/aws/fake/atomic.go b/pkg/cloudprovider/aws/fake/atomic.go index e5958a3e46ad..9cd9170d69e1 100644 --- a/pkg/cloudprovider/aws/fake/atomic.go +++ b/pkg/cloudprovider/aws/fake/atomic.go @@ -18,6 +18,7 @@ import ( "bytes" "encoding/json" "log" + "math" "sync" ) @@ -71,6 +72,9 @@ func (a *AtomicPtr[T]) Reset() { type AtomicError struct { mu sync.Mutex err error + + calls int + maxCalls int } func (e *AtomicError) Reset() { @@ -85,16 +89,40 @@ func (e *AtomicError) IsNil() bool { return e.err == nil } +// Get is equivalent to the error being called, so we increase +// number of calls in this function func (e *AtomicError) Get() error { e.mu.Lock() defer e.mu.Unlock() + if e.calls >= e.maxCalls { + return nil + } + e.calls++ return e.err } -func (e *AtomicError) Set(err error) { +func (e *AtomicError) Set(err error, opts ...AtomicErrorOption) { e.mu.Lock() defer e.mu.Unlock() e.err = err + for _, opt := range opts { + opt(e) + } + if e.maxCalls == 0 { + e.maxCalls = 1 + } +} + +type AtomicErrorOption func(atomicError *AtomicError) + +func MaxCalls(maxCalls int) AtomicErrorOption { + // Setting to 0 is equivalent to allowing infinite errors to API + if maxCalls <= 0 { + maxCalls = math.MaxInt + } + return func(e *AtomicError) { + e.maxCalls = maxCalls + } } // AtomicPtrSlice exposes a slice of a pointer type in a race-free manner. The interface is just enough to replace the diff --git a/pkg/cloudprovider/aws/fake/eventbridgeapi.go b/pkg/cloudprovider/aws/fake/eventbridgeapi.go index 254099f32827..e0a4c981771b 100644 --- a/pkg/cloudprovider/aws/fake/eventbridgeapi.go +++ b/pkg/cloudprovider/aws/fake/eventbridgeapi.go @@ -25,14 +25,10 @@ import ( // EventBridgeBehavior must be reset between tests otherwise tests will // pollute each other. type EventBridgeBehavior struct { - PutRuleOutput AtomicPtr[eventbridge.PutRuleOutput] - PutTargetsOutput AtomicPtr[eventbridge.PutTargetsOutput] - - CalledWithPutRuleInput AtomicPtrSlice[eventbridge.PutRuleInput] - CalledWithPutTargetsInput AtomicPtrSlice[eventbridge.PutTargetsInput] - CalledWithDeleteRuleInput AtomicPtrSlice[eventbridge.DeleteRuleInput] - CalledWithRemoveTargetsInput AtomicPtrSlice[eventbridge.RemoveTargetsInput] - NextError AtomicError + PutRuleBehavior MockedFunction[eventbridge.PutRuleInput, eventbridge.PutRuleOutput] + PutTargetsBehavior MockedFunction[eventbridge.PutTargetsInput, eventbridge.PutTargetsOutput] + DeleteRuleBehavior MockedFunction[eventbridge.DeleteRuleInput, eventbridge.DeleteRuleOutput] + RemoveTargetsBehavior MockedFunction[eventbridge.RemoveTargetsInput, eventbridge.RemoveTargetsOutput] } type EventBridgeAPI struct { @@ -43,59 +39,26 @@ type EventBridgeAPI struct { // Reset must be called between tests otherwise tests will pollute // each other. func (eb *EventBridgeAPI) Reset() { - eb.PutTargetsOutput.Reset() - eb.PutTargetsOutput.Reset() - eb.CalledWithPutRuleInput.Reset() - eb.CalledWithPutTargetsInput.Reset() - eb.CalledWithDeleteRuleInput.Reset() - eb.CalledWithRemoveTargetsInput.Reset() - eb.NextError.Reset() + eb.PutRuleBehavior.Reset() + eb.PutTargetsBehavior.Reset() + eb.DeleteRuleBehavior.Reset() + eb.RemoveTargetsBehavior.Reset() } // TODO: Create a dummy rule ARN for the default that is returned from this function func (eb *EventBridgeAPI) PutRuleWithContext(_ context.Context, input *eventbridge.PutRuleInput, _ ...request.Option) (*eventbridge.PutRuleOutput, error) { - if !eb.NextError.IsNil() { - defer eb.NextError.Reset() - return nil, eb.NextError.Get() - } - eb.CalledWithPutRuleInput.Add(input) - - if !eb.PutRuleOutput.IsNil() { - return eb.PutRuleOutput.Clone(), nil - } - return &eventbridge.PutRuleOutput{}, nil + return eb.PutRuleBehavior.Invoke(input) } // TODO: Create a default response that returns failed entries func (eb *EventBridgeAPI) PutTargetsWithContext(_ context.Context, input *eventbridge.PutTargetsInput, _ ...request.Option) (*eventbridge.PutTargetsOutput, error) { - if !eb.NextError.IsNil() { - defer eb.NextError.Reset() - return nil, eb.NextError.Get() - } - eb.CalledWithPutTargetsInput.Add(input) - - if !eb.PutTargetsOutput.IsNil() { - return eb.PutTargetsOutput.Clone(), nil - } - return &eventbridge.PutTargetsOutput{}, nil + return eb.PutTargetsBehavior.Invoke(input) } func (eb *EventBridgeAPI) DeleteRuleWithContext(_ context.Context, input *eventbridge.DeleteRuleInput, _ ...request.Option) (*eventbridge.DeleteRuleOutput, error) { - if !eb.NextError.IsNil() { - defer eb.NextError.Reset() - return nil, eb.NextError.Get() - } - eb.CalledWithDeleteRuleInput.Add(input) - - return &eventbridge.DeleteRuleOutput{}, nil + return eb.DeleteRuleBehavior.Invoke(input) } func (eb *EventBridgeAPI) RemoveTargetsWithContext(_ context.Context, input *eventbridge.RemoveTargetsInput, _ ...request.Option) (*eventbridge.RemoveTargetsOutput, error) { - if !eb.NextError.IsNil() { - defer eb.NextError.Reset() - return nil, eb.NextError.Get() - } - eb.CalledWithRemoveTargetsInput.Add(input) - - return &eventbridge.RemoveTargetsOutput{}, nil + return eb.RemoveTargetsBehavior.Invoke(input) } diff --git a/pkg/cloudprovider/aws/fake/eventrecorder.go b/pkg/cloudprovider/aws/fake/eventrecorder.go index 95299220d701..2b4b2f41f155 100644 --- a/pkg/cloudprovider/aws/fake/eventrecorder.go +++ b/pkg/cloudprovider/aws/fake/eventrecorder.go @@ -15,16 +15,37 @@ limitations under the License. package fake import ( - "github.com/aws/karpenter/pkg/cloudprovider/aws/events" + "context" + + v1 "k8s.io/api/core/v1" + "sigs.k8s.io/controller-runtime/pkg/client" + "github.com/aws/karpenter/pkg/test" ) // EventRecorder is a mock event recorder that is used to facilitate testing. type EventRecorder struct { - events.Recorder test.EventRecorder } +func (e *EventRecorder) EC2SpotInterruptionWarning(_ *v1.Node) {} + +func (e *EventRecorder) EC2SpotRebalanceRecommendation(_ *v1.Node) {} + +func (e *EventRecorder) EC2HealthWarning(_ *v1.Node) {} + +func (e *EventRecorder) EC2StateChange(_ *v1.Node) {} + +func (e *EventRecorder) TerminatingNodeOnNotification(_ *v1.Node) {} + +func (e *EventRecorder) InfrastructureUnhealthy(_ context.Context, _ client.Client) {} + +func (e *EventRecorder) InfrastructureHealthy(_ context.Context, _ client.Client) {} + +func (e *EventRecorder) InfrastructureDeletionSucceeded(_ context.Context, _ client.Client) {} + +func (e *EventRecorder) InfrastructureDeletionFailed(_ context.Context, _ client.Client) {} + func NewEventRecorder() *EventRecorder { return &EventRecorder{ EventRecorder: *test.NewEventRecorder(), diff --git a/pkg/cloudprovider/aws/fake/sqsapi.go b/pkg/cloudprovider/aws/fake/sqsapi.go index 005e070f2153..57223d8f2afa 100644 --- a/pkg/cloudprovider/aws/fake/sqsapi.go +++ b/pkg/cloudprovider/aws/fake/sqsapi.go @@ -30,16 +30,12 @@ const ( // SQSBehavior must be reset between tests otherwise tests will // pollute each other. type SQSBehavior struct { - CreateQueueOutput AtomicPtr[sqs.CreateQueueOutput] - GetQueueURLOutput AtomicPtr[sqs.GetQueueUrlOutput] - ReceiveMessageOutput AtomicPtr[sqs.ReceiveMessageOutput] - CalledWithCreateQueueInput AtomicPtrSlice[sqs.CreateQueueInput] - CalledWithGetQueueURLInput AtomicPtrSlice[sqs.GetQueueUrlInput] - CalledWithSetQueueAttributesInput AtomicPtrSlice[sqs.SetQueueAttributesInput] - CalledWithReceiveMessageInput AtomicPtrSlice[sqs.ReceiveMessageInput] - CalledWithDeleteMessageInput AtomicPtrSlice[sqs.DeleteMessageInput] - CalledWithDeleteQueueInput AtomicPtrSlice[sqs.DeleteQueueInput] - NextError AtomicError + CreateQueueBehavior MockedFunction[sqs.CreateQueueInput, sqs.CreateQueueOutput] + GetQueueURLBehavior MockedFunction[sqs.GetQueueUrlInput, sqs.GetQueueUrlOutput] + SetQueueAttributesBehavior MockedFunction[sqs.SetQueueAttributesInput, sqs.SetQueueAttributesOutput] + ReceiveMessageBehavior MockedFunction[sqs.ReceiveMessageInput, sqs.ReceiveMessageOutput] + DeleteMessageBehavior MockedFunction[sqs.DeleteMessageInput, sqs.DeleteMessageOutput] + DeleteQueueBehavior MockedFunction[sqs.DeleteQueueInput, sqs.DeleteQueueOutput] } type SQSAPI struct { @@ -50,90 +46,39 @@ type SQSAPI struct { // Reset must be called between tests otherwise tests will pollute // each other. func (s *SQSAPI) Reset() { - s.CreateQueueOutput.Reset() - s.GetQueueURLOutput.Reset() - s.ReceiveMessageOutput.Reset() - s.CalledWithCreateQueueInput.Reset() - s.CalledWithGetQueueURLInput.Reset() - s.CalledWithSetQueueAttributesInput.Reset() - s.CalledWithReceiveMessageInput.Reset() - s.CalledWithDeleteMessageInput.Reset() - s.CalledWithDeleteQueueInput.Reset() - s.NextError.Reset() + s.CreateQueueBehavior.Reset() + s.GetQueueURLBehavior.Reset() + s.SetQueueAttributesBehavior.Reset() + s.ReceiveMessageBehavior.Reset() + s.DeleteMessageBehavior.Reset() + s.DeleteQueueBehavior.Reset() } func (s *SQSAPI) CreateQueueWithContext(_ context.Context, input *sqs.CreateQueueInput, _ ...request.Option) (*sqs.CreateQueueOutput, error) { - if !s.NextError.IsNil() { - defer s.NextError.Reset() - return nil, s.NextError.Get() - } - s.CalledWithCreateQueueInput.Add(input) - - if !s.CreateQueueOutput.IsNil() { - return s.CreateQueueOutput.Clone(), nil - } - return &sqs.CreateQueueOutput{ + return s.CreateQueueBehavior.WithDefault(&sqs.CreateQueueOutput{ QueueUrl: aws.String(dummyQueueURL), - }, nil + }).Invoke(input) } //nolint:revive,stylecheck func (s *SQSAPI) GetQueueUrlWithContext(_ context.Context, input *sqs.GetQueueUrlInput, _ ...request.Option) (*sqs.GetQueueUrlOutput, error) { - if !s.NextError.IsNil() { - defer s.NextError.Reset() - return nil, s.NextError.Get() - } - s.CalledWithGetQueueURLInput.Add(input) - - if !s.GetQueueURLOutput.IsNil() { - return s.GetQueueURLOutput.Clone(), nil - } - return &sqs.GetQueueUrlOutput{ + return s.GetQueueURLBehavior.WithDefault(&sqs.GetQueueUrlOutput{ QueueUrl: aws.String(dummyQueueURL), - }, nil + }).Invoke(input) } func (s *SQSAPI) SetQueueAttributesWithContext(_ context.Context, input *sqs.SetQueueAttributesInput, _ ...request.Option) (*sqs.SetQueueAttributesOutput, error) { - if !s.NextError.IsNil() { - defer s.NextError.Reset() - return nil, s.NextError.Get() - } - s.CalledWithSetQueueAttributesInput.Add(input) - - return &sqs.SetQueueAttributesOutput{}, nil + return s.SetQueueAttributesBehavior.Invoke(input) } func (s *SQSAPI) ReceiveMessageWithContext(_ context.Context, input *sqs.ReceiveMessageInput, _ ...request.Option) (*sqs.ReceiveMessageOutput, error) { - if !s.NextError.IsNil() { - defer s.NextError.Reset() - return nil, s.NextError.Get() - } - s.CalledWithReceiveMessageInput.Add(input) - - if !s.ReceiveMessageOutput.IsNil() { - return s.ReceiveMessageOutput.Clone(), nil - } - return &sqs.ReceiveMessageOutput{ - Messages: []*sqs.Message{}, - }, nil + return s.ReceiveMessageBehavior.Invoke(input) } func (s *SQSAPI) DeleteMessageWithContext(_ context.Context, input *sqs.DeleteMessageInput, _ ...request.Option) (*sqs.DeleteMessageOutput, error) { - if !s.NextError.IsNil() { - defer s.NextError.Reset() - return nil, s.NextError.Get() - } - s.CalledWithDeleteMessageInput.Add(input) - - return &sqs.DeleteMessageOutput{}, nil + return s.DeleteMessageBehavior.Invoke(input) } func (s *SQSAPI) DeleteQueueWithContext(_ context.Context, input *sqs.DeleteQueueInput, _ ...request.Option) (*sqs.DeleteQueueOutput, error) { - if !s.NextError.IsNil() { - defer s.NextError.Reset() - return nil, s.NextError.Get() - } - s.CalledWithDeleteQueueInput.Add(input) - - return &sqs.DeleteQueueOutput{}, nil + return s.DeleteQueueBehavior.Invoke(input) } diff --git a/pkg/cloudprovider/aws/fake/types.go b/pkg/cloudprovider/aws/fake/types.go new file mode 100644 index 000000000000..2a5c0a707315 --- /dev/null +++ b/pkg/cloudprovider/aws/fake/types.go @@ -0,0 +1,70 @@ +/* +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package fake + +import ( + "sync/atomic" +) + +type MockedFunction[I any, O any] struct { + Output AtomicPtr[O] // Output to return on call to this function + CalledWithInput AtomicPtrSlice[I] // Slice used to keep track of passed input to this function + Error AtomicError // Error to return a certain number of times defined by custom error options + + defaultOutput AtomicPtr[O] // Default output stores the default output if Output isn't set + successfulCalls atomic.Int32 // Internal construct to keep track of the number of times this function has successfully been called + failedCalls atomic.Int32 // Internal construct to keep track of the number of times this function has failed (with error) +} + +// Reset must be called between tests otherwise tests will pollute +// each other. +func (m *MockedFunction[I, O]) Reset() { + m.Output.Reset() + m.CalledWithInput.Reset() + m.Error.Reset() + m.successfulCalls.Store(0) + m.failedCalls.Store(0) +} + +func (m *MockedFunction[I, O]) WithDefault(output *O) *MockedFunction[I, O] { + m.defaultOutput.Set(output) + return m +} + +func (m *MockedFunction[I, O]) Invoke(input *I) (*O, error) { + err := m.Error.Get() + if err != nil { + m.failedCalls.Add(1) + return nil, err + } + m.CalledWithInput.Add(input) + m.successfulCalls.Add(1) + + if !m.Output.IsNil() { + return m.Output.Clone(), nil + } + if !m.defaultOutput.IsNil() { + return m.defaultOutput.Clone(), nil + } + return new(O), nil +} + +func (m *MockedFunction[I, O]) SuccessfulCalls() int { + return int(m.successfulCalls.Load()) +} + +func (m *MockedFunction[I, O]) FailedCalls() int { + return int(m.failedCalls.Load()) +} diff --git a/pkg/cloudprovider/types.go b/pkg/cloudprovider/types.go index 0dea45be150f..16f4bfff3f58 100644 --- a/pkg/cloudprovider/types.go +++ b/pkg/cloudprovider/types.go @@ -36,6 +36,9 @@ type Options struct { // StartAsync is a channel that is closed when leader election has been won. This is a signal to start any async // processing that should only occur while the cloud provider is the leader. StartAsync <-chan struct{} + // CleanupAsync is a channel that is closed when pod termination is triggered. This is a signal to start any async + // processing that should occur on cleanup + CleanupAsync <-chan struct{} } // CloudProvider interface is implemented by cloud providers to support provisioning. diff --git a/pkg/controllers/consolidation/controller.go b/pkg/controllers/consolidation/controller.go index 2a78a81a4a7d..83d18fdbe2c8 100644 --- a/pkg/controllers/consolidation/controller.go +++ b/pkg/controllers/consolidation/controller.go @@ -381,14 +381,14 @@ func (c *Controller) launchReplacementNode(ctx context.Context, action consolida oldNode := action.oldNodes[0] // cordon the node before we launch the replacement to prevent new pods from scheduling to the node - if err := c.setNodeUnschedulable(ctx, action.oldNodes[0].Name, true); err != nil { - return fmt.Errorf("cordoning node %s, %w", action.oldNodes[0].Name, err) + if err := c.setNodeUnschedulable(ctx, oldNode.Name, true); err != nil { + return fmt.Errorf("cordoning node %s, %w", oldNode.Name, err) } - nodeNames, err := c.provisioner.LaunchNodes(ctx, provisioning.LaunchOptions{RecordPodNomination: false}, action.replacementNodes...) + nodeNames, err := c.provisioner.LaunchNodes(ctx, provisioning.LaunchOptions{RecordPodNomination: false}, action.replacementNode) if err != nil { // uncordon the node as the launch may fail (e.g. ICE or incompatible AMI) - err = multierr.Append(err, c.setNodeUnschedulable(ctx, action.oldNodes[0].Name, false)) + err = multierr.Append(err, c.setNodeUnschedulable(ctx, oldNode.Name, false)) return err } if len(nodeNames) != 1 { @@ -420,7 +420,8 @@ func (c *Controller) launchReplacementNode(ctx context.Context, action consolida return nil }, waitRetryOptions...); err != nil { // node never become ready, so uncordon the node we were trying to delete and report the error - return multierr.Combine(c.setNodeUnschedulable(ctx, action.oldNodes[0].Name, false), + c.cluster.UnmarkForDeletion(oldNode.Name) + return multierr.Combine(c.setNodeUnschedulable(ctx, oldNode.Name, false), fmt.Errorf("timed out checking node readiness, %w", err)) } return nil @@ -555,12 +556,11 @@ func (c *Controller) nodeConsolidationOptionReplaceOrDelete(ctx context.Context, return consolidationAction{result: consolidateResultNotPossible}, nil } - // We know the length of newNodes is 1 from above so this should only launch a single node return consolidationAction{ - oldNodes: []*v1.Node{node.Node}, - disruptionCost: disruptionCost(ctx, node.pods), - result: consolidateResultReplace, - replacementNodes: newNodes, + oldNodes: []*v1.Node{node.Node}, + disruptionCost: disruptionCost(ctx, node.pods), + result: consolidateResultReplace, + replacementNode: newNodes[0], }, nil } diff --git a/pkg/controllers/consolidation/types.go b/pkg/controllers/consolidation/types.go index 25fb9f4587d7..5f73dec2060f 100644 --- a/pkg/controllers/consolidation/types.go +++ b/pkg/controllers/consolidation/types.go @@ -64,10 +64,10 @@ func (r consolidateResult) String() string { } type consolidationAction struct { - oldNodes []*v1.Node - disruptionCost float64 - result consolidateResult - replacementNodes []*scheduling.Node + oldNodes []*v1.Node + disruptionCost float64 + result consolidateResult + replacementNode *scheduling.Node } func (o consolidationAction) String() string { @@ -82,14 +82,9 @@ func (o consolidationAction) String() string { fmt.Fprintf(&buf, "/%s", instanceType) } } - // TODO: Improve the stringify method here for getting all the nodes - if o.replacementNodes != nil { - if len(o.replacementNodes) == 1 { - fmt.Fprintf(&buf, " and replacing with a node from types %s", - scheduling.InstanceTypeList(o.replacementNodes[0].InstanceTypeOptions)) - } else { - fmt.Fprintf(&buf, " and replacing with multiple nodes") - } + if o.replacementNode != nil { + fmt.Fprintf(&buf, " and replacing with a node from types %s", + scheduling.InstanceTypeList(o.replacementNode.InstanceTypeOptions)) } return buf.String() } diff --git a/pkg/controllers/controllers.go b/pkg/controllers/controllers.go index a5a29f0ab4e2..40a4376b7378 100644 --- a/pkg/controllers/controllers.go +++ b/pkg/controllers/controllers.go @@ -147,7 +147,12 @@ func Initialize(injectCloudProvider func(context.Context, cloudprovider.Options) if opts.EnableProfiling { utilruntime.Must(registerPprof(manager)) } - cloudProvider, injectControllers := injectCloudProvider(ctx, cloudprovider.Options{ClientSet: clientSet, KubeClient: manager.GetClient(), StartAsync: manager.Elected()}) + cloudProvider, injectControllers := injectCloudProvider(ctx, cloudprovider.Options{ + ClientSet: clientSet, + KubeClient: manager.GetClient(), + StartAsync: manager.Elected(), + CleanupAsync: cleanup, + }) if hp, ok := cloudProvider.(HealthCheck); ok { utilruntime.Must(manager.AddHealthzCheck("cloud-provider", hp.LivenessProbe)) } diff --git a/pkg/test/deployment.go b/pkg/test/deployment.go index 6619acc6b87a..a64307bf5a36 100644 --- a/pkg/test/deployment.go +++ b/pkg/test/deployment.go @@ -65,3 +65,23 @@ func Deployment(overrides ...DeploymentOptions) *appsv1.Deployment { } return dep } + +func KarpenterDeployment(overrides ...DeploymentOptions) *appsv1.Deployment { + options := DeploymentOptions{ + ObjectMeta: metav1.ObjectMeta{ + Name: "karpenter", + Namespace: "default", + }, + Labels: map[string]string{ + "app.kubernetes.io/name": "karpenter", + "app.kubernetes.io/instance": "karpenter", + }, + Replicas: 2, + } + for _, opts := range overrides { + if err := mergo.Merge(&options, opts, mergo.WithOverride); err != nil { + panic(fmt.Sprintf("Failed to merge deployment options: %s", err)) + } + } + return Deployment(options) +} diff --git a/pkg/test/expectations/expectations.go b/pkg/test/expectations/expectations.go index c538c89b4651..cf633b5d9059 100644 --- a/pkg/test/expectations/expectations.go +++ b/pkg/test/expectations/expectations.go @@ -20,6 +20,7 @@ import ( "fmt" "math/rand" "sync" + "sync/atomic" "time" "github.com/onsi/ginkgo/v2" @@ -145,6 +146,7 @@ func ExpectCleanedUp(ctx context.Context, c client.Client) { } for _, object := range []client.Object{ &v1.Pod{}, + &appsv1.Deployment{}, &v1.Node{}, &appsv1.DaemonSet{}, &v1beta1.PodDisruptionBudget{}, @@ -210,10 +212,21 @@ func ExpectProvisionedNoBindingWithOffset(offset int, ctx context.Context, c cli return } -func ExpectReconcileSucceeded(ctx context.Context, reconciler reconcile.Reconciler, key client.ObjectKey) reconcile.Result { - result, err := reconciler.Reconcile(ctx, reconcile.Request{NamespacedName: key}) - ExpectWithOffset(1, err).ToNot(HaveOccurred()) - return result +func ExpectReconcileSucceeded(ctx context.Context, reconciler reconcile.Reconciler, keys ...client.ObjectKey) reconcile.Result { + // Return the result of the last key for backwards-compatibility + var lastResult atomic.Pointer[reconcile.Result] + wg := &sync.WaitGroup{} + for _, key := range keys { + wg.Add(1) + go func(k client.ObjectKey) { + defer wg.Done() + result, err := reconciler.Reconcile(ctx, reconcile.Request{NamespacedName: k}) + ExpectWithOffset(1, err).ToNot(HaveOccurred()) + lastResult.Store(&result) + }(key) + } + wg.Wait() + return *lastResult.Load() } func ExpectMetric(prefix string) *prometheus.MetricFamily { @@ -272,3 +285,28 @@ func ExpectSkew(ctx context.Context, c client.Client, namespace string, constrai } return ExpectWithOffset(1, skew) } + +type Completable[T any] interface { + Done() <-chan T +} + +// ExpectDone waits on a done channel until the Completable is done +func ExpectDone[T any](c Completable[T]) { + <-c.Done() +} + +// ExpectClosed closes a channel if it isn't already closed +func ExpectClosed[T any](ch chan T) { + if !IsClosed(ch) { + close(ch) + } +} + +func IsClosed[T any](ch <-chan T) bool { + select { + case <-ch: + return true + default: + } + return false +} diff --git a/pkg/test/nodes.go b/pkg/test/nodes.go index 88463876ccdb..b68647cfb2d0 100644 --- a/pkg/test/nodes.go +++ b/pkg/test/nodes.go @@ -28,6 +28,7 @@ type NodeOptions struct { ReadyReason string Conditions []v1.NodeCondition Unschedulable bool + ProviderID string Taints []v1.Taint Allocatable v1.ResourceList } @@ -47,6 +48,7 @@ func Node(overrides ...NodeOptions) *v1.Node { Spec: v1.NodeSpec{ Unschedulable: options.Unschedulable, Taints: options.Taints, + ProviderID: options.ProviderID, }, Status: v1.NodeStatus{ Allocatable: options.Allocatable, diff --git a/test/go.mod b/test/go.mod index 86100440a34d..f635adb8d342 100644 --- a/test/go.mod +++ b/test/go.mod @@ -68,13 +68,13 @@ require ( go.opencensus.io v0.23.0 // indirect go.uber.org/atomic v1.9.0 // indirect go.uber.org/automaxprocs v1.4.0 // indirect - go.uber.org/multierr v1.7.0 // indirect + go.uber.org/multierr v1.8.0 // indirect go.uber.org/zap v1.23.0 // indirect golang.org/x/exp v0.0.0-20220303212507-bbda1eaf7a17 // indirect golang.org/x/net v0.0.0-20220722155237-a158d28d115b // indirect golang.org/x/oauth2 v0.0.0-20220223155221-ee480838109b // indirect golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4 // indirect - golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f // indirect + golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab // indirect golang.org/x/term v0.0.0-20210927222741-03fcf44c2211 // indirect golang.org/x/text v0.3.7 // indirect golang.org/x/time v0.0.0-20220609170525-579cf78fd858 // indirect diff --git a/test/go.sum b/test/go.sum index 10f192db4295..ddab94baa6cf 100644 --- a/test/go.sum +++ b/test/go.sum @@ -379,8 +379,8 @@ go.uber.org/automaxprocs v1.4.0 h1:CpDZl6aOlLhReez+8S3eEotD7Jx0Os++lemPlMULQP0= go.uber.org/automaxprocs v1.4.0/go.mod h1:/mTEdr7LvHhs0v7mjdxDreTz1OG5zdZGqgOnhWiR/+Q= go.uber.org/goleak v1.1.12 h1:gZAh5/EyT/HQwlpkCy6wTpqfH9H8Lz8zbm3dZh+OyzA= go.uber.org/multierr v1.1.0/go.mod h1:wR5kodmAFQ0UK8QlbwjlSNy0Z68gJhDJUG5sjR94q/0= -go.uber.org/multierr v1.7.0 h1:zaiO/rmgFjbmCXdSYJWQcdvOCsthmdaHfr3Gm2Kx4Ec= -go.uber.org/multierr v1.7.0/go.mod h1:7EAYxJLBy9rStEaz58O2t4Uvip6FSURkq8/ppBp95ak= +go.uber.org/multierr v1.8.0 h1:dg6GjLku4EH+249NNmoIciG9N/jURbDG+pFlTkhzIC8= +go.uber.org/multierr v1.8.0/go.mod h1:7EAYxJLBy9rStEaz58O2t4Uvip6FSURkq8/ppBp95ak= go.uber.org/zap v1.10.0/go.mod h1:vwi/ZaCAaUcBkycHslxD9B2zi4UTXhF60s6SWpuDF0Q= go.uber.org/zap v1.23.0 h1:OjGQ5KQDEUawVHxNwQgPpiypGHOxo2mNZsOqTak4fFY= go.uber.org/zap v1.23.0/go.mod h1:D+nX8jyLsMHMYrln8A0rJjFt/T/9/bGgIhAqxv5URuY= @@ -561,8 +561,8 @@ golang.org/x/sys v0.0.0-20211124211545-fe61309f8881/go.mod h1:oPkhp1MJrh7nUepCBc golang.org/x/sys v0.0.0-20211216021012-1d35b9e2eb4e/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220114195835-da31bd327af9/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220412211240-33da011f77ad/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f h1:v4INt8xihDGvnrfjMDVXGxw9wrfxYyCjk0KbXjhR55s= -golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab h1:2QkjZIsXupsJbJIdSjjUOgWK3aEtzyuh2mPt3l/CkeU= +golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= golang.org/x/term v0.0.0-20210927222741-03fcf44c2211 h1:JGgROgKl9N8DuW20oFS5gxc+lE67/N3FcwmBPMe7ArY= golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= From 1a4a484daed85ebb5a1ca8ed0808491768d95e9c Mon Sep 17 00:00:00 2001 From: Jonathan Innis Date: Fri, 23 Sep 2022 12:15:51 -0700 Subject: [PATCH 24/55] Add E2E test for spot interruption monitoring --- Makefile | 5 +- go.mod | 2 +- go.sum | 4 +- .../controllers/notification/suite_test.go | 1 + pkg/cloudprovider/aws/fake/types.go | 2 + test/go.mod | 16 ++ test/go.sum | 41 +++++ test/pkg/environment/environment.go | 21 ++- test/suites/notification/suite_test.go | 144 ++++++++++++++++++ 9 files changed, 223 insertions(+), 13 deletions(-) create mode 100644 test/suites/notification/suite_test.go diff --git a/Makefile b/Makefile index 6e3a7ff6ba22..ae04c20b701a 100644 --- a/Makefile +++ b/Makefile @@ -45,14 +45,15 @@ run: ## Run Karpenter controller binary against your local cluster --leader-elect=false test: ## Run tests - go test -run=${TEST_FILTER} ./pkg/... + go test -run=${TEST_FILTER} ./pkg/... -timeout 15m battletest: ## Run randomized, racing, code coveraged, tests go test -run=${TEST_FILTER} ./pkg/... \ -race \ -cover -coverprofile=coverage.out -outputdir=. -coverpkg=./pkg/... \ -ginkgo.randomizeAllSpecs \ - -tags random_test_delay + -tags random_test_delay \ + -timeout 15m e2etests: ## Run the e2e suite against your local cluster go clean -testcache diff --git a/go.mod b/go.mod index 21733768a02f..b4399e2985c4 100644 --- a/go.mod +++ b/go.mod @@ -20,7 +20,7 @@ require ( github.com/prometheus/client_golang v1.13.0 github.com/prometheus/client_model v0.2.0 github.com/samber/lo v1.31.0 - go.uber.org/multierr v1.7.0 + go.uber.org/multierr v1.8.0 go.uber.org/zap v1.23.0 golang.org/x/time v0.0.0-20220609170525-579cf78fd858 k8s.io/api v0.25.2 diff --git a/go.sum b/go.sum index d7d313f0fc46..45522395c9a6 100644 --- a/go.sum +++ b/go.sum @@ -394,8 +394,8 @@ go.uber.org/goleak v1.1.10/go.mod h1:8a7PlsEVH3e/a/GLqe5IIrQx6GzcnRmZEufDUTk4A7A go.uber.org/goleak v1.1.12 h1:gZAh5/EyT/HQwlpkCy6wTpqfH9H8Lz8zbm3dZh+OyzA= go.uber.org/multierr v1.1.0/go.mod h1:wR5kodmAFQ0UK8QlbwjlSNy0Z68gJhDJUG5sjR94q/0= go.uber.org/multierr v1.6.0/go.mod h1:cdWPpRnG4AhwMwsgIHip0KRBQjJy5kYEpYjJxpXp9iU= -go.uber.org/multierr v1.7.0 h1:zaiO/rmgFjbmCXdSYJWQcdvOCsthmdaHfr3Gm2Kx4Ec= -go.uber.org/multierr v1.7.0/go.mod h1:7EAYxJLBy9rStEaz58O2t4Uvip6FSURkq8/ppBp95ak= +go.uber.org/multierr v1.8.0 h1:dg6GjLku4EH+249NNmoIciG9N/jURbDG+pFlTkhzIC8= +go.uber.org/multierr v1.8.0/go.mod h1:7EAYxJLBy9rStEaz58O2t4Uvip6FSURkq8/ppBp95ak= go.uber.org/zap v1.10.0/go.mod h1:vwi/ZaCAaUcBkycHslxD9B2zi4UTXhF60s6SWpuDF0Q= go.uber.org/zap v1.19.0/go.mod h1:xg/QME4nWcxGxrpdeYfq7UvYrLh66cuVKdrbD1XF/NI= go.uber.org/zap v1.23.0 h1:OjGQ5KQDEUawVHxNwQgPpiypGHOxo2mNZsOqTak4fFY= diff --git a/pkg/cloudprovider/aws/controllers/notification/suite_test.go b/pkg/cloudprovider/aws/controllers/notification/suite_test.go index 588886149c5b..a4108e9b85b2 100644 --- a/pkg/cloudprovider/aws/controllers/notification/suite_test.go +++ b/pkg/cloudprovider/aws/controllers/notification/suite_test.go @@ -103,6 +103,7 @@ var _ = BeforeEach(func() { }) Expect(env.Start()).To(Succeed(), "Failed to start environment") sqsapi.Reset() + eventbridgeapi.Reset() infraStartChan = make(chan struct{}) notificationStartChan = make(chan struct{}) infraController = infrastructure.NewController(env.Ctx, env.Ctx, env.Client, fakeClock, recorder, sqsProvider, eventBridgeProvider, infraStartChan, env.Ctx.Done()) diff --git a/pkg/cloudprovider/aws/fake/types.go b/pkg/cloudprovider/aws/fake/types.go index 2a5c0a707315..54a87f734ecb 100644 --- a/pkg/cloudprovider/aws/fake/types.go +++ b/pkg/cloudprovider/aws/fake/types.go @@ -34,6 +34,8 @@ func (m *MockedFunction[I, O]) Reset() { m.Output.Reset() m.CalledWithInput.Reset() m.Error.Reset() + + m.defaultOutput.Reset() m.successfulCalls.Store(0) m.failedCalls.Store(0) } diff --git a/test/go.mod b/test/go.mod index f635adb8d342..5cfd398033ec 100644 --- a/test/go.mod +++ b/test/go.mod @@ -3,7 +3,9 @@ module github.com/aws/karpenter/test go 1.19 require ( + github.com/aws/amazon-ec2-spot-interrupter v0.0.9 github.com/aws/aws-sdk-go v1.44.114 + github.com/aws/aws-sdk-go-v2/config v1.17.8 github.com/aws/karpenter v0.17.0 github.com/onsi/ginkgo/v2 v2.2.0 github.com/onsi/gomega v1.21.1 @@ -21,6 +23,20 @@ require ( github.com/Pallinder/go-randomdata v1.2.0 // indirect github.com/PuerkitoBio/purell v1.1.1 // indirect github.com/PuerkitoBio/urlesc v0.0.0-20170810143723-de5bf2ad4578 // indirect + github.com/aws/aws-sdk-go-v2 v1.16.16 // indirect + github.com/aws/aws-sdk-go-v2/credentials v1.12.21 // indirect + github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.12.17 // indirect + github.com/aws/aws-sdk-go-v2/internal/configsources v1.1.23 // indirect + github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.4.17 // indirect + github.com/aws/aws-sdk-go-v2/internal/ini v1.3.24 // indirect + github.com/aws/aws-sdk-go-v2/service/ec2 v1.37.0 // indirect + github.com/aws/aws-sdk-go-v2/service/fis v1.12.3 // indirect + github.com/aws/aws-sdk-go-v2/service/iam v1.18.3 // indirect + github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.9.17 // indirect + github.com/aws/aws-sdk-go-v2/service/sso v1.11.23 // indirect + github.com/aws/aws-sdk-go-v2/service/ssooidc v1.13.6 // indirect + github.com/aws/aws-sdk-go-v2/service/sts v1.16.19 // indirect + github.com/aws/smithy-go v1.13.3 // indirect github.com/benbjohnson/clock v1.1.0 // indirect github.com/beorn7/perks v1.0.1 // indirect github.com/blang/semver/v4 v4.0.0 // indirect diff --git a/test/go.sum b/test/go.sum index ddab94baa6cf..e0fd11f9b10e 100644 --- a/test/go.sum +++ b/test/go.sum @@ -62,10 +62,50 @@ github.com/alecthomas/units v0.0.0-20151022065526-2efee857e7cf/go.mod h1:ybxpYRF github.com/alecthomas/units v0.0.0-20190717042225-c3de453c63f4/go.mod h1:ybxpYRFXyAe+OPACYpWeL0wqObRcbAqCMya13uyzqw0= github.com/alecthomas/units v0.0.0-20190924025748-f65c72e2690d/go.mod h1:rBZYJk541a8SKzHPHnH3zbiI+7dagKZ0cgpgrD7Fyho= github.com/antihax/optional v1.0.0/go.mod h1:uupD/76wgC+ih3iEmQUL+0Ugr19nfwCT1kdvxnR2qWY= +github.com/aws/amazon-ec2-spot-interrupter v0.0.9 h1:2yRK7f29tPhrkFBn1lg6QZNjY3iE1ovsjom99OnBCDw= +github.com/aws/amazon-ec2-spot-interrupter v0.0.9/go.mod h1:TwqwmD9RUPwjjcyklxGlzxIxbA6oRfDn6lQf0Muu8/A= github.com/aws/aws-sdk-go v1.44.114 h1:plIkWc/RsHr3DXBj4MEw9sEW4CcL/e2ryokc+CKyq1I= github.com/aws/aws-sdk-go v1.44.114/go.mod h1:y4AeaBuwd2Lk+GepC1E9v0qOiTws0MIWAX4oIKwKHZo= +github.com/aws/aws-sdk-go-v2 v1.16.2/go.mod h1:ytwTPBG6fXTZLxxeeCCWj2/EMYp/xDUgX+OET6TLNNU= +github.com/aws/aws-sdk-go-v2 v1.16.3/go.mod h1:ytwTPBG6fXTZLxxeeCCWj2/EMYp/xDUgX+OET6TLNNU= +github.com/aws/aws-sdk-go-v2 v1.16.16 h1:M1fj4FE2lB4NzRb9Y0xdWsn2P0+2UHVxwKyOa4YJNjk= +github.com/aws/aws-sdk-go-v2 v1.16.16/go.mod h1:SwiyXi/1zTUZ6KIAmLK5V5ll8SiURNUYOqTerZPaF9k= +github.com/aws/aws-sdk-go-v2/config v1.17.8 h1:b9LGqNnOdg9vR4Q43tBTVWk4J6F+W774MSchvKJsqnE= +github.com/aws/aws-sdk-go-v2/config v1.17.8/go.mod h1:UkCI3kb0sCdvtjiXYiU4Zx5h07BOpgBTtkPu/49r+kA= +github.com/aws/aws-sdk-go-v2/credentials v1.12.21 h1:4tjlyCD0hRGNQivh5dN8hbP30qQhMLBE/FgQR1vHHWM= +github.com/aws/aws-sdk-go-v2/credentials v1.12.21/go.mod h1:O+4XyAt4e+oBAoIwNUYkRg3CVMscaIJdmZBOcPgJ8D8= +github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.12.17 h1:r08j4sbZu/RVi+BNxkBJwPMUYY3P8mgSDuKkZ/ZN1lE= +github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.12.17/go.mod h1:yIkQcCDYNsZfXpd5UX2Cy+sWA1jPgIhGTw9cOBzfVnQ= +github.com/aws/aws-sdk-go-v2/internal/configsources v1.1.9/go.mod h1:AnVH5pvai0pAF4lXRq0bmhbes1u9R8wTE+g+183bZNM= +github.com/aws/aws-sdk-go-v2/internal/configsources v1.1.10/go.mod h1:F+EZtuIwjlv35kRJPyBGcsA4f7bnSoz15zOQ2lJq1Z4= +github.com/aws/aws-sdk-go-v2/internal/configsources v1.1.23 h1:s4g/wnzMf+qepSNgTvaQQHNxyMLKSawNhKCPNy++2xY= +github.com/aws/aws-sdk-go-v2/internal/configsources v1.1.23/go.mod h1:2DFxAQ9pfIRy0imBCJv+vZ2X6RKxves6fbnEuSry6b4= +github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.4.3/go.mod h1:ssOhaLpRlh88H3UmEcsBoVKq309quMvm3Ds8e9d4eJM= +github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.4.4/go.mod h1:8glyUqVIM4AmeenIsPo0oVh3+NUwnsQml2OFupfQW+0= +github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.4.17 h1:/K482T5A3623WJgWT8w1yRAFK4RzGzEl7y39yhtn9eA= +github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.4.17/go.mod h1:pRwaTYCJemADaqCbUAxltMoHKata7hmB5PjEXeu0kfg= +github.com/aws/aws-sdk-go-v2/internal/ini v1.3.24 h1:wj5Rwc05hvUSvKuOF29IYb9QrCLjU+rHAy/x/o0DK2c= +github.com/aws/aws-sdk-go-v2/internal/ini v1.3.24/go.mod h1:jULHjqqjDlbyTa7pfM7WICATnOv+iOhjletM3N0Xbu8= +github.com/aws/aws-sdk-go-v2/service/ec2 v1.37.0 h1:zvVR76AXaNElDx6BwOjcxrk4cffFVxx0shQe8yRg2V8= +github.com/aws/aws-sdk-go-v2/service/ec2 v1.37.0/go.mod h1:KOy1O7Fc2+GRgsbn/Kjr15vYDVXMEQALBaPRia3twSY= +github.com/aws/aws-sdk-go-v2/service/fis v1.12.3 h1:jOr6HpAfzh8Pk/ji0QstROuOIk4vFagtm3eDLW2Dkm4= +github.com/aws/aws-sdk-go-v2/service/fis v1.12.3/go.mod h1:qOT644wBvlD/dOHzCUJqvgAqR3UAqV3Almli1PrUrCg= +github.com/aws/aws-sdk-go-v2/service/iam v1.18.3 h1:wllKL2fLtvfaNAVbXKMRmM/mD1oDNw0hXmDn8mE/6Us= +github.com/aws/aws-sdk-go-v2/service/iam v1.18.3/go.mod h1:51xGfEjd1HXnTzw2mAp++qkRo+NyGYblZkuGTsb49yw= +github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.9.4/go.mod h1:uKkN7qmSIsNJVyMtxNQoCEYMvFEXbOg9fwCJPdfp2u8= +github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.9.17 h1:Jrd/oMh0PKQc6+BowB+pLEwLIgaQF29eYbe7E1Av9Ug= +github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.9.17/go.mod h1:4nYOrY41Lrbk2170/BGkcJKBhws9Pfn8MG3aGqjjeFI= +github.com/aws/aws-sdk-go-v2/service/sso v1.11.23 h1:pwvCchFUEnlceKIgPUouBJwK81aCkQ8UDMORfeFtW10= +github.com/aws/aws-sdk-go-v2/service/sso v1.11.23/go.mod h1:/w0eg9IhFGjGyyncHIQrXtU8wvNsTJOP0R6PPj0wf80= +github.com/aws/aws-sdk-go-v2/service/ssooidc v1.13.6 h1:OwhhKc1P9ElfWbMKPIbMMZBV6hzJlL2JKD76wNNVzgQ= +github.com/aws/aws-sdk-go-v2/service/ssooidc v1.13.6/go.mod h1:csZuQY65DAdFBt1oIjO5hhBR49kQqop4+lcuCjf2arA= +github.com/aws/aws-sdk-go-v2/service/sts v1.16.19 h1:9pPi0PsFNAGILFfPCk8Y0iyEBGc6lu6OQ97U7hmdesg= +github.com/aws/aws-sdk-go-v2/service/sts v1.16.19/go.mod h1:h4J3oPZQbxLhzGnk+j9dfYHi5qIOVJ5kczZd658/ydM= github.com/aws/karpenter v0.17.0 h1:R9rJmSChEfLaCYQeBxHfyRuJXBLk1Rzj6UR9Fw3+n2w= github.com/aws/karpenter v0.17.0/go.mod h1:acChPsZRb5mvfuMibZ3ZV9UkNNDHHw2VcAzlwOAAfo0= +github.com/aws/smithy-go v1.11.2/go.mod h1:3xHYmszWVx2c0kIwQeEVf9uSm4fYZt67FBJnwub1bgM= +github.com/aws/smithy-go v1.13.3 h1:l7LYxGuzK6/K+NzJ2mC+VvLUbae0sL3bXU//04MkmnA= +github.com/aws/smithy-go v1.13.3/go.mod h1:Tg+OJXh4MB2R/uN61Ko2f6hTZwB/ZYGOtib8J3gBHzA= github.com/benbjohnson/clock v1.1.0 h1:Q92kusRqC1XV2MjkWETPvjJVqKetz1OzxZB7mHJLju8= github.com/benbjohnson/clock v1.1.0/go.mod h1:J11/hYXuz8f4ySSvYwY0FKfm+ezbsZBKZxNJlLklBHA= github.com/beorn7/perks v0.0.0-20180321164747-3a771d992973/go.mod h1:Dwedo/Wpr24TaqPxmxbtue+5NUziq4I4S80YR8gNf3Q= @@ -198,6 +238,7 @@ github.com/google/go-cmp v0.5.3/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/ github.com/google/go-cmp v0.5.4/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-cmp v0.5.5/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= github.com/google/go-cmp v0.5.6/go.mod h1:v8dTdLbMG2kIc/vJvl+f65V22dbkXbowE6jgT/gNBxE= +github.com/google/go-cmp v0.5.7/go.mod h1:n+brtR0CgQNWTVd5ZUFpTBC8YFBDLK/h/bpaJ8/DtOE= github.com/google/go-cmp v0.5.8 h1:e6P7q2lk1O+qJJb4BtCQXlK8vWEO8V1ZeuEdJNOqZyg= github.com/google/go-cmp v0.5.8/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY= github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= diff --git a/test/pkg/environment/environment.go b/test/pkg/environment/environment.go index e0e44c08632d..d9ef84e77e85 100644 --- a/test/pkg/environment/environment.go +++ b/test/pkg/environment/environment.go @@ -20,10 +20,13 @@ import ( "testing" "time" + "github.com/aws/amazon-ec2-spot-interrupter/pkg/itn" + cfg "github.com/aws/aws-sdk-go-v2/config" "github.com/aws/aws-sdk-go/aws/session" "github.com/aws/aws-sdk-go/service/ec2" "github.com/aws/aws-sdk-go/service/iam" "github.com/aws/aws-sdk-go/service/ssm" + "github.com/samber/lo" // . "github.com/onsi/ginkgo/v2" "github.com/onsi/gomega" @@ -49,6 +52,7 @@ type Environment struct { EC2API ec2.EC2 SSMAPI ssm.SSM IAMAPI iam.IAM + InterruptionAPI *itn.ITN Monitor *Monitor StartingNodeCount int } @@ -69,14 +73,15 @@ func NewEnvironment(t *testing.T) (*Environment, error) { session := session.Must(session.NewSessionWithOptions(session.Options{SharedConfigState: session.SharedConfigEnable})) return &Environment{Context: ctx, - ClusterName: clusterName, - Client: client, - KubeClient: kubernetes.NewForConfigOrDie(config), - EC2API: *ec2.New(session), - SSMAPI: *ssm.New(session), - IAMAPI: *iam.New(session), - Region: *session.Config.Region, - Monitor: NewMonitor(ctx, client), + ClusterName: clusterName, + Client: client, + KubeClient: kubernetes.NewForConfigOrDie(config), + EC2API: *ec2.New(session), + SSMAPI: *ssm.New(session), + IAMAPI: *iam.New(session), + InterruptionAPI: itn.New(lo.Must(cfg.LoadDefaultConfig(ctx))), + Region: *session.Config.Region, + Monitor: NewMonitor(ctx, client), }, nil } diff --git a/test/suites/notification/suite_test.go b/test/suites/notification/suite_test.go new file mode 100644 index 000000000000..16156d39519b --- /dev/null +++ b/test/suites/notification/suite_test.go @@ -0,0 +1,144 @@ +/* +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package consolidation + +import ( + "context" + "fmt" + "regexp" + "strings" + "testing" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + v1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/labels" + + "github.com/aws/karpenter/pkg/apis/awsnodetemplate/v1alpha1" + "github.com/aws/karpenter/pkg/apis/provisioning/v1alpha5" + awsv1alpha1 "github.com/aws/karpenter/pkg/cloudprovider/aws/apis/v1alpha1" + "github.com/aws/karpenter/pkg/test" + "github.com/aws/karpenter/test/pkg/environment" +) + +var env *environment.Environment + +func TestNotification(t *testing.T) { + RegisterFailHandler(Fail) + BeforeSuite(func() { + var err error + env, err = environment.NewEnvironment(t) + Expect(err).ToNot(HaveOccurred()) + }) + RunSpecs(t, "Notification") +} + +var _ = BeforeEach(func() { + env.BeforeEach() +}) + +var _ = AfterEach(func() { + env.AfterEach() +}) + +var _ = Describe("Notification", func() { + FIt("should terminate the spot instance and spin-up a new node on spot interruption warning", func() { + ctx, cancel := context.WithCancel(env.Context) + defer cancel() // In case the test fails, we need this so that the goroutine monitoring the events is closed + + provider := test.AWSNodeTemplate(v1alpha1.AWSNodeTemplateSpec{AWS: awsv1alpha1.AWS{ + SecurityGroupSelector: map[string]string{"karpenter.sh/discovery": env.ClusterName}, + SubnetSelector: map[string]string{"karpenter.sh/discovery": env.ClusterName}, + }}) + provisioner := test.Provisioner(test.ProvisionerOptions{ + Requirements: []v1.NodeSelectorRequirement{ + { + Key: v1alpha5.LabelCapacityType, + Operator: v1.NodeSelectorOpIn, + Values: []string{"spot"}, + }, + }, + ProviderRef: &v1alpha5.ProviderRef{Name: provider.Name}, + }) + + numPods := 1 + dep := test.Deployment(test.DeploymentOptions{ + Replicas: int32(numPods), + PodOptions: test.PodOptions{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{"app": "my-app"}, + }, + TopologySpreadConstraints: []v1.TopologySpreadConstraint{ + { + MaxSkew: 1, + TopologyKey: v1.LabelHostname, + WhenUnsatisfiable: v1.DoNotSchedule, + LabelSelector: &metav1.LabelSelector{ + MatchLabels: map[string]string{ + "app": "my-app", + }, + }, + }, + }, + }, + }) + selector := labels.SelectorFromSet(dep.Spec.Selector.MatchLabels) + + env.ExpectCreated(provider, provisioner, dep) + env.EventuallyExpectHealthyPodCount(selector, numPods) + env.ExpectCreatedNodeCount("==", 1) + + node := env.Monitor.GetCreatedNodes()[0] + instanceID := parseProviderID(node.Spec.ProviderID) + + _, events, _ := env.InterruptionAPI.Interrupt(env.Context, []string{instanceID}, 0, true) + + // Monitor the events channel + done := make(chan struct{}) + go func() { + defer fmt.Println("Closing event goroutine monitoring") + select { + case event := <-events: + if strings.Contains(event.Message, "Spot Instance Shutdown sent") { + Fail("Node didn't terminate before spot instance shutdown was sent") + } + fmt.Printf("[SPOT INTERRUPTION EVENT] %s\n", event.Message) + case <-done: + return + case <-ctx.Done(): + return + } + }() + + env.EventuallyExpectNotFound(&node) + close(done) // Once the node is gone, we can close the event channel because the test has effectively succeeded + env.EventuallyExpectHealthyPodCount(selector, numPods) + }) +}) + +func parseProviderID(pid string) string { + r := regexp.MustCompile(`aws:///(?P.*)/(?P.*)`) + matches := r.FindStringSubmatch(pid) + if matches == nil { + return "" + } + for i, name := range r.SubexpNames() { + if name == "InstanceID" { + return matches[i] + } + } + return "" +} From ced969cab41c53f9654c0ad92b7a8f165141feb1 Mon Sep 17 00:00:00 2001 From: Jonathan Innis Date: Fri, 23 Sep 2022 15:13:09 -0700 Subject: [PATCH 25/55] Decorate environment with cloud provider for testing --- .../controllers/infrastructure/suite_test.go | 9 +- .../controllers/notification/controller.go | 9 +- .../rebalancerecommendation/v0/parser.go | 7 - .../event/scheduledchange/v0/parser.go | 17 -- .../event/spotinterruption/v0/parser.go | 7 - .../event/statechange/v0/handler.go | 4 + .../event/statechange/v0/parser.go | 13 -- .../controllers/notification/suite_test.go | 12 +- pkg/cloudprovider/aws/events/recorder.go | 14 +- pkg/cloudprovider/aws/fake/eventrecorder.go | 4 +- pkg/cloudprovider/aws/metadata.go | 7 + pkg/cloudprovider/aws/sqs.go | 40 +++- pkg/controllers/termination/suite_test.go | 12 +- pkg/test/pods.go | 48 ++--- test/pkg/environment/environment.go | 62 ++++-- test/pkg/environment/expectations.go | 119 ++++++++--- test/suites/integration/scheduling_test.go | 8 +- test/suites/integration/suite_test.go | 4 +- test/suites/notification/suite_test.go | 189 +++++++++++++++--- 19 files changed, 410 insertions(+), 175 deletions(-) diff --git a/pkg/cloudprovider/aws/controllers/infrastructure/suite_test.go b/pkg/cloudprovider/aws/controllers/infrastructure/suite_test.go index 4f843beaefd6..a1ba859fa8eb 100644 --- a/pkg/cloudprovider/aws/controllers/infrastructure/suite_test.go +++ b/pkg/cloudprovider/aws/controllers/infrastructure/suite_test.go @@ -65,7 +65,7 @@ var defaultOpts = options.Options{ func TestAPIs(t *testing.T) { ctx = TestContextWithLogger(t) RegisterFailHandler(Fail) - SetDefaultEventuallyTimeout(time.Second * 5) + SetDefaultEventuallyTimeout(time.Minute) RunSpecs(t, "AWS Notification") } @@ -174,13 +174,16 @@ var _ = Describe("Reconciliation", func() { ExpectClosed(startChan) Eventually(func(g Gomega) { g.Expect(sqsapi.CreateQueueBehavior.FailedCalls()).To(Equal(1)) + g.Expect(eventbridgeapi.PutRuleBehavior.SuccessfulCalls()).To(Equal(4)) + g.Expect(eventbridgeapi.PutTargetsBehavior.SuccessfulCalls()).To(Equal(4)) + g.Expect(IsClosed(controller.Ready())).To(BeFalse()) }).Should(Succeed()) - // Backoff is 2 minutes, so we set the fake clock forward 3 minutes + // Backoff is 1 minute, so we set the fake clock forward 2 minutes // Access denied has now been resolved sqsapi.CreateQueueBehavior.Reset() - fakeClock.Step(time.Minute * 3) + fakeClock.Step(time.Minute * 2) // Should reconcile again after failed access denied calls Eventually(func(g Gomega) { diff --git a/pkg/cloudprovider/aws/controllers/notification/controller.go b/pkg/cloudprovider/aws/controllers/notification/controller.go index 8fe9c4341867..2ee4c90ed8d2 100644 --- a/pkg/cloudprovider/aws/controllers/notification/controller.go +++ b/pkg/cloudprovider/aws/controllers/notification/controller.go @@ -36,6 +36,7 @@ import ( "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/infrastructure" "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event" "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event/aggregatedparser" + statechangev0 "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event/statechange/v0" "github.com/aws/karpenter/pkg/cloudprovider/aws/events" "github.com/aws/karpenter/pkg/controllers/state" "github.com/aws/karpenter/pkg/metrics" @@ -231,7 +232,13 @@ func (c *Controller) notifyForEvent(evt event.Interface, n *v1.Node) { c.recorder.EC2SpotInterruptionWarning(n) case event.Kinds.StateChange: - c.recorder.EC2StateChange(n) + typed := evt.(statechangev0.EC2InstanceStateChangeNotification) + if lo.Contains([]string{"stopping", "stopped"}, typed.State()) { + c.recorder.EC2StateStopping(n) + } else { + c.recorder.EC2StateTerminating(n) + } + default: } } diff --git a/pkg/cloudprovider/aws/controllers/notification/event/rebalancerecommendation/v0/parser.go b/pkg/cloudprovider/aws/controllers/notification/event/rebalancerecommendation/v0/parser.go index 0cade3105fbb..f1004786bfb1 100644 --- a/pkg/cloudprovider/aws/controllers/notification/event/rebalancerecommendation/v0/parser.go +++ b/pkg/cloudprovider/aws/controllers/notification/event/rebalancerecommendation/v0/parser.go @@ -18,8 +18,6 @@ import ( "context" "encoding/json" - "knative.dev/pkg/logging" - "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event" ) @@ -32,13 +30,8 @@ const ( type Parser struct{} func (Parser) Parse(ctx context.Context, str string) event.Interface { - ctx = logging.WithLogger(ctx, logging.FromContext(ctx).Named("rebalanceRecommendation.v0")) - evt := EC2InstanceRebalanceRecommendation{} if err := json.Unmarshal([]byte(str), &evt); err != nil { - logging.FromContext(ctx). - With("error", err). - Error("failed to unmarshal EC2 instance rebalance recommendation event") return nil } diff --git a/pkg/cloudprovider/aws/controllers/notification/event/scheduledchange/v0/parser.go b/pkg/cloudprovider/aws/controllers/notification/event/scheduledchange/v0/parser.go index 414c0a4cbe47..dc1223c82e17 100644 --- a/pkg/cloudprovider/aws/controllers/notification/event/scheduledchange/v0/parser.go +++ b/pkg/cloudprovider/aws/controllers/notification/event/scheduledchange/v0/parser.go @@ -18,8 +18,6 @@ import ( "context" "encoding/json" - "knative.dev/pkg/logging" - "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event" ) @@ -34,33 +32,18 @@ const ( type Parser struct{} func (Parser) Parse(ctx context.Context, str string) event.Interface { - ctx = logging.WithLogger(ctx, logging.FromContext(ctx).Named("scheduledChange.v0")) - evt := AWSHealthEvent{} if err := json.Unmarshal([]byte(str), &evt); err != nil { - logging.FromContext(ctx). - With("error", err). - Error("failed to unmarshal AWS health event") return nil } if evt.Source != source || evt.DetailType != detailType || evt.Version != version { return nil } - if evt.Detail.Service != acceptedService { - logging.FromContext(ctx). - With("eventDetails", evt). - With("acceptedService", acceptedService). - Debug("ignoring AWS health event") return nil } - if evt.Detail.EventTypeCategory != acceptedEventTypeCategory { - logging.FromContext(ctx). - With("eventDetails", evt). - With("acceptedEventTypeCategory", acceptedEventTypeCategory). - Debug("ignoring AWS health event") return nil } return evt diff --git a/pkg/cloudprovider/aws/controllers/notification/event/spotinterruption/v0/parser.go b/pkg/cloudprovider/aws/controllers/notification/event/spotinterruption/v0/parser.go index ea283ab52c87..ade58b6d708f 100644 --- a/pkg/cloudprovider/aws/controllers/notification/event/spotinterruption/v0/parser.go +++ b/pkg/cloudprovider/aws/controllers/notification/event/spotinterruption/v0/parser.go @@ -18,8 +18,6 @@ import ( "context" "encoding/json" - "knative.dev/pkg/logging" - "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event" ) @@ -32,13 +30,8 @@ const ( type Parser struct{} func (Parser) Parse(ctx context.Context, str string) event.Interface { - ctx = logging.WithLogger(ctx, logging.FromContext(ctx).Named("spotInterruption.v0")) - evt := EC2SpotInstanceInterruptionWarning{} if err := json.Unmarshal([]byte(str), &evt); err != nil { - logging.FromContext(ctx). - With("error", err). - Error("failed to unmarshal EC2 spot instance interruption event") return nil } diff --git a/pkg/cloudprovider/aws/controllers/notification/event/statechange/v0/handler.go b/pkg/cloudprovider/aws/controllers/notification/event/statechange/v0/handler.go index d0eb84382b19..03c6f6a01abb 100644 --- a/pkg/cloudprovider/aws/controllers/notification/event/statechange/v0/handler.go +++ b/pkg/cloudprovider/aws/controllers/notification/event/statechange/v0/handler.go @@ -33,6 +33,10 @@ func (e EC2InstanceStateChangeNotification) EC2InstanceIDs() []string { return []string{e.Detail.InstanceID} } +func (e EC2InstanceStateChangeNotification) State() string { + return e.Detail.State +} + func (EC2InstanceStateChangeNotification) Kind() event.Kind { return event.Kinds.StateChange } diff --git a/pkg/cloudprovider/aws/controllers/notification/event/statechange/v0/parser.go b/pkg/cloudprovider/aws/controllers/notification/event/statechange/v0/parser.go index 09d97658f049..b248fe0a1720 100644 --- a/pkg/cloudprovider/aws/controllers/notification/event/statechange/v0/parser.go +++ b/pkg/cloudprovider/aws/controllers/notification/event/statechange/v0/parser.go @@ -19,8 +19,6 @@ import ( "encoding/json" "strings" - "knative.dev/pkg/logging" - "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event" ) @@ -36,26 +34,15 @@ const ( type Parser struct{} func (Parser) Parse(ctx context.Context, str string) event.Interface { - ctx = logging.WithLogger(ctx, logging.FromContext(ctx).Named("stateChange.v0")) - evt := EC2InstanceStateChangeNotification{} if err := json.Unmarshal([]byte(str), &evt); err != nil { - logging.FromContext(ctx). - With("error", err). - Error("failed to unmarshal EC2 state-change event") return nil } if evt.Source != source || evt.DetailType != detailType || evt.Version != version { return nil } - - // Do not log the information on instance state change if it isn't in accepted states if !strings.Contains(acceptedStates, strings.ToLower(evt.Detail.State)) { - logging.FromContext(ctx). - With("eventDetails", evt). - With("acceptedStates", acceptedStates). - Debug("ignoring AWS state change event") return nil } return evt diff --git a/pkg/cloudprovider/aws/controllers/notification/suite_test.go b/pkg/cloudprovider/aws/controllers/notification/suite_test.go index a4108e9b85b2..2c6e240f573b 100644 --- a/pkg/cloudprovider/aws/controllers/notification/suite_test.go +++ b/pkg/cloudprovider/aws/controllers/notification/suite_test.go @@ -81,7 +81,6 @@ var notificationStartChan chan struct{} func TestAPIs(t *testing.T) { ctx = TestContextWithLogger(t) RegisterFailHandler(Fail) - SetDefaultEventuallyTimeout(time.Second * 5) RunSpecs(t, "AWS Notification") } @@ -248,17 +247,14 @@ var _ = Describe("Processing Messages", func() { }) var _ = Describe("Error Handling", func() { - BeforeEach(func() { - // This ensures that the readiness gate is set to ready when we start the test - ExpectClosed(infraStartChan) - }) - It("should send an error on polling when AccessDenied", func() { + ExpectClosed(infraStartChan) sqsapi.ReceiveMessageBehavior.Error.Set(awsErrWithCode(aws.AccessDeniedCode), awsfake.MaxCalls(0)) Expect(controller.PollSQS(env.Ctx)).ToNot(Succeed()) }) - It("should trigger a infrastructure reconciliation on SQS queue doesn't exist", func() { + It("should trigger an infrastructure reconciliation on an SQS queue when it doesn't exist", func() { sqsapi.GetQueueURLBehavior.Error.Set(awsErrWithCode(sqs.ErrCodeQueueDoesNotExist), awsfake.MaxCalls(0)) // This mocks the queue not existing + ExpectClosed(infraStartChan) // Infrastructure reconciliation loop has completed Eventually(func(g Gomega) { @@ -361,7 +357,7 @@ func stateChangeMessage(involvedInstanceID, state string) *sqs.Message { } } -// TODO: Update the scheudled change message to accurately reflect a real health event +// TODO: Update the scheduled change message to accurately reflect a real health event func scheduledChangeMessage(involvedInstanceID string) *sqs.Message { evt := scheduledchangev0.AWSEvent{ AWSMetadata: event.AWSMetadata{ diff --git a/pkg/cloudprovider/aws/events/recorder.go b/pkg/cloudprovider/aws/events/recorder.go index 4a80ba959ebc..3292c367def1 100644 --- a/pkg/cloudprovider/aws/events/recorder.go +++ b/pkg/cloudprovider/aws/events/recorder.go @@ -42,8 +42,10 @@ type Recorder interface { EC2SpotRebalanceRecommendation(*v1.Node) // EC2HealthWarning is called when EC2 sends a health warning notification for a health issue for the node from the SQS queue EC2HealthWarning(*v1.Node) - // EC2StateChange is called when EC2 sends a state change notification for a node that is changing to a stopping/terminating state - EC2StateChange(*v1.Node) + // EC2StateTerminating is called when EC2 sends a state change notification for a node that is changing to a terminating/shutting-down state + EC2StateTerminating(*v1.Node) + // EC2StateStopping is called when EC2 sends a state change notification for a node that is changing to a stopping/stopped state + EC2StateStopping(*v1.Node) // TerminatingNodeOnNotification is called when a notification that is sent to the notification controller triggers node deletion TerminatingNodeOnNotification(*v1.Node) // InfrastructureUnhealthy event is called when infrastructure reconciliation errors and the controller enters an unhealthy state @@ -74,8 +76,12 @@ func (r recorder) EC2HealthWarning(node *v1.Node) { r.Eventf(node, "Normal", "EC2HealthWarning", "Node %s event: EC2 triggered a health warning for the node", node.Name) } -func (r recorder) EC2StateChange(node *v1.Node) { - r.Eventf(node, "Normal", "EC2StateTerminating", `Node %s event: EC2 node is stopping or terminating"`, node.Name) +func (r recorder) EC2StateTerminating(node *v1.Node) { + r.Eventf(node, "Normal", "EC2StateTerminating", `Node %s event: EC2 node is terminating"`, node.Name) +} + +func (r recorder) EC2StateStopping(node *v1.Node) { + r.Eventf(node, "Normal", "EC2StateStopping", `Node %s event: EC2 node is stopping"`, node.Name) } func (r recorder) TerminatingNodeOnNotification(node *v1.Node) { diff --git a/pkg/cloudprovider/aws/fake/eventrecorder.go b/pkg/cloudprovider/aws/fake/eventrecorder.go index 2b4b2f41f155..dad2321eb32f 100644 --- a/pkg/cloudprovider/aws/fake/eventrecorder.go +++ b/pkg/cloudprovider/aws/fake/eventrecorder.go @@ -34,7 +34,9 @@ func (e *EventRecorder) EC2SpotRebalanceRecommendation(_ *v1.Node) {} func (e *EventRecorder) EC2HealthWarning(_ *v1.Node) {} -func (e *EventRecorder) EC2StateChange(_ *v1.Node) {} +func (e *EventRecorder) EC2StateTerminating(_ *v1.Node) {} + +func (e *EventRecorder) EC2StateStopping(_ *v1.Node) {} func (e *EventRecorder) TerminatingNodeOnNotification(_ *v1.Node) {} diff --git a/pkg/cloudprovider/aws/metadata.go b/pkg/cloudprovider/aws/metadata.go index dff96a24198a..f9454623c403 100644 --- a/pkg/cloudprovider/aws/metadata.go +++ b/pkg/cloudprovider/aws/metadata.go @@ -57,6 +57,13 @@ func NewMetadataProvider(sess *session.Session) *MetadataProvider { } } +func (i *MetadataProvider) Metadata(ctx context.Context) *Metadata { + return &Metadata{ + region: i.Region(ctx), + accountID: i.AccountID(ctx), + } +} + // Region gets the current region from EC2 IMDS func (i *MetadataProvider) Region(ctx context.Context) string { region, err := i.imdsClient.RegionWithContext(ctx) diff --git a/pkg/cloudprovider/aws/sqs.go b/pkg/cloudprovider/aws/sqs.go index ff9e0209a9dd..fff0bec8ad7f 100644 --- a/pkg/cloudprovider/aws/sqs.go +++ b/pkg/cloudprovider/aws/sqs.go @@ -97,7 +97,7 @@ func (s *SQSProvider) QueueName() string { func (s *SQSProvider) CreateQueue(ctx context.Context) error { result, err := s.client.CreateQueueWithContext(ctx, s.createQueueInput) if err != nil { - return fmt.Errorf("failed creating sqs queue, %w", err) + return fmt.Errorf("creating sqs queue, %w", err) } s.mutex.Lock() defer s.mutex.Unlock() @@ -108,7 +108,7 @@ func (s *SQSProvider) CreateQueue(ctx context.Context) error { func (s *SQSProvider) SetQueueAttributes(ctx context.Context) error { queueURL, err := s.DiscoverQueueURL(ctx, false) if err != nil { - return fmt.Errorf("failed fetching queue url, %w", err) + return fmt.Errorf("fetching queue url, %w", err) } setQueueAttributesInput := &sqs.SetQueueAttributesInput{ @@ -117,7 +117,7 @@ func (s *SQSProvider) SetQueueAttributes(ctx context.Context) error { } _, err = s.client.SetQueueAttributesWithContext(ctx, setQueueAttributesInput) if err != nil { - return fmt.Errorf("failed setting queue attributes, %w", err) + return fmt.Errorf("setting queue attributes, %w", err) } return nil } @@ -137,7 +137,7 @@ func (s *SQSProvider) DiscoverQueueURL(ctx context.Context, ignoreCache bool) (s } result, err := s.client.GetQueueUrlWithContext(ctx, s.getQueueURLInput) if err != nil { - return "", fmt.Errorf("failed fetching queue url, %w", err) + return "", fmt.Errorf("fetching queue url, %w", err) } s.queueURL = aws.StringValue(result.QueueUrl) return aws.StringValue(result.QueueUrl), nil @@ -146,24 +146,44 @@ func (s *SQSProvider) DiscoverQueueURL(ctx context.Context, ignoreCache bool) (s func (s *SQSProvider) GetSQSMessages(ctx context.Context) ([]*sqs.Message, error) { queueURL, err := s.DiscoverQueueURL(ctx, false) if err != nil { - return nil, fmt.Errorf("failed fetching queue url, %w", err) + return nil, fmt.Errorf("fetching queue url, %w", err) } // Copy the input template and add the discovered queue url input, err := functional.DeepCopy(s.receiveMessageInput) if err != nil { - return nil, fmt.Errorf("failed copying input, %w", err) + return nil, fmt.Errorf("copying input, %w", err) } input.QueueUrl = aws.String(queueURL) result, err := s.client.ReceiveMessageWithContext(ctx, input) if err != nil { - return nil, fmt.Errorf("failed receiving sqs messages, %w", err) + return nil, fmt.Errorf("receiving sqs messages, %w", err) } return result.Messages, nil } +func (s *SQSProvider) SendMessage(ctx context.Context, body interface{}) (string, error) { + raw, err := json.Marshal(body) + if err != nil { + return "", fmt.Errorf("marshaling the passed body as json, %w", err) + } + queueURL, err := s.DiscoverQueueURL(ctx, false) + if err != nil { + return "", fmt.Errorf("fetching queue url, %w", err) + } + input := &sqs.SendMessageInput{ + MessageBody: aws.String(string(raw)), + QueueUrl: aws.String(queueURL), + } + result, err := s.client.SendMessage(input) + if err != nil { + return "", fmt.Errorf("sending messages to sqs queue, %w", err) + } + return aws.StringValue(result.MessageId), nil +} + func (s *SQSProvider) DeleteSQSMessage(ctx context.Context, msg *sqs.Message) error { queueURL, err := s.DiscoverQueueURL(ctx, false) if err != nil { @@ -177,7 +197,7 @@ func (s *SQSProvider) DeleteSQSMessage(ctx context.Context, msg *sqs.Message) er _, err = s.client.DeleteMessageWithContext(ctx, input) if err != nil { - return fmt.Errorf("failed deleting messages from sqs queue, %w", err) + return fmt.Errorf("deleting messages from sqs queue, %w", err) } return nil } @@ -188,7 +208,7 @@ func (s *SQSProvider) DeleteQueue(ctx context.Context) error { if IsNotFound(err) { return nil } - return fmt.Errorf("failed fetching queue url, %w", err) + return fmt.Errorf("fetching queue url, %w", err) } input := &sqs.DeleteQueueInput{ @@ -196,7 +216,7 @@ func (s *SQSProvider) DeleteQueue(ctx context.Context) error { } _, err = s.client.DeleteQueueWithContext(ctx, input) if err != nil && !IsNotFound(err) { - return fmt.Errorf("failed deleting sqs queue, %w", err) + return fmt.Errorf("deleting sqs queue, %w", err) } return nil } diff --git a/pkg/controllers/termination/suite_test.go b/pkg/controllers/termination/suite_test.go index 511871495d85..4c222ef82c37 100644 --- a/pkg/controllers/termination/suite_test.go +++ b/pkg/controllers/termination/suite_test.go @@ -574,13 +574,13 @@ var _ = Describe("Termination", func() { func ExpectNotEnqueuedForEviction(e *termination.EvictionQueue, pods ...*v1.Pod) { for _, pod := range pods { - Expect(e.Contains(client.ObjectKeyFromObject(pod))).To(BeFalse()) + ExpectWithOffset(1, e.Contains(client.ObjectKeyFromObject(pod))).To(BeFalse()) } } func ExpectEvicted(c client.Client, pods ...*v1.Pod) { for _, pod := range pods { - Eventually(func() bool { + EventuallyWithOffset(1, func() bool { return ExpectPodExists(ctx, c, pod.Name, pod.Namespace).GetDeletionTimestamp().IsZero() }, ReconcilerPropagationTime, RequestInterval).Should(BeFalse(), func() string { return fmt.Sprintf("expected %s/%s to be evicting, but it isn't", pod.Namespace, pod.Name) @@ -589,9 +589,9 @@ func ExpectEvicted(c client.Client, pods ...*v1.Pod) { } func ExpectNodeDraining(c client.Client, nodeName string) *v1.Node { - node := ExpectNodeExists(ctx, c, nodeName) - Expect(node.Spec.Unschedulable).To(BeTrue()) - Expect(lo.Contains(node.Finalizers, v1alpha5.TerminationFinalizer)).To(BeTrue()) - Expect(node.DeletionTimestamp.IsZero()).To(BeFalse()) + node := ExpectNodeExistsWithOffset(1, ctx, c, nodeName) + ExpectWithOffset(1, node.Spec.Unschedulable).To(BeTrue()) + ExpectWithOffset(1, lo.Contains(node.Finalizers, v1alpha5.TerminationFinalizer)).To(BeTrue()) + ExpectWithOffset(1, node.DeletionTimestamp.IsZero()).To(BeFalse()) return node } diff --git a/pkg/test/pods.go b/pkg/test/pods.go index cce52b460d2b..34ed538335db 100644 --- a/pkg/test/pods.go +++ b/pkg/test/pods.go @@ -27,25 +27,26 @@ import ( // PodOptions customizes a Pod. type PodOptions struct { metav1.ObjectMeta - Image string - InitImage string - NodeName string - PriorityClassName string - InitResourceRequirements v1.ResourceRequirements - ResourceRequirements v1.ResourceRequirements - NodeSelector map[string]string - NodeRequirements []v1.NodeSelectorRequirement - NodePreferences []v1.NodeSelectorRequirement - PodRequirements []v1.PodAffinityTerm - PodPreferences []v1.WeightedPodAffinityTerm - PodAntiRequirements []v1.PodAffinityTerm - PodAntiPreferences []v1.WeightedPodAffinityTerm - TopologySpreadConstraints []v1.TopologySpreadConstraint - Tolerations []v1.Toleration - PersistentVolumeClaims []string - Conditions []v1.PodCondition - Phase v1.PodPhase - RestartPolicy v1.RestartPolicy + Image string + InitImage string + NodeName string + PriorityClassName string + InitResourceRequirements v1.ResourceRequirements + ResourceRequirements v1.ResourceRequirements + NodeSelector map[string]string + NodeRequirements []v1.NodeSelectorRequirement + NodePreferences []v1.NodeSelectorRequirement + PodRequirements []v1.PodAffinityTerm + PodPreferences []v1.WeightedPodAffinityTerm + PodAntiRequirements []v1.PodAffinityTerm + PodAntiPreferences []v1.WeightedPodAffinityTerm + TopologySpreadConstraints []v1.TopologySpreadConstraint + Tolerations []v1.Toleration + PersistentVolumeClaims []string + Conditions []v1.PodCondition + Phase v1.PodPhase + RestartPolicy v1.RestartPolicy + TerminationGracePeriodSeconds *int64 } type PDBOptions struct { @@ -87,10 +88,11 @@ func Pod(overrides ...PodOptions) *v1.Pod { Image: options.Image, Resources: options.ResourceRequirements, }}, - NodeName: options.NodeName, - Volumes: volumes, - PriorityClassName: options.PriorityClassName, - RestartPolicy: options.RestartPolicy, + NodeName: options.NodeName, + Volumes: volumes, + PriorityClassName: options.PriorityClassName, + RestartPolicy: options.RestartPolicy, + TerminationGracePeriodSeconds: options.TerminationGracePeriodSeconds, }, Status: v1.PodStatus{ Conditions: options.Conditions, diff --git a/test/pkg/environment/environment.go b/test/pkg/environment/environment.go index d9ef84e77e85..c774cc9f698a 100644 --- a/test/pkg/environment/environment.go +++ b/test/pkg/environment/environment.go @@ -25,7 +25,9 @@ import ( "github.com/aws/aws-sdk-go/aws/session" "github.com/aws/aws-sdk-go/service/ec2" "github.com/aws/aws-sdk-go/service/iam" + "github.com/aws/aws-sdk-go/service/sqs" "github.com/aws/aws-sdk-go/service/ssm" + "github.com/aws/aws-sdk-go/service/sts" "github.com/samber/lo" // . "github.com/onsi/ginkgo/v2" @@ -39,24 +41,53 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" "github.com/aws/karpenter/pkg/apis" + "github.com/aws/karpenter/pkg/cloudprovider/aws" "github.com/aws/karpenter/pkg/utils/env" + "github.com/aws/karpenter/pkg/utils/injection" + "github.com/aws/karpenter/pkg/utils/options" "github.com/aws/karpenter/pkg/utils/project" ) +type AWSEnvironment struct { + *Environment + + Metadata *aws.Metadata + EC2API ec2.EC2 + SSMAPI ssm.SSM + STSAPI sts.STS + IAMAPI iam.IAM + + SQSProvider *aws.SQSProvider + InterruptionAPI *itn.ITN +} + type Environment struct { context.Context ClusterName string - Region string Client client.Client KubeClient kubernetes.Interface - EC2API ec2.EC2 - SSMAPI ssm.SSM - IAMAPI iam.IAM - InterruptionAPI *itn.ITN Monitor *Monitor StartingNodeCount int } +func NewAWSEnvironment(env *Environment, err error) (*AWSEnvironment, error) { + if err != nil { + return nil, err + } + session := session.Must(session.NewSessionWithOptions(session.Options{SharedConfigState: session.SharedConfigEnable})) + metadata := aws.NewMetadata(*session.Config.Region, aws.NewMetadataProvider(session).AccountID(env.Context)) + + return &AWSEnvironment{ + Environment: env, + Metadata: metadata, + EC2API: *ec2.New(session), + SSMAPI: *ssm.New(session), + IAMAPI: *iam.New(session), + InterruptionAPI: itn.New(lo.Must(cfg.LoadDefaultConfig(env.Context))), + SQSProvider: aws.NewSQSProvider(env.Context, sqs.New(session), metadata), + }, nil +} + func NewEnvironment(t *testing.T) (*Environment, error) { ctx := loggingtesting.TestContextWithLogger(t) config := NewConfig() @@ -70,18 +101,17 @@ func NewEnvironment(t *testing.T) (*Environment, error) { } gomega.SetDefaultEventuallyTimeout(5 * time.Minute) gomega.SetDefaultEventuallyPollingInterval(1 * time.Second) - session := session.Must(session.NewSessionWithOptions(session.Options{SharedConfigState: session.SharedConfigEnable})) - return &Environment{Context: ctx, - ClusterName: clusterName, - Client: client, - KubeClient: kubernetes.NewForConfigOrDie(config), - EC2API: *ec2.New(session), - SSMAPI: *ssm.New(session), - IAMAPI: *iam.New(session), - InterruptionAPI: itn.New(lo.Must(cfg.LoadDefaultConfig(ctx))), - Region: *session.Config.Region, - Monitor: NewMonitor(ctx, client), + opts := options.Options{ + ClusterName: clusterName, + } + ctx = injection.WithOptions(ctx, opts) + return &Environment{ + Context: ctx, + ClusterName: clusterName, + Client: client, + KubeClient: kubernetes.NewForConfigOrDie(config), + Monitor: NewMonitor(ctx, client), }, nil } diff --git a/test/pkg/environment/expectations.go b/test/pkg/environment/expectations.go index 9cc1675ea2e6..33117e434857 100644 --- a/test/pkg/environment/expectations.go +++ b/test/pkg/environment/expectations.go @@ -19,10 +19,12 @@ import ( "fmt" "io" "strings" + "sync" "time" . "github.com/onsi/gomega" //nolint:revive,stylecheck "github.com/samber/lo" + "go.uber.org/multierr" v1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -126,12 +128,24 @@ func (env *Environment) eventuallyExpectScaleDown() { } func (env *Environment) EventuallyExpectNotFound(objects ...client.Object) { - for _, object := range objects { - EventuallyWithOffset(1, func(g Gomega) { + env.EventuallyExpectNotFoundAssertionWithOffset(1, objects...).Should(Succeed()) +} + +func (env *Environment) EventuallyExpectNotFoundAssertion(objects ...client.Object) AsyncAssertion { + return env.EventuallyExpectNotFoundAssertionWithOffset(1, objects...) +} + +func (env *Environment) EventuallyExpectNotFoundAssertionWithOffset(offset int, objects ...client.Object) AsyncAssertion { + return EventuallyWithOffset(offset+1, func(g Gomega) { + for _, object := range objects { err := env.Client.Get(env, client.ObjectKeyFromObject(object), object) g.Expect(errors.IsNotFound(err)).To(BeTrue()) - }).Should(Succeed(), fmt.Sprintf("expcted %s to be deleted", client.ObjectKeyFromObject(object))) - } + } + }) +} + +func (env *Environment) ExpectDeploymentCreatedAndHealthy(numPods int) { + } func (env *Environment) ExpectCreatedNodeCount(comparator string, nodeCount int) { @@ -152,31 +166,6 @@ func (env *Environment) GetNode(nodeName string) v1.Node { return node } -func (env *Environment) ExpectInstance(nodeName string) Assertion { - return Expect(env.GetInstance(nodeName)) -} - -func (env *Environment) GetInstance(nodeName string) ec2.Instance { - node := env.GetNode(nodeName) - providerIDSplit := strings.Split(node.Spec.ProviderID, "/") - ExpectWithOffset(1, len(providerIDSplit)).ToNot(Equal(0)) - instanceID := providerIDSplit[len(providerIDSplit)-1] - instance, err := env.EC2API.DescribeInstances(&ec2.DescribeInstancesInput{ - InstanceIds: aws.StringSlice([]string{instanceID}), - }) - ExpectWithOffset(1, err).ToNot(HaveOccurred()) - ExpectWithOffset(1, instance.Reservations).To(HaveLen(1)) - ExpectWithOffset(1, instance.Reservations[0].Instances).To(HaveLen(1)) - return *instance.Reservations[0].Instances[0] -} - -func (env *Environment) GetVolume(volumeID *string) ec2.Volume { - dvo, err := env.EC2API.DescribeVolumes(&ec2.DescribeVolumesInput{VolumeIds: []*string{volumeID}}) - ExpectWithOffset(1, err).ToNot(HaveOccurred()) - ExpectWithOffset(1, len(dvo.Volumes)).To(Equal(1)) - return *dvo.Volumes[0] -} - func (env *Environment) expectNoCrashes() { crashed := false var crashInfo strings.Builder @@ -243,3 +232,75 @@ func (env *Environment) EventuallyExpectAvgUtilization(resource v1.ResourceName, g.Expect(env.Monitor.AvgUtilization(resource)).To(BeNumerically(comparator, value)) }, 10*time.Minute).Should(Succeed()) } + +// ------ START AWS ENVIRONMENT EXPECTATIONS ------ + +func (env *AWSEnvironment) ExpectInstance(nodeName string) Assertion { + return Expect(env.GetInstance(nodeName)) +} + +func (env *AWSEnvironment) GetInstance(nodeName string) ec2.Instance { + node := env.GetNode(nodeName) + providerIDSplit := strings.Split(node.Spec.ProviderID, "/") + ExpectWithOffset(1, len(providerIDSplit)).ToNot(Equal(0)) + instanceID := providerIDSplit[len(providerIDSplit)-1] + instance, err := env.EC2API.DescribeInstances(&ec2.DescribeInstancesInput{ + InstanceIds: aws.StringSlice([]string{instanceID}), + }) + ExpectWithOffset(1, err).ToNot(HaveOccurred()) + ExpectWithOffset(1, instance.Reservations).To(HaveLen(1)) + ExpectWithOffset(1, instance.Reservations[0].Instances).To(HaveLen(1)) + return *instance.Reservations[0].Instances[0] +} + +func (env *AWSEnvironment) ExpectInstanceStopped(nodeName string) { + node := env.GetNode(nodeName) + providerIDSplit := strings.Split(node.Spec.ProviderID, "/") + ExpectWithOffset(1, len(providerIDSplit)).ToNot(Equal(0)) + instanceID := providerIDSplit[len(providerIDSplit)-1] + _, err := env.EC2API.StopInstances(&ec2.StopInstancesInput{ + Force: aws.Bool(true), + InstanceIds: aws.StringSlice([]string{instanceID}), + }) + ExpectWithOffset(1, err).To(Succeed()) +} + +func (env *AWSEnvironment) ExpectInstanceTerminated(nodeName string) { + node := env.GetNode(nodeName) + providerIDSplit := strings.Split(node.Spec.ProviderID, "/") + ExpectWithOffset(1, len(providerIDSplit)).ToNot(Equal(0)) + instanceID := providerIDSplit[len(providerIDSplit)-1] + _, err := env.EC2API.TerminateInstances(&ec2.TerminateInstancesInput{ + InstanceIds: aws.StringSlice([]string{instanceID}), + }) + ExpectWithOffset(1, err).To(Succeed()) +} + +func (env *AWSEnvironment) GetVolume(volumeID *string) ec2.Volume { + dvo, err := env.EC2API.DescribeVolumes(&ec2.DescribeVolumesInput{VolumeIds: []*string{volumeID}}) + ExpectWithOffset(1, err).ToNot(HaveOccurred()) + ExpectWithOffset(1, len(dvo.Volumes)).To(Equal(1)) + return *dvo.Volumes[0] +} + +func (env *AWSEnvironment) ExpectMessagesCreated(msgs ...interface{}) { + wg := &sync.WaitGroup{} + mu := &sync.Mutex{} + + var err error + for _, msg := range msgs { + wg.Add(1) + go func(m interface{}) { + defer wg.Done() + defer GinkgoRecover() + _, e := env.SQSProvider.SendMessage(env.Context, m) + if e != nil { + mu.Lock() + err = multierr.Append(err, e) + mu.Unlock() + } + }(msg) + } + wg.Wait() + ExpectWithOffset(1, err).To(Succeed()) +} diff --git a/test/suites/integration/scheduling_test.go b/test/suites/integration/scheduling_test.go index b07d60dbd97b..f34f4cbddd2a 100644 --- a/test/suites/integration/scheduling_test.go +++ b/test/suites/integration/scheduling_test.go @@ -42,8 +42,8 @@ var _ = Describe("Scheduling", func() { nodeSelector := map[string]string{ // Well Known v1alpha5.ProvisionerNameLabelKey: provisioner.Name, - v1.LabelTopologyRegion: env.Region, - v1.LabelTopologyZone: fmt.Sprintf("%sa", env.Region), + v1.LabelTopologyRegion: env.Metadata.Region(), + v1.LabelTopologyZone: fmt.Sprintf("%sa", env.Metadata.Region()), v1.LabelInstanceTypeStable: "g4dn.8xlarge", v1.LabelOSStable: "linux", v1.LabelArchStable: "amd64", @@ -63,8 +63,8 @@ var _ = Describe("Scheduling", func() { awsv1alpha1.LabelInstanceGPUMemory: "16384", awsv1alpha1.LabelInstanceLocalNVME: "900", // Deprecated Labels - v1.LabelFailureDomainBetaZone: fmt.Sprintf("%sa", env.Region), - v1.LabelFailureDomainBetaRegion: env.Region, + v1.LabelFailureDomainBetaZone: fmt.Sprintf("%sa", env.Metadata.Region()), + v1.LabelFailureDomainBetaRegion: env.Metadata.Region(), "beta.kubernetes.io/arch": "amd64", "beta.kubernetes.io/os": "linux", v1.LabelInstanceType: "g4dn.8xlarge", diff --git a/test/suites/integration/suite_test.go b/test/suites/integration/suite_test.go index b9d47c3151ac..8fe70604660b 100644 --- a/test/suites/integration/suite_test.go +++ b/test/suites/integration/suite_test.go @@ -23,13 +23,13 @@ import ( "github.com/aws/karpenter/test/pkg/environment" ) -var env *environment.Environment +var env *environment.AWSEnvironment func TestIntegration(t *testing.T) { RegisterFailHandler(Fail) BeforeSuite(func() { var err error - env, err = environment.NewEnvironment(t) + env, err = environment.NewAWSEnvironment(environment.NewEnvironment(t)) Expect(err).ToNot(HaveOccurred()) }) RunSpecs(t, "Integration") diff --git a/test/suites/notification/suite_test.go b/test/suites/notification/suite_test.go index 16156d39519b..e782337920f8 100644 --- a/test/suites/notification/suite_test.go +++ b/test/suites/notification/suite_test.go @@ -12,7 +12,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package consolidation +package notification import ( "context" @@ -20,27 +20,32 @@ import ( "regexp" "strings" "testing" + "time" + "github.com/google/uuid" . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" v1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/labels" + "knative.dev/pkg/ptr" "github.com/aws/karpenter/pkg/apis/awsnodetemplate/v1alpha1" "github.com/aws/karpenter/pkg/apis/provisioning/v1alpha5" awsv1alpha1 "github.com/aws/karpenter/pkg/cloudprovider/aws/apis/v1alpha1" + "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event" + scheduledchangev0 "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event/scheduledchange/v0" "github.com/aws/karpenter/pkg/test" "github.com/aws/karpenter/test/pkg/environment" ) -var env *environment.Environment +var env *environment.AWSEnvironment func TestNotification(t *testing.T) { RegisterFailHandler(Fail) BeforeSuite(func() { var err error - env, err = environment.NewEnvironment(t) + env, err = environment.NewAWSEnvironment(environment.NewEnvironment(t)) Expect(err).ToNot(HaveOccurred()) }) RunSpecs(t, "Notification") @@ -54,11 +59,9 @@ var _ = AfterEach(func() { env.AfterEach() }) -var _ = Describe("Notification", func() { - FIt("should terminate the spot instance and spin-up a new node on spot interruption warning", func() { - ctx, cancel := context.WithCancel(env.Context) - defer cancel() // In case the test fails, we need this so that the goroutine monitoring the events is closed - +var _ = Describe("Notification", Label("AWS"), func() { + It("should terminate the spot instance and spin-up a new node on spot interruption warning", func() { + By("Creating a single healthy node with a healthy deployment") provider := test.AWSNodeTemplate(v1alpha1.AWSNodeTemplateSpec{AWS: awsv1alpha1.AWS{ SecurityGroupSelector: map[string]string{"karpenter.sh/discovery": env.ClusterName}, SubnetSelector: map[string]string{"karpenter.sh/discovery": env.ClusterName}, @@ -68,12 +71,11 @@ var _ = Describe("Notification", func() { { Key: v1alpha5.LabelCapacityType, Operator: v1.NodeSelectorOpIn, - Values: []string{"spot"}, + Values: []string{awsv1alpha1.CapacityTypeSpot}, }, }, ProviderRef: &v1alpha5.ProviderRef{Name: provider.Name}, }) - numPods := 1 dep := test.Deployment(test.DeploymentOptions{ Replicas: int32(numPods), @@ -81,18 +83,7 @@ var _ = Describe("Notification", func() { ObjectMeta: metav1.ObjectMeta{ Labels: map[string]string{"app": "my-app"}, }, - TopologySpreadConstraints: []v1.TopologySpreadConstraint{ - { - MaxSkew: 1, - TopologyKey: v1.LabelHostname, - WhenUnsatisfiable: v1.DoNotSchedule, - LabelSelector: &metav1.LabelSelector{ - MatchLabels: map[string]string{ - "app": "my-app", - }, - }, - }, - }, + TerminationGracePeriodSeconds: ptr.Int64(0), }, }) selector := labels.SelectorFromSet(dep.Spec.Selector.MatchLabels) @@ -101,21 +92,25 @@ var _ = Describe("Notification", func() { env.EventuallyExpectHealthyPodCount(selector, numPods) env.ExpectCreatedNodeCount("==", 1) + ctx, cancel := context.WithCancel(env.Context) + defer cancel() // In case the test fails, we need this so that the goroutine monitoring the events is closed + node := env.Monitor.GetCreatedNodes()[0] instanceID := parseProviderID(node.Spec.ProviderID) + By("Interrupting the spot instance") _, events, _ := env.InterruptionAPI.Interrupt(env.Context, []string{instanceID}, 0, true) // Monitor the events channel done := make(chan struct{}) go func() { - defer fmt.Println("Closing event goroutine monitoring") + defer fmt.Println("[FIS EVENT MONITOR] Closing event goroutine monitoring") select { case event := <-events: if strings.Contains(event.Message, "Spot Instance Shutdown sent") { Fail("Node didn't terminate before spot instance shutdown was sent") } - fmt.Printf("[SPOT INTERRUPTION EVENT] %s\n", event.Message) + fmt.Printf("[FIS EVENT MONITOR] %s\n", event.Message) case <-done: return case <-ctx.Done(): @@ -125,10 +120,156 @@ var _ = Describe("Notification", func() { env.EventuallyExpectNotFound(&node) close(done) // Once the node is gone, we can close the event channel because the test has effectively succeeded + env.EventuallyExpectHealthyPodCount(selector, 1) + }) + It("should terminate the node at the API server when the EC2 instance is stopped", func() { + By("Creating a single healthy node with a healthy deployment") + provider := test.AWSNodeTemplate(v1alpha1.AWSNodeTemplateSpec{AWS: awsv1alpha1.AWS{ + SecurityGroupSelector: map[string]string{"karpenter.sh/discovery": env.ClusterName}, + SubnetSelector: map[string]string{"karpenter.sh/discovery": env.ClusterName}, + }}) + provisioner := test.Provisioner(test.ProvisionerOptions{ + Requirements: []v1.NodeSelectorRequirement{ + { + Key: v1alpha5.LabelCapacityType, + Operator: v1.NodeSelectorOpIn, + Values: []string{awsv1alpha1.CapacityTypeOnDemand}, + }, + }, + ProviderRef: &v1alpha5.ProviderRef{Name: provider.Name}, + }) + numPods := 1 + dep := test.Deployment(test.DeploymentOptions{ + Replicas: int32(numPods), + PodOptions: test.PodOptions{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{"app": "my-app"}, + }, + TerminationGracePeriodSeconds: ptr.Int64(0), + }, + }) + selector := labels.SelectorFromSet(dep.Spec.Selector.MatchLabels) + + env.ExpectCreated(provider, provisioner, dep) + env.EventuallyExpectHealthyPodCount(selector, numPods) + env.ExpectCreatedNodeCount("==", 1) + + node := env.Monitor.GetCreatedNodes()[0] + + By("Stopping the EC2 instance without the EKS cluster's knowledge") + env.ExpectInstanceStopped(node.Name) // Make a call to the EC2 api to stop the instance + env.EventuallyExpectNotFoundAssertion(&node).WithTimeout(time.Minute) // shorten the timeout since we should react faster + env.EventuallyExpectHealthyPodCount(selector, 1) + }) + It("should terminate the node at the API server when the EC2 instance is terminated", func() { + By("Creating a single healthy node with a healthy deployment") + provider := test.AWSNodeTemplate(v1alpha1.AWSNodeTemplateSpec{AWS: awsv1alpha1.AWS{ + SecurityGroupSelector: map[string]string{"karpenter.sh/discovery": env.ClusterName}, + SubnetSelector: map[string]string{"karpenter.sh/discovery": env.ClusterName}, + }}) + provisioner := test.Provisioner(test.ProvisionerOptions{ + Requirements: []v1.NodeSelectorRequirement{ + { + Key: v1alpha5.LabelCapacityType, + Operator: v1.NodeSelectorOpIn, + Values: []string{awsv1alpha1.CapacityTypeOnDemand}, + }, + }, + ProviderRef: &v1alpha5.ProviderRef{Name: provider.Name}, + }) + numPods := 1 + dep := test.Deployment(test.DeploymentOptions{ + Replicas: int32(numPods), + PodOptions: test.PodOptions{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{"app": "my-app"}, + }, + TerminationGracePeriodSeconds: ptr.Int64(0), + }, + }) + selector := labels.SelectorFromSet(dep.Spec.Selector.MatchLabels) + + env.ExpectCreated(provider, provisioner, dep) env.EventuallyExpectHealthyPodCount(selector, numPods) + env.ExpectCreatedNodeCount("==", 1) + + node := env.Monitor.GetCreatedNodes()[0] + + By("Terminating the EC2 instance without the EKS cluster's knowledge") + env.ExpectInstanceTerminated(node.Name) // Make a call to the EC2 api to stop the instance + env.EventuallyExpectNotFoundAssertion(&node).WithTimeout(time.Minute) // shorten the timeout since we should react faster + env.EventuallyExpectHealthyPodCount(selector, 1) + }) + It("should terminate the node when receiving a scheduled change health event", func() { + By("Creating a single healthy node with a healthy deployment") + provider := test.AWSNodeTemplate(v1alpha1.AWSNodeTemplateSpec{AWS: awsv1alpha1.AWS{ + SecurityGroupSelector: map[string]string{"karpenter.sh/discovery": env.ClusterName}, + SubnetSelector: map[string]string{"karpenter.sh/discovery": env.ClusterName}, + }}) + provisioner := test.Provisioner(test.ProvisionerOptions{ + Requirements: []v1.NodeSelectorRequirement{ + { + Key: v1alpha5.LabelCapacityType, + Operator: v1.NodeSelectorOpIn, + Values: []string{awsv1alpha1.CapacityTypeOnDemand}, + }, + }, + ProviderRef: &v1alpha5.ProviderRef{Name: provider.Name}, + }) + numPods := 1 + dep := test.Deployment(test.DeploymentOptions{ + Replicas: int32(numPods), + PodOptions: test.PodOptions{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{"app": "my-app"}, + }, + TerminationGracePeriodSeconds: ptr.Int64(0), + }, + }) + selector := labels.SelectorFromSet(dep.Spec.Selector.MatchLabels) + + env.ExpectCreated(provider, provisioner, dep) + env.EventuallyExpectHealthyPodCount(selector, numPods) + env.ExpectCreatedNodeCount("==", 1) + + node := env.Monitor.GetCreatedNodes()[0] + instanceID := parseProviderID(node.Spec.ProviderID) + + By("Creating a scheduled change health event in the SQS message queue") + env.ExpectMessagesCreated(scheduledChangeMessage(env.Metadata.Region(), env.Metadata.AccountID(), instanceID)) + env.EventuallyExpectNotFound(&node) + + env.EventuallyExpectHealthyPodCount(selector, 1) }) }) +// TODO: Update the scheduled change message to accurately reflect a real health event +func scheduledChangeMessage(region, accountID, involvedInstanceID string) scheduledchangev0.AWSEvent { + return scheduledchangev0.AWSEvent{ + AWSMetadata: event.AWSMetadata{ + Version: "0", + Account: accountID, + DetailType: "AWS Health Event", + ID: uuid.NewString(), + Region: region, + Resources: []string{ + fmt.Sprintf("arn:aws:ec2:%s:instance/%s", region, involvedInstanceID), + }, + Source: "aws.health", + Time: time.Now(), + }, + Detail: scheduledchangev0.AWSHealthEventDetail{ + Service: "EC2", + EventTypeCategory: "scheduledChange", + AffectedEntities: []scheduledchangev0.AffectedEntity{ + { + EntityValue: involvedInstanceID, + }, + }, + }, + } +} + func parseProviderID(pid string) string { r := regexp.MustCompile(`aws:///(?P.*)/(?P.*)`) matches := r.FindStringSubmatch(pid) From 5a2b4d20c9821c565f3dd2e0609468c5baf76621 Mon Sep 17 00:00:00 2001 From: Jonathan Innis Date: Tue, 27 Sep 2022 22:01:50 -0700 Subject: [PATCH 26/55] Add instance to ICE cache on spot interruption warning --- charts/karpenter/templates/deployment.yaml | 10 ++- pkg/cloudprovider/aws/cloudprovider.go | 4 + .../controllers/infrastructure/controller.go | 15 +++- .../controllers/infrastructure/suite_test.go | 21 +++-- .../controllers/notification/controller.go | 79 +++++++++++-------- .../controllers/notification/event/noop.go | 2 +- .../controllers/notification/event/types.go | 12 +-- .../aws/controllers/notification/metrics.go | 14 ++-- .../controllers/notification/suite_test.go | 71 ++++++++++++++--- pkg/cloudprovider/aws/controllers/register.go | 2 +- pkg/cloudprovider/aws/events/recorder.go | 30 +++---- pkg/cloudprovider/aws/instancetypes.go | 5 +- pkg/controllers/controllers.go | 42 ++++++---- pkg/utils/options/options.go | 9 ++- .../pipeline-trigger-cron.yaml | 2 +- .../cloudformation.yaml | 1 + 16 files changed, 209 insertions(+), 110 deletions(-) diff --git a/charts/karpenter/templates/deployment.yaml b/charts/karpenter/templates/deployment.yaml index ab688d428b2d..575dadb565fe 100644 --- a/charts/karpenter/templates/deployment.yaml +++ b/charts/karpenter/templates/deployment.yaml @@ -75,6 +75,12 @@ spec: {{- end }} - name: KARPENTER_SERVICE value: {{ include "karpenter.fullname" . }} + - name: DEPLOYMENT_NAME + value: {{ include "karpenter.fullname" . }} + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name - name: SYSTEM_NAMESPACE valueFrom: fieldRef: @@ -138,10 +144,6 @@ spec: {{- end }} - name: KARPENTER_SERVICE value: {{ include "karpenter.fullname" . }} - - name: DEPLOYMENT_NAME - valueFrom: - fieldRef: - fieldPath: metadata.name - name: SYSTEM_NAMESPACE valueFrom: fieldRef: diff --git a/pkg/cloudprovider/aws/cloudprovider.go b/pkg/cloudprovider/aws/cloudprovider.go index 6444aad3c990..66fca8615389 100644 --- a/pkg/cloudprovider/aws/cloudprovider.go +++ b/pkg/cloudprovider/aws/cloudprovider.go @@ -231,6 +231,10 @@ func (c *CloudProvider) EventBridgeProvider() *EventBridgeProvider { return c.eventBridgeProvider } +func (c *CloudProvider) InstanceTypeProvider() *InstanceTypeProvider { + return c.instanceTypeProvider +} + // Default the provisioner func (*CloudProvider) Default(ctx context.Context, provisioner *v1alpha5.Provisioner) { defaultLabels(provisioner) diff --git a/pkg/cloudprovider/aws/controllers/infrastructure/controller.go b/pkg/cloudprovider/aws/controllers/infrastructure/controller.go index ce598990bce9..9653d9d686b3 100644 --- a/pkg/cloudprovider/aws/controllers/infrastructure/controller.go +++ b/pkg/cloudprovider/aws/controllers/infrastructure/controller.go @@ -31,6 +31,7 @@ import ( "k8s.io/apimachinery/pkg/types" "k8s.io/utils/clock" "knative.dev/pkg/logging" + "knative.dev/pkg/system" "sigs.k8s.io/controller-runtime/pkg/client" "github.com/aws/karpenter/pkg/cloudprovider/aws" @@ -71,15 +72,17 @@ func NewController(ctx context.Context, cleanupCtx context.Context, kubeClient c sqsProvider: sqsProvider, eventBridgeProvider: eventBridgeProvider, mutex: &sync.RWMutex{}, - backoff: newBackoff(), + backoff: newBackoff(clk), readinessChan: make(chan struct{}), trigger: make(chan struct{}, 1), done: make(chan struct{}), } + ctx, cancel := context.WithCancel(ctx) // Cancel so we don't re-provision the infra on cleanup go func() { select { case <-cleanupAsync: + cancel() c.cleanup(cleanupCtx) case <-cleanupCtx.Done(): } @@ -97,10 +100,11 @@ func NewController(ctx context.Context, cleanupCtx context.Context, kubeClient c return c } -func newBackoff() *backoff.ExponentialBackOff { +func newBackoff(clk clock.Clock) *backoff.ExponentialBackOff { b := backoff.NewExponentialBackOff() b.InitialInterval = time.Minute - b.MaxElapsedTime = time.Minute * 20 + b.MaxElapsedTime = time.Minute * 30 + b.Clock = clk return b } @@ -144,7 +148,7 @@ func (c *Controller) cleanup(ctx context.Context) { dep := &appsv1.Deployment{} nn := types.NamespacedName{ Name: injection.GetOptions(ctx).DeploymentName, - Namespace: injection.GetOptions(ctx).DeploymentNamespace, + Namespace: system.Namespace(), } notFound := false @@ -206,6 +210,7 @@ func (c *Controller) setReady(ctx context.Context, ready bool) { c.readinessChan = make(chan struct{}) } c.ready = ready + c.ready = ready } // EnsureInfrastructure reconciles the SQS queue and the EventBridge rules with the expected @@ -273,9 +278,11 @@ func (c *Controller) DeleteInfrastructure(ctx context.Context) (err error) { }() wg.Wait() if err != nil { + c.recorder.InfrastructureDeletionFailed(ctx, c.kubeClient) return err } logging.FromContext(ctx).Infof("Completed deprovisioning the infrastructure") + c.recorder.InfrastructureDeletionSucceeded(ctx, c.kubeClient) return nil } diff --git a/pkg/cloudprovider/aws/controllers/infrastructure/suite_test.go b/pkg/cloudprovider/aws/controllers/infrastructure/suite_test.go index a1ba859fa8eb..bcde5833bfd9 100644 --- a/pkg/cloudprovider/aws/controllers/infrastructure/suite_test.go +++ b/pkg/cloudprovider/aws/controllers/infrastructure/suite_test.go @@ -27,6 +27,7 @@ import ( . "github.com/onsi/gomega" clock "k8s.io/utils/clock/testing" . "knative.dev/pkg/logging/testing" + _ "knative.dev/pkg/system/testing" . "github.com/aws/karpenter/pkg/test/expectations" "github.com/aws/karpenter/pkg/utils/injection" @@ -59,13 +60,11 @@ var defaultOpts = options.Options{ AWSEnablePodENI: true, AWSDefaultInstanceProfile: "test-instance-profile", DeploymentName: test.KarpenterDeployment().Name, - DeploymentNamespace: test.KarpenterDeployment().Namespace, } func TestAPIs(t *testing.T) { ctx = TestContextWithLogger(t) RegisterFailHandler(Fail) - SetDefaultEventuallyTimeout(time.Minute) RunSpecs(t, "AWS Notification") } @@ -83,14 +82,14 @@ var _ = BeforeEach(func() { eventbridgeapi = &awsfake.EventBridgeAPI{} sqsProvider = aws.NewSQSProvider(e.Ctx, sqsapi, metadata) eventBridgeProvider = aws.NewEventBridgeProvider(eventbridgeapi, metadata, sqsProvider.QueueName()) + + cleanupChan = make(chan struct{}, 1) + startChan = make(chan struct{}) + + controller = infrastructure.NewController(env.Ctx, env.Ctx, env.Client, fakeClock, recorder, sqsProvider, eventBridgeProvider, startChan, cleanupChan) }) Expect(env.Start()).To(Succeed(), "Failed to start environment") ExpectApplied(env.Ctx, env.Client, test.KarpenterDeployment()) - cleanupChan = make(chan struct{}, 1) - startChan = make(chan struct{}) - sqsapi.Reset() - eventbridgeapi.Reset() - controller = infrastructure.NewController(env.Ctx, env.Ctx, env.Client, fakeClock, recorder, sqsProvider, eventBridgeProvider, startChan, cleanupChan) }) var _ = AfterEach(func() { @@ -155,6 +154,10 @@ var _ = Describe("Reconciliation", func() { sqsapi.CreateQueueBehavior.Reset() eventbridgeapi.PutRuleBehavior.Reset() eventbridgeapi.PutTargetsBehavior.Reset() + + // Give the loop a second to stabilize + time.Sleep(time.Second) + fakeClock.Step(time.Minute * 11) // Should reconcile again after failed access denied calls @@ -183,6 +186,10 @@ var _ = Describe("Reconciliation", func() { // Backoff is 1 minute, so we set the fake clock forward 2 minutes // Access denied has now been resolved sqsapi.CreateQueueBehavior.Reset() + + // Give the loop a second to stabilize + time.Sleep(time.Second) + fakeClock.Step(time.Minute * 2) // Should reconcile again after failed access denied calls diff --git a/pkg/cloudprovider/aws/controllers/notification/controller.go b/pkg/cloudprovider/aws/controllers/notification/controller.go index 2ee4c90ed8d2..46d0812825a0 100644 --- a/pkg/cloudprovider/aws/controllers/notification/controller.go +++ b/pkg/cloudprovider/aws/controllers/notification/controller.go @@ -33,6 +33,7 @@ import ( "github.com/aws/karpenter/pkg/apis/provisioning/v1alpha5" "github.com/aws/karpenter/pkg/cloudprovider/aws" + awsv1alpha1 "github.com/aws/karpenter/pkg/cloudprovider/aws/apis/v1alpha1" "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/infrastructure" "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event" "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event/aggregatedparser" @@ -57,12 +58,13 @@ var Actions = struct { // Controller is the notification controller. It is not a standard controller-runtime controller in that it doesn't // have a reconcile method. type Controller struct { - kubeClient client.Client - cluster *state.Cluster - recorder events.Recorder - clock clock.Clock - provider *aws.SQSProvider - parser event.Parser + kubeClient client.Client + cluster *state.Cluster + recorder events.Recorder + clock clock.Clock + provider *aws.SQSProvider + instanceTypeProvider *aws.InstanceTypeProvider + parser event.Parser infraController *infrastructure.Controller backoff *backoff.ExponentialBackOff @@ -73,17 +75,19 @@ const pollingPeriod = 2 * time.Second func NewController(ctx context.Context, kubeClient client.Client, clk clock.Clock, recorder events.Recorder, cluster *state.Cluster, sqsProvider *aws.SQSProvider, - infraController *infrastructure.Controller, startAsync <-chan struct{}) *Controller { + instanceTypeProvider *aws.InstanceTypeProvider, infraController *infrastructure.Controller, + startAsync <-chan struct{}) *Controller { c := &Controller{ - kubeClient: kubeClient, - cluster: cluster, - recorder: recorder, - clock: clk, - provider: sqsProvider, - parser: aggregatedparser.NewAggregatedParser(aggregatedparser.DefaultParsers...), - infraController: infraController, - backoff: newBackoff(), + kubeClient: kubeClient, + cluster: cluster, + recorder: recorder, + clock: clk, + provider: sqsProvider, + instanceTypeProvider: instanceTypeProvider, + parser: aggregatedparser.NewAggregatedParser(aggregatedparser.DefaultParsers...), + infraController: infraController, + backoff: newBackoff(clk), } go func() { @@ -98,10 +102,11 @@ func NewController(ctx context.Context, kubeClient client.Client, clk clock.Cloc return c } -func newBackoff() *backoff.ExponentialBackOff { +func newBackoff(clk clock.Clock) *backoff.ExponentialBackOff { b := backoff.NewExponentialBackOff() b.InitialInterval = time.Second * 2 b.MaxElapsedTime = time.Minute * 30 + b.Clock = clk return b } @@ -134,7 +139,7 @@ func (c *Controller) run(ctx context.Context) { } func (c *Controller) PollSQS(ctx context.Context) error { - defer metrics.Measure(reconcileDuration.WithLabelValues())() + defer metrics.Measure(reconcileDuration)() sqsMessages, err := c.provider.GetSQSMessages(ctx) if err != nil { @@ -175,12 +180,11 @@ func (c *Controller) handleMessage(ctx context.Context, instanceIDMap map[string if err != nil { return fmt.Errorf("failed to delete message from queue, %w", err) } - deletedMessages.WithLabelValues().Inc() + deletedMessages.Inc() return } receivedMessages.WithLabelValues(evt.Kind(), "true").Inc() - action := actionForEvent(evt) nodeNames := lo.Map(nodes, func(n *v1.Node, _ int) string { return n.Name }) logging.FromContext(ctx).Infof("Received actionable event from SQS queue for node(s) [%s%s]", strings.Join(lo.Slice(nodeNames, 0, 3), ","), @@ -188,16 +192,7 @@ func (c *Controller) handleMessage(ctx context.Context, instanceIDMap map[string for i := range nodes { node := nodes[i] - nodeCtx := logging.WithLogger(ctx, logging.FromContext(ctx).With("node", node.Name)) - - // Record metric and event for this action - c.notifyForEvent(evt, node) - actionsTaken.WithLabelValues(action).Inc() - - if action != Actions.NoAction { - e := c.deleteInstance(nodeCtx, node) - err = multierr.Append(err, e) - } + err = multierr.Append(err, c.handleNode(ctx, evt, node)) } if err != nil { return fmt.Errorf("failed to act on nodes [%s%s], %w", @@ -208,14 +203,36 @@ func (c *Controller) handleMessage(ctx context.Context, instanceIDMap map[string if err != nil { return fmt.Errorf("failed to delete message from queue, %w", err) } - deletedMessages.WithLabelValues().Inc() + deletedMessages.Inc() + return nil +} + +func (c *Controller) handleNode(ctx context.Context, evt event.Interface, node *v1.Node) error { + ctx = logging.WithLogger(ctx, logging.FromContext(ctx).With("node", node.Name)) + action := actionForEvent(evt) + + // Record metric and event for this action + c.notifyForEvent(evt, node) + actionsPerformed.WithLabelValues(action).Inc() + + // Mark the offering as unavailable in the ICE cache since we got a spot interruption warning + if evt.Kind() == event.Kinds.SpotInterruption { + zone := node.Labels[v1.LabelTopologyZone] + instanceType := node.Labels[v1.LabelInstanceTypeStable] + if zone != "" && instanceType != "" { + c.instanceTypeProvider.MarkOfferingUnavailable(instanceType, zone, awsv1alpha1.CapacityTypeSpot) + } + } + if action != Actions.NoAction { + return c.deleteInstance(ctx, node) + } return nil } func (c *Controller) deleteInstance(ctx context.Context, node *v1.Node) error { c.recorder.TerminatingNodeOnNotification(node) if err := c.kubeClient.Delete(ctx, node); err != nil { - return fmt.Errorf("deleting the spot interrupted node, %w", err) + return fmt.Errorf("deleting the node on notification, %w", err) } return nil } diff --git a/pkg/cloudprovider/aws/controllers/notification/event/noop.go b/pkg/cloudprovider/aws/controllers/notification/event/noop.go index c2709c59353d..0e3c5267ae92 100644 --- a/pkg/cloudprovider/aws/controllers/notification/event/noop.go +++ b/pkg/cloudprovider/aws/controllers/notification/event/noop.go @@ -32,7 +32,7 @@ func (NoOp) EC2InstanceIDs() []string { } func (NoOp) Kind() Kind { - return Kinds.Noop + return Kinds.NoOp } func (n NoOp) MarshalLogObject(enc zapcore.ObjectEncoder) error { diff --git a/pkg/cloudprovider/aws/controllers/notification/event/types.go b/pkg/cloudprovider/aws/controllers/notification/event/types.go index 7c6916478caf..fb2f19b2dd85 100644 --- a/pkg/cloudprovider/aws/controllers/notification/event/types.go +++ b/pkg/cloudprovider/aws/controllers/notification/event/types.go @@ -38,11 +38,11 @@ var Kinds = struct { ScheduledChange, SpotInterruption, StateChange, - Noop Kind + NoOp Kind }{ - RebalanceRecommendation: "rebalanceRecommendation", - ScheduledChange: "scheduledChange", - SpotInterruption: "spotInterruption", - StateChange: "stateChange", - Noop: "noop", + RebalanceRecommendation: "RebalanceRecommendation", + ScheduledChange: "ScheduledChange", + SpotInterruption: "SpotInterruption", + StateChange: "StateChange", + NoOp: "NoOp", } diff --git a/pkg/cloudprovider/aws/controllers/notification/metrics.go b/pkg/cloudprovider/aws/controllers/notification/metrics.go index 41ce9d1fdb6a..08b445f279a1 100644 --- a/pkg/cloudprovider/aws/controllers/notification/metrics.go +++ b/pkg/cloudprovider/aws/controllers/notification/metrics.go @@ -29,7 +29,7 @@ const ( ) var ( - reconcileDuration = prometheus.NewHistogramVec( + reconcileDuration = prometheus.NewHistogram( prometheus.HistogramOpts{ Namespace: metrics.Namespace, Subsystem: subSystem, @@ -37,7 +37,6 @@ var ( Help: "Duration of notification reconciliation process in seconds.", Buckets: metrics.DurationBuckets(), }, - []string{}, ) receivedMessages = prometheus.NewCounterVec( prometheus.CounterOpts{ @@ -48,26 +47,25 @@ var ( }, []string{messageTypeLabel, actionableTypeLabel}, ) - deletedMessages = prometheus.NewCounterVec( + deletedMessages = prometheus.NewCounter( prometheus.CounterOpts{ Namespace: metrics.Namespace, Subsystem: subSystem, Name: "deleted_messages", Help: "Count of messages deleted from the SQS queue.", }, - []string{}, ) - actionsTaken = prometheus.NewCounterVec( + actionsPerformed = prometheus.NewCounterVec( prometheus.CounterOpts{ Namespace: metrics.Namespace, Subsystem: subSystem, - Name: "actions_taken", - Help: "Count of actions taken based on notifications from the SQS queue. Broken down by action type", + Name: "actions_performed", + Help: "Number of notification actions performed. Labeled by action", }, []string{actionTypeLabel}, ) ) func init() { - crmetrics.Registry.MustRegister(reconcileDuration, receivedMessages, deletedMessages, actionsTaken) + crmetrics.Registry.MustRegister(reconcileDuration, receivedMessages, deletedMessages, actionsPerformed) } diff --git a/pkg/cloudprovider/aws/controllers/notification/suite_test.go b/pkg/cloudprovider/aws/controllers/notification/suite_test.go index 2c6e240f573b..d2d6b3742477 100644 --- a/pkg/cloudprovider/aws/controllers/notification/suite_test.go +++ b/pkg/cloudprovider/aws/controllers/notification/suite_test.go @@ -24,6 +24,7 @@ import ( awssdk "github.com/aws/aws-sdk-go/aws" "github.com/aws/aws-sdk-go/aws/awserr" + "github.com/aws/aws-sdk-go/awstesting/mock" "github.com/aws/aws-sdk-go/service/sqs" . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" @@ -32,25 +33,29 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" clock "k8s.io/utils/clock/testing" . "knative.dev/pkg/logging/testing" + _ "knative.dev/pkg/system/testing" "github.com/google/uuid" + "sigs.k8s.io/controller-runtime/pkg/client" + "github.com/aws/karpenter/pkg/apis/provisioning/v1alpha5" + "github.com/aws/karpenter/pkg/cloudprovider" + "github.com/aws/karpenter/pkg/cloudprovider/aws" + "github.com/aws/karpenter/pkg/cloudprovider/aws/apis/v1alpha1" "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/infrastructure" + "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification" "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event" scheduledchangev0 "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event/scheduledchange/v0" spotinterruptionv0 "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event/spotinterruption/v0" statechangev0 "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event/statechange/v0" - . "github.com/aws/karpenter/pkg/test/expectations" - - "sigs.k8s.io/controller-runtime/pkg/client" - - "github.com/aws/karpenter/pkg/cloudprovider/aws" - "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification" awsfake "github.com/aws/karpenter/pkg/cloudprovider/aws/fake" "github.com/aws/karpenter/pkg/cloudprovider/fake" "github.com/aws/karpenter/pkg/controllers/state" "github.com/aws/karpenter/pkg/test" + . "github.com/aws/karpenter/pkg/test/expectations" + "github.com/aws/karpenter/pkg/utils/injection" + "github.com/aws/karpenter/pkg/utils/options" ) const ( @@ -64,10 +69,12 @@ const ( var ctx context.Context var env *test.Environment var cluster *state.Cluster +var ec2api *awsfake.EC2API var sqsapi *awsfake.SQSAPI var eventbridgeapi *awsfake.EventBridgeAPI var cloudProvider *fake.CloudProvider var sqsProvider *aws.SQSProvider +var instanceTypeProvider *aws.InstanceTypeProvider var eventBridgeProvider *aws.EventBridgeProvider var recorder *awsfake.EventRecorder var fakeClock *clock.FakeClock @@ -85,6 +92,10 @@ func TestAPIs(t *testing.T) { } var _ = BeforeEach(func() { + opts := options.Options{ + AWSIsolatedVPC: true, + } + ctx = injection.WithOptions(ctx, opts) env = test.NewEnvironment(ctx, func(e *test.Environment) { cfg = test.NewConfig() fakeClock = clock.NewFakeClock(time.Now()) @@ -99,14 +110,16 @@ var _ = BeforeEach(func() { eventbridgeapi = &awsfake.EventBridgeAPI{} eventBridgeProvider = aws.NewEventBridgeProvider(eventbridgeapi, metadata, sqsProvider.QueueName()) + infraStartChan = make(chan struct{}) + notificationStartChan = make(chan struct{}) + + ec2api = &awsfake.EC2API{} + subnetProvider := aws.NewSubnetProvider(ec2api) + instanceTypeProvider = aws.NewInstanceTypeProvider(env.Ctx, mock.Session, cloudprovider.Options{}, ec2api, subnetProvider) + infraController = infrastructure.NewController(env.Ctx, env.Ctx, env.Client, fakeClock, recorder, sqsProvider, eventBridgeProvider, infraStartChan, env.Ctx.Done()) + controller = notification.NewController(env.Ctx, env.Client, fakeClock, recorder, cluster, sqsProvider, instanceTypeProvider, infraController, notificationStartChan) }) Expect(env.Start()).To(Succeed(), "Failed to start environment") - sqsapi.Reset() - eventbridgeapi.Reset() - infraStartChan = make(chan struct{}) - notificationStartChan = make(chan struct{}) - infraController = infrastructure.NewController(env.Ctx, env.Ctx, env.Client, fakeClock, recorder, sqsProvider, eventBridgeProvider, infraStartChan, env.Ctx.Done()) - controller = notification.NewController(env.Ctx, env.Client, fakeClock, recorder, cluster, sqsProvider, infraController, notificationStartChan) }) var _ = AfterEach(func() { @@ -244,6 +257,40 @@ var _ = Describe("Processing Messages", func() { ExpectNodeExists(env.Ctx, env.Client, node.Name) Expect(sqsapi.DeleteMessageBehavior.SuccessfulCalls()).To(Equal(1)) }) + It("should mark the ICE cache for the offering when getting a spot interruption warning", func() { + node := test.Node(test.NodeOptions{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + v1alpha5.ProvisionerNameLabelKey: "default", + v1.LabelTopologyZone: "test-zone-1a", + v1.LabelInstanceTypeStable: "t3.large", + v1alpha5.LabelCapacityType: v1alpha1.CapacityTypeSpot, + }, + }, + ProviderID: makeProviderID(defaultInstanceID), + }) + ExpectMessagesCreated(spotInterruptionMessage(defaultInstanceID)) + ExpectApplied(env.Ctx, env.Client, node) + ExpectReconcileSucceeded(env.Ctx, nodeStateController, client.ObjectKeyFromObject(node)) + + Expect(controller.PollSQS(env.Ctx)).To(Succeed()) + ExpectNotFound(env.Ctx, env.Client, node) + Expect(sqsapi.DeleteMessageBehavior.SuccessfulCalls()).To(Equal(1)) + + // Expect a t3.large in test-zone-1a to not be returned since we should add it to the ICE cache + instanceTypes, err := instanceTypeProvider.Get(env.Ctx, &v1alpha1.AWS{}, &v1alpha5.KubeletConfiguration{}) + Expect(err).To(Succeed()) + + t3Large := lo.Filter(instanceTypes, func(it cloudprovider.InstanceType, _ int) bool { + return it.Name() == "t3.large" + }) + Expect(len(t3Large)).To(BeNumerically("==", 1)) + matchingOfferings := lo.Filter(t3Large[0].Offerings(), func(of cloudprovider.Offering, _ int) bool { + return of.CapacityType == v1alpha1.CapacityTypeSpot && of.Zone == "test-zone-1a" + }) + Expect(len(matchingOfferings)).To(BeNumerically("==", 1)) + Expect(matchingOfferings[0].Available).To(BeFalse()) + }) }) var _ = Describe("Error Handling", func() { diff --git a/pkg/cloudprovider/aws/controllers/register.go b/pkg/cloudprovider/aws/controllers/register.go index decb7050d26c..0282422412ef 100644 --- a/pkg/cloudprovider/aws/controllers/register.go +++ b/pkg/cloudprovider/aws/controllers/register.go @@ -32,6 +32,6 @@ func Register(ctx context.Context, provider *aws.CloudProvider, opts *controller // Injecting the AWS-specific controllers that will start when opts.StartAsync is triggered infraController := infrastructure.NewController(ctx, opts.BaseContext(), opts.KubeClient, opts.Clock, rec, provider.SQSProvider(), provider.EventBridgeProvider(), opts.StartAsync, opts.CleanupAsync) - notification.NewController(ctx, opts.KubeClient, opts.Clock, rec, opts.Cluster, provider.SQSProvider(), infraController, opts.StartAsync) + notification.NewController(ctx, opts.KubeClient, opts.Clock, rec, opts.Cluster, provider.SQSProvider(), provider.InstanceTypeProvider(), infraController, opts.StartAsync) return infraController.Done() } diff --git a/pkg/cloudprovider/aws/events/recorder.go b/pkg/cloudprovider/aws/events/recorder.go index 3292c367def1..480db7965a2b 100644 --- a/pkg/cloudprovider/aws/events/recorder.go +++ b/pkg/cloudprovider/aws/events/recorder.go @@ -18,11 +18,11 @@ import ( "context" "github.com/avast/retry-go" - appsv1 "k8s.io/api/apps/v1" v1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/types" "k8s.io/client-go/tools/record" "knative.dev/pkg/logging" + "knative.dev/pkg/system" "sigs.k8s.io/controller-runtime/pkg/client" "github.com/aws/karpenter/pkg/events" @@ -89,49 +89,49 @@ func (r recorder) TerminatingNodeOnNotification(node *v1.Node) { } func (r recorder) InfrastructureHealthy(ctx context.Context, kubeClient client.Client) { - dep := &appsv1.Deployment{} + pod := &v1.Pod{} err := retry.Do(func() error { - return kubeClient.Get(ctx, types.NamespacedName{Namespace: injection.GetOptions(ctx).DeploymentNamespace, Name: injection.GetOptions(ctx).DeploymentName}, dep) + return kubeClient.Get(ctx, types.NamespacedName{Namespace: system.Namespace(), Name: injection.GetOptions(ctx).PodName}, pod) }) if err != nil { logging.FromContext(ctx).Errorf("Sending InfrastructureHealthy event, %v", err) return } - r.Eventf(dep, "Normal", "AWSInfrastructureHealthy", "Karpenter infrastructure reconciliation is healthy") + r.Eventf(pod, "Normal", "AWSInfrastructureHealthy", "Karpenter infrastructure reconciliation is healthy") } func (r recorder) InfrastructureUnhealthy(ctx context.Context, kubeClient client.Client) { - dep := &appsv1.Deployment{} + pod := &v1.Pod{} err := retry.Do(func() error { - return kubeClient.Get(ctx, types.NamespacedName{Namespace: injection.GetOptions(ctx).DeploymentNamespace, Name: injection.GetOptions(ctx).DeploymentName}, dep) + return kubeClient.Get(ctx, types.NamespacedName{Namespace: system.Namespace(), Name: injection.GetOptions(ctx).PodName}, pod) }) if err != nil { logging.FromContext(ctx).Errorf("Sending InfrastructureUnhealthy event, %v", err) return } - r.Eventf(dep, "Warning", "AWSInfrastructureUnhealthy", "Karpenter infrastructure reconciliation is unhealthy") + r.Eventf(pod, "Warning", "AWSInfrastructureUnhealthy", "Karpenter infrastructure reconciliation is unhealthy") } func (r recorder) InfrastructureDeletionSucceeded(ctx context.Context, kubeClient client.Client) { - dep := &appsv1.Deployment{} + pod := &v1.Pod{} err := retry.Do(func() error { - return kubeClient.Get(ctx, types.NamespacedName{Namespace: injection.GetOptions(ctx).DeploymentNamespace, Name: injection.GetOptions(ctx).DeploymentName}, dep) + return kubeClient.Get(ctx, types.NamespacedName{Namespace: system.Namespace(), Name: injection.GetOptions(ctx).PodName}, pod) }) if err != nil { - logging.FromContext(ctx).Errorf("Sending InfrastructureUnhealthy event, %v", err) + logging.FromContext(ctx).Errorf("Sending InfrastructureDeletionSucceeded event, %v", err) return } - r.Eventf(dep, "Warning", "InfrastructureDeletionSucceeded", "Karpenter infrastructure deletion succeeded") + r.Eventf(pod, "Normal", "AWSInfrastructureDeletionSucceeded", "Karpenter infrastructure deletion succeeded") } func (r recorder) InfrastructureDeletionFailed(ctx context.Context, kubeClient client.Client) { - dep := &appsv1.Deployment{} + pod := &v1.Pod{} err := retry.Do(func() error { - return kubeClient.Get(ctx, types.NamespacedName{Namespace: injection.GetOptions(ctx).DeploymentNamespace, Name: injection.GetOptions(ctx).DeploymentName}, dep) + return kubeClient.Get(ctx, types.NamespacedName{Namespace: system.Namespace(), Name: injection.GetOptions(ctx).PodName}, pod) }) if err != nil { - logging.FromContext(ctx).Errorf("Sending InfrastructureUnhealthy event, %v", err) + logging.FromContext(ctx).Errorf("Sending InfrastructureDeletionFailed event, %v", err) return } - r.Eventf(dep, "Warning", "InfrastructureDeletionFailed", "Karpenter infrastructure deletion failed") + r.Eventf(pod, "Warning", "AWSInfrastructureDeletionFailed", "Karpenter infrastructure deletion failed") } diff --git a/pkg/cloudprovider/aws/instancetypes.go b/pkg/cloudprovider/aws/instancetypes.go index 148832ea7b97..cff1a7a4d881 100644 --- a/pkg/cloudprovider/aws/instancetypes.go +++ b/pkg/cloudprovider/aws/instancetypes.go @@ -247,7 +247,10 @@ func (p *InstanceTypeProvider) CacheUnavailable(ctx context.Context, fleetErr *e zone, capacityType, UnfulfillableCapacityErrorCacheTTL) - // even if the key is already in the cache, we still need to call Set to extend the cached entry's TTL + p.MarkOfferingUnavailable(instanceType, zone, capacityType) +} + +func (p *InstanceTypeProvider) MarkOfferingUnavailable(instanceType, zone, capacityType string) { p.unavailableOfferings.SetDefault(UnavailableOfferingsCacheKey(instanceType, zone, capacityType), struct{}{}) } diff --git a/pkg/controllers/controllers.go b/pkg/controllers/controllers.go index 40a4376b7378..c4f98ee6efba 100644 --- a/pkg/controllers/controllers.go +++ b/pkg/controllers/controllers.go @@ -114,17 +114,6 @@ func Initialize(injectCloudProvider func(context.Context, cloudprovider.Options) ctx = newRunnableContext(controllerRuntimeConfig, opts, logging.FromContext(ctx))() ctx, cancel := context.WithCancel(ctx) - // Setup the cleanup logic for teardown on SIGINT or SIGTERM - cleanup := make(chan struct{}) // This is a channel to broadcast to controllers cleanup can start - go func() { - sigs := make(chan os.Signal, 1) - signal.Notify(sigs, syscall.SIGINT, syscall.SIGTERM) - <-sigs - logging.FromContext(context.Background()).Infof("Got a signal to react to") - close(cleanup) - cancel() - }() - logging.FromContext(ctx).Infof("Initializing with version %s", project.Version) if opts.MemoryLimit > 0 { @@ -143,6 +132,7 @@ func Initialize(injectCloudProvider func(context.Context, cloudprovider.Options) HealthProbeBindAddress: fmt.Sprintf(":%d", opts.HealthProbePort), BaseContext: newRunnableContext(controllerRuntimeConfig, opts, logging.FromContext(ctx)), }) + cleanupAsync := make(chan struct{}) // This is a channel to broadcast to controllers cleanup can start if opts.EnableProfiling { utilruntime.Must(registerPprof(manager)) @@ -151,7 +141,7 @@ func Initialize(injectCloudProvider func(context.Context, cloudprovider.Options) ClientSet: clientSet, KubeClient: manager.GetClient(), StartAsync: manager.Elected(), - CleanupAsync: cleanup, + CleanupAsync: cleanupAsync, }) if hp, ok := cloudProvider.(HealthCheck); ok { utilruntime.Must(manager.AddHealthzCheck("cloud-provider", hp.LivenessProbe)) @@ -185,13 +175,14 @@ func Initialize(injectCloudProvider func(context.Context, cloudprovider.Options) KubeClient: manager.GetClient(), Recorder: recorder, StartAsync: manager.Elected(), - CleanupAsync: cleanup, + CleanupAsync: cleanupAsync, Clock: realClock, } - done := injectControllers(ctx, controllerOptions) + cleanupDone := injectControllers(ctx, controllerOptions) metricsstate.StartMetricScraper(ctx, cluster) + StartCleanupWatcher(ctx, cancel, manager.Elected(), cleanupAsync, cleanupDone) if err := RegisterControllers(ctx, manager, provisioning.NewController(manager.GetClient(), provisioner, recorder), @@ -206,7 +197,6 @@ func Initialize(injectCloudProvider func(context.Context, cloudprovider.Options) ).Start(ctx); err != nil { panic(fmt.Sprintf("Unable to start manager, %s", err)) } - <-done } // NewManagerOrDie instantiates a controller manager or panics @@ -247,6 +237,28 @@ func RegisterControllers(ctx context.Context, m manager.Manager, controllers ... return m } +// StartCleanupWatcher monitors the signal channel for termination, closes the cleanupAsync channel when +// there is a signal for pod termination, waits for the cleanup operation to complete, and then cancels all contexts +// Only the leader will perform the cleanup operation +func StartCleanupWatcher(ctx context.Context, cancel context.CancelFunc, elected <-chan struct{}, + cleanupAsync chan<- struct{}, cleanupDone ...<-chan struct{}) { + go func() { + sigs := make(chan os.Signal, 1) + signal.Notify(sigs, syscall.SIGINT, syscall.SIGTERM) + select { + case <-elected: + <-sigs + logging.FromContext(ctx).Infof("Initiating cleanup processes") + close(cleanupAsync) + for _, c := range cleanupDone { + <-c + } + case <-sigs: + } + cancel() + }() +} + func registerPprof(manager manager.Manager) error { for path, handler := range map[string]http.Handler{ "/debug/pprof/": http.HandlerFunc(pprof.Index), diff --git a/pkg/utils/options/options.go b/pkg/utils/options/options.go index 58402aaaa522..a4762c34817e 100644 --- a/pkg/utils/options/options.go +++ b/pkg/utils/options/options.go @@ -22,7 +22,6 @@ import ( "os" "go.uber.org/multierr" - "knative.dev/pkg/system" "github.com/aws/karpenter/pkg/utils/env" ) @@ -46,8 +45,8 @@ type Options struct { EnableLeaderElection bool MemoryLimit int64 // Metadata information - DeploymentName string - DeploymentNamespace string + DeploymentName string + PodName string // AWS Specific ClusterName string ClusterEndpoint string @@ -100,8 +99,10 @@ func (o *Options) MustParse() *Options { if err := o.Validate(); err != nil { panic(err) } + + // Set the metadata fields in the options o.DeploymentName = env.WithDefaultString("DEPLOYMENT_NAME", "karpenter") - o.DeploymentNamespace = system.Namespace() + o.PodName = env.WithDefaultString("POD_NAME", "") return o } diff --git a/test/infrastructure/clusters/test-infra/karpenter-tests/pipeline-trigger-cron.yaml b/test/infrastructure/clusters/test-infra/karpenter-tests/pipeline-trigger-cron.yaml index a31369263b13..4ac5f012f3bf 100644 --- a/test/infrastructure/clusters/test-infra/karpenter-tests/pipeline-trigger-cron.yaml +++ b/test/infrastructure/clusters/test-infra/karpenter-tests/pipeline-trigger-cron.yaml @@ -37,7 +37,7 @@ data: pipelines-trigger.sh: |+ #!/usr/bin/env bash set -euo pipefail - for suite in "Integration" "Consolidation" "Utilization"; do + for suite in "Integration" "Consolidation" "Utilization" "Notification"; do cat < Date: Tue, 4 Oct 2022 11:12:58 -0700 Subject: [PATCH 27/55] go.mod deps update --- test/go.mod | 6 ++++-- test/go.sum | 7 +++++++ 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/test/go.mod b/test/go.mod index 5cfd398033ec..1515fff7a353 100644 --- a/test/go.mod +++ b/test/go.mod @@ -7,9 +7,11 @@ require ( github.com/aws/aws-sdk-go v1.44.114 github.com/aws/aws-sdk-go-v2/config v1.17.8 github.com/aws/karpenter v0.17.0 + github.com/google/uuid v1.3.0 github.com/onsi/ginkgo/v2 v2.2.0 github.com/onsi/gomega v1.21.1 github.com/samber/lo v1.31.0 + go.uber.org/multierr v1.8.0 k8s.io/api v0.25.2 k8s.io/apimachinery v0.25.2 k8s.io/client-go v0.25.2 @@ -23,6 +25,7 @@ require ( github.com/Pallinder/go-randomdata v1.2.0 // indirect github.com/PuerkitoBio/purell v1.1.1 // indirect github.com/PuerkitoBio/urlesc v0.0.0-20170810143723-de5bf2ad4578 // indirect + github.com/avast/retry-go v3.0.0+incompatible // indirect github.com/aws/aws-sdk-go-v2 v1.16.16 // indirect github.com/aws/aws-sdk-go-v2/credentials v1.12.21 // indirect github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.12.17 // indirect @@ -59,7 +62,6 @@ require ( github.com/google/gnostic v0.5.7-v3refs // indirect github.com/google/go-cmp v0.5.8 // indirect github.com/google/gofuzz v1.2.0 // indirect - github.com/google/uuid v1.3.0 // indirect github.com/grpc-ecosystem/grpc-gateway v1.16.0 // indirect github.com/hashicorp/golang-lru v0.5.4 // indirect github.com/imdario/mergo v0.3.13 // indirect @@ -74,6 +76,7 @@ require ( github.com/modern-go/reflect2 v1.0.2 // indirect github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect github.com/patrickmn/go-cache v2.1.0+incompatible // indirect + github.com/pelletier/go-toml/v2 v2.0.5 // indirect github.com/pkg/errors v0.9.1 // indirect github.com/prometheus/client_golang v1.13.0 // indirect github.com/prometheus/client_model v0.2.0 // indirect @@ -84,7 +87,6 @@ require ( go.opencensus.io v0.23.0 // indirect go.uber.org/atomic v1.9.0 // indirect go.uber.org/automaxprocs v1.4.0 // indirect - go.uber.org/multierr v1.8.0 // indirect go.uber.org/zap v1.23.0 // indirect golang.org/x/exp v0.0.0-20220303212507-bbda1eaf7a17 // indirect golang.org/x/net v0.0.0-20220722155237-a158d28d115b // indirect diff --git a/test/go.sum b/test/go.sum index e0fd11f9b10e..82a388581601 100644 --- a/test/go.sum +++ b/test/go.sum @@ -62,6 +62,8 @@ github.com/alecthomas/units v0.0.0-20151022065526-2efee857e7cf/go.mod h1:ybxpYRF github.com/alecthomas/units v0.0.0-20190717042225-c3de453c63f4/go.mod h1:ybxpYRFXyAe+OPACYpWeL0wqObRcbAqCMya13uyzqw0= github.com/alecthomas/units v0.0.0-20190924025748-f65c72e2690d/go.mod h1:rBZYJk541a8SKzHPHnH3zbiI+7dagKZ0cgpgrD7Fyho= github.com/antihax/optional v1.0.0/go.mod h1:uupD/76wgC+ih3iEmQUL+0Ugr19nfwCT1kdvxnR2qWY= +github.com/avast/retry-go v3.0.0+incompatible h1:4SOWQ7Qs+oroOTQOYnAHqelpCO0biHSxpiH9JdtuBj0= +github.com/avast/retry-go v3.0.0+incompatible/go.mod h1:XtSnn+n/sHqQIpZ10K1qAevBhOOCWBLXXy3hyiqqBrY= github.com/aws/amazon-ec2-spot-interrupter v0.0.9 h1:2yRK7f29tPhrkFBn1lg6QZNjY3iE1ovsjom99OnBCDw= github.com/aws/amazon-ec2-spot-interrupter v0.0.9/go.mod h1:TwqwmD9RUPwjjcyklxGlzxIxbA6oRfDn6lQf0Muu8/A= github.com/aws/aws-sdk-go v1.44.114 h1:plIkWc/RsHr3DXBj4MEw9sEW4CcL/e2ryokc+CKyq1I= @@ -342,6 +344,8 @@ github.com/onsi/gomega v1.21.1 h1:OB/euWYIExnPBohllTicTHmGTrMaqJ67nIu80j0/uEM= github.com/onsi/gomega v1.21.1/go.mod h1:iYAIXgPSaDHak0LCMA+AWBpIKBr8WZicMxnE8luStNc= github.com/patrickmn/go-cache v2.1.0+incompatible h1:HRMgzkcYKYpi3C8ajMPV8OFXaaRUnok+kx1WdO15EQc= github.com/patrickmn/go-cache v2.1.0+incompatible/go.mod h1:3Qf8kWWT7OJRJbdiICTKqZju1ZixQ/KpMGzzAfe6+WQ= +github.com/pelletier/go-toml/v2 v2.0.5 h1:ipoSadvV8oGUjnUbMub59IDPPwfxF694nG/jwbMiyQg= +github.com/pelletier/go-toml/v2 v2.0.5/go.mod h1:OMHamSCAODeSsVrwwvcJOaoN0LIUIaFVNZzmWyNfXas= github.com/pkg/errors v0.8.0/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= @@ -390,13 +394,16 @@ github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An github.com/stoewer/go-strcase v1.2.0/go.mod h1:IBiWB2sKIp3wVVQ3Y035++gc+knqhUQag1KpM8ahLw8= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/objx v0.1.1/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs= github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4= github.com/stretchr/testify v1.5.1/go.mod h1:5W2xD1RspED5o8YsWQXVCued0rvSQ+mT+I5cxcmMvtA= github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= +github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.8.0 h1:pSgiaMZlXftHpm5L7V1+rVB+AZJydKsMxsQBIJw4PKk= +github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= github.com/thoas/go-funk v0.9.1 h1:O549iLZqPpTUQ10ykd26sZhzD+rmR5pWhuElrhbC20M= github.com/yuin/goldmark v1.1.25/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= From 7631357f957658645eac1d55865acc464b5c3703 Mon Sep 17 00:00:00 2001 From: Jonathan Innis Date: Tue, 4 Oct 2022 11:42:44 -0700 Subject: [PATCH 28/55] Remove baseContext from controller options --- .../aws/controllers/infrastructure/controller.go | 10 +++++----- .../aws/controllers/infrastructure/suite_test.go | 2 +- .../aws/controllers/notification/suite_test.go | 2 +- pkg/cloudprovider/aws/controllers/register.go | 2 +- pkg/controllers/controllers.go | 10 ++++------ 5 files changed, 12 insertions(+), 14 deletions(-) diff --git a/pkg/cloudprovider/aws/controllers/infrastructure/controller.go b/pkg/cloudprovider/aws/controllers/infrastructure/controller.go index 9653d9d686b3..ebeb150d51fc 100644 --- a/pkg/cloudprovider/aws/controllers/infrastructure/controller.go +++ b/pkg/cloudprovider/aws/controllers/infrastructure/controller.go @@ -62,7 +62,7 @@ type Controller struct { // This period can be reduced to a backoffPeriod if there is an error in ensuring the infrastructure const pollingPeriod = time.Hour -func NewController(ctx context.Context, cleanupCtx context.Context, kubeClient client.Client, clk clock.Clock, +func NewController(ctx context.Context, kubeClient client.Client, clk clock.Clock, recorder events.Recorder, sqsProvider *aws.SQSProvider, eventBridgeProvider *aws.EventBridgeProvider, startAsync <-chan struct{}, cleanupAsync <-chan struct{}) *Controller { c := &Controller{ @@ -78,13 +78,13 @@ func NewController(ctx context.Context, cleanupCtx context.Context, kubeClient c done: make(chan struct{}), } - ctx, cancel := context.WithCancel(ctx) // Cancel so we don't re-provision the infra on cleanup + innerCtx, cancel := context.WithCancel(ctx) // Cancel so we don't re-provision the infra on cleanup go func() { select { case <-cleanupAsync: cancel() - c.cleanup(cleanupCtx) - case <-cleanupCtx.Done(): + c.cleanup(ctx) + case <-ctx.Done(): } close(c.done) }() @@ -94,7 +94,7 @@ func NewController(ctx context.Context, cleanupCtx context.Context, kubeClient c case <-ctx.Done(): return case <-startAsync: - c.run(ctx) + c.run(innerCtx) } }() return c diff --git a/pkg/cloudprovider/aws/controllers/infrastructure/suite_test.go b/pkg/cloudprovider/aws/controllers/infrastructure/suite_test.go index bcde5833bfd9..1ecedb00936f 100644 --- a/pkg/cloudprovider/aws/controllers/infrastructure/suite_test.go +++ b/pkg/cloudprovider/aws/controllers/infrastructure/suite_test.go @@ -86,7 +86,7 @@ var _ = BeforeEach(func() { cleanupChan = make(chan struct{}, 1) startChan = make(chan struct{}) - controller = infrastructure.NewController(env.Ctx, env.Ctx, env.Client, fakeClock, recorder, sqsProvider, eventBridgeProvider, startChan, cleanupChan) + controller = infrastructure.NewController(env.Ctx, env.Client, fakeClock, recorder, sqsProvider, eventBridgeProvider, startChan, cleanupChan) }) Expect(env.Start()).To(Succeed(), "Failed to start environment") ExpectApplied(env.Ctx, env.Client, test.KarpenterDeployment()) diff --git a/pkg/cloudprovider/aws/controllers/notification/suite_test.go b/pkg/cloudprovider/aws/controllers/notification/suite_test.go index d2d6b3742477..77f191ac0089 100644 --- a/pkg/cloudprovider/aws/controllers/notification/suite_test.go +++ b/pkg/cloudprovider/aws/controllers/notification/suite_test.go @@ -116,7 +116,7 @@ var _ = BeforeEach(func() { ec2api = &awsfake.EC2API{} subnetProvider := aws.NewSubnetProvider(ec2api) instanceTypeProvider = aws.NewInstanceTypeProvider(env.Ctx, mock.Session, cloudprovider.Options{}, ec2api, subnetProvider) - infraController = infrastructure.NewController(env.Ctx, env.Ctx, env.Client, fakeClock, recorder, sqsProvider, eventBridgeProvider, infraStartChan, env.Ctx.Done()) + infraController = infrastructure.NewController(env.Ctx, env.Client, fakeClock, recorder, sqsProvider, eventBridgeProvider, infraStartChan, env.Ctx.Done()) controller = notification.NewController(env.Ctx, env.Client, fakeClock, recorder, cluster, sqsProvider, instanceTypeProvider, infraController, notificationStartChan) }) Expect(env.Start()).To(Succeed(), "Failed to start environment") diff --git a/pkg/cloudprovider/aws/controllers/register.go b/pkg/cloudprovider/aws/controllers/register.go index 0282422412ef..74f924cd5697 100644 --- a/pkg/cloudprovider/aws/controllers/register.go +++ b/pkg/cloudprovider/aws/controllers/register.go @@ -31,7 +31,7 @@ func Register(ctx context.Context, provider *aws.CloudProvider, opts *controller ctx = logging.WithLogger(ctx, logging.FromContext(ctx).Named("aws")) // Injecting the AWS-specific controllers that will start when opts.StartAsync is triggered - infraController := infrastructure.NewController(ctx, opts.BaseContext(), opts.KubeClient, opts.Clock, rec, provider.SQSProvider(), provider.EventBridgeProvider(), opts.StartAsync, opts.CleanupAsync) + infraController := infrastructure.NewController(ctx, opts.KubeClient, opts.Clock, rec, provider.SQSProvider(), provider.EventBridgeProvider(), opts.StartAsync, opts.CleanupAsync) notification.NewController(ctx, opts.KubeClient, opts.Clock, rec, opts.Cluster, provider.SQSProvider(), provider.InstanceTypeProvider(), infraController, opts.StartAsync) return infraController.Done() } diff --git a/pkg/controllers/controllers.go b/pkg/controllers/controllers.go index c4f98ee6efba..e0c820e09595 100644 --- a/pkg/controllers/controllers.go +++ b/pkg/controllers/controllers.go @@ -90,11 +90,10 @@ type Controller interface { } type ControllerOptions struct { - BaseContext func() context.Context - Cluster *state.Cluster - KubeClient client.Client - Recorder events.Recorder - Clock clock.Clock + Cluster *state.Cluster + KubeClient client.Client + Recorder events.Recorder + Clock clock.Clock StartAsync <-chan struct{} CleanupAsync <-chan struct{} @@ -170,7 +169,6 @@ func Initialize(injectCloudProvider func(context.Context, cloudprovider.Options) // Inject cloudprovider-specific controllers into the controller-set using the injectControllers function // Inject the base cloud provider into the injection function rather than the decorated interface controllerOptions := &ControllerOptions{ - BaseContext: newRunnableContext(controllerRuntimeConfig, opts, logging.FromContext(ctx)), Cluster: cluster, KubeClient: manager.GetClient(), Recorder: recorder, From 82d11d820130be9136b87b732facf80353670558 Mon Sep 17 00:00:00 2001 From: Jonathan Innis Date: Tue, 4 Oct 2022 16:22:24 -0700 Subject: [PATCH 29/55] Add spot interruption handling feature flag --- Makefile | 1 + charts/karpenter/templates/configmap.yaml | 5 ++++ charts/karpenter/values.yaml | 5 ++++ cmd/controller/main.go | 2 +- .../controllers/infrastructure/controller.go | 11 ++++---- .../controllers/notification/controller.go | 7 ++--- pkg/cloudprovider/aws/controllers/register.go | 13 +++++---- pkg/config/config.go | 28 +++++++++++++++++-- pkg/controllers/controllers.go | 6 ++-- pkg/test/config.go | 28 +++++++++++++++---- test/suites/common/setup.yaml | 1 + 11 files changed, 81 insertions(+), 26 deletions(-) diff --git a/Makefile b/Makefile index ae04c20b701a..97dda921cf6c 100644 --- a/Makefile +++ b/Makefile @@ -20,6 +20,7 @@ HELM_OPTS ?= --set serviceAccount.annotations.eks\\.amazonaws\\.com/role-arn=${K --set clusterName=${CLUSTER_NAME} \ --set clusterEndpoint=${CLUSTER_ENDPOINT} \ --set aws.defaultInstanceProfile=KarpenterNodeInstanceProfile-${CLUSTER_NAME} \ + --set controller.enableInterruptionHandling=true \ --create-namespace TEST_FILTER ?= .* diff --git a/charts/karpenter/templates/configmap.yaml b/charts/karpenter/templates/configmap.yaml index 7d9f64b708b8..94d64420b89a 100644 --- a/charts/karpenter/templates/configmap.yaml +++ b/charts/karpenter/templates/configmap.yaml @@ -12,3 +12,8 @@ metadata: data: "batchMaxDuration": "{{ .Values.controller.batchMaxDuration }}" "batchIdleDuration": "{{ .Values.controller.batchIdleDuration }}" + # FEATURE FLAGS + # + # enableInterruptionHandling is currently in beta and is disabled by default. Enabling interruption handling may + # require additional permissions on the controller service account. Additional permissions are outlined in the docs + "enableInterruptionHandling": "{{ .Values.controller.enableInterruptionHandling }}" diff --git a/charts/karpenter/values.yaml b/charts/karpenter/values.yaml index b384ef93bec7..e4906de1f49b 100644 --- a/charts/karpenter/values.yaml +++ b/charts/karpenter/values.yaml @@ -125,6 +125,11 @@ controller: # faster than this time, the batching window will be extended up to the maxDuration. If they arrive slower, the pods # will be batched separately. batchIdleDuration: 1s + + # FEATURE FLAGS + # + # Whether to enable the interruption handling processing logic for the cloud provider + enableInterruptionHandling: false webhook: # -- Webhook image. image: "public.ecr.aws/karpenter/webhook:v0.16.3@sha256:96a2d9b06d6bc5127801f358f74b1cf2d289b423a2e9ba40c573c0b14b17dafa" diff --git a/cmd/controller/main.go b/cmd/controller/main.go index 7e8e7e2c9e4c..def04d68613c 100644 --- a/cmd/controller/main.go +++ b/cmd/controller/main.go @@ -26,7 +26,7 @@ import ( func main() { controllers.Initialize(func(ctx context.Context, options cloudprovider.Options) (cloudprovider.CloudProvider, controllers.ControllerInitFunc) { provider := aws.NewCloudProvider(ctx, options) - return provider, func(c context.Context, opts *controllers.ControllerOptions) <-chan struct{} { + return provider, func(c context.Context, opts *controllers.ControllerOptions) []<-chan struct{} { return awscontrollers.Register(c, provider, opts) } }) diff --git a/pkg/cloudprovider/aws/controllers/infrastructure/controller.go b/pkg/cloudprovider/aws/controllers/infrastructure/controller.go index ebeb150d51fc..44b596e4aa4b 100644 --- a/pkg/cloudprovider/aws/controllers/infrastructure/controller.go +++ b/pkg/cloudprovider/aws/controllers/infrastructure/controller.go @@ -65,6 +65,7 @@ const pollingPeriod = time.Hour func NewController(ctx context.Context, kubeClient client.Client, clk clock.Clock, recorder events.Recorder, sqsProvider *aws.SQSProvider, eventBridgeProvider *aws.EventBridgeProvider, startAsync <-chan struct{}, cleanupAsync <-chan struct{}) *Controller { + c := &Controller{ kubeClient: kubeClient, recorder: recorder, @@ -77,6 +78,8 @@ func NewController(ctx context.Context, kubeClient client.Client, clk clock.Cloc trigger: make(chan struct{}, 1), done: make(chan struct{}), } + ctx = logging.WithLogger(ctx, logging.FromContext(ctx).Named("infrastructure")) + logging.FromContext(ctx).Infof("Starting controller") innerCtx, cancel := context.WithCancel(ctx) // Cancel so we don't re-provision the infra on cleanup go func() { @@ -109,11 +112,8 @@ func newBackoff(clk clock.Clock) *backoff.ExponentialBackOff { } func (c *Controller) run(ctx context.Context) { - logger := logging.FromContext(ctx).Named("infrastructure") - ctx = logging.WithLogger(ctx, logger) - defer func() { - logger.Infof("Shutting down") + logging.FromContext(ctx).Infof("Shutting down") }() for { if err := c.EnsureInfrastructure(ctx); err != nil { @@ -143,7 +143,7 @@ func (c *Controller) run(ctx context.Context) { } func (c *Controller) cleanup(ctx context.Context) { - ctx = logging.WithLogger(ctx, logging.FromContext(ctx).Named("infrastructure.cleanup")) + ctx = logging.WithLogger(ctx, logging.FromContext(ctx).Named("cleanup")) dep := &appsv1.Deployment{} nn := types.NamespacedName{ @@ -210,7 +210,6 @@ func (c *Controller) setReady(ctx context.Context, ready bool) { c.readinessChan = make(chan struct{}) } c.ready = ready - c.ready = ready } // EnsureInfrastructure reconciles the SQS queue and the EventBridge rules with the expected diff --git a/pkg/cloudprovider/aws/controllers/notification/controller.go b/pkg/cloudprovider/aws/controllers/notification/controller.go index 46d0812825a0..4b676cd18b1d 100644 --- a/pkg/cloudprovider/aws/controllers/notification/controller.go +++ b/pkg/cloudprovider/aws/controllers/notification/controller.go @@ -89,6 +89,8 @@ func NewController(ctx context.Context, kubeClient client.Client, clk clock.Cloc infraController: infraController, backoff: newBackoff(clk), } + ctx = logging.WithLogger(ctx, logging.FromContext(ctx).Named("notification")) + logging.FromContext(ctx).Infof("Starting controller") go func() { select { @@ -111,11 +113,8 @@ func newBackoff(clk clock.Clock) *backoff.ExponentialBackOff { } func (c *Controller) run(ctx context.Context) { - logger := logging.FromContext(ctx).Named("notification") - ctx = logging.WithLogger(ctx, logger) - defer func() { - logger.Infof("Shutting down") + logging.FromContext(ctx).Infof("Shutting down") }() for { <-c.infraController.Ready() // block until the infrastructure is up and ready diff --git a/pkg/cloudprovider/aws/controllers/register.go b/pkg/cloudprovider/aws/controllers/register.go index 74f924cd5697..75b4dab19eb4 100644 --- a/pkg/cloudprovider/aws/controllers/register.go +++ b/pkg/cloudprovider/aws/controllers/register.go @@ -26,12 +26,15 @@ import ( "github.com/aws/karpenter/pkg/controllers" ) -func Register(ctx context.Context, provider *aws.CloudProvider, opts *controllers.ControllerOptions) <-chan struct{} { +func Register(ctx context.Context, provider *aws.CloudProvider, opts *controllers.ControllerOptions) (done []<-chan struct{}) { rec := events.NewRecorder(opts.Recorder) ctx = logging.WithLogger(ctx, logging.FromContext(ctx).Named("aws")) - // Injecting the AWS-specific controllers that will start when opts.StartAsync is triggered - infraController := infrastructure.NewController(ctx, opts.KubeClient, opts.Clock, rec, provider.SQSProvider(), provider.EventBridgeProvider(), opts.StartAsync, opts.CleanupAsync) - notification.NewController(ctx, opts.KubeClient, opts.Clock, rec, opts.Cluster, provider.SQSProvider(), provider.InstanceTypeProvider(), infraController, opts.StartAsync) - return infraController.Done() + // Only enable spot interruption handling controllers when the feature flag is enabled + if opts.Config.EnableInterruptionHandling() { + infraController := infrastructure.NewController(ctx, opts.KubeClient, opts.Clock, rec, provider.SQSProvider(), provider.EventBridgeProvider(), opts.StartAsync, opts.CleanupAsync) + notification.NewController(ctx, opts.KubeClient, opts.Clock, rec, opts.Cluster, provider.SQSProvider(), provider.InstanceTypeProvider(), infraController, opts.StartAsync) + done = append(done, infraController.Done()) + } + return done } diff --git a/pkg/config/config.go b/pkg/config/config.go index 5d70c13faa93..d99c4685cdea 100644 --- a/pkg/config/config.go +++ b/pkg/config/config.go @@ -19,6 +19,7 @@ import ( "fmt" "hash/fnv" "sort" + "strconv" "sync" "time" @@ -35,13 +36,16 @@ const ( paramBatchMaxDuration = "batchMaxDuration" paramBatchIdleDuration = "batchIdleDuration" + enableInterruptionHandling = "enableInterruptionHandling" + configMapName = "karpenter-global-settings" ) // these values need to be synced with our templates/configmap.yaml var defaultConfigMapData = map[string]string{ - paramBatchMaxDuration: "10s", - paramBatchIdleDuration: "1s", + paramBatchMaxDuration: "10s", + paramBatchIdleDuration: "1s", + enableInterruptionHandling: "false", } type ChangeHandler func(c Config) @@ -54,6 +58,11 @@ type Config interface { BatchMaxDuration() time.Duration // BatchIdleDuration returns the maximum idle period used to extend a batch duration up to BatchMaxDuration BatchIdleDuration() time.Duration + + // START FEATURE FLAGS ----------------------------------- + + // EnableInterruptionHandling returns whether to enable interruption handling controllers + EnableInterruptionHandling() bool } type config struct { ctx context.Context @@ -62,6 +71,8 @@ type config struct { batchMaxDuration time.Duration batchIdleDuration time.Duration + enableInterruptionHandling bool + // hash of the config map so we only notify watches if it has changed configHash uint64 @@ -81,6 +92,12 @@ func (c *config) BatchIdleDuration() time.Duration { return c.batchIdleDuration } +func (c *config) EnableInterruptionHandling() bool { + c.dataMu.RLock() + defer c.dataMu.RUnlock() + return c.enableInterruptionHandling +} + func New(ctx context.Context, kubeClient *kubernetes.Clientset, iw *informer.InformedWatcher) (Config, error) { if iw.Namespace != system.Namespace() { return nil, fmt.Errorf("watcher configured for wrong namespace, expected %s found %s", system.Namespace(), iw.Namespace) @@ -143,6 +160,7 @@ func hashCM(cm *v1.ConfigMap) uint64 { return hasher.Sum64() } +//nolint:gocyclo func (c *config) configMapChanged(configMap *v1.ConfigMap) { hash := hashCM(configMap) if hash == c.configHash { @@ -171,6 +189,12 @@ func (c *config) configMapChanged(configMap *v1.ConfigMap) { c.batchMaxDuration = c.parsePositiveDuration(k, v, defaultConfigMapData[k]) case paramBatchIdleDuration: c.batchIdleDuration = c.parsePositiveDuration(k, v, defaultConfigMapData[k]) + case enableInterruptionHandling: + enabled, err := strconv.ParseBool(v) + if err != nil { + panic(fmt.Sprintf("enableInterruptionHandling should be a boolean value, %v", err)) + } + c.enableInterruptionHandling = enabled default: logging.FromContext(c.ctx).Warnf("ignoring unknown config parameter %s", k) } diff --git a/pkg/controllers/controllers.go b/pkg/controllers/controllers.go index e0c820e09595..d7a0dd9020cc 100644 --- a/pkg/controllers/controllers.go +++ b/pkg/controllers/controllers.go @@ -77,7 +77,7 @@ func init() { metrics.MustRegister() // Registers cross-controller metrics } -type ControllerInitFunc func(context.Context, *ControllerOptions) <-chan struct{} +type ControllerInitFunc func(context.Context, *ControllerOptions) []<-chan struct{} // Controller is an interface implemented by Karpenter custom resources. type Controller interface { @@ -90,6 +90,7 @@ type Controller interface { } type ControllerOptions struct { + Config config.Config Cluster *state.Cluster KubeClient client.Client Recorder events.Recorder @@ -169,6 +170,7 @@ func Initialize(injectCloudProvider func(context.Context, cloudprovider.Options) // Inject cloudprovider-specific controllers into the controller-set using the injectControllers function // Inject the base cloud provider into the injection function rather than the decorated interface controllerOptions := &ControllerOptions{ + Config: cfg, Cluster: cluster, KubeClient: manager.GetClient(), Recorder: recorder, @@ -180,7 +182,7 @@ func Initialize(injectCloudProvider func(context.Context, cloudprovider.Options) metricsstate.StartMetricScraper(ctx, cluster) - StartCleanupWatcher(ctx, cancel, manager.Elected(), cleanupAsync, cleanupDone) + StartCleanupWatcher(ctx, cancel, manager.Elected(), cleanupAsync, cleanupDone...) if err := RegisterControllers(ctx, manager, provisioning.NewController(manager.GetClient(), provisioner, recorder), diff --git a/pkg/test/config.go b/pkg/test/config.go index 81125706e066..5726204a0556 100644 --- a/pkg/test/config.go +++ b/pkg/test/config.go @@ -22,10 +22,11 @@ import ( ) type Config struct { - Mu sync.Mutex - Handlers []config.ChangeHandler - batchMaxDuration time.Duration - batchIdleDuration time.Duration + Mu sync.Mutex + Handlers []config.ChangeHandler + batchMaxDuration time.Duration + batchIdleDuration time.Duration + enableInterruptionHandling bool } func (c *Config) OnChange(handler config.ChangeHandler) { @@ -39,6 +40,7 @@ func (c *Config) SetBatchMaxDuration(d time.Duration) { defer c.Mu.Unlock() c.batchMaxDuration = d } + func (c *Config) BatchMaxDuration() time.Duration { c.Mu.Lock() defer c.Mu.Unlock() @@ -50,15 +52,29 @@ func (c *Config) SetBatchIdleDuration(d time.Duration) { defer c.Mu.Unlock() c.batchIdleDuration = d } + func (c *Config) BatchIdleDuration() time.Duration { c.Mu.Lock() defer c.Mu.Unlock() return c.batchIdleDuration } +func (c *Config) SetEnableInterruptionHandling(b bool) { + c.Mu.Lock() + defer c.Mu.Unlock() + c.enableInterruptionHandling = b +} + +func (c *Config) EnableInterruptionHandling() bool { + c.Mu.Lock() + defer c.Mu.Unlock() + return c.enableInterruptionHandling +} + func NewConfig() *Config { return &Config{ - batchMaxDuration: 10 * time.Second, - batchIdleDuration: 1 * time.Second, + batchMaxDuration: 10 * time.Second, + batchIdleDuration: 1 * time.Second, + enableInterruptionHandling: true, } } diff --git a/test/suites/common/setup.yaml b/test/suites/common/setup.yaml index 7b049705ae50..658f72248984 100644 --- a/test/suites/common/setup.yaml +++ b/test/suites/common/setup.yaml @@ -104,4 +104,5 @@ spec: --set clusterEndpoint=$(cat /root/.kube/config | grep server | awk '{print $2}') \ --set aws.defaultInstanceProfile=KarpenterNodeInstanceProfile-$(params.cluster-name) \ --set controller.batchIdleDuration=10s \ + --set controller.enableInterruptionHandling=true \ --wait \ No newline at end of file From ebad47ea20bda08e5f804d7701c650c6273e6331 Mon Sep 17 00:00:00 2001 From: Jonathan Innis Date: Thu, 6 Oct 2022 09:59:51 -0700 Subject: [PATCH 30/55] Add caching resolution to metadata provider --- pkg/cloudprovider/aws/eventbridge.go | 4 +- pkg/cloudprovider/aws/metadata.go | 72 +++++++++++++-------------- pkg/cloudprovider/aws/sqs.go | 73 ++++++++++++++-------------- pkg/utils/cache/cache.go | 69 ++++++++++++++++++++++++++ 4 files changed, 142 insertions(+), 76 deletions(-) create mode 100644 pkg/utils/cache/cache.go diff --git a/pkg/cloudprovider/aws/eventbridge.go b/pkg/cloudprovider/aws/eventbridge.go index e41914799fb3..37db6959882c 100644 --- a/pkg/cloudprovider/aws/eventbridge.go +++ b/pkg/cloudprovider/aws/eventbridge.go @@ -114,7 +114,7 @@ func (eb *EventBridgeProvider) DeleteEC2NotificationRules(ctx context.Context) ( Rule: aws.String(r.Name), } _, e := eb.client.RemoveTargetsWithContext(ctx, targetInput) - if err != nil && !IsNotFound(e) { + if e != nil && !IsNotFound(e) { m.Lock() err = multierr.Append(err, e) m.Unlock() @@ -124,7 +124,7 @@ func (eb *EventBridgeProvider) DeleteEC2NotificationRules(ctx context.Context) ( Name: aws.String(r.Name), } _, e = eb.client.DeleteRuleWithContext(ctx, ruleInput) - if err != nil && !IsNotFound(e) { + if e != nil && !IsNotFound(e) { m.Lock() err = multierr.Append(err, e) m.Unlock() diff --git a/pkg/cloudprovider/aws/metadata.go b/pkg/cloudprovider/aws/metadata.go index f9454623c403..5a858fe510d4 100644 --- a/pkg/cloudprovider/aws/metadata.go +++ b/pkg/cloudprovider/aws/metadata.go @@ -17,37 +17,26 @@ package aws import ( "context" "fmt" + "sync" "github.com/aws/aws-sdk-go/aws" "github.com/aws/aws-sdk-go/aws/ec2metadata" "github.com/aws/aws-sdk-go/aws/session" "github.com/aws/aws-sdk-go/service/sts" "github.com/aws/aws-sdk-go/service/sts/stsiface" -) - -type Metadata struct { - region string - accountID string -} -func NewMetadata(region, accountID string) *Metadata { - return &Metadata{ - region: region, - accountID: accountID, - } -} - -func (i *Metadata) Region() string { - return i.region -} - -func (i *Metadata) AccountID() string { - return i.accountID -} + "github.com/aws/karpenter/pkg/utils/cache" +) type MetadataProvider struct { imdsClient *ec2metadata.EC2Metadata stsClient stsiface.STSAPI + + region *string // cached region if already resolved + regionMu sync.RWMutex + + accountID *string // cached accountID if already resolved + accountIDMu sync.RWMutex } func NewMetadataProvider(sess *session.Session) *MetadataProvider { @@ -57,31 +46,40 @@ func NewMetadataProvider(sess *session.Session) *MetadataProvider { } } -func (i *MetadataProvider) Metadata(ctx context.Context) *Metadata { - return &Metadata{ - region: i.Region(ctx), - accountID: i.AccountID(ctx), - } -} - // Region gets the current region from EC2 IMDS func (i *MetadataProvider) Region(ctx context.Context) string { - region, err := i.imdsClient.RegionWithContext(ctx) + ret, err := cache.TryGetStringWithFallback(&i.regionMu, i.region, + func() (string, error) { + return i.imdsClient.RegionWithContext(ctx) + }) if err != nil { panic(fmt.Sprintf("Failed to call the metadata server's region API, %s", err)) } - return region + return ret } +// AccountID gets the AWS Account ID from EC2 IMDS, then STS if it can't be resolved at IMDS func (i *MetadataProvider) AccountID(ctx context.Context) string { - doc, err := i.imdsClient.GetInstanceIdentityDocumentWithContext(ctx) + ret, err := cache.TryGetStringWithFallback(&i.accountIDMu, i.accountID, + func() (string, error) { + doc, err := i.imdsClient.GetInstanceIdentityDocumentWithContext(ctx) + if err != nil { + // Fallback to using the STS provider if IMDS fails + result, err := i.stsClient.GetCallerIdentity(&sts.GetCallerIdentityInput{}) + if err != nil { + return "", err + } + return aws.StringValue(result.Account), nil + } + return doc.AccountID, nil + }, + ) if err != nil { - // Fallback to using the STS provider if IMDS fails - result, err := i.stsClient.GetCallerIdentity(&sts.GetCallerIdentityInput{}) - if err != nil { - panic(fmt.Sprintf("Failed to get account ID from IMDS or STS, %s", err)) - } - return aws.StringValue(result.Account) + panic(fmt.Sprintf("Failed to get account ID from IMDS or STS, %s", err)) } - return doc.AccountID + return ret +} + +func (i *MetadataProvider) Partition() string { + return i.imdsClient.PartitionID } diff --git a/pkg/cloudprovider/aws/sqs.go b/pkg/cloudprovider/aws/sqs.go index fff0bec8ad7f..d74f0bb95cce 100644 --- a/pkg/cloudprovider/aws/sqs.go +++ b/pkg/cloudprovider/aws/sqs.go @@ -26,6 +26,7 @@ import ( "github.com/samber/lo" "github.com/aws/karpenter/pkg/apis/provisioning/v1alpha5" + "github.com/aws/karpenter/pkg/utils/cache" "github.com/aws/karpenter/pkg/utils/functional" "github.com/aws/karpenter/pkg/utils/injection" ) @@ -53,21 +54,21 @@ type SQSProvider struct { createQueueInput *sqs.CreateQueueInput getQueueURLInput *sqs.GetQueueUrlInput receiveMessageInput *sqs.ReceiveMessageInput - mutex *sync.RWMutex - queueURL string + mu sync.RWMutex + queueURL *string queueName string - metadata *Metadata + metadataProvider *MetadataProvider } -func NewSQSProvider(ctx context.Context, client sqsiface.SQSAPI, metadata *Metadata) *SQSProvider { +func NewSQSProvider(ctx context.Context, client sqsiface.SQSAPI, metadataProvider *MetadataProvider) *SQSProvider { provider := &SQSProvider{ - client: client, - mutex: &sync.RWMutex{}, - metadata: metadata, - queueName: getQueueName(ctx), + client: client, + mu: sync.RWMutex{}, + metadataProvider: metadataProvider, + queueName: getQueueName(ctx), } provider.createQueueInput = &sqs.CreateQueueInput{ - Attributes: provider.getQueueAttributes(), + Attributes: provider.getQueueAttributes(ctx), QueueName: aws.String(provider.queueName), Tags: map[string]*string{ v1alpha5.DiscoveryLabelKey: aws.String(injection.GetOptions(ctx).ClusterName), @@ -99,9 +100,9 @@ func (s *SQSProvider) CreateQueue(ctx context.Context) error { if err != nil { return fmt.Errorf("creating sqs queue, %w", err) } - s.mutex.Lock() - defer s.mutex.Unlock() - s.queueURL = aws.StringValue(result.QueueUrl) + s.mu.Lock() + defer s.mu.Unlock() + s.queueURL = result.QueueUrl return nil } @@ -112,7 +113,7 @@ func (s *SQSProvider) SetQueueAttributes(ctx context.Context) error { } setQueueAttributesInput := &sqs.SetQueueAttributesInput{ - Attributes: s.getQueueAttributes(), + Attributes: s.getQueueAttributes(ctx), QueueUrl: aws.String(queueURL), } _, err = s.client.SetQueueAttributesWithContext(ctx, setQueueAttributesInput) @@ -123,24 +124,17 @@ func (s *SQSProvider) SetQueueAttributes(ctx context.Context) error { } func (s *SQSProvider) DiscoverQueueURL(ctx context.Context, ignoreCache bool) (string, error) { - s.mutex.RLock() - queueURL := s.queueURL - s.mutex.RUnlock() - if queueURL != "" && !ignoreCache { - return queueURL, nil - } - s.mutex.Lock() - defer s.mutex.Unlock() - // We have to check if the queueUrl is set again here in case multiple threads make it past the read-locked section - if s.queueURL != "" && !ignoreCache { - return s.queueURL, nil - } - result, err := s.client.GetQueueUrlWithContext(ctx, s.getQueueURLInput) - if err != nil { - return "", fmt.Errorf("fetching queue url, %w", err) - } - s.queueURL = aws.StringValue(result.QueueUrl) - return aws.StringValue(result.QueueUrl), nil + opts := lo.Ternary(ignoreCache, cache.IgnoreCacheOption, nil) + return cache.TryGetStringWithFallback(&s.mu, s.queueURL, + func() (string, error) { + ret, err := s.client.GetQueueUrlWithContext(ctx, s.getQueueURLInput) + if err != nil { + return "", fmt.Errorf("fetching queue url, %w", err) + } + return aws.StringValue(ret.QueueUrl), nil + }, + opts, + ) } func (s *SQSProvider) GetSQSMessages(ctx context.Context) ([]*sqs.Message, error) { @@ -221,15 +215,15 @@ func (s *SQSProvider) DeleteQueue(ctx context.Context) error { return nil } -func (s *SQSProvider) getQueueAttributes() map[string]*string { - policy := lo.Must(json.Marshal(s.getQueuePolicy())) +func (s *SQSProvider) getQueueAttributes(ctx context.Context) map[string]*string { + policy := lo.Must(json.Marshal(s.getQueuePolicy(ctx))) return map[string]*string{ sqs.QueueAttributeNameMessageRetentionPeriod: aws.String("300"), sqs.QueueAttributeNamePolicy: aws.String(string(policy)), } } -func (s *SQSProvider) getQueuePolicy() *QueuePolicy { +func (s *SQSProvider) getQueuePolicy(ctx context.Context) *QueuePolicy { return &QueuePolicy{ Version: "2008-10-17", ID: "EC2NotificationPolicy", @@ -243,14 +237,19 @@ func (s *SQSProvider) getQueuePolicy() *QueuePolicy { }, }, Action: []string{"sqs:SendMessage"}, - Resource: s.getQueueARN(), + Resource: s.getQueueARN(ctx), }, }, } } -func (s *SQSProvider) getQueueARN() string { - return fmt.Sprintf("arn:aws:sqs:%s:%s:%s", s.metadata.Region(), s.metadata.AccountID(), s.queueName) +func (s *SQSProvider) getQueueARN(ctx context.Context) string { + return fmt.Sprintf("arn:%s:sqs:%s:%s:%s", + s.metadataProvider.Partition(), + s.metadataProvider.Region(ctx), + s.metadataProvider.AccountID(ctx), + s.queueName, + ) } func getQueueName(ctx context.Context) string { diff --git a/pkg/utils/cache/cache.go b/pkg/utils/cache/cache.go new file mode 100644 index 000000000000..94a89b11aeab --- /dev/null +++ b/pkg/utils/cache/cache.go @@ -0,0 +1,69 @@ +/* +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package cache + +import ( + "fmt" + "sync" +) + +type Option func(Options) Options + +type Options struct { + ignoreCache bool +} + +func IgnoreCacheOption(o Options) Options { + o.ignoreCache = true + return o +} + +// TryGetStringWithFallback attempts to get non-nil string value from field. If field is nil, the function +// will attempt to resolve the value by calling fallback, setting the value stored in field in-place if found. +func TryGetStringWithFallback(mu *sync.RWMutex, field *string, fallback func() (string, error), opts ...Option) (string, error) { + o := resolveOptions(opts) + mu.RLock() + if field != nil && !o.ignoreCache { + ret := *field + mu.RUnlock() + return ret, nil + } + mu.RUnlock() + mu.Lock() + defer mu.Unlock() + // We have to check if the field is set again here in case multiple threads make it past the read-locked section + if field != nil { + return *field, nil + } + ret, err := fallback() + if err != nil { + return "", err + } + if ret == "" { + return "", fmt.Errorf("return value didn't resolve to non-nil value") + } + *field = ret + return ret, nil +} + +func resolveOptions(opts []Option) Options { + o := Options{} + for _, opt := range opts { + if opt != nil { + o = opt(o) + } + } + return o +} From 58e9e41d71b4295e9afc85592930b1e3d4e8cf30 Mon Sep 17 00:00:00 2001 From: Jonathan Innis Date: Thu, 6 Oct 2022 12:20:40 -0700 Subject: [PATCH 31/55] Expose EventRecorder through interface --- pkg/cloudprovider/aws/events/recorder.go | 28 +++++++++++++----------- pkg/events/dedupe.go | 27 +++++++++++++---------- pkg/events/loadshedding.go | 25 ++++++++++++--------- pkg/events/recorder.go | 28 ++++++++++++++---------- 4 files changed, 62 insertions(+), 46 deletions(-) diff --git a/pkg/cloudprovider/aws/events/recorder.go b/pkg/cloudprovider/aws/events/recorder.go index 480db7965a2b..bab6c22eb2bd 100644 --- a/pkg/cloudprovider/aws/events/recorder.go +++ b/pkg/cloudprovider/aws/events/recorder.go @@ -30,11 +30,12 @@ import ( ) type recorder struct { - record.EventRecorder + rec record.EventRecorder + events.Recorder } type Recorder interface { - record.EventRecorder + events.Recorder // EC2SpotInterruptionWarning is called when EC2 sends a spot interruption 2-minute warning for the node from the SQS queue EC2SpotInterruptionWarning(*v1.Node) @@ -60,32 +61,33 @@ type Recorder interface { func NewRecorder(r events.Recorder) Recorder { return recorder{ - EventRecorder: r, + rec: r.EventRecorder(), + Recorder: r, } } func (r recorder) EC2SpotInterruptionWarning(node *v1.Node) { - r.Eventf(node, "Normal", "EC2SpotInterruptionWarning", "Node %s event: EC2 triggered a spot interruption warning for the node", node.Name) + r.rec.Eventf(node, "Normal", "EC2SpotInterruptionWarning", "Node %s event: EC2 triggered a spot interruption warning for the node", node.Name) } func (r recorder) EC2SpotRebalanceRecommendation(node *v1.Node) { - r.Eventf(node, "Normal", "EC2RebalanceRecommendation", "Node %s event: EC2 triggered a spot rebalance recommendation for the node", node.Name) + r.rec.Eventf(node, "Normal", "EC2RebalanceRecommendation", "Node %s event: EC2 triggered a spot rebalance recommendation for the node", node.Name) } func (r recorder) EC2HealthWarning(node *v1.Node) { - r.Eventf(node, "Normal", "EC2HealthWarning", "Node %s event: EC2 triggered a health warning for the node", node.Name) + r.rec.Eventf(node, "Normal", "EC2HealthWarning", "Node %s event: EC2 triggered a health warning for the node", node.Name) } func (r recorder) EC2StateTerminating(node *v1.Node) { - r.Eventf(node, "Normal", "EC2StateTerminating", `Node %s event: EC2 node is terminating"`, node.Name) + r.rec.Eventf(node, "Normal", "EC2StateTerminating", `Node %s event: EC2 node is terminating"`, node.Name) } func (r recorder) EC2StateStopping(node *v1.Node) { - r.Eventf(node, "Normal", "EC2StateStopping", `Node %s event: EC2 node is stopping"`, node.Name) + r.rec.Eventf(node, "Normal", "EC2StateStopping", `Node %s event: EC2 node is stopping"`, node.Name) } func (r recorder) TerminatingNodeOnNotification(node *v1.Node) { - r.Eventf(node, "Normal", "AWSNotificationTerminateNode", "Node %s event: Notification triggered termination for the node", node.Name) + r.rec.Eventf(node, "Normal", "AWSNotificationTerminateNode", "Node %s event: Notification triggered termination for the node", node.Name) } func (r recorder) InfrastructureHealthy(ctx context.Context, kubeClient client.Client) { @@ -97,7 +99,7 @@ func (r recorder) InfrastructureHealthy(ctx context.Context, kubeClient client.C logging.FromContext(ctx).Errorf("Sending InfrastructureHealthy event, %v", err) return } - r.Eventf(pod, "Normal", "AWSInfrastructureHealthy", "Karpenter infrastructure reconciliation is healthy") + r.rec.Eventf(pod, "Normal", "AWSInfrastructureHealthy", "Karpenter infrastructure reconciliation is healthy") } func (r recorder) InfrastructureUnhealthy(ctx context.Context, kubeClient client.Client) { @@ -109,7 +111,7 @@ func (r recorder) InfrastructureUnhealthy(ctx context.Context, kubeClient client logging.FromContext(ctx).Errorf("Sending InfrastructureUnhealthy event, %v", err) return } - r.Eventf(pod, "Warning", "AWSInfrastructureUnhealthy", "Karpenter infrastructure reconciliation is unhealthy") + r.rec.Eventf(pod, "Warning", "AWSInfrastructureUnhealthy", "Karpenter infrastructure reconciliation is unhealthy") } func (r recorder) InfrastructureDeletionSucceeded(ctx context.Context, kubeClient client.Client) { @@ -121,7 +123,7 @@ func (r recorder) InfrastructureDeletionSucceeded(ctx context.Context, kubeClien logging.FromContext(ctx).Errorf("Sending InfrastructureDeletionSucceeded event, %v", err) return } - r.Eventf(pod, "Normal", "AWSInfrastructureDeletionSucceeded", "Karpenter infrastructure deletion succeeded") + r.rec.Eventf(pod, "Normal", "AWSInfrastructureDeletionSucceeded", "Karpenter infrastructure deletion succeeded") } func (r recorder) InfrastructureDeletionFailed(ctx context.Context, kubeClient client.Client) { @@ -133,5 +135,5 @@ func (r recorder) InfrastructureDeletionFailed(ctx context.Context, kubeClient c logging.FromContext(ctx).Errorf("Sending InfrastructureDeletionFailed event, %v", err) return } - r.Eventf(pod, "Warning", "AWSInfrastructureDeletionFailed", "Karpenter infrastructure deletion failed") + r.rec.Eventf(pod, "Warning", "AWSInfrastructureDeletionFailed", "Karpenter infrastructure deletion failed") } diff --git a/pkg/events/dedupe.go b/pkg/events/dedupe.go index 9c2a60ddca1c..6fcec618f0b1 100644 --- a/pkg/events/dedupe.go +++ b/pkg/events/dedupe.go @@ -20,53 +20,58 @@ import ( "github.com/patrickmn/go-cache" v1 "k8s.io/api/core/v1" + "k8s.io/client-go/tools/record" ) func NewDedupeRecorder(r Recorder) Recorder { return &dedupe{ - Recorder: r, - cache: cache.New(120*time.Second, 10*time.Second), + rec: r, + cache: cache.New(120*time.Second, 10*time.Second), } } type dedupe struct { - Recorder + rec Recorder cache *cache.Cache } +func (d *dedupe) EventRecorder() record.EventRecorder { + return d.rec.EventRecorder() +} + func (d *dedupe) WaitingOnDeletionForConsolidation(node *v1.Node) { if !d.shouldCreateEvent(fmt.Sprintf("wait-node-consolidate-delete-%s", node.UID)) { return } - d.Recorder.WaitingOnDeletionForConsolidation(node) + d.rec.WaitingOnDeletionForConsolidation(node) } func (d *dedupe) WaitingOnReadinessForConsolidation(node *v1.Node) { if !d.shouldCreateEvent(fmt.Sprintf("wait-node-consolidate-ready-%s", node.UID)) { return } - d.Recorder.WaitingOnReadinessForConsolidation(node) + d.rec.WaitingOnReadinessForConsolidation(node) } func (d *dedupe) TerminatingNodeForConsolidation(node *v1.Node, reason string) { if !d.shouldCreateEvent(fmt.Sprintf("terminate-node-consolidate-%s-%s", node.UID, reason)) { return } - d.Recorder.TerminatingNodeForConsolidation(node, reason) + d.rec.TerminatingNodeForConsolidation(node, reason) } func (d *dedupe) LaunchingNodeForConsolidation(node *v1.Node, reason string) { if !d.shouldCreateEvent(fmt.Sprintf("launch-node-consolidate-%s-%s", node.UID, reason)) { return } - d.Recorder.LaunchingNodeForConsolidation(node, reason) + d.rec.LaunchingNodeForConsolidation(node, reason) } func (d *dedupe) NominatePod(pod *v1.Pod, node *v1.Node) { if !d.shouldCreateEvent(fmt.Sprintf("nominate-node-%s-%s", pod.UID, node.UID)) { return } - d.Recorder.NominatePod(pod, node) + d.rec.NominatePod(pod, node) } func (d *dedupe) EvictPod(pod *v1.Pod) { @@ -75,21 +80,21 @@ func (d *dedupe) EvictPod(pod *v1.Pod) { return } d.cache.SetDefault(key, nil) - d.Recorder.EvictPod(pod) + d.rec.EvictPod(pod) } func (d *dedupe) PodFailedToSchedule(pod *v1.Pod, err error) { if !d.shouldCreateEvent(fmt.Sprintf("failed-to-schedule-%s-%s", pod.UID, err)) { return } - d.Recorder.PodFailedToSchedule(pod, err) + d.rec.PodFailedToSchedule(pod, err) } func (d *dedupe) NodeFailedToDrain(node *v1.Node, err error) { if !d.shouldCreateEvent(fmt.Sprintf("failed-to-drain-%s", node.Name)) { return } - d.Recorder.NodeFailedToDrain(node, err) + d.rec.NodeFailedToDrain(node, err) } func (d *dedupe) shouldCreateEvent(key string) bool { diff --git a/pkg/events/loadshedding.go b/pkg/events/loadshedding.go index 04d08f13a60c..f56d6ee0edb2 100644 --- a/pkg/events/loadshedding.go +++ b/pkg/events/loadshedding.go @@ -16,21 +16,26 @@ package events import ( v1 "k8s.io/api/core/v1" + "k8s.io/client-go/tools/record" "k8s.io/client-go/util/flowcontrol" ) func NewLoadSheddingRecorder(r Recorder) Recorder { return &loadshedding{ - Recorder: r, + rec: r, nominationBucket: flowcontrol.NewTokenBucketRateLimiter(5, 10), } } type loadshedding struct { - Recorder + rec Recorder nominationBucket flowcontrol.RateLimiter } +func (l *loadshedding) EventRecorder() record.EventRecorder { + return l.rec.EventRecorder() +} + func (l *loadshedding) NominatePod(pod *v1.Pod, node *v1.Node) { // Pod nominations occur very often, especially in large scale-ups. They normally aren't particularly useful // during a scaleup, but are useful when at a steady state where we have a bug and think a pod will schedule @@ -39,33 +44,33 @@ func (l *loadshedding) NominatePod(pod *v1.Pod, node *v1.Node) { if !l.nominationBucket.TryAccept() { return } - l.Recorder.NominatePod(pod, node) + l.rec.NominatePod(pod, node) } func (l *loadshedding) EvictPod(pod *v1.Pod) { - l.Recorder.EvictPod(pod) + l.rec.EvictPod(pod) } func (l *loadshedding) PodFailedToSchedule(pod *v1.Pod, err error) { - l.Recorder.PodFailedToSchedule(pod, err) + l.rec.PodFailedToSchedule(pod, err) } func (l *loadshedding) NodeFailedToDrain(node *v1.Node, err error) { - l.Recorder.NodeFailedToDrain(node, err) + l.rec.NodeFailedToDrain(node, err) } func (l *loadshedding) TerminatingNodeForConsolidation(node *v1.Node, reason string) { - l.Recorder.TerminatingNodeForConsolidation(node, reason) + l.rec.TerminatingNodeForConsolidation(node, reason) } func (l *loadshedding) LaunchingNodeForConsolidation(node *v1.Node, reason string) { - l.Recorder.LaunchingNodeForConsolidation(node, reason) + l.rec.LaunchingNodeForConsolidation(node, reason) } func (l *loadshedding) WaitingOnReadinessForConsolidation(node *v1.Node) { - l.Recorder.WaitingOnReadinessForConsolidation(node) + l.rec.WaitingOnReadinessForConsolidation(node) } func (l *loadshedding) WaitingOnDeletionForConsolidation(node *v1.Node) { - l.Recorder.WaitingOnDeletionForConsolidation(node) + l.rec.WaitingOnDeletionForConsolidation(node) } diff --git a/pkg/events/recorder.go b/pkg/events/recorder.go index d329132e61c3..6fda1ba32e77 100644 --- a/pkg/events/recorder.go +++ b/pkg/events/recorder.go @@ -22,8 +22,8 @@ import ( // Recorder is used to record events that occur about pods so they can be viewed by looking at the pod's events so our // actions are more observable without requiring log inspection type Recorder interface { - record.EventRecorder - + // EventRecorder returns the internal event recorder used by this recorder + EventRecorder() record.EventRecorder // NominatePod is called when we have determined that a pod should schedule against an existing node and don't // currently need to provision new capacity for the pod. NominatePod(*v1.Pod, *v1.Node) @@ -47,40 +47,44 @@ type Recorder interface { } type recorder struct { - record.EventRecorder + rec record.EventRecorder } func NewRecorder(rec record.EventRecorder) Recorder { - return &recorder{EventRecorder: rec} + return &recorder{rec: rec} +} + +func (r recorder) EventRecorder() record.EventRecorder { + return r.rec } func (r recorder) WaitingOnDeletionForConsolidation(node *v1.Node) { - r.Eventf(node, "Normal", "ConsolidateWaiting", "Waiting on deletion to continue consolidation") + r.rec.Eventf(node, "Normal", "ConsolidateWaiting", "Waiting on deletion to continue consolidation") } func (r recorder) WaitingOnReadinessForConsolidation(node *v1.Node) { - r.Eventf(node, "Normal", "ConsolidateWaiting", "Waiting on readiness to continue consolidation") + r.rec.Eventf(node, "Normal", "ConsolidateWaiting", "Waiting on readiness to continue consolidation") } func (r recorder) TerminatingNodeForConsolidation(node *v1.Node, reason string) { - r.Eventf(node, "Normal", "ConsolidateTerminateNode", "Consolidating node via %s", reason) + r.rec.Eventf(node, "Normal", "ConsolidateTerminateNode", "Consolidating node via %s", reason) } func (r recorder) LaunchingNodeForConsolidation(node *v1.Node, reason string) { - r.Eventf(node, "Normal", "ConsolidateLaunchNode", "Launching node for %s", reason) + r.rec.Eventf(node, "Normal", "ConsolidateLaunchNode", "Launching node for %s", reason) } func (r recorder) NominatePod(pod *v1.Pod, node *v1.Node) { - r.Eventf(pod, "Normal", "Nominate", "Pod should schedule on %s", node.Name) + r.rec.Eventf(pod, "Normal", "Nominate", "Pod should schedule on %s", node.Name) } func (r recorder) EvictPod(pod *v1.Pod) { - r.Eventf(pod, "Normal", "Evict", "Evicted pod") + r.rec.Eventf(pod, "Normal", "Evict", "Evicted pod") } func (r recorder) PodFailedToSchedule(pod *v1.Pod, err error) { - r.Eventf(pod, "Warning", "FailedProvisioning", "Failed to provision new node, %s", err) + r.rec.Eventf(pod, "Warning", "FailedProvisioning", "Failed to provision new node, %s", err) } func (r recorder) NodeFailedToDrain(node *v1.Node, err error) { - r.Eventf(node, "Warning", "FailedDraining", "Failed to drain node, %s", err) + r.rec.Eventf(node, "Warning", "FailedDraining", "Failed to drain node, %s", err) } From 23fab2d7cd06aa27a15cda9ca8aef84e940df3f8 Mon Sep 17 00:00:00 2001 From: Jonathan Innis Date: Thu, 6 Oct 2022 16:24:15 -0700 Subject: [PATCH 32/55] Resolve compile errors after metadata updates --- go.mod | 2 +- pkg/apis/provisioning/v1alpha5/labels.go | 1 - .../aws/apis/v1alpha1/register.go | 2 + pkg/cloudprovider/aws/cloudprovider.go | 8 ++-- .../controllers/infrastructure/controller.go | 10 ++--- .../controllers/infrastructure/suite_test.go | 9 ++-- .../controllers/notification/controller.go | 8 ++-- .../controllers/notification/suite_test.go | 44 +++++++++---------- pkg/cloudprovider/aws/eventbridge.go | 30 ++++++------- pkg/cloudprovider/aws/events/recorder.go | 4 ++ pkg/cloudprovider/aws/fake/ec2metadataapi.go | 37 ++++++++++++++++ pkg/cloudprovider/aws/fake/eventrecorder.go | 7 ++- pkg/cloudprovider/aws/fake/stsapi.go | 21 +++++++++ pkg/cloudprovider/aws/metadata.go | 36 +++++++++++---- pkg/cloudprovider/aws/sqs.go | 4 +- pkg/cloudprovider/aws/suite_test.go | 4 +- pkg/controllers/consolidation/suite_test.go | 4 +- .../scheduling/scheduling_benchmark_test.go | 2 +- .../provisioning/scheduling/suite_test.go | 4 +- pkg/controllers/provisioning/suite_test.go | 4 +- pkg/controllers/termination/suite_test.go | 2 +- pkg/test/eventrecorder.go | 35 ++++++++------- pkg/test/expectations/expectations.go | 2 +- test/pkg/environment/environment.go | 26 +++++------ test/suites/notification/suite_test.go | 26 +++++------ 25 files changed, 207 insertions(+), 125 deletions(-) create mode 100644 pkg/cloudprovider/aws/fake/ec2metadataapi.go create mode 100644 pkg/cloudprovider/aws/fake/stsapi.go diff --git a/go.mod b/go.mod index b4399e2985c4..bd0ce9a44f1c 100644 --- a/go.mod +++ b/go.mod @@ -10,7 +10,6 @@ require ( github.com/deckarep/golang-set v1.8.0 github.com/go-logr/logr v1.2.3 github.com/go-logr/zapr v1.2.3 - github.com/google/uuid v1.3.0 github.com/imdario/mergo v0.3.13 github.com/mitchellh/hashstructure/v2 v2.0.2 github.com/onsi/ginkgo/v2 v2.2.0 @@ -58,6 +57,7 @@ require ( github.com/google/gnostic v0.5.7-v3refs // indirect github.com/google/go-cmp v0.5.8 // indirect github.com/google/gofuzz v1.2.0 // indirect + github.com/google/uuid v1.3.0 // indirect github.com/grpc-ecosystem/grpc-gateway v1.16.0 // indirect github.com/hashicorp/golang-lru v0.5.4 // indirect github.com/jmespath/go-jmespath v0.4.0 // indirect diff --git a/pkg/apis/provisioning/v1alpha5/labels.go b/pkg/apis/provisioning/v1alpha5/labels.go index 712145e2f4d1..b3ddf0a51189 100644 --- a/pkg/apis/provisioning/v1alpha5/labels.go +++ b/pkg/apis/provisioning/v1alpha5/labels.go @@ -31,7 +31,6 @@ var ( KarpenterLabelDomain = "karpenter.sh" ProvisionerNameLabelKey = Group + "/provisioner-name" - DiscoveryLabelKey = Group + "/discovery" DoNotEvictPodAnnotationKey = Group + "/do-not-evict" DoNotConsolidateNodeAnnotationKey = KarpenterLabelDomain + "/do-not-consolidate" EmptinessTimestampAnnotationKey = Group + "/emptiness-timestamp" diff --git a/pkg/cloudprovider/aws/apis/v1alpha1/register.go b/pkg/cloudprovider/aws/apis/v1alpha1/register.go index d1229f6206e1..d4b8cb52c7b3 100644 --- a/pkg/cloudprovider/aws/apis/v1alpha1/register.go +++ b/pkg/cloudprovider/aws/apis/v1alpha1/register.go @@ -71,6 +71,8 @@ var ( LabelInstanceGPUManufacturer = LabelDomain + "/instance-gpu-manufacturer" LabelInstanceGPUCount = LabelDomain + "/instance-gpu-count" LabelInstanceGPUMemory = LabelDomain + "/instance-gpu-memory" + + DiscoveryTagKey = v1alpha5.Group + "/discovery" ) var ( diff --git a/pkg/cloudprovider/aws/cloudprovider.go b/pkg/cloudprovider/aws/cloudprovider.go index 66fca8615389..8d9ae483ee03 100644 --- a/pkg/cloudprovider/aws/cloudprovider.go +++ b/pkg/cloudprovider/aws/cloudprovider.go @@ -32,6 +32,7 @@ import ( "github.com/aws/aws-sdk-go/service/eventbridge" "github.com/aws/aws-sdk-go/service/sqs" "github.com/aws/aws-sdk-go/service/ssm" + "github.com/aws/aws-sdk-go/service/sts" "github.com/patrickmn/go-cache" "github.com/samber/lo" v1 "k8s.io/api/core/v1" @@ -98,7 +99,7 @@ func NewCloudProvider(ctx context.Context, options cloudprovider.Options) *Cloud client.DefaultRetryer{NumMaxRetries: client.DefaultRetryerMaxNumRetries}, ), ))) - metadataProvider := NewMetadataProvider(sess) + metadataProvider := NewMetadataProvider(NewEC2MetadataClient(sess), sts.New(sess)) if *sess.Config.Region == "" { logging.FromContext(ctx).Debug("AWS region not configured, asking EC2 Instance Metadata Service") *sess.Config.Region = metadataProvider.Region(ctx) @@ -112,9 +113,8 @@ func NewCloudProvider(ctx context.Context, options cloudprovider.Options) *Cloud subnetProvider := NewSubnetProvider(ec2api) instanceTypeProvider := NewInstanceTypeProvider(ctx, sess, options, ec2api, subnetProvider) - m := NewMetadata(*sess.Config.Region, metadataProvider.AccountID(ctx)) - sqsProvider := NewSQSProvider(ctx, sqs.New(sess), m) - eventBridgeProvider := NewEventBridgeProvider(eventbridge.New(sess), m, sqsProvider.QueueName()) + sqsProvider := NewSQSProvider(ctx, sqs.New(sess), metadataProvider) + eventBridgeProvider := NewEventBridgeProvider(eventbridge.New(sess), metadataProvider, sqsProvider.QueueName()) cloudprovider := &CloudProvider{ instanceTypeProvider: instanceTypeProvider, instanceProvider: NewInstanceProvider(ctx, ec2api, instanceTypeProvider, subnetProvider, diff --git a/pkg/cloudprovider/aws/controllers/infrastructure/controller.go b/pkg/cloudprovider/aws/controllers/infrastructure/controller.go index 44b596e4aa4b..ef018fbe3394 100644 --- a/pkg/cloudprovider/aws/controllers/infrastructure/controller.go +++ b/pkg/cloudprovider/aws/controllers/infrastructure/controller.go @@ -112,11 +112,9 @@ func newBackoff(clk clock.Clock) *backoff.ExponentialBackOff { } func (c *Controller) run(ctx context.Context) { - defer func() { - logging.FromContext(ctx).Infof("Shutting down") - }() + defer logging.FromContext(ctx).Infof("Shutting down") for { - if err := c.EnsureInfrastructure(ctx); err != nil { + if err := c.Reconcile(ctx); err != nil { logging.FromContext(ctx).Errorf("ensuring infrastructure established, %v", err) c.setReady(ctx, false) backoffPeriod := c.getBackoff(err) @@ -212,9 +210,9 @@ func (c *Controller) setReady(ctx context.Context, ready bool) { c.ready = ready } -// EnsureInfrastructure reconciles the SQS queue and the EventBridge rules with the expected +// Reconcile reconciles the SQS queue and the EventBridge rules with the expected // configuration prescribed by Karpenter -func (c *Controller) EnsureInfrastructure(ctx context.Context) (err error) { +func (c *Controller) Reconcile(ctx context.Context) (err error) { defer metrics.Measure(reconcileDuration)() wg := &sync.WaitGroup{} diff --git a/pkg/cloudprovider/aws/controllers/infrastructure/suite_test.go b/pkg/cloudprovider/aws/controllers/infrastructure/suite_test.go index 1ecedb00936f..57c1d234fd57 100644 --- a/pkg/cloudprovider/aws/controllers/infrastructure/suite_test.go +++ b/pkg/cloudprovider/aws/controllers/infrastructure/suite_test.go @@ -76,12 +76,11 @@ var _ = BeforeEach(func() { fakeClock = clock.NewFakeClock(time.Now()) recorder = awsfake.NewEventRecorder() - metadata := aws.NewMetadata("us-east-1", "000000000000") - + metadataProvider := aws.NewMetadataProvider(&awsfake.EC2MetadataAPI{}, &awsfake.STSAPI{}) sqsapi = &awsfake.SQSAPI{} eventbridgeapi = &awsfake.EventBridgeAPI{} - sqsProvider = aws.NewSQSProvider(e.Ctx, sqsapi, metadata) - eventBridgeProvider = aws.NewEventBridgeProvider(eventbridgeapi, metadata, sqsProvider.QueueName()) + sqsProvider = aws.NewSQSProvider(e.Ctx, sqsapi, metadataProvider) + eventBridgeProvider = aws.NewEventBridgeProvider(eventbridgeapi, metadataProvider, sqsProvider.QueueName()) cleanupChan = make(chan struct{}, 1) startChan = make(chan struct{}) @@ -102,7 +101,7 @@ var _ = AfterEach(func() { var _ = Describe("Reconciliation", func() { It("should reconcile the queue and the eventbridge rules on start", func() { sqsapi.GetQueueURLBehavior.Error.Set(awsErrWithCode(sqs.ErrCodeQueueDoesNotExist), awsfake.MaxCalls(1)) // This mocks the queue not existing - Expect(controller.EnsureInfrastructure(env.Ctx)).To(Succeed()) + Expect(controller.Reconcile(env.Ctx)).To(Succeed()) Expect(sqsapi.CreateQueueBehavior.SuccessfulCalls()).To(Equal(1)) Expect(eventbridgeapi.PutRuleBehavior.SuccessfulCalls()).To(Equal(4)) diff --git a/pkg/cloudprovider/aws/controllers/notification/controller.go b/pkg/cloudprovider/aws/controllers/notification/controller.go index 4b676cd18b1d..cedb573e6689 100644 --- a/pkg/cloudprovider/aws/controllers/notification/controller.go +++ b/pkg/cloudprovider/aws/controllers/notification/controller.go @@ -113,12 +113,10 @@ func newBackoff(clk clock.Clock) *backoff.ExponentialBackOff { } func (c *Controller) run(ctx context.Context) { - defer func() { - logging.FromContext(ctx).Infof("Shutting down") - }() + defer logging.FromContext(ctx).Infof("Shutting down") for { <-c.infraController.Ready() // block until the infrastructure is up and ready - err := c.PollSQS(ctx) + err := c.Reconcile(ctx) if err != nil { logging.FromContext(ctx).Errorf("Handling notification messages from SQS queue, %v", err) select { @@ -137,7 +135,7 @@ func (c *Controller) run(ctx context.Context) { } } -func (c *Controller) PollSQS(ctx context.Context) error { +func (c *Controller) Reconcile(ctx context.Context) error { defer metrics.Measure(reconcileDuration)() sqsMessages, err := c.provider.GetSQSMessages(ctx) diff --git a/pkg/cloudprovider/aws/controllers/notification/suite_test.go b/pkg/cloudprovider/aws/controllers/notification/suite_test.go index 77f191ac0089..e837e9930c0c 100644 --- a/pkg/cloudprovider/aws/controllers/notification/suite_test.go +++ b/pkg/cloudprovider/aws/controllers/notification/suite_test.go @@ -35,7 +35,7 @@ import ( . "knative.dev/pkg/logging/testing" _ "knative.dev/pkg/system/testing" - "github.com/google/uuid" + "k8s.io/apimachinery/pkg/util/uuid" "sigs.k8s.io/controller-runtime/pkg/client" @@ -103,12 +103,12 @@ var _ = BeforeEach(func() { cluster = state.NewCluster(fakeClock, cfg, env.Client, cloudProvider) nodeStateController = state.NewNodeController(env.Client, cluster) recorder = awsfake.NewEventRecorder() - metadata := aws.NewMetadata("us-east-1", "000000000000") + metadataProvider := aws.NewMetadataProvider(&awsfake.EC2MetadataAPI{}, &awsfake.STSAPI{}) sqsapi = &awsfake.SQSAPI{} - sqsProvider = aws.NewSQSProvider(ctx, sqsapi, metadata) + sqsProvider = aws.NewSQSProvider(ctx, sqsapi, metadataProvider) eventbridgeapi = &awsfake.EventBridgeAPI{} - eventBridgeProvider = aws.NewEventBridgeProvider(eventbridgeapi, metadata, sqsProvider.QueueName()) + eventBridgeProvider = aws.NewEventBridgeProvider(eventbridgeapi, metadataProvider, sqsProvider.QueueName()) infraStartChan = make(chan struct{}) notificationStartChan = make(chan struct{}) @@ -141,7 +141,7 @@ var _ = Describe("Processing Messages", func() { ExpectApplied(env.Ctx, env.Client, node) ExpectReconcileSucceeded(env.Ctx, nodeStateController, client.ObjectKeyFromObject(node)) - Expect(controller.PollSQS(env.Ctx)).To(Succeed()) + Expect(controller.Reconcile(env.Ctx)).To(Succeed()) ExpectNotFound(env.Ctx, env.Client, node) Expect(sqsapi.DeleteMessageBehavior.SuccessfulCalls()).To(Equal(1)) }) @@ -158,7 +158,7 @@ var _ = Describe("Processing Messages", func() { ExpectApplied(env.Ctx, env.Client, node) ExpectReconcileSucceeded(env.Ctx, nodeStateController, client.ObjectKeyFromObject(node)) - Expect(controller.PollSQS(env.Ctx)).To(Succeed()) + Expect(controller.Reconcile(env.Ctx)).To(Succeed()) ExpectNotFound(env.Ctx, env.Client, node) Expect(sqsapi.DeleteMessageBehavior.SuccessfulCalls()).To(Equal(1)) }) @@ -182,7 +182,7 @@ var _ = Describe("Processing Messages", func() { // Wait for the nodes to reconcile with the cluster state ExpectReconcileSucceeded(env.Ctx, nodeStateController, lo.Map(nodes, func(n *v1.Node, _ int) client.ObjectKey { return client.ObjectKeyFromObject(n) })...) - Expect(controller.PollSQS(env.Ctx)).To(Succeed()) + Expect(controller.Reconcile(env.Ctx)).To(Succeed()) ExpectNotFound(env.Ctx, env.Client, lo.Map(nodes, func(n *v1.Node, _ int) client.Object { return n })...) Expect(sqsapi.DeleteMessageBehavior.SuccessfulCalls()).To(Equal(4)) }) @@ -211,19 +211,19 @@ var _ = Describe("Processing Messages", func() { // Wait for the nodes to reconcile with the cluster state ExpectReconcileSucceeded(env.Ctx, nodeStateController, lo.Map(nodes, func(n *v1.Node, _ int) client.ObjectKey { return client.ObjectKeyFromObject(n) })...) - Expect(controller.PollSQS(env.Ctx)).To(Succeed()) + Expect(controller.Reconcile(env.Ctx)).To(Succeed()) ExpectNotFound(env.Ctx, env.Client, lo.Map(nodes, func(n *v1.Node, _ int) client.Object { return n })...) Expect(sqsapi.DeleteMessageBehavior.SuccessfulCalls()).To(Equal(100)) }) It("should not delete a node when not owned by provisioner", func() { node := test.Node(test.NodeOptions{ - ProviderID: makeProviderID(uuid.NewString()), + ProviderID: makeProviderID(string(uuid.NewUUID())), }) ExpectMessagesCreated(spotInterruptionMessage(node.Spec.ProviderID)) ExpectApplied(env.Ctx, env.Client, node) ExpectReconcileSucceeded(env.Ctx, nodeStateController, client.ObjectKeyFromObject(node)) - Expect(controller.PollSQS(env.Ctx)).To(Succeed()) + Expect(controller.Reconcile(env.Ctx)).To(Succeed()) ExpectNodeExists(env.Ctx, env.Client, node.Name) Expect(sqsapi.DeleteMessageBehavior.SuccessfulCalls()).To(Equal(1)) }) @@ -233,11 +233,11 @@ var _ = Describe("Processing Messages", func() { "field1": "value1", "field2": "value2", })))), - MessageId: awssdk.String(uuid.NewString()), + MessageId: awssdk.String(string(uuid.NewUUID())), } ExpectMessagesCreated(badMessage) - Expect(controller.PollSQS(env.Ctx)).To(Succeed()) + Expect(controller.Reconcile(env.Ctx)).To(Succeed()) Expect(sqsapi.DeleteMessageBehavior.SuccessfulCalls()).To(Equal(1)) }) It("should delete a state change message when the state isn't in accepted states", func() { @@ -253,7 +253,7 @@ var _ = Describe("Processing Messages", func() { ExpectApplied(env.Ctx, env.Client, node) ExpectReconcileSucceeded(env.Ctx, nodeStateController, client.ObjectKeyFromObject(node)) - Expect(controller.PollSQS(env.Ctx)).To(Succeed()) + Expect(controller.Reconcile(env.Ctx)).To(Succeed()) ExpectNodeExists(env.Ctx, env.Client, node.Name) Expect(sqsapi.DeleteMessageBehavior.SuccessfulCalls()).To(Equal(1)) }) @@ -273,7 +273,7 @@ var _ = Describe("Processing Messages", func() { ExpectApplied(env.Ctx, env.Client, node) ExpectReconcileSucceeded(env.Ctx, nodeStateController, client.ObjectKeyFromObject(node)) - Expect(controller.PollSQS(env.Ctx)).To(Succeed()) + Expect(controller.Reconcile(env.Ctx)).To(Succeed()) ExpectNotFound(env.Ctx, env.Client, node) Expect(sqsapi.DeleteMessageBehavior.SuccessfulCalls()).To(Equal(1)) @@ -297,7 +297,7 @@ var _ = Describe("Error Handling", func() { It("should send an error on polling when AccessDenied", func() { ExpectClosed(infraStartChan) sqsapi.ReceiveMessageBehavior.Error.Set(awsErrWithCode(aws.AccessDeniedCode), awsfake.MaxCalls(0)) - Expect(controller.PollSQS(env.Ctx)).ToNot(Succeed()) + Expect(controller.Reconcile(env.Ctx)).ToNot(Succeed()) }) It("should trigger an infrastructure reconciliation on an SQS queue when it doesn't exist", func() { sqsapi.GetQueueURLBehavior.Error.Set(awsErrWithCode(sqs.ErrCodeQueueDoesNotExist), awsfake.MaxCalls(0)) // This mocks the queue not existing @@ -314,7 +314,7 @@ var _ = Describe("Error Handling", func() { sqsapi.ReceiveMessageBehavior.Error.Set(awsErrWithCode(sqs.ErrCodeQueueDoesNotExist)) // This mocks the queue being deleted manually after infra reconciliation // This should fail with an error since the queue doesn't exist - Expect(controller.PollSQS(env.Ctx)).ToNot(Succeed()) + Expect(controller.Reconcile(env.Ctx)).ToNot(Succeed()) Eventually(func(g Gomega) { g.Expect(sqsapi.CreateQueueBehavior.SuccessfulCalls()).To(Equal(2)) @@ -360,7 +360,7 @@ func spotInterruptionMessage(involvedInstanceID string) *sqs.Message { Version: "0", Account: defaultAccountID, DetailType: "EC2 Spot Instance Interruption Warning", - ID: uuid.NewString(), + ID: string(uuid.NewUUID()), Region: defaultRegion, Resources: []string{ fmt.Sprintf("arn:aws:ec2:%s:instance/%s", defaultRegion, involvedInstanceID), @@ -375,7 +375,7 @@ func spotInterruptionMessage(involvedInstanceID string) *sqs.Message { } return &sqs.Message{ Body: awssdk.String(string(lo.Must(json.Marshal(evt)))), - MessageId: awssdk.String(uuid.NewString()), + MessageId: awssdk.String(string(uuid.NewUUID())), } } @@ -385,7 +385,7 @@ func stateChangeMessage(involvedInstanceID, state string) *sqs.Message { Version: "0", Account: defaultAccountID, DetailType: "EC2 Instance State-change Notification", - ID: uuid.NewString(), + ID: string(uuid.NewUUID()), Region: defaultRegion, Resources: []string{ fmt.Sprintf("arn:aws:ec2:%s:instance/%s", defaultRegion, involvedInstanceID), @@ -400,7 +400,7 @@ func stateChangeMessage(involvedInstanceID, state string) *sqs.Message { } return &sqs.Message{ Body: awssdk.String(string(lo.Must(json.Marshal(evt)))), - MessageId: awssdk.String(uuid.NewString()), + MessageId: awssdk.String(string(uuid.NewUUID())), } } @@ -411,7 +411,7 @@ func scheduledChangeMessage(involvedInstanceID string) *sqs.Message { Version: "0", Account: defaultAccountID, DetailType: "AWS Health Event", - ID: uuid.NewString(), + ID: string(uuid.NewUUID()), Region: defaultRegion, Resources: []string{ fmt.Sprintf("arn:aws:ec2:%s:instance/%s", defaultRegion, involvedInstanceID), @@ -431,7 +431,7 @@ func scheduledChangeMessage(involvedInstanceID string) *sqs.Message { } return &sqs.Message{ Body: awssdk.String(string(lo.Must(json.Marshal(evt)))), - MessageId: awssdk.String(uuid.NewString()), + MessageId: awssdk.String(string(uuid.NewUUID())), } } diff --git a/pkg/cloudprovider/aws/eventbridge.go b/pkg/cloudprovider/aws/eventbridge.go index 37db6959882c..2f459f163fd1 100644 --- a/pkg/cloudprovider/aws/eventbridge.go +++ b/pkg/cloudprovider/aws/eventbridge.go @@ -26,14 +26,14 @@ import ( "github.com/samber/lo" "go.uber.org/multierr" - "github.com/aws/karpenter/pkg/apis/provisioning/v1alpha5" + awsv1alpha1 "github.com/aws/karpenter/pkg/cloudprovider/aws/apis/v1alpha1" "github.com/aws/karpenter/pkg/utils/injection" ) type EventBridgeProvider struct { - client eventbridgeiface.EventBridgeAPI - queueName string - metadata *Metadata + client eventbridgeiface.EventBridgeAPI + queueName string + metadataProvider *MetadataProvider } type EventRule struct { @@ -56,11 +56,11 @@ func (ep *EventPattern) Serialize() []byte { return lo.Must(json.Marshal(ep)) } -func NewEventBridgeProvider(eb eventbridgeiface.EventBridgeAPI, metadata *Metadata, queueName string) *EventBridgeProvider { +func NewEventBridgeProvider(eb eventbridgeiface.EventBridgeAPI, metadataProvider *MetadataProvider, queueName string) *EventBridgeProvider { return &EventBridgeProvider{ - client: eb, - metadata: metadata, - queueName: queueName, + client: eb, + metadataProvider: metadataProvider, + queueName: queueName, } } @@ -76,7 +76,7 @@ func (eb *EventBridgeProvider) CreateEC2NotificationRules(ctx context.Context) ( EventPattern: aws.String(string(r.Pattern.Serialize())), Tags: []*eventbridge.Tag{ { - Key: aws.String(v1alpha5.DiscoveryLabelKey), + Key: aws.String(awsv1alpha1.DiscoveryTagKey), Value: aws.String(injection.GetOptions(ctx).ClusterName), }, }, @@ -146,7 +146,7 @@ func (eb *EventBridgeProvider) getEC2NotificationEventRules(ctx context.Context) }, Target: &EventTarget{ ID: "1", - ARN: eb.getQueueARN(), + ARN: eb.getQueueARN(ctx), }, }, { @@ -157,7 +157,7 @@ func (eb *EventBridgeProvider) getEC2NotificationEventRules(ctx context.Context) }, Target: &EventTarget{ ID: "1", - ARN: eb.getQueueARN(), + ARN: eb.getQueueARN(ctx), }, }, { @@ -168,7 +168,7 @@ func (eb *EventBridgeProvider) getEC2NotificationEventRules(ctx context.Context) }, Target: &EventTarget{ ID: "1", - ARN: eb.getQueueARN(), + ARN: eb.getQueueARN(ctx), }, }, { @@ -179,12 +179,12 @@ func (eb *EventBridgeProvider) getEC2NotificationEventRules(ctx context.Context) }, Target: &EventTarget{ ID: "1", - ARN: eb.getQueueARN(), + ARN: eb.getQueueARN(ctx), }, }, } } -func (eb *EventBridgeProvider) getQueueARN() string { - return fmt.Sprintf("arn:aws:sqs:%s:%s:%s", eb.metadata.Region(), eb.metadata.AccountID(), eb.queueName) +func (eb *EventBridgeProvider) getQueueARN(ctx context.Context) string { + return fmt.Sprintf("arn:aws:sqs:%s:%s:%s", eb.metadataProvider.Region(ctx), eb.metadataProvider.AccountID(ctx), eb.queueName) } diff --git a/pkg/cloudprovider/aws/events/recorder.go b/pkg/cloudprovider/aws/events/recorder.go index bab6c22eb2bd..cb2a182779f3 100644 --- a/pkg/cloudprovider/aws/events/recorder.go +++ b/pkg/cloudprovider/aws/events/recorder.go @@ -66,6 +66,10 @@ func NewRecorder(r events.Recorder) Recorder { } } +func (r recorder) EventRecorder() record.EventRecorder { + return r.rec +} + func (r recorder) EC2SpotInterruptionWarning(node *v1.Node) { r.rec.Eventf(node, "Normal", "EC2SpotInterruptionWarning", "Node %s event: EC2 triggered a spot interruption warning for the node", node.Name) } diff --git a/pkg/cloudprovider/aws/fake/ec2metadataapi.go b/pkg/cloudprovider/aws/fake/ec2metadataapi.go new file mode 100644 index 000000000000..59e63bac2c61 --- /dev/null +++ b/pkg/cloudprovider/aws/fake/ec2metadataapi.go @@ -0,0 +1,37 @@ +/* +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package fake + +import ( + "context" + + "github.com/aws/aws-sdk-go/aws/ec2metadata" +) + +type EC2MetadataAPI struct{} + +func (e *EC2MetadataAPI) RegionWithContext(ctx context.Context) (string, error) { + return "us-west-2", nil +} + +func (e *EC2MetadataAPI) GetInstanceIdentityDocumentWithContext(context.Context) (ec2metadata.EC2InstanceIdentityDocument, error) { + return ec2metadata.EC2InstanceIdentityDocument{ + AccountID: "000000000000", + }, nil +} + +func (e *EC2MetadataAPI) PartitionID() string { + return "aws" +} diff --git a/pkg/cloudprovider/aws/fake/eventrecorder.go b/pkg/cloudprovider/aws/fake/eventrecorder.go index dad2321eb32f..a80e4a604720 100644 --- a/pkg/cloudprovider/aws/fake/eventrecorder.go +++ b/pkg/cloudprovider/aws/fake/eventrecorder.go @@ -18,6 +18,7 @@ import ( "context" v1 "k8s.io/api/core/v1" + "k8s.io/client-go/tools/record" "sigs.k8s.io/controller-runtime/pkg/client" "github.com/aws/karpenter/pkg/test" @@ -25,9 +26,11 @@ import ( // EventRecorder is a mock event recorder that is used to facilitate testing. type EventRecorder struct { - test.EventRecorder + test.Recorder } +func (e *EventRecorder) EventRecorder() record.EventRecorder { return e.Recorder.EventRecorder() } + func (e *EventRecorder) EC2SpotInterruptionWarning(_ *v1.Node) {} func (e *EventRecorder) EC2SpotRebalanceRecommendation(_ *v1.Node) {} @@ -50,6 +53,6 @@ func (e *EventRecorder) InfrastructureDeletionFailed(_ context.Context, _ client func NewEventRecorder() *EventRecorder { return &EventRecorder{ - EventRecorder: *test.NewEventRecorder(), + Recorder: *test.NewRecorder(), } } diff --git a/pkg/cloudprovider/aws/fake/stsapi.go b/pkg/cloudprovider/aws/fake/stsapi.go new file mode 100644 index 000000000000..405a30ed70bb --- /dev/null +++ b/pkg/cloudprovider/aws/fake/stsapi.go @@ -0,0 +1,21 @@ +/* +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package fake + +import "github.com/aws/aws-sdk-go/service/sts/stsiface" + +type STSAPI struct { + stsiface.STSAPI +} diff --git a/pkg/cloudprovider/aws/metadata.go b/pkg/cloudprovider/aws/metadata.go index 5a858fe510d4..8fc534aba144 100644 --- a/pkg/cloudprovider/aws/metadata.go +++ b/pkg/cloudprovider/aws/metadata.go @@ -28,9 +28,29 @@ import ( "github.com/aws/karpenter/pkg/utils/cache" ) +type EC2MetadataInterface interface { + RegionWithContext(context.Context) (string, error) + GetInstanceIdentityDocumentWithContext(context.Context) (ec2metadata.EC2InstanceIdentityDocument, error) + PartitionID() string +} + +type EC2MetadataClient struct { + *ec2metadata.EC2Metadata +} + +func NewEC2MetadataClient(sess *session.Session) *EC2MetadataClient { + return &EC2MetadataClient{ + EC2Metadata: ec2metadata.New(sess), + } +} + +func (e *EC2MetadataClient) PartitionID() string { + return e.EC2Metadata.PartitionID +} + type MetadataProvider struct { - imdsClient *ec2metadata.EC2Metadata - stsClient stsiface.STSAPI + ec2MetadataClient EC2MetadataInterface + stsClient stsiface.STSAPI region *string // cached region if already resolved regionMu sync.RWMutex @@ -39,10 +59,10 @@ type MetadataProvider struct { accountIDMu sync.RWMutex } -func NewMetadataProvider(sess *session.Session) *MetadataProvider { +func NewMetadataProvider(ec2metadataapi EC2MetadataInterface, stsapi stsiface.STSAPI) *MetadataProvider { return &MetadataProvider{ - imdsClient: ec2metadata.New(sess), - stsClient: sts.New(sess), + ec2MetadataClient: ec2metadataapi, + stsClient: stsapi, } } @@ -50,7 +70,7 @@ func NewMetadataProvider(sess *session.Session) *MetadataProvider { func (i *MetadataProvider) Region(ctx context.Context) string { ret, err := cache.TryGetStringWithFallback(&i.regionMu, i.region, func() (string, error) { - return i.imdsClient.RegionWithContext(ctx) + return i.ec2MetadataClient.RegionWithContext(ctx) }) if err != nil { panic(fmt.Sprintf("Failed to call the metadata server's region API, %s", err)) @@ -62,7 +82,7 @@ func (i *MetadataProvider) Region(ctx context.Context) string { func (i *MetadataProvider) AccountID(ctx context.Context) string { ret, err := cache.TryGetStringWithFallback(&i.accountIDMu, i.accountID, func() (string, error) { - doc, err := i.imdsClient.GetInstanceIdentityDocumentWithContext(ctx) + doc, err := i.ec2MetadataClient.GetInstanceIdentityDocumentWithContext(ctx) if err != nil { // Fallback to using the STS provider if IMDS fails result, err := i.stsClient.GetCallerIdentity(&sts.GetCallerIdentityInput{}) @@ -81,5 +101,5 @@ func (i *MetadataProvider) AccountID(ctx context.Context) string { } func (i *MetadataProvider) Partition() string { - return i.imdsClient.PartitionID + return i.ec2MetadataClient.PartitionID() } diff --git a/pkg/cloudprovider/aws/sqs.go b/pkg/cloudprovider/aws/sqs.go index d74f0bb95cce..0d72d4bbfc8f 100644 --- a/pkg/cloudprovider/aws/sqs.go +++ b/pkg/cloudprovider/aws/sqs.go @@ -25,7 +25,7 @@ import ( "github.com/aws/aws-sdk-go/service/sqs/sqsiface" "github.com/samber/lo" - "github.com/aws/karpenter/pkg/apis/provisioning/v1alpha5" + awsv1alpha1 "github.com/aws/karpenter/pkg/cloudprovider/aws/apis/v1alpha1" "github.com/aws/karpenter/pkg/utils/cache" "github.com/aws/karpenter/pkg/utils/functional" "github.com/aws/karpenter/pkg/utils/injection" @@ -71,7 +71,7 @@ func NewSQSProvider(ctx context.Context, client sqsiface.SQSAPI, metadataProvide Attributes: provider.getQueueAttributes(ctx), QueueName: aws.String(provider.queueName), Tags: map[string]*string{ - v1alpha5.DiscoveryLabelKey: aws.String(injection.GetOptions(ctx).ClusterName), + awsv1alpha1.DiscoveryTagKey: aws.String(injection.GetOptions(ctx).ClusterName), }, } provider.getQueueURLInput = &sqs.GetQueueUrlInput{ diff --git a/pkg/cloudprovider/aws/suite_test.go b/pkg/cloudprovider/aws/suite_test.go index 5208f99ca067..b116e090c3af 100644 --- a/pkg/cloudprovider/aws/suite_test.go +++ b/pkg/cloudprovider/aws/suite_test.go @@ -68,7 +68,7 @@ var controller *provisioning.Controller var cloudProvider *CloudProvider var clientSet *kubernetes.Clientset var cluster *state.Cluster -var recorder *test.EventRecorder +var recorder *test.Recorder var cfg *test.Config var fakeClock *clock.FakeClock var provisioner *v1alpha5.Provisioner @@ -143,7 +143,7 @@ var _ = BeforeSuite(func() { cfg = test.NewConfig() fakeClock = clock.NewFakeClock(time.Now()) cluster = state.NewCluster(fakeClock, cfg, e.Client, cloudProvider) - recorder = test.NewEventRecorder() + recorder = test.NewRecorder() prov := provisioning.NewProvisioner(ctx, cfg, e.Client, corev1.NewForConfigOrDie(e.Config), recorder, cloudProvider, cluster) controller = provisioning.NewController(e.Client, prov, recorder) }) diff --git a/pkg/controllers/consolidation/suite_test.go b/pkg/controllers/consolidation/suite_test.go index 454b2ef24005..e1674aca69d3 100644 --- a/pkg/controllers/consolidation/suite_test.go +++ b/pkg/controllers/consolidation/suite_test.go @@ -57,7 +57,7 @@ var provisioningController *provisioning.Controller var provisioner *provisioning.Provisioner var cloudProvider *fake.CloudProvider var clientSet *kubernetes.Clientset -var recorder *test.EventRecorder +var recorder *test.Recorder var nodeStateController *state.NodeController var fakeClock *clock.FakeClock var cfg *test.Config @@ -81,7 +81,7 @@ var _ = BeforeSuite(func() { cluster = state.NewCluster(fakeClock, cfg, env.Client, cloudProvider) nodeStateController = state.NewNodeController(env.Client, cluster) clientSet = kubernetes.NewForConfigOrDie(e.Config) - recorder = test.NewEventRecorder() + recorder = test.NewRecorder() provisioner = provisioning.NewProvisioner(ctx, cfg, env.Client, clientSet.CoreV1(), recorder, cloudProvider, cluster) provisioningController = provisioning.NewController(env.Client, provisioner, recorder) }) diff --git a/pkg/controllers/provisioning/scheduling/scheduling_benchmark_test.go b/pkg/controllers/provisioning/scheduling/scheduling_benchmark_test.go index d29c3f7782f6..7bf46460272e 100644 --- a/pkg/controllers/provisioning/scheduling/scheduling_benchmark_test.go +++ b/pkg/controllers/provisioning/scheduling/scheduling_benchmark_test.go @@ -120,7 +120,7 @@ func benchmarkScheduler(b *testing.B, instanceCount, podCount int) { scheduler := pscheduling.NewScheduler(ctx, nil, []*scheduling.NodeTemplate{scheduling.NewNodeTemplate(provisioner)}, nil, state.NewCluster(&clock.RealClock{}, test.NewConfig(), nil, cloudProv), nil, &pscheduling.Topology{}, map[string][]cloudprovider.InstanceType{provisioner.Name: instanceTypes}, map[*scheduling.NodeTemplate]v1.ResourceList{}, - test.NewEventRecorder(), + test.NewRecorder(), pscheduling.SchedulerOptions{}) pods := makeDiversePods(podCount) diff --git a/pkg/controllers/provisioning/scheduling/suite_test.go b/pkg/controllers/provisioning/scheduling/suite_test.go index 7770bcb82138..cc30c75fc4e0 100644 --- a/pkg/controllers/provisioning/scheduling/suite_test.go +++ b/pkg/controllers/provisioning/scheduling/suite_test.go @@ -59,7 +59,7 @@ var cloudProv *fake.CloudProvider var cluster *state.Cluster var nodeStateController *state.NodeController var podStateController *state.PodController -var recorder *test.EventRecorder +var recorder *test.Recorder var cfg *test.Config func TestScheduling(t *testing.T) { @@ -79,7 +79,7 @@ var _ = BeforeSuite(func() { cluster = state.NewCluster(fakeClock, cfg, e.Client, cloudProv) nodeStateController = state.NewNodeController(e.Client, cluster) podStateController = state.NewPodController(e.Client, cluster) - recorder = test.NewEventRecorder() + recorder = test.NewRecorder() cfg = test.NewConfig() prov := provisioning.NewProvisioner(ctx, cfg, e.Client, corev1.NewForConfigOrDie(e.Config), recorder, cloudProv, cluster) controller = provisioning.NewController(e.Client, prov, recorder) diff --git a/pkg/controllers/provisioning/suite_test.go b/pkg/controllers/provisioning/suite_test.go index 34e233a3ad34..addcffac7251 100644 --- a/pkg/controllers/provisioning/suite_test.go +++ b/pkg/controllers/provisioning/suite_test.go @@ -52,7 +52,7 @@ var nodeController *state.NodeController var cloudProvider cloudprovider.CloudProvider var controller *provisioning.Controller var env *test.Environment -var recorder *test.EventRecorder +var recorder *test.Recorder var cfg *test.Config var instanceTypeMap map[string]cloudprovider.InstanceType @@ -66,7 +66,7 @@ var _ = BeforeSuite(func() { env = test.NewEnvironment(ctx, func(e *test.Environment) { cloudProvider = &fake.CloudProvider{} cfg = test.NewConfig() - recorder = test.NewEventRecorder() + recorder = test.NewRecorder() fakeClock = clock.NewFakeClock(time.Now()) cluster = state.NewCluster(fakeClock, cfg, e.Client, cloudProvider) nodeController = state.NewNodeController(e.Client, cluster) diff --git a/pkg/controllers/termination/suite_test.go b/pkg/controllers/termination/suite_test.go index 4c222ef82c37..94c096c28dcf 100644 --- a/pkg/controllers/termination/suite_test.go +++ b/pkg/controllers/termination/suite_test.go @@ -62,7 +62,7 @@ var _ = BeforeSuite(func() { env = test.NewEnvironment(ctx, func(e *test.Environment) { cloudProvider := &fake.CloudProvider{} coreV1Client := corev1.NewForConfigOrDie(e.Config) - recorder := test.NewEventRecorder() + recorder := test.NewRecorder() evictionQueue = termination.NewEvictionQueue(ctx, coreV1Client, recorder) controller = &termination.Controller{ KubeClient: e.Client, diff --git a/pkg/test/eventrecorder.go b/pkg/test/eventrecorder.go index 250b48cb886e..5ee8c0433cd5 100644 --- a/pkg/test/eventrecorder.go +++ b/pkg/test/eventrecorder.go @@ -29,46 +29,47 @@ type Binding struct { Node *v1.Node } -// EventRecorder is a mock event recorder that is used to facilitate testing. -type EventRecorder struct { - record.EventRecorder +// Recorder is a mock event recorder that is used to facilitate testing. +type Recorder struct { + rec record.EventRecorder mu sync.Mutex bindings []Binding } -var _ events.Recorder = (*EventRecorder)(nil) +var _ events.Recorder = (*Recorder)(nil) -func NewEventRecorder() *EventRecorder { - return &EventRecorder{} +func NewRecorder() *Recorder { + return &Recorder{} } -func (e *EventRecorder) WaitingOnReadinessForConsolidation(v *v1.Node) {} -func (e *EventRecorder) TerminatingNodeForConsolidation(node *v1.Node, reason string) {} -func (e *EventRecorder) LaunchingNodeForConsolidation(node *v1.Node, reason string) {} -func (e *EventRecorder) WaitingOnDeletionForConsolidation(node *v1.Node) {} +func (e *Recorder) EventRecorder() record.EventRecorder { return e.rec } +func (e *Recorder) WaitingOnReadinessForConsolidation(v *v1.Node) {} +func (e *Recorder) TerminatingNodeForConsolidation(node *v1.Node, reason string) {} +func (e *Recorder) LaunchingNodeForConsolidation(node *v1.Node, reason string) {} +func (e *Recorder) WaitingOnDeletionForConsolidation(node *v1.Node) {} -func (e *EventRecorder) NominatePod(pod *v1.Pod, node *v1.Node) { +func (e *Recorder) NominatePod(pod *v1.Pod, node *v1.Node) { e.mu.Lock() defer e.mu.Unlock() e.bindings = append(e.bindings, Binding{pod, node}) } -func (e *EventRecorder) EvictPod(pod *v1.Pod) {} +func (e *Recorder) EvictPod(pod *v1.Pod) {} -func (e *EventRecorder) PodFailedToSchedule(pod *v1.Pod, err error) {} +func (e *Recorder) PodFailedToSchedule(pod *v1.Pod, err error) {} -func (e *EventRecorder) NodeFailedToDrain(node *v1.Node, err error) {} +func (e *Recorder) NodeFailedToDrain(node *v1.Node, err error) {} -func (e *EventRecorder) Reset() { +func (e *Recorder) Reset() { e.ResetBindings() } -func (e *EventRecorder) ResetBindings() { +func (e *Recorder) ResetBindings() { e.mu.Lock() defer e.mu.Unlock() e.bindings = nil } -func (e *EventRecorder) ForEachBinding(f func(pod *v1.Pod, node *v1.Node)) { +func (e *Recorder) ForEachBinding(f func(pod *v1.Pod, node *v1.Node)) { e.mu.Lock() defer e.mu.Unlock() for _, b := range e.bindings { diff --git a/pkg/test/expectations/expectations.go b/pkg/test/expectations/expectations.go index cf633b5d9059..bfeb2aec01d5 100644 --- a/pkg/test/expectations/expectations.go +++ b/pkg/test/expectations/expectations.go @@ -170,7 +170,7 @@ func ExpectCleanedUp(ctx context.Context, c client.Client) { func ExpectProvisioned(ctx context.Context, c client.Client, controller *provisioning.Controller, pods ...*v1.Pod) (result []*v1.Pod) { ExpectProvisionedNoBindingWithOffset(1, ctx, c, controller, pods...) - recorder := controller.Recorder().(*test.EventRecorder) + recorder := controller.Recorder().(*test.Recorder) recorder.ForEachBinding(func(pod *v1.Pod, node *v1.Node) { ExpectManualBindingWithOffset(1, ctx, c, pod, node) }) diff --git a/test/pkg/environment/environment.go b/test/pkg/environment/environment.go index c774cc9f698a..4771df022736 100644 --- a/test/pkg/environment/environment.go +++ b/test/pkg/environment/environment.go @@ -51,11 +51,11 @@ import ( type AWSEnvironment struct { *Environment - Metadata *aws.Metadata - EC2API ec2.EC2 - SSMAPI ssm.SSM - STSAPI sts.STS - IAMAPI iam.IAM + MetadataProvider *aws.MetadataProvider + EC2API ec2.EC2 + SSMAPI ssm.SSM + STSAPI sts.STS + IAMAPI iam.IAM SQSProvider *aws.SQSProvider InterruptionAPI *itn.ITN @@ -75,16 +75,16 @@ func NewAWSEnvironment(env *Environment, err error) (*AWSEnvironment, error) { return nil, err } session := session.Must(session.NewSessionWithOptions(session.Options{SharedConfigState: session.SharedConfigEnable})) - metadata := aws.NewMetadata(*session.Config.Region, aws.NewMetadataProvider(session).AccountID(env.Context)) + metadataProvider := aws.NewMetadataProvider(aws.NewEC2MetadataClient(session), sts.New(session)) return &AWSEnvironment{ - Environment: env, - Metadata: metadata, - EC2API: *ec2.New(session), - SSMAPI: *ssm.New(session), - IAMAPI: *iam.New(session), - InterruptionAPI: itn.New(lo.Must(cfg.LoadDefaultConfig(env.Context))), - SQSProvider: aws.NewSQSProvider(env.Context, sqs.New(session), metadata), + Environment: env, + MetadataProvider: metadataProvider, + EC2API: *ec2.New(session), + SSMAPI: *ssm.New(session), + IAMAPI: *iam.New(session), + InterruptionAPI: itn.New(lo.Must(cfg.LoadDefaultConfig(env.Context))), + SQSProvider: aws.NewSQSProvider(env.Context, sqs.New(session), metadataProvider), }, nil } diff --git a/test/suites/notification/suite_test.go b/test/suites/notification/suite_test.go index e782337920f8..8d6854f63450 100644 --- a/test/suites/notification/suite_test.go +++ b/test/suites/notification/suite_test.go @@ -22,12 +22,12 @@ import ( "testing" "time" - "github.com/google/uuid" . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" v1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/labels" + "k8s.io/apimachinery/pkg/util/uuid" "knative.dev/pkg/ptr" "github.com/aws/karpenter/pkg/apis/awsnodetemplate/v1alpha1" @@ -95,7 +95,7 @@ var _ = Describe("Notification", Label("AWS"), func() { ctx, cancel := context.WithCancel(env.Context) defer cancel() // In case the test fails, we need this so that the goroutine monitoring the events is closed - node := env.Monitor.GetCreatedNodes()[0] + node := env.Monitor.CreatedNodes()[0] instanceID := parseProviderID(node.Spec.ProviderID) By("Interrupting the spot instance") @@ -118,7 +118,7 @@ var _ = Describe("Notification", Label("AWS"), func() { } }() - env.EventuallyExpectNotFound(&node) + env.EventuallyExpectNotFound(node) close(done) // Once the node is gone, we can close the event channel because the test has effectively succeeded env.EventuallyExpectHealthyPodCount(selector, 1) }) @@ -154,11 +154,11 @@ var _ = Describe("Notification", Label("AWS"), func() { env.EventuallyExpectHealthyPodCount(selector, numPods) env.ExpectCreatedNodeCount("==", 1) - node := env.Monitor.GetCreatedNodes()[0] + node := env.Monitor.CreatedNodes()[0] By("Stopping the EC2 instance without the EKS cluster's knowledge") - env.ExpectInstanceStopped(node.Name) // Make a call to the EC2 api to stop the instance - env.EventuallyExpectNotFoundAssertion(&node).WithTimeout(time.Minute) // shorten the timeout since we should react faster + env.ExpectInstanceStopped(node.Name) // Make a call to the EC2 api to stop the instance + env.EventuallyExpectNotFoundAssertion(node).WithTimeout(time.Minute) // shorten the timeout since we should react faster env.EventuallyExpectHealthyPodCount(selector, 1) }) It("should terminate the node at the API server when the EC2 instance is terminated", func() { @@ -193,11 +193,11 @@ var _ = Describe("Notification", Label("AWS"), func() { env.EventuallyExpectHealthyPodCount(selector, numPods) env.ExpectCreatedNodeCount("==", 1) - node := env.Monitor.GetCreatedNodes()[0] + node := env.Monitor.CreatedNodes()[0] By("Terminating the EC2 instance without the EKS cluster's knowledge") - env.ExpectInstanceTerminated(node.Name) // Make a call to the EC2 api to stop the instance - env.EventuallyExpectNotFoundAssertion(&node).WithTimeout(time.Minute) // shorten the timeout since we should react faster + env.ExpectInstanceTerminated(node.Name) // Make a call to the EC2 api to stop the instance + env.EventuallyExpectNotFoundAssertion(node).WithTimeout(time.Minute) // shorten the timeout since we should react faster env.EventuallyExpectHealthyPodCount(selector, 1) }) It("should terminate the node when receiving a scheduled change health event", func() { @@ -232,12 +232,12 @@ var _ = Describe("Notification", Label("AWS"), func() { env.EventuallyExpectHealthyPodCount(selector, numPods) env.ExpectCreatedNodeCount("==", 1) - node := env.Monitor.GetCreatedNodes()[0] + node := env.Monitor.CreatedNodes()[0] instanceID := parseProviderID(node.Spec.ProviderID) By("Creating a scheduled change health event in the SQS message queue") - env.ExpectMessagesCreated(scheduledChangeMessage(env.Metadata.Region(), env.Metadata.AccountID(), instanceID)) - env.EventuallyExpectNotFound(&node) + env.ExpectMessagesCreated(scheduledChangeMessage(env.MetadataProvider.Region(env.Context), env.MetadataProvider.AccountID(env.Context), instanceID)) + env.EventuallyExpectNotFound(node) env.EventuallyExpectHealthyPodCount(selector, 1) }) @@ -250,7 +250,7 @@ func scheduledChangeMessage(region, accountID, involvedInstanceID string) schedu Version: "0", Account: accountID, DetailType: "AWS Health Event", - ID: uuid.NewString(), + ID: string(uuid.NewUUID()), Region: region, Resources: []string{ fmt.Sprintf("arn:aws:ec2:%s:instance/%s", region, involvedInstanceID), From e250433624c8d7d28bb0fd6b2ff58ff19b618816 Mon Sep 17 00:00:00 2001 From: Jonathan Innis Date: Thu, 6 Oct 2022 16:32:55 -0700 Subject: [PATCH 33/55] Remove cleanup logic on shutdown --- cmd/controller/main.go | 4 +- .../controllers/infrastructure/controller.go | 55 +------------------ .../controllers/infrastructure/suite_test.go | 3 +- .../controllers/notification/suite_test.go | 2 +- pkg/cloudprovider/aws/controllers/register.go | 6 +- pkg/cloudprovider/types.go | 3 - pkg/controllers/controllers.go | 52 ++++-------------- pkg/test/expectations/expectations.go | 9 --- 8 files changed, 20 insertions(+), 114 deletions(-) diff --git a/cmd/controller/main.go b/cmd/controller/main.go index def04d68613c..24cf8f07c860 100644 --- a/cmd/controller/main.go +++ b/cmd/controller/main.go @@ -26,8 +26,8 @@ import ( func main() { controllers.Initialize(func(ctx context.Context, options cloudprovider.Options) (cloudprovider.CloudProvider, controllers.ControllerInitFunc) { provider := aws.NewCloudProvider(ctx, options) - return provider, func(c context.Context, opts *controllers.ControllerOptions) []<-chan struct{} { - return awscontrollers.Register(c, provider, opts) + return provider, func(c context.Context, opts *controllers.ControllerOptions) { + awscontrollers.Register(c, provider, opts) } }) } diff --git a/pkg/cloudprovider/aws/controllers/infrastructure/controller.go b/pkg/cloudprovider/aws/controllers/infrastructure/controller.go index ef018fbe3394..2cad54872b12 100644 --- a/pkg/cloudprovider/aws/controllers/infrastructure/controller.go +++ b/pkg/cloudprovider/aws/controllers/infrastructure/controller.go @@ -21,23 +21,17 @@ import ( "sync" "time" - "github.com/avast/retry-go" "github.com/aws/aws-sdk-go/aws/awserr" "github.com/aws/aws-sdk-go/service/sqs" "github.com/cenkalti/backoff/v4" "go.uber.org/multierr" - appsv1 "k8s.io/api/apps/v1" - apierrors "k8s.io/apimachinery/pkg/api/errors" - "k8s.io/apimachinery/pkg/types" "k8s.io/utils/clock" "knative.dev/pkg/logging" - "knative.dev/pkg/system" "sigs.k8s.io/controller-runtime/pkg/client" "github.com/aws/karpenter/pkg/cloudprovider/aws" "github.com/aws/karpenter/pkg/cloudprovider/aws/events" "github.com/aws/karpenter/pkg/metrics" - "github.com/aws/karpenter/pkg/utils/injection" ) // Controller is the AWS infrastructure controller. It is not a standard controller-runtime controller in that it doesn't @@ -55,7 +49,6 @@ type Controller struct { readinessChan chan struct{} // A signal to other controllers that infrastructure is in a good state ready bool trigger chan struct{} - done chan struct{} } // pollingPeriod is the period that we go to AWS APIs to ensure that the appropriate AWS infrastructure is provisioned @@ -64,7 +57,7 @@ const pollingPeriod = time.Hour func NewController(ctx context.Context, kubeClient client.Client, clk clock.Clock, recorder events.Recorder, sqsProvider *aws.SQSProvider, eventBridgeProvider *aws.EventBridgeProvider, - startAsync <-chan struct{}, cleanupAsync <-chan struct{}) *Controller { + startAsync <-chan struct{}) *Controller { c := &Controller{ kubeClient: kubeClient, @@ -76,28 +69,16 @@ func NewController(ctx context.Context, kubeClient client.Client, clk clock.Cloc backoff: newBackoff(clk), readinessChan: make(chan struct{}), trigger: make(chan struct{}, 1), - done: make(chan struct{}), } ctx = logging.WithLogger(ctx, logging.FromContext(ctx).Named("infrastructure")) logging.FromContext(ctx).Infof("Starting controller") - innerCtx, cancel := context.WithCancel(ctx) // Cancel so we don't re-provision the infra on cleanup - go func() { - select { - case <-cleanupAsync: - cancel() - c.cleanup(ctx) - case <-ctx.Done(): - } - close(c.done) - }() - go func() { select { case <-ctx.Done(): return case <-startAsync: - c.run(innerCtx) + c.run(ctx) } }() return c @@ -140,34 +121,6 @@ func (c *Controller) run(ctx context.Context) { } } -func (c *Controller) cleanup(ctx context.Context) { - ctx = logging.WithLogger(ctx, logging.FromContext(ctx).Named("cleanup")) - - dep := &appsv1.Deployment{} - nn := types.NamespacedName{ - Name: injection.GetOptions(ctx).DeploymentName, - Namespace: system.Namespace(), - } - - notFound := false - if err := retry.Do(func() error { - err := c.kubeClient.Get(ctx, nn, dep) - if apierrors.IsNotFound(err) { - notFound = true - } - return client.IgnoreNotFound(err) - }); err != nil { - logging.FromContext(ctx).Errorf("Getting the deployment %s for cleanup, %v", nn, err) - } - - // Deployment is already deleted or currently deleting, so we should cleanup the infrastructure - if notFound || !dep.DeletionTimestamp.IsZero() { - if err := retry.Do(func() error { return c.DeleteInfrastructure(ctx) }); err != nil { - logging.FromContext(ctx).Errorf("Deprovisioning the infrastructure, %v", err) - } - } -} - // Ready returns a channel that serves as a gate for other controllers // to wait on the infrastructure to be in a good state. When the infrastructure is ready, // this channel is closed so other controllers can proceed with their operations @@ -181,10 +134,6 @@ func (c *Controller) Trigger() { c.trigger <- struct{}{} } -func (c *Controller) Done() <-chan struct{} { - return c.done -} - func (c *Controller) setReady(ctx context.Context, ready bool) { c.mutex.Lock() defer c.mutex.Unlock() diff --git a/pkg/cloudprovider/aws/controllers/infrastructure/suite_test.go b/pkg/cloudprovider/aws/controllers/infrastructure/suite_test.go index 57c1d234fd57..4859be48e803 100644 --- a/pkg/cloudprovider/aws/controllers/infrastructure/suite_test.go +++ b/pkg/cloudprovider/aws/controllers/infrastructure/suite_test.go @@ -85,7 +85,7 @@ var _ = BeforeEach(func() { cleanupChan = make(chan struct{}, 1) startChan = make(chan struct{}) - controller = infrastructure.NewController(env.Ctx, env.Client, fakeClock, recorder, sqsProvider, eventBridgeProvider, startChan, cleanupChan) + controller = infrastructure.NewController(env.Ctx, env.Client, fakeClock, recorder, sqsProvider, eventBridgeProvider, startChan) }) Expect(env.Start()).To(Succeed(), "Failed to start environment") ExpectApplied(env.Ctx, env.Client, test.KarpenterDeployment()) @@ -203,7 +203,6 @@ var _ = Describe("Cleanup", func() { It("should cleanup the infrastructure when the cleanup channel is triggered", func() { ExpectDeleted(env.Ctx, env.Client, test.KarpenterDeployment()) ExpectClosed(cleanupChan) - ExpectDone[struct{}](controller) Expect(sqsapi.DeleteQueueBehavior.SuccessfulCalls()).To(Equal(1)) Expect(eventbridgeapi.RemoveTargetsBehavior.SuccessfulCalls()).To(Equal(4)) Expect(eventbridgeapi.DeleteRuleBehavior.SuccessfulCalls()).To(Equal(4)) diff --git a/pkg/cloudprovider/aws/controllers/notification/suite_test.go b/pkg/cloudprovider/aws/controllers/notification/suite_test.go index e837e9930c0c..082295af05fe 100644 --- a/pkg/cloudprovider/aws/controllers/notification/suite_test.go +++ b/pkg/cloudprovider/aws/controllers/notification/suite_test.go @@ -116,7 +116,7 @@ var _ = BeforeEach(func() { ec2api = &awsfake.EC2API{} subnetProvider := aws.NewSubnetProvider(ec2api) instanceTypeProvider = aws.NewInstanceTypeProvider(env.Ctx, mock.Session, cloudprovider.Options{}, ec2api, subnetProvider) - infraController = infrastructure.NewController(env.Ctx, env.Client, fakeClock, recorder, sqsProvider, eventBridgeProvider, infraStartChan, env.Ctx.Done()) + infraController = infrastructure.NewController(env.Ctx, env.Client, fakeClock, recorder, sqsProvider, eventBridgeProvider, infraStartChan) controller = notification.NewController(env.Ctx, env.Client, fakeClock, recorder, cluster, sqsProvider, instanceTypeProvider, infraController, notificationStartChan) }) Expect(env.Start()).To(Succeed(), "Failed to start environment") diff --git a/pkg/cloudprovider/aws/controllers/register.go b/pkg/cloudprovider/aws/controllers/register.go index 75b4dab19eb4..e99f60cb660d 100644 --- a/pkg/cloudprovider/aws/controllers/register.go +++ b/pkg/cloudprovider/aws/controllers/register.go @@ -26,15 +26,13 @@ import ( "github.com/aws/karpenter/pkg/controllers" ) -func Register(ctx context.Context, provider *aws.CloudProvider, opts *controllers.ControllerOptions) (done []<-chan struct{}) { +func Register(ctx context.Context, provider *aws.CloudProvider, opts *controllers.ControllerOptions) { rec := events.NewRecorder(opts.Recorder) ctx = logging.WithLogger(ctx, logging.FromContext(ctx).Named("aws")) // Only enable spot interruption handling controllers when the feature flag is enabled if opts.Config.EnableInterruptionHandling() { - infraController := infrastructure.NewController(ctx, opts.KubeClient, opts.Clock, rec, provider.SQSProvider(), provider.EventBridgeProvider(), opts.StartAsync, opts.CleanupAsync) + infraController := infrastructure.NewController(ctx, opts.KubeClient, opts.Clock, rec, provider.SQSProvider(), provider.EventBridgeProvider(), opts.StartAsync) notification.NewController(ctx, opts.KubeClient, opts.Clock, rec, opts.Cluster, provider.SQSProvider(), provider.InstanceTypeProvider(), infraController, opts.StartAsync) - done = append(done, infraController.Done()) } - return done } diff --git a/pkg/cloudprovider/types.go b/pkg/cloudprovider/types.go index 16f4bfff3f58..0dea45be150f 100644 --- a/pkg/cloudprovider/types.go +++ b/pkg/cloudprovider/types.go @@ -36,9 +36,6 @@ type Options struct { // StartAsync is a channel that is closed when leader election has been won. This is a signal to start any async // processing that should only occur while the cloud provider is the leader. StartAsync <-chan struct{} - // CleanupAsync is a channel that is closed when pod termination is triggered. This is a signal to start any async - // processing that should occur on cleanup - CleanupAsync <-chan struct{} } // CloudProvider interface is implemented by cloud providers to support provisioning. diff --git a/pkg/controllers/controllers.go b/pkg/controllers/controllers.go index d7a0dd9020cc..6354e17d123f 100644 --- a/pkg/controllers/controllers.go +++ b/pkg/controllers/controllers.go @@ -77,7 +77,7 @@ func init() { metrics.MustRegister() // Registers cross-controller metrics } -type ControllerInitFunc func(context.Context, *ControllerOptions) []<-chan struct{} +type ControllerInitFunc func(context.Context, *ControllerOptions) // Controller is an interface implemented by Karpenter custom resources. type Controller interface { @@ -96,8 +96,7 @@ type ControllerOptions struct { Recorder events.Recorder Clock clock.Clock - StartAsync <-chan struct{} - CleanupAsync <-chan struct{} + StartAsync <-chan struct{} } func Initialize(injectCloudProvider func(context.Context, cloudprovider.Options) (cloudprovider.CloudProvider, ControllerInitFunc)) { @@ -112,7 +111,6 @@ func Initialize(injectCloudProvider func(context.Context, cloudprovider.Options) cmw := informer.NewInformedWatcher(clientSet, system.Namespace()) ctx := injection.LoggingContextOrDie(component, controllerRuntimeConfig, cmw) ctx = newRunnableContext(controllerRuntimeConfig, opts, logging.FromContext(ctx))() - ctx, cancel := context.WithCancel(ctx) logging.FromContext(ctx).Infof("Initializing with version %s", project.Version) @@ -132,16 +130,14 @@ func Initialize(injectCloudProvider func(context.Context, cloudprovider.Options) HealthProbeBindAddress: fmt.Sprintf(":%d", opts.HealthProbePort), BaseContext: newRunnableContext(controllerRuntimeConfig, opts, logging.FromContext(ctx)), }) - cleanupAsync := make(chan struct{}) // This is a channel to broadcast to controllers cleanup can start if opts.EnableProfiling { utilruntime.Must(registerPprof(manager)) } cloudProvider, injectControllers := injectCloudProvider(ctx, cloudprovider.Options{ - ClientSet: clientSet, - KubeClient: manager.GetClient(), - StartAsync: manager.Elected(), - CleanupAsync: cleanupAsync, + ClientSet: clientSet, + KubeClient: manager.GetClient(), + StartAsync: manager.Elected(), }) if hp, ok := cloudProvider.(HealthCheck); ok { utilruntime.Must(manager.AddHealthzCheck("cloud-provider", hp.LivenessProbe)) @@ -170,19 +166,17 @@ func Initialize(injectCloudProvider func(context.Context, cloudprovider.Options) // Inject cloudprovider-specific controllers into the controller-set using the injectControllers function // Inject the base cloud provider into the injection function rather than the decorated interface controllerOptions := &ControllerOptions{ - Config: cfg, - Cluster: cluster, - KubeClient: manager.GetClient(), - Recorder: recorder, - StartAsync: manager.Elected(), - CleanupAsync: cleanupAsync, - Clock: realClock, + Config: cfg, + Cluster: cluster, + KubeClient: manager.GetClient(), + Recorder: recorder, + StartAsync: manager.Elected(), + Clock: realClock, } - cleanupDone := injectControllers(ctx, controllerOptions) + injectControllers(ctx, controllerOptions) metricsstate.StartMetricScraper(ctx, cluster) - StartCleanupWatcher(ctx, cancel, manager.Elected(), cleanupAsync, cleanupDone...) if err := RegisterControllers(ctx, manager, provisioning.NewController(manager.GetClient(), provisioner, recorder), @@ -237,28 +231,6 @@ func RegisterControllers(ctx context.Context, m manager.Manager, controllers ... return m } -// StartCleanupWatcher monitors the signal channel for termination, closes the cleanupAsync channel when -// there is a signal for pod termination, waits for the cleanup operation to complete, and then cancels all contexts -// Only the leader will perform the cleanup operation -func StartCleanupWatcher(ctx context.Context, cancel context.CancelFunc, elected <-chan struct{}, - cleanupAsync chan<- struct{}, cleanupDone ...<-chan struct{}) { - go func() { - sigs := make(chan os.Signal, 1) - signal.Notify(sigs, syscall.SIGINT, syscall.SIGTERM) - select { - case <-elected: - <-sigs - logging.FromContext(ctx).Infof("Initiating cleanup processes") - close(cleanupAsync) - for _, c := range cleanupDone { - <-c - } - case <-sigs: - } - cancel() - }() -} - func registerPprof(manager manager.Manager) error { for path, handler := range map[string]http.Handler{ "/debug/pprof/": http.HandlerFunc(pprof.Index), diff --git a/pkg/test/expectations/expectations.go b/pkg/test/expectations/expectations.go index bfeb2aec01d5..749f7076498f 100644 --- a/pkg/test/expectations/expectations.go +++ b/pkg/test/expectations/expectations.go @@ -286,15 +286,6 @@ func ExpectSkew(ctx context.Context, c client.Client, namespace string, constrai return ExpectWithOffset(1, skew) } -type Completable[T any] interface { - Done() <-chan T -} - -// ExpectDone waits on a done channel until the Completable is done -func ExpectDone[T any](c Completable[T]) { - <-c.Done() -} - // ExpectClosed closes a channel if it isn't already closed func ExpectClosed[T any](ch chan T) { if !IsClosed(ch) { From 4439eab909708f3d3c060cd05910f7f8faefbd68 Mon Sep 17 00:00:00 2001 From: Jonathan Innis Date: Thu, 6 Oct 2022 16:36:10 -0700 Subject: [PATCH 34/55] Migrate events out of versioned folders --- .../aws/controllers/notification/controller.go | 2 +- .../event/aggregatedparser/aggregatedparser.go | 16 ++++++++-------- .../rebalancerecommendation/{v0 => }/handler.go | 2 +- .../rebalancerecommendation/{v0 => }/parser.go | 2 +- .../{v0 => }/unmarshal.go | 2 +- .../event/scheduledchange/{v0 => }/handler.go | 2 +- .../event/scheduledchange/{v0 => }/parser.go | 2 +- .../event/scheduledchange/{v0 => }/unmarshal.go | 2 +- .../event/spotinterruption/{v0 => }/handler.go | 2 +- .../event/spotinterruption/{v0 => }/parser.go | 2 +- .../event/spotinterruption/{v0 => }/unmarshal.go | 2 +- .../event/statechange/{v0 => }/handler.go | 2 +- .../event/statechange/{v0 => }/parser.go | 2 +- .../event/statechange/{v0 => }/unmarshal.go | 2 +- .../aws/controllers/notification/suite_test.go | 6 +++--- test/suites/notification/suite_test.go | 2 +- 16 files changed, 25 insertions(+), 25 deletions(-) rename pkg/cloudprovider/aws/controllers/notification/event/rebalancerecommendation/{v0 => }/handler.go (97%) rename pkg/cloudprovider/aws/controllers/notification/event/rebalancerecommendation/{v0 => }/parser.go (97%) rename pkg/cloudprovider/aws/controllers/notification/event/rebalancerecommendation/{v0 => }/unmarshal.go (97%) rename pkg/cloudprovider/aws/controllers/notification/event/scheduledchange/{v0 => }/handler.go (98%) rename pkg/cloudprovider/aws/controllers/notification/event/scheduledchange/{v0 => }/parser.go (98%) rename pkg/cloudprovider/aws/controllers/notification/event/scheduledchange/{v0 => }/unmarshal.go (99%) rename pkg/cloudprovider/aws/controllers/notification/event/spotinterruption/{v0 => }/handler.go (98%) rename pkg/cloudprovider/aws/controllers/notification/event/spotinterruption/{v0 => }/parser.go (97%) rename pkg/cloudprovider/aws/controllers/notification/event/spotinterruption/{v0 => }/unmarshal.go (98%) rename pkg/cloudprovider/aws/controllers/notification/event/statechange/{v0 => }/handler.go (98%) rename pkg/cloudprovider/aws/controllers/notification/event/statechange/{v0 => }/parser.go (98%) rename pkg/cloudprovider/aws/controllers/notification/event/statechange/{v0 => }/unmarshal.go (98%) diff --git a/pkg/cloudprovider/aws/controllers/notification/controller.go b/pkg/cloudprovider/aws/controllers/notification/controller.go index cedb573e6689..6e5d1cecbd42 100644 --- a/pkg/cloudprovider/aws/controllers/notification/controller.go +++ b/pkg/cloudprovider/aws/controllers/notification/controller.go @@ -37,7 +37,7 @@ import ( "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/infrastructure" "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event" "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event/aggregatedparser" - statechangev0 "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event/statechange/v0" + statechangev0 "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event/statechange" "github.com/aws/karpenter/pkg/cloudprovider/aws/events" "github.com/aws/karpenter/pkg/controllers/state" "github.com/aws/karpenter/pkg/metrics" diff --git a/pkg/cloudprovider/aws/controllers/notification/event/aggregatedparser/aggregatedparser.go b/pkg/cloudprovider/aws/controllers/notification/event/aggregatedparser/aggregatedparser.go index 570315605391..75dab8cc1333 100644 --- a/pkg/cloudprovider/aws/controllers/notification/event/aggregatedparser/aggregatedparser.go +++ b/pkg/cloudprovider/aws/controllers/notification/event/aggregatedparser/aggregatedparser.go @@ -21,18 +21,18 @@ import ( "knative.dev/pkg/logging" "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event" - rebalancerecommendationv0 "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event/rebalancerecommendation/v0" - scheduledchangev0 "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event/scheduledchange/v0" - spotinterruptionv0 "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event/spotinterruption/v0" - statechangev0 "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event/statechange/v0" + "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event/rebalancerecommendation" + "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event/scheduledchange" + "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event/spotinterruption" + "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event/statechange" ) var ( DefaultParsers = []event.Parser{ - statechangev0.Parser{}, - spotinterruptionv0.Parser{}, - scheduledchangev0.Parser{}, - rebalancerecommendationv0.Parser{}, + statechange.Parser{}, + spotinterruption.Parser{}, + scheduledchange.Parser{}, + rebalancerecommendation.Parser{}, } ) diff --git a/pkg/cloudprovider/aws/controllers/notification/event/rebalancerecommendation/v0/handler.go b/pkg/cloudprovider/aws/controllers/notification/event/rebalancerecommendation/handler.go similarity index 97% rename from pkg/cloudprovider/aws/controllers/notification/event/rebalancerecommendation/v0/handler.go rename to pkg/cloudprovider/aws/controllers/notification/event/rebalancerecommendation/handler.go index bc156bd8b722..bff92e8bdb42 100644 --- a/pkg/cloudprovider/aws/controllers/notification/event/rebalancerecommendation/v0/handler.go +++ b/pkg/cloudprovider/aws/controllers/notification/event/rebalancerecommendation/handler.go @@ -12,7 +12,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package v0 +package rebalancerecommendation import ( "time" diff --git a/pkg/cloudprovider/aws/controllers/notification/event/rebalancerecommendation/v0/parser.go b/pkg/cloudprovider/aws/controllers/notification/event/rebalancerecommendation/parser.go similarity index 97% rename from pkg/cloudprovider/aws/controllers/notification/event/rebalancerecommendation/v0/parser.go rename to pkg/cloudprovider/aws/controllers/notification/event/rebalancerecommendation/parser.go index f1004786bfb1..baea33276db2 100644 --- a/pkg/cloudprovider/aws/controllers/notification/event/rebalancerecommendation/v0/parser.go +++ b/pkg/cloudprovider/aws/controllers/notification/event/rebalancerecommendation/parser.go @@ -12,7 +12,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package v0 +package rebalancerecommendation import ( "context" diff --git a/pkg/cloudprovider/aws/controllers/notification/event/rebalancerecommendation/v0/unmarshal.go b/pkg/cloudprovider/aws/controllers/notification/event/rebalancerecommendation/unmarshal.go similarity index 97% rename from pkg/cloudprovider/aws/controllers/notification/event/rebalancerecommendation/v0/unmarshal.go rename to pkg/cloudprovider/aws/controllers/notification/event/rebalancerecommendation/unmarshal.go index dee22372f1f6..f5a5fca146ce 100644 --- a/pkg/cloudprovider/aws/controllers/notification/event/rebalancerecommendation/v0/unmarshal.go +++ b/pkg/cloudprovider/aws/controllers/notification/event/rebalancerecommendation/unmarshal.go @@ -12,7 +12,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package v0 +package rebalancerecommendation import ( "go.uber.org/zap" diff --git a/pkg/cloudprovider/aws/controllers/notification/event/scheduledchange/v0/handler.go b/pkg/cloudprovider/aws/controllers/notification/event/scheduledchange/handler.go similarity index 98% rename from pkg/cloudprovider/aws/controllers/notification/event/scheduledchange/v0/handler.go rename to pkg/cloudprovider/aws/controllers/notification/event/scheduledchange/handler.go index cfe006dbda22..54201de6ef13 100644 --- a/pkg/cloudprovider/aws/controllers/notification/event/scheduledchange/v0/handler.go +++ b/pkg/cloudprovider/aws/controllers/notification/event/scheduledchange/handler.go @@ -12,7 +12,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package v0 +package scheduledchange import ( "time" diff --git a/pkg/cloudprovider/aws/controllers/notification/event/scheduledchange/v0/parser.go b/pkg/cloudprovider/aws/controllers/notification/event/scheduledchange/parser.go similarity index 98% rename from pkg/cloudprovider/aws/controllers/notification/event/scheduledchange/v0/parser.go rename to pkg/cloudprovider/aws/controllers/notification/event/scheduledchange/parser.go index dc1223c82e17..0cbb32573e88 100644 --- a/pkg/cloudprovider/aws/controllers/notification/event/scheduledchange/v0/parser.go +++ b/pkg/cloudprovider/aws/controllers/notification/event/scheduledchange/parser.go @@ -12,7 +12,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package v0 +package scheduledchange import ( "context" diff --git a/pkg/cloudprovider/aws/controllers/notification/event/scheduledchange/v0/unmarshal.go b/pkg/cloudprovider/aws/controllers/notification/event/scheduledchange/unmarshal.go similarity index 99% rename from pkg/cloudprovider/aws/controllers/notification/event/scheduledchange/v0/unmarshal.go rename to pkg/cloudprovider/aws/controllers/notification/event/scheduledchange/unmarshal.go index 805e40fde657..83c9eb5e8327 100644 --- a/pkg/cloudprovider/aws/controllers/notification/event/scheduledchange/v0/unmarshal.go +++ b/pkg/cloudprovider/aws/controllers/notification/event/scheduledchange/unmarshal.go @@ -12,7 +12,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package v0 +package scheduledchange import ( "go.uber.org/multierr" diff --git a/pkg/cloudprovider/aws/controllers/notification/event/spotinterruption/v0/handler.go b/pkg/cloudprovider/aws/controllers/notification/event/spotinterruption/handler.go similarity index 98% rename from pkg/cloudprovider/aws/controllers/notification/event/spotinterruption/v0/handler.go rename to pkg/cloudprovider/aws/controllers/notification/event/spotinterruption/handler.go index 23ffe6edcb55..59af20f86d41 100644 --- a/pkg/cloudprovider/aws/controllers/notification/event/spotinterruption/v0/handler.go +++ b/pkg/cloudprovider/aws/controllers/notification/event/spotinterruption/handler.go @@ -12,7 +12,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package v0 +package spotinterruption import ( "time" diff --git a/pkg/cloudprovider/aws/controllers/notification/event/spotinterruption/v0/parser.go b/pkg/cloudprovider/aws/controllers/notification/event/spotinterruption/parser.go similarity index 97% rename from pkg/cloudprovider/aws/controllers/notification/event/spotinterruption/v0/parser.go rename to pkg/cloudprovider/aws/controllers/notification/event/spotinterruption/parser.go index ade58b6d708f..40d5f93c4387 100644 --- a/pkg/cloudprovider/aws/controllers/notification/event/spotinterruption/v0/parser.go +++ b/pkg/cloudprovider/aws/controllers/notification/event/spotinterruption/parser.go @@ -12,7 +12,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package v0 +package spotinterruption import ( "context" diff --git a/pkg/cloudprovider/aws/controllers/notification/event/spotinterruption/v0/unmarshal.go b/pkg/cloudprovider/aws/controllers/notification/event/spotinterruption/unmarshal.go similarity index 98% rename from pkg/cloudprovider/aws/controllers/notification/event/spotinterruption/v0/unmarshal.go rename to pkg/cloudprovider/aws/controllers/notification/event/spotinterruption/unmarshal.go index 2fefe931e11b..698de656d12a 100644 --- a/pkg/cloudprovider/aws/controllers/notification/event/spotinterruption/v0/unmarshal.go +++ b/pkg/cloudprovider/aws/controllers/notification/event/spotinterruption/unmarshal.go @@ -12,7 +12,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package v0 +package spotinterruption import ( "go.uber.org/zap" diff --git a/pkg/cloudprovider/aws/controllers/notification/event/statechange/v0/handler.go b/pkg/cloudprovider/aws/controllers/notification/event/statechange/handler.go similarity index 98% rename from pkg/cloudprovider/aws/controllers/notification/event/statechange/v0/handler.go rename to pkg/cloudprovider/aws/controllers/notification/event/statechange/handler.go index 03c6f6a01abb..0faf6604d554 100644 --- a/pkg/cloudprovider/aws/controllers/notification/event/statechange/v0/handler.go +++ b/pkg/cloudprovider/aws/controllers/notification/event/statechange/handler.go @@ -12,7 +12,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package v0 +package statechange import ( "time" diff --git a/pkg/cloudprovider/aws/controllers/notification/event/statechange/v0/parser.go b/pkg/cloudprovider/aws/controllers/notification/event/statechange/parser.go similarity index 98% rename from pkg/cloudprovider/aws/controllers/notification/event/statechange/v0/parser.go rename to pkg/cloudprovider/aws/controllers/notification/event/statechange/parser.go index b248fe0a1720..9e36251b98e5 100644 --- a/pkg/cloudprovider/aws/controllers/notification/event/statechange/v0/parser.go +++ b/pkg/cloudprovider/aws/controllers/notification/event/statechange/parser.go @@ -12,7 +12,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package v0 +package statechange import ( "context" diff --git a/pkg/cloudprovider/aws/controllers/notification/event/statechange/v0/unmarshal.go b/pkg/cloudprovider/aws/controllers/notification/event/statechange/unmarshal.go similarity index 98% rename from pkg/cloudprovider/aws/controllers/notification/event/statechange/v0/unmarshal.go rename to pkg/cloudprovider/aws/controllers/notification/event/statechange/unmarshal.go index bfeb1593d9de..dea8791d0063 100644 --- a/pkg/cloudprovider/aws/controllers/notification/event/statechange/v0/unmarshal.go +++ b/pkg/cloudprovider/aws/controllers/notification/event/statechange/unmarshal.go @@ -12,7 +12,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package v0 +package statechange import ( "go.uber.org/zap" diff --git a/pkg/cloudprovider/aws/controllers/notification/suite_test.go b/pkg/cloudprovider/aws/controllers/notification/suite_test.go index 082295af05fe..c8ac09ba9839 100644 --- a/pkg/cloudprovider/aws/controllers/notification/suite_test.go +++ b/pkg/cloudprovider/aws/controllers/notification/suite_test.go @@ -46,9 +46,9 @@ import ( "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/infrastructure" "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification" "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event" - scheduledchangev0 "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event/scheduledchange/v0" - spotinterruptionv0 "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event/spotinterruption/v0" - statechangev0 "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event/statechange/v0" + scheduledchangev0 "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event/scheduledchange" + spotinterruptionv0 "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event/spotinterruption" + statechangev0 "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event/statechange" awsfake "github.com/aws/karpenter/pkg/cloudprovider/aws/fake" "github.com/aws/karpenter/pkg/cloudprovider/fake" "github.com/aws/karpenter/pkg/controllers/state" diff --git a/test/suites/notification/suite_test.go b/test/suites/notification/suite_test.go index 8d6854f63450..e74635208eb5 100644 --- a/test/suites/notification/suite_test.go +++ b/test/suites/notification/suite_test.go @@ -34,7 +34,7 @@ import ( "github.com/aws/karpenter/pkg/apis/provisioning/v1alpha5" awsv1alpha1 "github.com/aws/karpenter/pkg/cloudprovider/aws/apis/v1alpha1" "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event" - scheduledchangev0 "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event/scheduledchange/v0" + scheduledchangev0 "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event/scheduledchange" "github.com/aws/karpenter/pkg/test" "github.com/aws/karpenter/test/pkg/environment" ) From 632aea7a707af9602c37401b9ce4ca841bd19ac0 Mon Sep 17 00:00:00 2001 From: Jonathan Innis Date: Thu, 6 Oct 2022 16:51:25 -0700 Subject: [PATCH 35/55] Addressing other PR comments --- .../controllers/infrastructure/controller.go | 4 +-- .../controllers/notification/controller.go | 27 +++++-------------- pkg/cloudprovider/aws/errors.go | 20 +++++++------- pkg/cloudprovider/aws/instance.go | 14 +++------- pkg/cloudprovider/aws/sqs.go | 4 +-- pkg/cloudprovider/aws/utils/utils.go | 25 +++++++++++++++++ 6 files changed, 48 insertions(+), 46 deletions(-) create mode 100644 pkg/cloudprovider/aws/utils/utils.go diff --git a/pkg/cloudprovider/aws/controllers/infrastructure/controller.go b/pkg/cloudprovider/aws/controllers/infrastructure/controller.go index 2cad54872b12..3fb22ad05ee2 100644 --- a/pkg/cloudprovider/aws/controllers/infrastructure/controller.go +++ b/pkg/cloudprovider/aws/controllers/infrastructure/controller.go @@ -44,7 +44,7 @@ type Controller struct { sqsProvider *aws.SQSProvider eventBridgeProvider *aws.EventBridgeProvider - mutex *sync.RWMutex + mutex sync.RWMutex backoff *backoff.ExponentialBackOff readinessChan chan struct{} // A signal to other controllers that infrastructure is in a good state ready bool @@ -65,7 +65,7 @@ func NewController(ctx context.Context, kubeClient client.Client, clk clock.Cloc clock: clk, sqsProvider: sqsProvider, eventBridgeProvider: eventBridgeProvider, - mutex: &sync.RWMutex{}, + mutex: sync.RWMutex{}, backoff: newBackoff(clk), readinessChan: make(chan struct{}), trigger: make(chan struct{}, 1), diff --git a/pkg/cloudprovider/aws/controllers/notification/controller.go b/pkg/cloudprovider/aws/controllers/notification/controller.go index 6e5d1cecbd42..2ac6b3527412 100644 --- a/pkg/cloudprovider/aws/controllers/notification/controller.go +++ b/pkg/cloudprovider/aws/controllers/notification/controller.go @@ -17,7 +17,6 @@ package notification import ( "context" "fmt" - "regexp" "strings" "time" @@ -27,6 +26,7 @@ import ( v1 "k8s.io/api/core/v1" "k8s.io/utils/clock" "knative.dev/pkg/logging" + "knative.dev/pkg/ptr" "sigs.k8s.io/controller-runtime/pkg/client" "github.com/cenkalti/backoff/v4" @@ -39,6 +39,7 @@ import ( "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event/aggregatedparser" statechangev0 "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event/statechange" "github.com/aws/karpenter/pkg/cloudprovider/aws/events" + "github.com/aws/karpenter/pkg/cloudprovider/aws/utils" "github.com/aws/karpenter/pkg/controllers/state" "github.com/aws/karpenter/pkg/metrics" ) @@ -288,7 +289,7 @@ func getInvolvedNodes(instanceIDs []string, instanceIDMap map[string]*v1.Node) [ return nodes } -// buildInstanceIDMap builds a map between the instance name that is stored in the +// makeInstanceIDMap builds a map between the instance id that is stored in the // node .spec.providerID and the node name stored on the host func (c *Controller) makeInstanceIDMap() map[string]*v1.Node { m := map[string]*v1.Node{} @@ -297,28 +298,12 @@ func (c *Controller) makeInstanceIDMap() map[string]*v1.Node { if _, ok := n.Node.Labels[v1alpha5.ProvisionerNameLabelKey]; !ok { return true } - id := parseProviderID(n.Node.Spec.ProviderID) - if id == "" { + id, err := utils.ParseProviderID(n.Node) + if err != nil || id == nil { return true } - m[id] = n.Node + m[ptr.StringValue(id)] = n.Node return true }) return m } - -// parseProviderID parses the provider ID stored on the node to get the instance ID -// associated with a node -func parseProviderID(pid string) string { - r := regexp.MustCompile(`aws:///(?P.*)/(?P.*)`) - matches := r.FindStringSubmatch(pid) - if matches == nil { - return "" - } - for i, name := range r.SubexpNames() { - if name == "InstanceID" { - return matches[i] - } - } - return "" -} diff --git a/pkg/cloudprovider/aws/errors.go b/pkg/cloudprovider/aws/errors.go index aceea11abc16..70289af4d92c 100644 --- a/pkg/cloudprovider/aws/errors.go +++ b/pkg/cloudprovider/aws/errors.go @@ -21,7 +21,7 @@ import ( "github.com/aws/aws-sdk-go/service/ec2" "github.com/aws/aws-sdk-go/service/eventbridge" "github.com/aws/aws-sdk-go/service/sqs" - "github.com/samber/lo" + "k8s.io/apimachinery/pkg/util/sets" ) const ( @@ -32,24 +32,24 @@ const ( var ( // This is not an exhaustive list, add to it as needed - notFoundErrorCodes = []string{ + notFoundErrorCodes = sets.NewString( "InvalidInstanceID.NotFound", launchTemplateNotFoundCode, sqs.ErrCodeQueueDoesNotExist, (&eventbridge.ResourceNotFoundException{}).Code(), - } + ) // unfulfillableCapacityErrorCodes signify that capacity is temporarily unable to be launched - unfulfillableCapacityErrorCodes = []string{ + unfulfillableCapacityErrorCodes = sets.NewString( "InsufficientInstanceCapacity", "MaxSpotInstanceCountExceeded", "VcpuLimitExceeded", "UnfulfillableCapacity", "Unsupported", - } - accessDeniedErrorCodes = []string{ + ) + accessDeniedErrorCodes = sets.NewString( AccessDeniedCode, AccessDeniedExceptionCode, - } + ) ) type InstanceTerminatedError struct { @@ -73,7 +73,7 @@ func IsNotFound(err error) bool { } var awsError awserr.Error if errors.As(err, &awsError) { - return lo.Contains(notFoundErrorCodes, awsError.Code()) + return notFoundErrorCodes.Has(awsError.Code()) } return false } @@ -87,7 +87,7 @@ func IsAccessDenied(err error) bool { } var awsError awserr.Error if errors.As(err, &awsError) { - return lo.Contains(accessDeniedErrorCodes, awsError.Code()) + return accessDeniedErrorCodes.Has(awsError.Code()) } return false } @@ -96,7 +96,7 @@ func IsAccessDenied(err error) bool { // capacity is temporarily unavailable for launching. // This could be due to account limits, insufficient ec2 capacity, etc. func isUnfulfillableCapacity(err *ec2.CreateFleetError) bool { - return lo.Contains(unfulfillableCapacityErrorCodes, *err.ErrorCode) + return unfulfillableCapacityErrorCodes.Has(*err.ErrorCode) } func isLaunchTemplateNotFound(err error) bool { diff --git a/pkg/cloudprovider/aws/instance.go b/pkg/cloudprovider/aws/instance.go index f5ab945f5590..3a88a1dd54be 100644 --- a/pkg/cloudprovider/aws/instance.go +++ b/pkg/cloudprovider/aws/instance.go @@ -22,13 +22,12 @@ import ( "strings" "time" - "github.com/samber/lo" - "github.com/avast/retry-go" "github.com/aws/aws-sdk-go/aws" "github.com/aws/aws-sdk-go/aws/awserr" "github.com/aws/aws-sdk-go/service/ec2" "github.com/aws/aws-sdk-go/service/ec2/ec2iface" + "github.com/samber/lo" "go.uber.org/multierr" v1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -38,6 +37,7 @@ import ( "github.com/aws/karpenter/pkg/apis/provisioning/v1alpha5" "github.com/aws/karpenter/pkg/cloudprovider" "github.com/aws/karpenter/pkg/cloudprovider/aws/apis/v1alpha1" + "github.com/aws/karpenter/pkg/cloudprovider/aws/utils" "github.com/aws/karpenter/pkg/scheduling" "github.com/aws/karpenter/pkg/utils/functional" "github.com/aws/karpenter/pkg/utils/injection" @@ -109,7 +109,7 @@ func (p *InstanceProvider) Create(ctx context.Context, provider *v1alpha1.AWS, n } func (p *InstanceProvider) Terminate(ctx context.Context, node *v1.Node) error { - id, err := getInstanceID(node) + id, err := utils.ParseProviderID(node) if err != nil { return fmt.Errorf("getting instance ID for node %s, %w", node.Name, err) } @@ -411,14 +411,6 @@ func (p *InstanceProvider) prioritizeInstanceTypes(instanceTypes []cloudprovider return instanceTypes } -func getInstanceID(node *v1.Node) (*string, error) { - id := strings.Split(node.Spec.ProviderID, "/") - if len(id) < 5 { - return nil, fmt.Errorf("parsing instance id %s", node.Spec.ProviderID) - } - return aws.String(id[4]), nil -} - func combineFleetErrors(errors []*ec2.CreateFleetError) (errs error) { unique := sets.NewString() for _, err := range errors { diff --git a/pkg/cloudprovider/aws/sqs.go b/pkg/cloudprovider/aws/sqs.go index 0d72d4bbfc8f..c4a8b2c4dd37 100644 --- a/pkg/cloudprovider/aws/sqs.go +++ b/pkg/cloudprovider/aws/sqs.go @@ -79,8 +79,8 @@ func NewSQSProvider(ctx context.Context, client sqsiface.SQSAPI, metadataProvide } provider.receiveMessageInput = &sqs.ReceiveMessageInput{ MaxNumberOfMessages: aws.Int64(10), - VisibilityTimeout: aws.Int64(10), // Seconds - WaitTimeSeconds: aws.Int64(10), // Seconds, maximum for long polling + VisibilityTimeout: aws.Int64(20), // Seconds + WaitTimeSeconds: aws.Int64(20), // Seconds, maximum for long polling AttributeNames: []*string{ aws.String(sqs.MessageSystemAttributeNameSentTimestamp), }, diff --git a/pkg/cloudprovider/aws/utils/utils.go b/pkg/cloudprovider/aws/utils/utils.go new file mode 100644 index 000000000000..88d9bff849a4 --- /dev/null +++ b/pkg/cloudprovider/aws/utils/utils.go @@ -0,0 +1,25 @@ +package utils + +import ( + "fmt" + "regexp" + + v1 "k8s.io/api/core/v1" + "knative.dev/pkg/ptr" +) + +// ParseProviderID parses the provider ID stored on the node to get the instance ID +// associated with a node +func ParseProviderID(node *v1.Node) (*string, error) { + r := regexp.MustCompile(`aws:///(?P.*)/(?P.*)`) + matches := r.FindStringSubmatch(node.Spec.ProviderID) + if matches == nil { + return nil, fmt.Errorf("parsing instance id %s", node.Spec.ProviderID) + } + for i, name := range r.SubexpNames() { + if name == "InstanceID" { + return ptr.String(matches[i]), nil + } + } + return nil, fmt.Errorf("parsing instance id %s", node.Spec.ProviderID) +} From 2be448e9abc4ce60d40ae9a5a858cdbf6bcc48b3 Mon Sep 17 00:00:00 2001 From: Jonathan Innis Date: Thu, 6 Oct 2022 17:07:48 -0700 Subject: [PATCH 36/55] Use workqueue.parallelizeuntil instead of rolling own parallelism --- .../controllers/infrastructure/controller.go | 76 +++++++------------ .../controllers/notification/controller.go | 33 ++++---- .../controllers/notification/event/noop.go | 2 +- .../event/rebalancerecommendation/handler.go | 2 +- .../event/scheduledchange/handler.go | 2 +- .../event/spotinterruption/handler.go | 2 +- .../notification/event/statechange/handler.go | 2 +- .../notification/event/statechange/parser.go | 13 ++-- .../controllers/notification/event/types.go | 42 ++++++---- pkg/cloudprovider/aws/eventbridge.go | 64 ++++++++-------- pkg/cloudprovider/aws/utils/utils.go | 14 ++++ 11 files changed, 127 insertions(+), 125 deletions(-) diff --git a/pkg/cloudprovider/aws/controllers/infrastructure/controller.go b/pkg/cloudprovider/aws/controllers/infrastructure/controller.go index 3fb22ad05ee2..f6dfa625c998 100644 --- a/pkg/cloudprovider/aws/controllers/infrastructure/controller.go +++ b/pkg/cloudprovider/aws/controllers/infrastructure/controller.go @@ -25,6 +25,7 @@ import ( "github.com/aws/aws-sdk-go/service/sqs" "github.com/cenkalti/backoff/v4" "go.uber.org/multierr" + "k8s.io/client-go/util/workqueue" "k8s.io/utils/clock" "knative.dev/pkg/logging" "sigs.k8s.io/controller-runtime/pkg/client" @@ -164,65 +165,40 @@ func (c *Controller) setReady(ctx context.Context, ready bool) { func (c *Controller) Reconcile(ctx context.Context) (err error) { defer metrics.Measure(reconcileDuration)() - wg := &sync.WaitGroup{} - m := &sync.Mutex{} - - wg.Add(2) - go func() { - defer wg.Done() - e := c.ensureQueue(ctx) - m.Lock() - err = multierr.Append(err, e) - m.Unlock() - }() - go func() { - defer wg.Done() - e := c.ensureEventBridge(ctx) - m.Lock() - err = multierr.Append(err, e) - m.Unlock() - }() - wg.Wait() - return err + funcs := []func() error{ + func() error { return c.ensureQueue(ctx) }, + func() error { return c.ensureEventBridge(ctx) }, + } + errs := make([]error, len(funcs)) + workqueue.ParallelizeUntil(ctx, len(funcs), len(funcs), func(i int) { + errs[i] = funcs[i]() + }) + return multierr.Combine(errs...) } // DeleteInfrastructure removes the infrastructure that was stood up and reconciled // by the infrastructure controller for SQS message polling -func (c *Controller) DeleteInfrastructure(ctx context.Context) (err error) { +func (c *Controller) DeleteInfrastructure(ctx context.Context) error { logging.FromContext(ctx).Infof("Deprovisioning the infrastructure...") - wg := &sync.WaitGroup{} - m := &sync.Mutex{} - wg.Add(2) - go func() { - defer wg.Done() + deleteQueueFunc := func() error { logging.FromContext(ctx).Debugf("Deleting the SQS notification queue...") - e := c.sqsProvider.DeleteQueue(ctx) - - // If we get access denied, nothing we can do so just log and don't return the error - if aws.IsAccessDenied(e) { - logging.FromContext(ctx).Errorf("Access denied while trying to delete SQS queue, %v", err) - } else if err != nil { - m.Lock() - err = multierr.Append(err, e) - m.Unlock() - } - }() - go func() { - defer wg.Done() + return c.sqsProvider.DeleteQueue(ctx) + } + deleteEventBridgeRulesFunc := func() error { logging.FromContext(ctx).Debugf("Deleting the EventBridge notification rules...") - e := c.eventBridgeProvider.DeleteEC2NotificationRules(ctx) + return c.eventBridgeProvider.DeleteEC2NotificationRules(ctx) + } + funcs := []func() error{ + deleteQueueFunc, + deleteEventBridgeRulesFunc, + } + errs := make([]error, len(funcs)) + workqueue.ParallelizeUntil(ctx, len(funcs), len(funcs), func(i int) { + errs[i] = funcs[i]() + }) - // If we get access denied, nothing we can do so just log and don't return the error - if aws.IsAccessDenied(e) { - logging.FromContext(ctx).Errorf("Access denied while trying to delete notification rules, %v", err) - } else if err != nil { - m.Lock() - err = multierr.Append(err, e) - m.Unlock() - } - }() - wg.Wait() + err := multierr.Combine(errs...) if err != nil { c.recorder.InfrastructureDeletionFailed(ctx, c.kubeClient) return err diff --git a/pkg/cloudprovider/aws/controllers/notification/controller.go b/pkg/cloudprovider/aws/controllers/notification/controller.go index 2ac6b3527412..ae6a1a5150be 100644 --- a/pkg/cloudprovider/aws/controllers/notification/controller.go +++ b/pkg/cloudprovider/aws/controllers/notification/controller.go @@ -24,6 +24,7 @@ import ( "github.com/samber/lo" "go.uber.org/multierr" v1 "k8s.io/api/core/v1" + "k8s.io/client-go/util/workqueue" "k8s.io/utils/clock" "knative.dev/pkg/logging" "knative.dev/pkg/ptr" @@ -151,11 +152,11 @@ func (c *Controller) Reconcile(ctx context.Context) error { return nil } instanceIDMap := c.makeInstanceIDMap() - for _, msg := range sqsMessages { - e := c.handleMessage(ctx, instanceIDMap, msg) - err = multierr.Append(err, e) - } - return nil + errs := make([]error, len(sqsMessages)) + workqueue.ParallelizeUntil(ctx, 10, len(sqsMessages), func(i int) { + errs[i] = c.handleMessage(ctx, instanceIDMap, sqsMessages[i]) + }) + return multierr.Combine(errs...) } // handleMessage gets the node names of the instances involved in the queue message and takes the @@ -171,7 +172,7 @@ func (c *Controller) handleMessage(ctx context.Context, instanceIDMap map[string nodes := getInvolvedNodes(evt.EC2InstanceIDs(), instanceIDMap) // There's no action to take here since the event doesn't pertain to any of our instances if len(nodes) == 0 { - receivedMessages.WithLabelValues(evt.Kind(), "false").Inc() + receivedMessages.WithLabelValues(evt.Kind().String(), "false").Inc() // Since there's no action, just delete the message err = c.provider.DeleteSQSMessage(ctx, msg) @@ -181,7 +182,7 @@ func (c *Controller) handleMessage(ctx context.Context, instanceIDMap map[string deletedMessages.Inc() return } - receivedMessages.WithLabelValues(evt.Kind(), "true").Inc() + receivedMessages.WithLabelValues(evt.Kind().String(), "true").Inc() nodeNames := lo.Map(nodes, func(n *v1.Node, _ int) string { return n.Name }) logging.FromContext(ctx).Infof("Received actionable event from SQS queue for node(s) [%s%s]", @@ -214,7 +215,7 @@ func (c *Controller) handleNode(ctx context.Context, evt event.Interface, node * actionsPerformed.WithLabelValues(action).Inc() // Mark the offering as unavailable in the ICE cache since we got a spot interruption warning - if evt.Kind() == event.Kinds.SpotInterruption { + if evt.Kind() == event.SpotInterruptionKind { zone := node.Labels[v1.LabelTopologyZone] instanceType := node.Labels[v1.LabelInstanceTypeStable] if zone != "" && instanceType != "" { @@ -237,16 +238,16 @@ func (c *Controller) deleteInstance(ctx context.Context, node *v1.Node) error { func (c *Controller) notifyForEvent(evt event.Interface, n *v1.Node) { switch evt.Kind() { - case event.Kinds.RebalanceRecommendation: + case event.RebalanceRecommendationKind: c.recorder.EC2SpotRebalanceRecommendation(n) - case event.Kinds.ScheduledChange: + case event.ScheduledChangeKind: c.recorder.EC2HealthWarning(n) - case event.Kinds.SpotInterruption: + case event.SpotInterruptionKind: c.recorder.EC2SpotInterruptionWarning(n) - case event.Kinds.StateChange: + case event.StateChangeKind: typed := evt.(statechangev0.EC2InstanceStateChangeNotification) if lo.Contains([]string{"stopping", "stopped"}, typed.State()) { c.recorder.EC2StateStopping(n) @@ -260,16 +261,16 @@ func (c *Controller) notifyForEvent(evt event.Interface, n *v1.Node) { func actionForEvent(evt event.Interface) Action { switch evt.Kind() { - case event.Kinds.RebalanceRecommendation: + case event.RebalanceRecommendationKind: return Actions.NoAction - case event.Kinds.ScheduledChange: + case event.ScheduledChangeKind: return Actions.CordonAndDrain - case event.Kinds.SpotInterruption: + case event.SpotInterruptionKind: return Actions.CordonAndDrain - case event.Kinds.StateChange: + case event.StateChangeKind: return Actions.CordonAndDrain default: diff --git a/pkg/cloudprovider/aws/controllers/notification/event/noop.go b/pkg/cloudprovider/aws/controllers/notification/event/noop.go index 0e3c5267ae92..a6f78d730fa4 100644 --- a/pkg/cloudprovider/aws/controllers/notification/event/noop.go +++ b/pkg/cloudprovider/aws/controllers/notification/event/noop.go @@ -32,7 +32,7 @@ func (NoOp) EC2InstanceIDs() []string { } func (NoOp) Kind() Kind { - return Kinds.NoOp + return NoOpKind } func (n NoOp) MarshalLogObject(enc zapcore.ObjectEncoder) error { diff --git a/pkg/cloudprovider/aws/controllers/notification/event/rebalancerecommendation/handler.go b/pkg/cloudprovider/aws/controllers/notification/event/rebalancerecommendation/handler.go index bff92e8bdb42..610e31edc385 100644 --- a/pkg/cloudprovider/aws/controllers/notification/event/rebalancerecommendation/handler.go +++ b/pkg/cloudprovider/aws/controllers/notification/event/rebalancerecommendation/handler.go @@ -34,7 +34,7 @@ func (e EC2InstanceRebalanceRecommendation) EC2InstanceIDs() []string { } func (EC2InstanceRebalanceRecommendation) Kind() event.Kind { - return event.Kinds.RebalanceRecommendation + return event.RebalanceRecommendationKind } func (e EC2InstanceRebalanceRecommendation) MarshalLogObject(enc zapcore.ObjectEncoder) error { diff --git a/pkg/cloudprovider/aws/controllers/notification/event/scheduledchange/handler.go b/pkg/cloudprovider/aws/controllers/notification/event/scheduledchange/handler.go index 54201de6ef13..42b998470daf 100644 --- a/pkg/cloudprovider/aws/controllers/notification/event/scheduledchange/handler.go +++ b/pkg/cloudprovider/aws/controllers/notification/event/scheduledchange/handler.go @@ -38,7 +38,7 @@ func (e AWSHealthEvent) EC2InstanceIDs() []string { } func (AWSHealthEvent) Kind() event.Kind { - return event.Kinds.ScheduledChange + return event.ScheduledChangeKind } func (e AWSHealthEvent) MarshalLogObject(enc zapcore.ObjectEncoder) error { diff --git a/pkg/cloudprovider/aws/controllers/notification/event/spotinterruption/handler.go b/pkg/cloudprovider/aws/controllers/notification/event/spotinterruption/handler.go index 59af20f86d41..c2a58a4e9d69 100644 --- a/pkg/cloudprovider/aws/controllers/notification/event/spotinterruption/handler.go +++ b/pkg/cloudprovider/aws/controllers/notification/event/spotinterruption/handler.go @@ -34,7 +34,7 @@ func (e EC2SpotInstanceInterruptionWarning) EC2InstanceIDs() []string { } func (EC2SpotInstanceInterruptionWarning) Kind() event.Kind { - return event.Kinds.SpotInterruption + return event.SpotInterruptionKind } func (e EC2SpotInstanceInterruptionWarning) MarshalLogObject(enc zapcore.ObjectEncoder) error { diff --git a/pkg/cloudprovider/aws/controllers/notification/event/statechange/handler.go b/pkg/cloudprovider/aws/controllers/notification/event/statechange/handler.go index 0faf6604d554..e036f5c1108b 100644 --- a/pkg/cloudprovider/aws/controllers/notification/event/statechange/handler.go +++ b/pkg/cloudprovider/aws/controllers/notification/event/statechange/handler.go @@ -38,7 +38,7 @@ func (e EC2InstanceStateChangeNotification) State() string { } func (EC2InstanceStateChangeNotification) Kind() event.Kind { - return event.Kinds.StateChange + return event.StateChangeKind } func (e EC2InstanceStateChangeNotification) MarshalLogObject(enc zapcore.ObjectEncoder) error { diff --git a/pkg/cloudprovider/aws/controllers/notification/event/statechange/parser.go b/pkg/cloudprovider/aws/controllers/notification/event/statechange/parser.go index 9e36251b98e5..1b85f6572251 100644 --- a/pkg/cloudprovider/aws/controllers/notification/event/statechange/parser.go +++ b/pkg/cloudprovider/aws/controllers/notification/event/statechange/parser.go @@ -19,17 +19,18 @@ import ( "encoding/json" "strings" + "k8s.io/apimachinery/pkg/util/sets" + "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event" ) const ( - source = "aws.ec2" - detailType = "EC2 Instance State-change Notification" - version = "0" - acceptedStates = "stopping,stopped,shutting-down,terminated" + source = "aws.ec2" + detailType = "EC2 Instance State-change Notification" + version = "0" ) -//var acceptedStatesList = strings.Split(acceptedStates, ",") +var acceptedStates = sets.NewString("stopping", "stopped", "shutting-down", "terminated") type Parser struct{} @@ -42,7 +43,7 @@ func (Parser) Parse(ctx context.Context, str string) event.Interface { if evt.Source != source || evt.DetailType != detailType || evt.Version != version { return nil } - if !strings.Contains(acceptedStates, strings.ToLower(evt.Detail.State)) { + if !acceptedStates.Has(strings.ToLower(evt.Detail.State)) { return nil } return evt diff --git a/pkg/cloudprovider/aws/controllers/notification/event/types.go b/pkg/cloudprovider/aws/controllers/notification/event/types.go index fb2f19b2dd85..fb5c7b19c74c 100644 --- a/pkg/cloudprovider/aws/controllers/notification/event/types.go +++ b/pkg/cloudprovider/aws/controllers/notification/event/types.go @@ -16,6 +16,7 @@ package event import ( "context" + "fmt" "go.uber.org/zap/zapcore" ) @@ -31,18 +32,31 @@ type Interface interface { Kind() Kind } -type Kind = string - -var Kinds = struct { - RebalanceRecommendation, - ScheduledChange, - SpotInterruption, - StateChange, - NoOp Kind -}{ - RebalanceRecommendation: "RebalanceRecommendation", - ScheduledChange: "ScheduledChange", - SpotInterruption: "SpotInterruption", - StateChange: "StateChange", - NoOp: "NoOp", +type Kind byte + +const ( + UnknownKind = iota + RebalanceRecommendationKind + ScheduledChangeKind + SpotInterruptionKind + StateChangeKind + NoOpKind +) + +// manually written or generated using https://pkg.go.dev/golang.org/x/tools/cmd/stringer +func (k Kind) String() string { + switch k { + case RebalanceRecommendationKind: + return "RebalanceRecommendation" + case ScheduledChangeKind: + return "ScheduledChange" + case SpotInterruptionKind: + return "SpotInterruption" + case StateChangeKind: + return "StateChange" + case NoOpKind: + return "NoOp" + default: + return fmt.Sprintf("Unsupported Kind %d", k) + } } diff --git a/pkg/cloudprovider/aws/eventbridge.go b/pkg/cloudprovider/aws/eventbridge.go index 2f459f163fd1..2631aab994c8 100644 --- a/pkg/cloudprovider/aws/eventbridge.go +++ b/pkg/cloudprovider/aws/eventbridge.go @@ -25,6 +25,7 @@ import ( "github.com/aws/aws-sdk-go/service/eventbridge/eventbridgeiface" "github.com/samber/lo" "go.uber.org/multierr" + "k8s.io/client-go/util/workqueue" awsv1alpha1 "github.com/aws/karpenter/pkg/cloudprovider/aws/apis/v1alpha1" "github.com/aws/karpenter/pkg/utils/injection" @@ -64,42 +65,37 @@ func NewEventBridgeProvider(eb eventbridgeiface.EventBridgeAPI, metadataProvider } } -func (eb *EventBridgeProvider) CreateEC2NotificationRules(ctx context.Context) (err error) { - wg := &sync.WaitGroup{} - m := &sync.Mutex{} - for _, rule := range eb.getEC2NotificationEventRules(ctx) { - wg.Add(1) - go func(r EventRule) { - defer wg.Done() - _, e := eb.client.PutRuleWithContext(ctx, &eventbridge.PutRuleInput{ - Name: aws.String(r.Name), - EventPattern: aws.String(string(r.Pattern.Serialize())), - Tags: []*eventbridge.Tag{ - { - Key: aws.String(awsv1alpha1.DiscoveryTagKey), - Value: aws.String(injection.GetOptions(ctx).ClusterName), - }, +func (eb *EventBridgeProvider) CreateEC2NotificationRules(ctx context.Context) error { + rules := eb.getEC2NotificationEventRules(ctx) + errs := make([]error, len(rules)) + workqueue.ParallelizeUntil(ctx, len(rules), len(rules), func(i int) { + _, err := eb.client.PutRuleWithContext(ctx, &eventbridge.PutRuleInput{ + Name: aws.String(rules[i].Name), + EventPattern: aws.String(string(rules[i].Pattern.Serialize())), + Tags: []*eventbridge.Tag{ + { + Key: aws.String(awsv1alpha1.DiscoveryTagKey), + Value: aws.String(injection.GetOptions(ctx).ClusterName), }, - }) - m.Lock() - err = multierr.Append(err, e) - m.Unlock() - _, e = eb.client.PutTargetsWithContext(ctx, &eventbridge.PutTargetsInput{ - Rule: aws.String(r.Name), - Targets: []*eventbridge.Target{ - { - Id: aws.String(r.Target.ID), - Arn: aws.String(r.Target.ARN), - }, + }, + }) + if err != nil { + errs[i] = multierr.Append(errs[i], err) + } + _, err = eb.client.PutTargetsWithContext(ctx, &eventbridge.PutTargetsInput{ + Rule: aws.String(rules[i].Name), + Targets: []*eventbridge.Target{ + { + Id: aws.String(rules[i].Target.ID), + Arn: aws.String(rules[i].Target.ARN), }, - }) - m.Lock() - err = multierr.Append(err, e) - m.Unlock() - }(rule) - } - wg.Wait() - return err + }, + }) + if err != nil { + errs[i] = multierr.Append(errs[i], err) + } + }) + return multierr.Combine(errs...) } func (eb *EventBridgeProvider) DeleteEC2NotificationRules(ctx context.Context) (err error) { diff --git a/pkg/cloudprovider/aws/utils/utils.go b/pkg/cloudprovider/aws/utils/utils.go index 88d9bff849a4..27ecaaed549e 100644 --- a/pkg/cloudprovider/aws/utils/utils.go +++ b/pkg/cloudprovider/aws/utils/utils.go @@ -1,3 +1,17 @@ +/* +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + package utils import ( From da160e53a45b30c81928258fa6fbd7d3ceda739e Mon Sep 17 00:00:00 2001 From: Jonathan Innis Date: Fri, 7 Oct 2022 15:15:38 -0700 Subject: [PATCH 37/55] Migrate controllers to controller-runtime --- Makefile | 5 +- charts/karpenter/templates/clusterrole.yaml | 3 + charts/karpenter/templates/deployment.yaml | 6 - cmd/controller/main.go | 6 +- go.mod | 3 +- go.sum | 2 - pkg/cloudprovider/aws/cloudprovider.go | 7 +- .../aws/controllers/fake/triggercontroller.go | 45 +++ .../controllers/infrastructure/controller.go | 265 ------------------ .../controllers/infrastructure/provider.go | 120 ++++++++ .../controllers/infrastructure/reconciler.go | 52 ++++ .../controllers/infrastructure/suite_test.go | 231 ++++++--------- .../controllers/nodetemplate/controller.go | 102 +++++++ .../controllers/nodetemplate/suite_test.go | 15 + .../aws/controllers/notification/metrics.go | 25 +- .../{controller.go => reconciler.go} | 168 +++++------ .../controllers/notification/suite_test.go | 108 ++++--- pkg/cloudprovider/aws/controllers/register.go | 18 +- pkg/cloudprovider/aws/events/recorder.go | 64 ----- pkg/cloudprovider/aws/fake/eventrecorder.go | 11 - pkg/cloudprovider/aws/metadata.go | 78 +++--- pkg/cloudprovider/aws/sqs.go | 26 +- pkg/controllers/controllers.go | 77 +---- pkg/controllers/polling/controller.go | 211 ++++++++++++++ .../polling}/metrics.go | 44 +-- pkg/controllers/types.go | 77 +++++ pkg/test/deployment.go | 20 -- .../{cache/cache.go => atomic/cached_val.go} | 55 ++-- pkg/utils/options/options.go | 7 - pkg/utils/ptr/ptr.go | 4 + 30 files changed, 982 insertions(+), 873 deletions(-) create mode 100644 pkg/cloudprovider/aws/controllers/fake/triggercontroller.go delete mode 100644 pkg/cloudprovider/aws/controllers/infrastructure/controller.go create mode 100644 pkg/cloudprovider/aws/controllers/infrastructure/provider.go create mode 100644 pkg/cloudprovider/aws/controllers/infrastructure/reconciler.go create mode 100644 pkg/cloudprovider/aws/controllers/nodetemplate/controller.go create mode 100644 pkg/cloudprovider/aws/controllers/nodetemplate/suite_test.go rename pkg/cloudprovider/aws/controllers/notification/{controller.go => reconciler.go} (66%) create mode 100644 pkg/controllers/polling/controller.go rename pkg/{cloudprovider/aws/controllers/infrastructure => controllers/polling}/metrics.go (50%) create mode 100644 pkg/controllers/types.go rename pkg/utils/{cache/cache.go => atomic/cached_val.go} (55%) diff --git a/Makefile b/Makefile index 97dda921cf6c..6db871c343cd 100644 --- a/Makefile +++ b/Makefile @@ -46,15 +46,14 @@ run: ## Run Karpenter controller binary against your local cluster --leader-elect=false test: ## Run tests - go test -run=${TEST_FILTER} ./pkg/... -timeout 15m + go test -run=${TEST_FILTER} ./pkg/... battletest: ## Run randomized, racing, code coveraged, tests go test -run=${TEST_FILTER} ./pkg/... \ -race \ -cover -coverprofile=coverage.out -outputdir=. -coverpkg=./pkg/... \ -ginkgo.randomizeAllSpecs \ - -tags random_test_delay \ - -timeout 15m + -tags random_test_delay e2etests: ## Run the e2e suite against your local cluster go clean -testcache diff --git a/charts/karpenter/templates/clusterrole.yaml b/charts/karpenter/templates/clusterrole.yaml index 08ef9e597932..108a44e8a317 100644 --- a/charts/karpenter/templates/clusterrole.yaml +++ b/charts/karpenter/templates/clusterrole.yaml @@ -35,6 +35,9 @@ rules: - apiGroups: ["karpenter.sh"] resources: ["provisioners/status"] verbs: ["create", "delete", "patch"] + - apiGroups: ["karpenter.k8s.aws"] + resources: ["awsnodetemplates"] + verbs: ["patch"] - apiGroups: [""] resources: ["events"] verbs: ["create", "patch"] diff --git a/charts/karpenter/templates/deployment.yaml b/charts/karpenter/templates/deployment.yaml index 575dadb565fe..6da4354990d6 100644 --- a/charts/karpenter/templates/deployment.yaml +++ b/charts/karpenter/templates/deployment.yaml @@ -75,12 +75,6 @@ spec: {{- end }} - name: KARPENTER_SERVICE value: {{ include "karpenter.fullname" . }} - - name: DEPLOYMENT_NAME - value: {{ include "karpenter.fullname" . }} - - name: POD_NAME - valueFrom: - fieldRef: - fieldPath: metadata.name - name: SYSTEM_NAMESPACE valueFrom: fieldRef: diff --git a/cmd/controller/main.go b/cmd/controller/main.go index 24cf8f07c860..58517526ad31 100644 --- a/cmd/controller/main.go +++ b/cmd/controller/main.go @@ -24,10 +24,10 @@ import ( ) func main() { - controllers.Initialize(func(ctx context.Context, options cloudprovider.Options) (cloudprovider.CloudProvider, controllers.ControllerInitFunc) { + controllers.Initialize(func(ctx context.Context, options cloudprovider.Options) (cloudprovider.CloudProvider, controllers.ControllerGetterFunc) { provider := aws.NewCloudProvider(ctx, options) - return provider, func(c context.Context, opts *controllers.ControllerOptions) { - awscontrollers.Register(c, provider, opts) + return provider, func(ctx context.Context, opts *controllers.ControllerOptions) []controllers.Controller { + return awscontrollers.Register(ctx, provider, opts) } }) } diff --git a/go.mod b/go.mod index bd0ce9a44f1c..d14fcb0cb4ab 100644 --- a/go.mod +++ b/go.mod @@ -6,10 +6,10 @@ require ( github.com/Pallinder/go-randomdata v1.2.0 github.com/avast/retry-go v3.0.0+incompatible github.com/aws/aws-sdk-go v1.44.114 - github.com/cenkalti/backoff/v4 v4.1.3 github.com/deckarep/golang-set v1.8.0 github.com/go-logr/logr v1.2.3 github.com/go-logr/zapr v1.2.3 + github.com/google/uuid v1.3.0 github.com/imdario/mergo v0.3.13 github.com/mitchellh/hashstructure/v2 v2.0.2 github.com/onsi/ginkgo/v2 v2.2.0 @@ -57,7 +57,6 @@ require ( github.com/google/gnostic v0.5.7-v3refs // indirect github.com/google/go-cmp v0.5.8 // indirect github.com/google/gofuzz v1.2.0 // indirect - github.com/google/uuid v1.3.0 // indirect github.com/grpc-ecosystem/grpc-gateway v1.16.0 // indirect github.com/hashicorp/golang-lru v0.5.4 // indirect github.com/jmespath/go-jmespath v0.4.0 // indirect diff --git a/go.sum b/go.sum index 45522395c9a6..b4b7f441f175 100644 --- a/go.sum +++ b/go.sum @@ -76,8 +76,6 @@ github.com/blang/semver/v4 v4.0.0 h1:1PFHFE6yCCTv8C1TeyNNarDzntLi7wMI5i/pzqYIsAM github.com/blang/semver/v4 v4.0.0/go.mod h1:IbckMUScFkM3pff0VJDNKRiT6TG/YpiHIM2yvyW5YoQ= github.com/blendle/zapdriver v1.3.1 h1:C3dydBOWYRiOk+B8X9IVZ5IOe+7cl+tGOexN4QqHfpE= github.com/blendle/zapdriver v1.3.1/go.mod h1:mdXfREi6u5MArG4j9fewC+FGnXaBR+T4Ox4J2u4eHCc= -github.com/cenkalti/backoff/v4 v4.1.3 h1:cFAlzYUlVYDysBEH2T5hyJZMh3+5+WCBvSnK6Q8UtC4= -github.com/cenkalti/backoff/v4 v4.1.3/go.mod h1:scbssz8iZGpm3xbr14ovlUdkxfGXNInqkPWOWmG2CLw= github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU= github.com/census-instrumentation/opencensus-proto v0.3.0 h1:t/LhUZLVitR1Ow2YOnduCsavhwFUklBMoGVYUCqmCqk= github.com/census-instrumentation/opencensus-proto v0.3.0/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU= diff --git a/pkg/cloudprovider/aws/cloudprovider.go b/pkg/cloudprovider/aws/cloudprovider.go index 8d9ae483ee03..f80586bfee4f 100644 --- a/pkg/cloudprovider/aws/cloudprovider.go +++ b/pkg/cloudprovider/aws/cloudprovider.go @@ -99,11 +99,8 @@ func NewCloudProvider(ctx context.Context, options cloudprovider.Options) *Cloud client.DefaultRetryer{NumMaxRetries: client.DefaultRetryerMaxNumRetries}, ), ))) - metadataProvider := NewMetadataProvider(NewEC2MetadataClient(sess), sts.New(sess)) - if *sess.Config.Region == "" { - logging.FromContext(ctx).Debug("AWS region not configured, asking EC2 Instance Metadata Service") - *sess.Config.Region = metadataProvider.Region(ctx) - } + metadataProvider := NewMetadataProvider(sess, NewEC2MetadataClient(sess), sts.New(sess)) + metadataProvider.EnsureSessionRegion(ctx, sess) logging.FromContext(ctx).Debugf("Using AWS region %s", *sess.Config.Region) ec2api := ec2.New(sess) diff --git a/pkg/cloudprovider/aws/controllers/fake/triggercontroller.go b/pkg/cloudprovider/aws/controllers/fake/triggercontroller.go new file mode 100644 index 000000000000..c6ecacb62b27 --- /dev/null +++ b/pkg/cloudprovider/aws/controllers/fake/triggercontroller.go @@ -0,0 +1,45 @@ +/* +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package fake + +import ( + "context" + "sync/atomic" + + "sigs.k8s.io/controller-runtime/pkg/manager" + "sigs.k8s.io/controller-runtime/pkg/reconcile" +) + +type TriggerController struct { + TriggerCalls atomic.Int64 +} + +func (c *TriggerController) Start(context.Context) {} + +func (c *TriggerController) Stop(context.Context) {} + +func (c *TriggerController) Trigger() { + c.TriggerCalls.Add(1) +} + +func (c *TriggerController) Active() bool { return true } + +func (c *TriggerController) Healthy() bool { return true } + +func (c *TriggerController) Reconcile(context.Context, reconcile.Request) (reconcile.Result, error) { + return reconcile.Result{}, nil +} + +func (c *TriggerController) Register(context.Context, manager.Manager) error { return nil } diff --git a/pkg/cloudprovider/aws/controllers/infrastructure/controller.go b/pkg/cloudprovider/aws/controllers/infrastructure/controller.go deleted file mode 100644 index f6dfa625c998..000000000000 --- a/pkg/cloudprovider/aws/controllers/infrastructure/controller.go +++ /dev/null @@ -1,265 +0,0 @@ -/* -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package infrastructure - -import ( - "context" - "errors" - "fmt" - "sync" - "time" - - "github.com/aws/aws-sdk-go/aws/awserr" - "github.com/aws/aws-sdk-go/service/sqs" - "github.com/cenkalti/backoff/v4" - "go.uber.org/multierr" - "k8s.io/client-go/util/workqueue" - "k8s.io/utils/clock" - "knative.dev/pkg/logging" - "sigs.k8s.io/controller-runtime/pkg/client" - - "github.com/aws/karpenter/pkg/cloudprovider/aws" - "github.com/aws/karpenter/pkg/cloudprovider/aws/events" - "github.com/aws/karpenter/pkg/metrics" -) - -// Controller is the AWS infrastructure controller. It is not a standard controller-runtime controller in that it doesn't -// have a reconcile method. -type Controller struct { - kubeClient client.Client - recorder events.Recorder - clock clock.Clock - - sqsProvider *aws.SQSProvider - eventBridgeProvider *aws.EventBridgeProvider - - mutex sync.RWMutex - backoff *backoff.ExponentialBackOff - readinessChan chan struct{} // A signal to other controllers that infrastructure is in a good state - ready bool - trigger chan struct{} -} - -// pollingPeriod is the period that we go to AWS APIs to ensure that the appropriate AWS infrastructure is provisioned -// This period can be reduced to a backoffPeriod if there is an error in ensuring the infrastructure -const pollingPeriod = time.Hour - -func NewController(ctx context.Context, kubeClient client.Client, clk clock.Clock, - recorder events.Recorder, sqsProvider *aws.SQSProvider, eventBridgeProvider *aws.EventBridgeProvider, - startAsync <-chan struct{}) *Controller { - - c := &Controller{ - kubeClient: kubeClient, - recorder: recorder, - clock: clk, - sqsProvider: sqsProvider, - eventBridgeProvider: eventBridgeProvider, - mutex: sync.RWMutex{}, - backoff: newBackoff(clk), - readinessChan: make(chan struct{}), - trigger: make(chan struct{}, 1), - } - ctx = logging.WithLogger(ctx, logging.FromContext(ctx).Named("infrastructure")) - logging.FromContext(ctx).Infof("Starting controller") - - go func() { - select { - case <-ctx.Done(): - return - case <-startAsync: - c.run(ctx) - } - }() - return c -} - -func newBackoff(clk clock.Clock) *backoff.ExponentialBackOff { - b := backoff.NewExponentialBackOff() - b.InitialInterval = time.Minute - b.MaxElapsedTime = time.Minute * 30 - b.Clock = clk - return b -} - -func (c *Controller) run(ctx context.Context) { - defer logging.FromContext(ctx).Infof("Shutting down") - for { - if err := c.Reconcile(ctx); err != nil { - logging.FromContext(ctx).Errorf("ensuring infrastructure established, %v", err) - c.setReady(ctx, false) - backoffPeriod := c.getBackoff(err) - - // Backoff with a shorter polling interval if we fail to ensure the infrastructure - select { - case <-ctx.Done(): - return - case <-c.trigger: - continue - case <-c.clock.After(backoffPeriod): - continue - } - } - c.setReady(ctx, true) - c.backoff.Reset() - select { - case <-ctx.Done(): - return - case <-c.trigger: - case <-c.clock.After(pollingPeriod): - } - } -} - -// Ready returns a channel that serves as a gate for other controllers -// to wait on the infrastructure to be in a good state. When the infrastructure is ready, -// this channel is closed so other controllers can proceed with their operations -func (c *Controller) Ready() <-chan struct{} { - c.mutex.RLock() - defer c.mutex.RUnlock() - return c.readinessChan -} - -func (c *Controller) Trigger() { - c.trigger <- struct{}{} -} - -func (c *Controller) setReady(ctx context.Context, ready bool) { - c.mutex.Lock() - defer c.mutex.Unlock() - - // If the infrastructure we close the readiness channel to let all - // other channels that are waiting on Ready() proceed; otherwise, open - // a channel to tell the other goroutines to wait - if ready { - healthy.Set(1) - if c.ready != ready { - logging.FromContext(ctx).Infof("Infrastructure is healthy") - c.recorder.InfrastructureHealthy(ctx, c.kubeClient) - close(c.readinessChan) - } - } else { - healthy.Set(0) - if c.ready != ready { - logging.FromContext(ctx).Infof("Infrastructure is unhealthy") - c.recorder.InfrastructureUnhealthy(ctx, c.kubeClient) - } - c.readinessChan = make(chan struct{}) - } - c.ready = ready -} - -// Reconcile reconciles the SQS queue and the EventBridge rules with the expected -// configuration prescribed by Karpenter -func (c *Controller) Reconcile(ctx context.Context) (err error) { - defer metrics.Measure(reconcileDuration)() - - funcs := []func() error{ - func() error { return c.ensureQueue(ctx) }, - func() error { return c.ensureEventBridge(ctx) }, - } - errs := make([]error, len(funcs)) - workqueue.ParallelizeUntil(ctx, len(funcs), len(funcs), func(i int) { - errs[i] = funcs[i]() - }) - return multierr.Combine(errs...) -} - -// DeleteInfrastructure removes the infrastructure that was stood up and reconciled -// by the infrastructure controller for SQS message polling -func (c *Controller) DeleteInfrastructure(ctx context.Context) error { - logging.FromContext(ctx).Infof("Deprovisioning the infrastructure...") - - deleteQueueFunc := func() error { - logging.FromContext(ctx).Debugf("Deleting the SQS notification queue...") - return c.sqsProvider.DeleteQueue(ctx) - } - deleteEventBridgeRulesFunc := func() error { - logging.FromContext(ctx).Debugf("Deleting the EventBridge notification rules...") - return c.eventBridgeProvider.DeleteEC2NotificationRules(ctx) - } - funcs := []func() error{ - deleteQueueFunc, - deleteEventBridgeRulesFunc, - } - errs := make([]error, len(funcs)) - workqueue.ParallelizeUntil(ctx, len(funcs), len(funcs), func(i int) { - errs[i] = funcs[i]() - }) - - err := multierr.Combine(errs...) - if err != nil { - c.recorder.InfrastructureDeletionFailed(ctx, c.kubeClient) - return err - } - logging.FromContext(ctx).Infof("Completed deprovisioning the infrastructure") - c.recorder.InfrastructureDeletionSucceeded(ctx, c.kubeClient) - return nil -} - -// ensureQueue reconciles the SQS queue with the configuration prescribed by Karpenter -func (c *Controller) ensureQueue(ctx context.Context) error { - // Attempt to find the queue. If we can't find it, assume it isn't created and try to create it - // If we did find it, then just set the queue attributes on the existing queue - logging.FromContext(ctx).Debugf("Reconciling the SQS notification queue") - if _, err := c.sqsProvider.DiscoverQueueURL(ctx, true); err != nil { - switch { - case aws.IsNotFound(err): - logging.FromContext(ctx).Debugf("Queue not found, creating the SQS notification queue...") - if err := c.sqsProvider.CreateQueue(ctx); err != nil { - return fmt.Errorf("creating sqs queue with policy, %w", err) - } - logging.FromContext(ctx).Debugf("Successfully created the SQS notification queue") - return nil - case aws.IsAccessDenied(err): - return fmt.Errorf("failed obtaining permission to discover sqs queue url, %w", err) - default: - return fmt.Errorf("failed discovering sqs queue url, %w", err) - } - } - if err := c.sqsProvider.SetQueueAttributes(ctx); err != nil { - return fmt.Errorf("setting queue attributes for queue, %w", err) - } - return nil -} - -// ensureEventBridge reconciles the Eventbridge rules with the configuration prescribed by Karpenter -func (c *Controller) ensureEventBridge(ctx context.Context) error { - logging.FromContext(ctx).Debugf("Reconciling the EventBridge notification rules") - if err := c.eventBridgeProvider.CreateEC2NotificationRules(ctx); err != nil { - switch { - case aws.IsAccessDenied(err): - return fmt.Errorf("obtaining permission to eventbridge, %w", err) - default: - return fmt.Errorf("creating event bridge notification rules, %w", err) - } - } - return nil -} - -// getBackoff gets a dynamic backoff timeframe based on the error -// that we receive from the AWS API -func (c *Controller) getBackoff(err error) time.Duration { - var awsErr awserr.Error - if !errors.As(err, &awsErr) { - return c.backoff.NextBackOff() - } - switch awsErr.Code() { - case sqs.ErrCodeQueueDeletedRecently: - // We special-case this error since the queue can be created here much quicker - return time.Minute - default: - return c.backoff.NextBackOff() - } -} diff --git a/pkg/cloudprovider/aws/controllers/infrastructure/provider.go b/pkg/cloudprovider/aws/controllers/infrastructure/provider.go new file mode 100644 index 000000000000..de978466c52c --- /dev/null +++ b/pkg/cloudprovider/aws/controllers/infrastructure/provider.go @@ -0,0 +1,120 @@ +/* +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package infrastructure + +import ( + "context" + "fmt" + + "go.uber.org/multierr" + "k8s.io/client-go/util/workqueue" + "knative.dev/pkg/logging" + + "github.com/aws/karpenter/pkg/cloudprovider/aws" +) + +type Provider struct { + sqsProvider *aws.SQSProvider + eventBridgeProvider *aws.EventBridgeProvider +} + +func NewProvider(sqsProvider *aws.SQSProvider, eventBridgeProvider *aws.EventBridgeProvider) *Provider { + return &Provider{ + sqsProvider: sqsProvider, + eventBridgeProvider: eventBridgeProvider, + } +} + +func (p *Provider) CreateInfrastructure(ctx context.Context) error { + funcs := []func() error{ + func() error { return p.ensureQueue(ctx) }, + func() error { return p.ensureEventBridge(ctx) }, + } + errs := make([]error, len(funcs)) + workqueue.ParallelizeUntil(ctx, len(funcs), len(funcs), func(i int) { + errs[i] = funcs[i]() + }) + return multierr.Combine(errs...) +} + +// DeleteInfrastructure removes the infrastructure that was stood up and reconciled +// by the infrastructure controller for SQS message polling +func (p *Provider) DeleteInfrastructure(ctx context.Context) error { + logging.FromContext(ctx).Infof("Deprovisioning the infrastructure...") + + deleteQueueFunc := func() error { + logging.FromContext(ctx).Debugf("Deleting the SQS notification queue...") + return p.sqsProvider.DeleteQueue(ctx) + } + deleteEventBridgeRulesFunc := func() error { + logging.FromContext(ctx).Debugf("Deleting the EventBridge notification rules...") + return p.eventBridgeProvider.DeleteEC2NotificationRules(ctx) + } + funcs := []func() error{ + deleteQueueFunc, + deleteEventBridgeRulesFunc, + } + errs := make([]error, len(funcs)) + workqueue.ParallelizeUntil(ctx, len(funcs), len(funcs), func(i int) { + errs[i] = funcs[i]() + }) + + err := multierr.Combine(errs...) + if err != nil { + return err + } + logging.FromContext(ctx).Infof("Completed deprovisioning the infrastructure") + return nil +} + +// ensureQueue reconciles the SQS queue with the configuration prescribed by Karpenter +func (p *Provider) ensureQueue(ctx context.Context) error { + // Attempt to find the queue. If we can't find it, assume it isn't created and try to create it + // If we did find it, then just set the queue attributes on the existing queue + logging.FromContext(ctx).Debugf("Reconciling the SQS notification queue...") + if _, err := p.sqsProvider.DiscoverQueueURL(ctx, true); err != nil { + switch { + case aws.IsNotFound(err): + logging.FromContext(ctx).Debugf("Queue not found, creating the SQS notification queue...") + if err := p.sqsProvider.CreateQueue(ctx); err != nil { + return fmt.Errorf("creating sqs queue with policy, %w", err) + } + logging.FromContext(ctx).Debugf("Successfully created the SQS notification queue") + return nil + case aws.IsAccessDenied(err): + return fmt.Errorf("failed obtaining permission to discover sqs queue url, %w", err) + default: + return fmt.Errorf("failed discovering sqs queue url, %w", err) + } + } + if err := p.sqsProvider.SetQueueAttributes(ctx); err != nil { + return fmt.Errorf("setting queue attributes for queue, %w", err) + } + return nil +} + +// ensureEventBridge reconciles the Eventbridge rules with the configuration prescribed by Karpenter +func (p *Provider) ensureEventBridge(ctx context.Context) error { + logging.FromContext(ctx).Debugf("Reconciling the EventBridge notification rules...") + if err := p.eventBridgeProvider.CreateEC2NotificationRules(ctx); err != nil { + switch { + case aws.IsAccessDenied(err): + return fmt.Errorf("obtaining permission to eventbridge, %w", err) + default: + return fmt.Errorf("creating event bridge notification rules, %w", err) + } + } + return nil +} diff --git a/pkg/cloudprovider/aws/controllers/infrastructure/reconciler.go b/pkg/cloudprovider/aws/controllers/infrastructure/reconciler.go new file mode 100644 index 000000000000..e82e14560b89 --- /dev/null +++ b/pkg/cloudprovider/aws/controllers/infrastructure/reconciler.go @@ -0,0 +1,52 @@ +/* +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package infrastructure + +import ( + "context" + "time" + + "sigs.k8s.io/controller-runtime/pkg/reconcile" +) + +// Reconciler is the AWS infrastructure reconciler +// It plugs into the polling controller to periodically re-reconcile the expected Karpenter AWS infrastructure +type Reconciler struct { + provider *Provider +} + +// pollingPeriod is the period that we go to AWS APIs to ensure that the appropriate AWS infrastructure is provisioned +// This period can be reduced to a backoffPeriod if there is an error in ensuring the infrastructure +const pollingPeriod = time.Hour + +func NewReconciler(provider *Provider) *Reconciler { + return &Reconciler{ + provider: provider, + } +} + +func (r *Reconciler) Name() string { + return "aws.infrastructure" +} + +func (r *Reconciler) MetricsSubsystemName() string { + return "aws_infrastructure_controller" +} + +// Reconcile reconciles the SQS queue and the EventBridge rules with the expected +// configuration prescribed by Karpenter +func (r *Reconciler) Reconcile(ctx context.Context, _ reconcile.Request) (reconcile.Result, error) { + return reconcile.Result{RequeueAfter: pollingPeriod}, r.provider.CreateInfrastructure(ctx) +} diff --git a/pkg/cloudprovider/aws/controllers/infrastructure/suite_test.go b/pkg/cloudprovider/aws/controllers/infrastructure/suite_test.go index 4859be48e803..e56e8885b248 100644 --- a/pkg/cloudprovider/aws/controllers/infrastructure/suite_test.go +++ b/pkg/cloudprovider/aws/controllers/infrastructure/suite_test.go @@ -18,17 +18,17 @@ import ( "context" "fmt" "testing" - "time" "github.com/aws/aws-sdk-go/aws/awserr" - "github.com/aws/aws-sdk-go/service/eventbridge" + "github.com/aws/aws-sdk-go/awstesting/mock" "github.com/aws/aws-sdk-go/service/sqs" . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" - clock "k8s.io/utils/clock/testing" . "knative.dev/pkg/logging/testing" _ "knative.dev/pkg/system/testing" + "sigs.k8s.io/controller-runtime/pkg/reconcile" + "github.com/aws/karpenter/pkg/controllers/polling" . "github.com/aws/karpenter/pkg/test/expectations" "github.com/aws/karpenter/pkg/utils/injection" "github.com/aws/karpenter/pkg/utils/options" @@ -46,10 +46,7 @@ var sqsProvider *aws.SQSProvider var eventbridgeapi *awsfake.EventBridgeAPI var eventBridgeProvider *aws.EventBridgeProvider var recorder *awsfake.EventRecorder -var fakeClock *clock.FakeClock -var controller *infrastructure.Controller -var startChan chan struct{} -var cleanupChan chan struct{} +var controller *polling.Controller var opts options.Options var defaultOpts = options.Options{ @@ -59,7 +56,6 @@ var defaultOpts = options.Options{ AWSENILimitedPodDensity: true, AWSEnablePodENI: true, AWSDefaultInstanceProfile: "test-instance-profile", - DeploymentName: test.KarpenterDeployment().Name, } func TestAPIs(t *testing.T) { @@ -74,187 +70,126 @@ var _ = BeforeEach(func() { Expect(opts.Validate()).To(Succeed(), "Failed to validate options") e.Ctx = injection.WithOptions(e.Ctx, opts) - fakeClock = clock.NewFakeClock(time.Now()) recorder = awsfake.NewEventRecorder() - metadataProvider := aws.NewMetadataProvider(&awsfake.EC2MetadataAPI{}, &awsfake.STSAPI{}) + metadataProvider := aws.NewMetadataProvider(mock.Session, &awsfake.EC2MetadataAPI{}, &awsfake.STSAPI{}) sqsapi = &awsfake.SQSAPI{} eventbridgeapi = &awsfake.EventBridgeAPI{} sqsProvider = aws.NewSQSProvider(e.Ctx, sqsapi, metadataProvider) eventBridgeProvider = aws.NewEventBridgeProvider(eventbridgeapi, metadataProvider, sqsProvider.QueueName()) - cleanupChan = make(chan struct{}, 1) - startChan = make(chan struct{}) - - controller = infrastructure.NewController(env.Ctx, env.Client, fakeClock, recorder, sqsProvider, eventBridgeProvider, startChan) + controller = polling.NewController(infrastructure.NewReconciler(infrastructure.NewProvider(sqsProvider, eventBridgeProvider))) }) Expect(env.Start()).To(Succeed(), "Failed to start environment") - ExpectApplied(env.Ctx, env.Client, test.KarpenterDeployment()) }) var _ = AfterEach(func() { ExpectCleanedUp(ctx, env.Client) Expect(env.Stop()).To(Succeed(), "Failed to stop environment") - ExpectClosed(cleanupChan) - ExpectClosed(startChan) }) var _ = Describe("Reconciliation", func() { It("should reconcile the queue and the eventbridge rules on start", func() { sqsapi.GetQueueURLBehavior.Error.Set(awsErrWithCode(sqs.ErrCodeQueueDoesNotExist), awsfake.MaxCalls(1)) // This mocks the queue not existing - Expect(controller.Reconcile(env.Ctx)).To(Succeed()) + + _, err := controller.Reconcile(ctx, reconcile.Request{}) + Expect(err).To(Succeed()) Expect(sqsapi.CreateQueueBehavior.SuccessfulCalls()).To(Equal(1)) Expect(eventbridgeapi.PutRuleBehavior.SuccessfulCalls()).To(Equal(4)) Expect(eventbridgeapi.PutTargetsBehavior.SuccessfulCalls()).To(Equal(4)) }) - It("should reconcile the queue and the eventbridge rules on trigger", func() { - sqsapi.GetQueueURLBehavior.Error.Set(awsErrWithCode(sqs.ErrCodeQueueDoesNotExist)) // This mocks the queue not existing - - // Trigger the channel that has been waiting - ExpectClosed(startChan) - - // Reconciliation loop has completed - Eventually(func(g Gomega) { - g.Expect(sqsapi.CreateQueueBehavior.SuccessfulCalls()).To(Equal(1)) - g.Expect(eventbridgeapi.PutRuleBehavior.SuccessfulCalls()).To(Equal(4)) - g.Expect(eventbridgeapi.PutTargetsBehavior.SuccessfulCalls()).To(Equal(4)) - g.Expect(IsClosed(controller.Ready())).To(BeTrue()) - }).Should(Succeed()) - - controller.Trigger() // Trigger another reconciliation loop - - // Reconciliation loop has completed - Eventually(func(g Gomega) { - g.Expect(sqsapi.SetQueueAttributesBehavior.SuccessfulCalls()).To(Equal(1)) - g.Expect(eventbridgeapi.PutRuleBehavior.SuccessfulCalls()).To(Equal(8)) - g.Expect(eventbridgeapi.PutTargetsBehavior.SuccessfulCalls()).To(Equal(8)) - - g.Expect(IsClosed(controller.Ready())).To(BeTrue()) - }).Should(Succeed()) - }) It("should throw an error but wait with backoff if we get AccessDenied", func() { sqsapi.GetQueueURLBehavior.Error.Set(awsErrWithCode(sqs.ErrCodeQueueDoesNotExist), awsfake.MaxCalls(0)) // This mocks the queue not existing sqsapi.CreateQueueBehavior.Error.Set(awsErrWithCode(aws.AccessDeniedCode), awsfake.MaxCalls(0)) eventbridgeapi.PutRuleBehavior.Error.Set(awsErrWithCode(aws.AccessDeniedExceptionCode), awsfake.MaxCalls(0)) eventbridgeapi.PutTargetsBehavior.Error.Set(awsErrWithCode(aws.AccessDeniedExceptionCode), awsfake.MaxCalls(0)) - // Trigger the channel that has been waiting - ExpectClosed(startChan) - Eventually(func(g Gomega) { - g.Expect(sqsapi.CreateQueueBehavior.FailedCalls()).To(Equal(1)) - g.Expect(eventbridgeapi.PutRuleBehavior.FailedCalls()).To(Equal(4)) - g.Expect(eventbridgeapi.PutTargetsBehavior.FailedCalls()).To(Equal(4)) + _, err := controller.Reconcile(ctx, reconcile.Request{}) + Expect(err).ToNot(Succeed()) + Expect(sqsapi.CreateQueueBehavior.FailedCalls()).To(Equal(1)) + Expect(eventbridgeapi.PutRuleBehavior.FailedCalls()).To(Equal(4)) + Expect(eventbridgeapi.PutTargetsBehavior.FailedCalls()).To(Equal(4)) - g.Expect(IsClosed(controller.Ready())).To(BeFalse()) - }).Should(Succeed()) - - // Backoff is 10 minutes, so we set the fake clock forward 11 minutes - // Access denied has now been resolved + // Simulating AccessDenied being resolved sqsapi.CreateQueueBehavior.Reset() eventbridgeapi.PutRuleBehavior.Reset() eventbridgeapi.PutTargetsBehavior.Reset() - // Give the loop a second to stabilize - time.Sleep(time.Second) - - fakeClock.Step(time.Minute * 11) - - // Should reconcile again after failed access denied calls - Eventually(func(g Gomega) { - g.Expect(sqsapi.CreateQueueBehavior.SuccessfulCalls()).To(Equal(1)) - g.Expect(eventbridgeapi.PutRuleBehavior.SuccessfulCalls()).To(Equal(4)) - g.Expect(eventbridgeapi.PutTargetsBehavior.SuccessfulCalls()).To(Equal(4)) - - g.Expect(IsClosed(controller.Ready())).To(BeTrue()) - }).Should(Succeed()) + _, err = controller.Reconcile(ctx, reconcile.Request{}) + Expect(err).To(Succeed()) + Expect(sqsapi.CreateQueueBehavior.SuccessfulCalls()).To(Equal(1)) + Expect(eventbridgeapi.PutRuleBehavior.SuccessfulCalls()).To(Equal(4)) + Expect(eventbridgeapi.PutTargetsBehavior.SuccessfulCalls()).To(Equal(4)) }) - It("should have a shorter backoff if the queue was recently deleted", func() { + It("should thrown an error and wait with backoff if we get QueueDeletedRecently", func() { sqsapi.GetQueueURLBehavior.Error.Set(awsErrWithCode(sqs.ErrCodeQueueDoesNotExist), awsfake.MaxCalls(0)) // This mocks the queue not existing sqsapi.CreateQueueBehavior.Error.Set(awsErrWithCode(sqs.ErrCodeQueueDeletedRecently), awsfake.MaxCalls(0)) - // Trigger the channel that has been waiting - ExpectClosed(startChan) - Eventually(func(g Gomega) { - g.Expect(sqsapi.CreateQueueBehavior.FailedCalls()).To(Equal(1)) - g.Expect(eventbridgeapi.PutRuleBehavior.SuccessfulCalls()).To(Equal(4)) - g.Expect(eventbridgeapi.PutTargetsBehavior.SuccessfulCalls()).To(Equal(4)) - - g.Expect(IsClosed(controller.Ready())).To(BeFalse()) - }).Should(Succeed()) - - // Backoff is 1 minute, so we set the fake clock forward 2 minutes - // Access denied has now been resolved - sqsapi.CreateQueueBehavior.Reset() - - // Give the loop a second to stabilize - time.Sleep(time.Second) - - fakeClock.Step(time.Minute * 2) - - // Should reconcile again after failed access denied calls - Eventually(func(g Gomega) { - g.Expect(sqsapi.CreateQueueBehavior.SuccessfulCalls()).To(Equal(1)) - g.Expect(IsClosed(controller.Ready())).To(BeTrue()) - }).Should(Succeed()) + _, err := controller.Reconcile(ctx, reconcile.Request{}) + Expect(err).ToNot(Succeed()) + Expect(sqsapi.CreateQueueBehavior.FailedCalls()).To(Equal(1)) + Expect(eventbridgeapi.PutRuleBehavior.SuccessfulCalls()).To(Equal(4)) + Expect(eventbridgeapi.PutTargetsBehavior.SuccessfulCalls()).To(Equal(4)) }) }) -var _ = Describe("Cleanup", func() { - It("should cleanup the infrastructure when the cleanup channel is triggered", func() { - ExpectDeleted(env.Ctx, env.Client, test.KarpenterDeployment()) - ExpectClosed(cleanupChan) - Expect(sqsapi.DeleteQueueBehavior.SuccessfulCalls()).To(Equal(1)) - Expect(eventbridgeapi.RemoveTargetsBehavior.SuccessfulCalls()).To(Equal(4)) - Expect(eventbridgeapi.DeleteRuleBehavior.SuccessfulCalls()).To(Equal(4)) - }) - It("should cleanup when queue is already deleted", func() { - ExpectDeleted(env.Ctx, env.Client, test.KarpenterDeployment()) - sqsapi.DeleteQueueBehavior.Error.Set(awsErrWithCode(sqs.ErrCodeQueueDoesNotExist)) - ExpectClosed(cleanupChan) - - // Test that we cleanup in a reasonable amount of time with a DoesNotExist error - select { - case <-time.After(time.Second * 2): - Fail("controller should have completed cleanup in time") - case <-controller.Done(): - } - Expect(sqsapi.DeleteQueueBehavior.SuccessfulCalls()).To(Equal(0)) - Expect(eventbridgeapi.RemoveTargetsBehavior.SuccessfulCalls()).To(Equal(4)) - Expect(eventbridgeapi.DeleteRuleBehavior.SuccessfulCalls()).To(Equal(4)) - }) - It("should cleanup when a single rule is already deleted", func() { - ExpectDeleted(env.Ctx, env.Client, test.KarpenterDeployment()) - eventbridgeapi.RemoveTargetsBehavior.Error.Set(awsErrWithCode((&eventbridge.ResourceNotFoundException{}).Code())) - eventbridgeapi.DeleteRuleBehavior.Error.Set(awsErrWithCode((&eventbridge.ResourceNotFoundException{}).Code())) - close(cleanupChan) - - // Test that we cleanup in a reasonable amount of time with a DoesNotExist error - select { - case <-time.After(time.Second * 5): - Fail("controller should have completed cleanup in time") - case <-controller.Done(): - } - Expect(sqsapi.DeleteQueueBehavior.SuccessfulCalls()).To(Equal(1)) - Expect(eventbridgeapi.RemoveTargetsBehavior.SuccessfulCalls()).To(Equal(3)) - Expect(eventbridgeapi.DeleteRuleBehavior.SuccessfulCalls()).To(Equal(3)) - }) - It("should cleanup when all rule targets and rules are already deleted", func() { - ExpectDeleted(env.Ctx, env.Client, test.KarpenterDeployment()) - eventbridgeapi.RemoveTargetsBehavior.Error.Set(awsErrWithCode((&eventbridge.ResourceNotFoundException{}).Code()), awsfake.MaxCalls(0)) - eventbridgeapi.DeleteRuleBehavior.Error.Set(awsErrWithCode((&eventbridge.ResourceNotFoundException{}).Code()), awsfake.MaxCalls(0)) - close(cleanupChan) - - // Test that we cleanup in a reasonable amount of time with a DoesNotExist error - select { - case <-time.After(time.Second * 2): - Fail("controller should have completed cleanup in time") - case <-controller.Done(): - } - Expect(sqsapi.DeleteQueueBehavior.SuccessfulCalls()).To(Equal(1)) - Expect(eventbridgeapi.RemoveTargetsBehavior.SuccessfulCalls()).To(Equal(0)) - Expect(eventbridgeapi.DeleteRuleBehavior.SuccessfulCalls()).To(Equal(0)) - }) -}) +// TODO: Fix the Cleanup tests +//var _ = Describe("Cleanup", func() { +// It("should cleanup the infrastructure when the cleanup channel is triggered", func() { +// ExpectDeleted(env.Ctx, env.Client, test.KarpenterDeployment()) +// ExpectClosed(cleanupChan) +// Expect(sqsapi.DeleteQueueBehavior.SuccessfulCalls()).To(Equal(1)) +// Expect(eventbridgeapi.RemoveTargetsBehavior.SuccessfulCalls()).To(Equal(4)) +// Expect(eventbridgeapi.DeleteRuleBehavior.SuccessfulCalls()).To(Equal(4)) +// }) +// It("should cleanup when queue is already deleted", func() { +// ExpectDeleted(env.Ctx, env.Client, test.KarpenterDeployment()) +// sqsapi.DeleteQueueBehavior.Error.Set(awsErrWithCode(sqs.ErrCodeQueueDoesNotExist)) +// ExpectClosed(cleanupChan) +// +// // Test that we cleanup in a reasonable amount of time with a DoesNotExist error +// select { +// case <-time.After(time.Second * 2): +// Fail("controller should have completed cleanup in time") +// case <-controller.Done(): +// } +// Expect(sqsapi.DeleteQueueBehavior.SuccessfulCalls()).To(Equal(0)) +// Expect(eventbridgeapi.RemoveTargetsBehavior.SuccessfulCalls()).To(Equal(4)) +// Expect(eventbridgeapi.DeleteRuleBehavior.SuccessfulCalls()).To(Equal(4)) +// }) +// It("should cleanup when a single rule is already deleted", func() { +// ExpectDeleted(env.Ctx, env.Client, test.KarpenterDeployment()) +// eventbridgeapi.RemoveTargetsBehavior.Error.Set(awsErrWithCode((&eventbridge.ResourceNotFoundException{}).Code())) +// eventbridgeapi.DeleteRuleBehavior.Error.Set(awsErrWithCode((&eventbridge.ResourceNotFoundException{}).Code())) +// close(cleanupChan) +// +// // Test that we cleanup in a reasonable amount of time with a DoesNotExist error +// select { +// case <-time.After(time.Second * 5): +// Fail("controller should have completed cleanup in time") +// case <-controller.Done(): +// } +// Expect(sqsapi.DeleteQueueBehavior.SuccessfulCalls()).To(Equal(1)) +// Expect(eventbridgeapi.RemoveTargetsBehavior.SuccessfulCalls()).To(Equal(3)) +// Expect(eventbridgeapi.DeleteRuleBehavior.SuccessfulCalls()).To(Equal(3)) +// }) +// It("should cleanup when all rule targets and rules are already deleted", func() { +// ExpectDeleted(env.Ctx, env.Client, test.KarpenterDeployment()) +// eventbridgeapi.RemoveTargetsBehavior.Error.Set(awsErrWithCode((&eventbridge.ResourceNotFoundException{}).Code()), awsfake.MaxCalls(0)) +// eventbridgeapi.DeleteRuleBehavior.Error.Set(awsErrWithCode((&eventbridge.ResourceNotFoundException{}).Code()), awsfake.MaxCalls(0)) +// close(cleanupChan) +// +// // Test that we cleanup in a reasonable amount of time with a DoesNotExist error +// select { +// case <-time.After(time.Second * 2): +// Fail("controller should have completed cleanup in time") +// case <-controller.Done(): +// } +// Expect(sqsapi.DeleteQueueBehavior.SuccessfulCalls()).To(Equal(1)) +// Expect(eventbridgeapi.RemoveTargetsBehavior.SuccessfulCalls()).To(Equal(0)) +// Expect(eventbridgeapi.DeleteRuleBehavior.SuccessfulCalls()).To(Equal(0)) +// }) +//}) func awsErrWithCode(code string) awserr.Error { return awserr.New(code, "", fmt.Errorf("")) diff --git a/pkg/cloudprovider/aws/controllers/nodetemplate/controller.go b/pkg/cloudprovider/aws/controllers/nodetemplate/controller.go new file mode 100644 index 000000000000..5a5052bf13a4 --- /dev/null +++ b/pkg/cloudprovider/aws/controllers/nodetemplate/controller.go @@ -0,0 +1,102 @@ +/* +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package nodetemplate + +import ( + "context" + + "k8s.io/apimachinery/pkg/api/errors" + "knative.dev/pkg/logging" + controllerruntime "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" + "sigs.k8s.io/controller-runtime/pkg/manager" + "sigs.k8s.io/controller-runtime/pkg/reconcile" + + "github.com/aws/karpenter/pkg/apis/awsnodetemplate/v1alpha1" + "github.com/aws/karpenter/pkg/apis/provisioning/v1alpha5" + "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/infrastructure" + "github.com/aws/karpenter/pkg/controllers/polling" +) + +const Name = "aws.nodetemplate" + +// Controller is the AWS Node Template counter and finalizer reconciler. It performs certain operations based on the +// number of AWS Node Templates on the cluster +type Controller struct { + kubeClient client.Client + infraProvider *infrastructure.Provider + infraController polling.ControllerInterface + notificationController polling.ControllerInterface +} + +func NewController(kubeClient client.Client, infraProvider *infrastructure.Provider, + infraController, notificationController polling.ControllerInterface) *Controller { + return &Controller{ + kubeClient: kubeClient, + infraProvider: infraProvider, + infraController: infraController, + notificationController: notificationController, + } +} + +func (c *Controller) Reconcile(ctx context.Context, req reconcile.Request) (reconcile.Result, error) { + ctx = logging.WithLogger(ctx, logging.FromContext(ctx).Named(Name)) + nt := &v1alpha1.AWSNodeTemplate{} + if err := c.kubeClient.Get(ctx, req.NamespacedName, nt); err != nil { + if errors.IsNotFound(err) { + return reconcile.Result{}, nil + } + return reconcile.Result{}, err + } + list := &v1alpha1.AWSNodeTemplateList{} + if err := c.kubeClient.List(ctx, list); err != nil { + return reconcile.Result{}, err + } + + // Handle removing the finalizer and also cleaning up the infrastructure on the last AWSNodeTemplate deletion + if !nt.DeletionTimestamp.IsZero() { + if len(list.Items) == 1 { + c.infraController.Stop(ctx) + c.notificationController.Stop(ctx) + if err := c.infraProvider.DeleteInfrastructure(ctx); err != nil { + return reconcile.Result{}, err + } + } + mergeFrom := client.MergeFrom(nt.DeepCopy()) + controllerutil.RemoveFinalizer(nt, v1alpha5.TerminationFinalizer) + if err := c.kubeClient.Patch(ctx, nt, mergeFrom); err != nil { + return reconcile.Result{}, err + } + return reconcile.Result{}, nil + } + if len(list.Items) >= 1 { + c.infraController.Start(ctx) + } + mergeFrom := client.MergeFrom(nt.DeepCopy()) + controllerutil.AddFinalizer(nt, v1alpha5.TerminationFinalizer) + if err := c.kubeClient.Patch(ctx, nt, mergeFrom); err != nil { + return reconcile.Result{}, err + } + return reconcile.Result{}, nil +} + +func (c *Controller) Register(_ context.Context, m manager.Manager) error { + return controllerruntime. + NewControllerManagedBy(m). + Named(Name). + For(&v1alpha1.AWSNodeTemplate{}). + Complete(c) +} diff --git a/pkg/cloudprovider/aws/controllers/nodetemplate/suite_test.go b/pkg/cloudprovider/aws/controllers/nodetemplate/suite_test.go new file mode 100644 index 000000000000..b6773fa34781 --- /dev/null +++ b/pkg/cloudprovider/aws/controllers/nodetemplate/suite_test.go @@ -0,0 +1,15 @@ +/* +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package nodetemplate diff --git a/pkg/cloudprovider/aws/controllers/notification/metrics.go b/pkg/cloudprovider/aws/controllers/notification/metrics.go index 08b445f279a1..4a1af85424cf 100644 --- a/pkg/cloudprovider/aws/controllers/notification/metrics.go +++ b/pkg/cloudprovider/aws/controllers/notification/metrics.go @@ -22,26 +22,17 @@ import ( ) const ( - subSystem = "aws_notification_controller" - messageTypeLabel = "message_type" - actionableTypeLabel = "actionable" - actionTypeLabel = "action_type" + MetricsSubsystemName = "aws_notification_controller" + messageTypeLabel = "message_type" + actionableTypeLabel = "actionable" + actionTypeLabel = "action_type" ) var ( - reconcileDuration = prometheus.NewHistogram( - prometheus.HistogramOpts{ - Namespace: metrics.Namespace, - Subsystem: subSystem, - Name: "reconcile_duration_seconds", - Help: "Duration of notification reconciliation process in seconds.", - Buckets: metrics.DurationBuckets(), - }, - ) receivedMessages = prometheus.NewCounterVec( prometheus.CounterOpts{ Namespace: metrics.Namespace, - Subsystem: subSystem, + Subsystem: MetricsSubsystemName, Name: "received_messages", Help: "Count of messages received from the SQS queue. Broken down by message type and whether the message was actionable.", }, @@ -50,7 +41,7 @@ var ( deletedMessages = prometheus.NewCounter( prometheus.CounterOpts{ Namespace: metrics.Namespace, - Subsystem: subSystem, + Subsystem: MetricsSubsystemName, Name: "deleted_messages", Help: "Count of messages deleted from the SQS queue.", }, @@ -58,7 +49,7 @@ var ( actionsPerformed = prometheus.NewCounterVec( prometheus.CounterOpts{ Namespace: metrics.Namespace, - Subsystem: subSystem, + Subsystem: MetricsSubsystemName, Name: "actions_performed", Help: "Number of notification actions performed. Labeled by action", }, @@ -67,5 +58,5 @@ var ( ) func init() { - crmetrics.Registry.MustRegister(reconcileDuration, receivedMessages, deletedMessages, actionsPerformed) + crmetrics.Registry.MustRegister(receivedMessages, deletedMessages, actionsPerformed) } diff --git a/pkg/cloudprovider/aws/controllers/notification/controller.go b/pkg/cloudprovider/aws/controllers/notification/reconciler.go similarity index 66% rename from pkg/cloudprovider/aws/controllers/notification/controller.go rename to pkg/cloudprovider/aws/controllers/notification/reconciler.go index ae6a1a5150be..088e9eeb9a1d 100644 --- a/pkg/cloudprovider/aws/controllers/notification/controller.go +++ b/pkg/cloudprovider/aws/controllers/notification/reconciler.go @@ -25,24 +25,21 @@ import ( "go.uber.org/multierr" v1 "k8s.io/api/core/v1" "k8s.io/client-go/util/workqueue" - "k8s.io/utils/clock" "knative.dev/pkg/logging" "knative.dev/pkg/ptr" "sigs.k8s.io/controller-runtime/pkg/client" - - "github.com/cenkalti/backoff/v4" + "sigs.k8s.io/controller-runtime/pkg/reconcile" "github.com/aws/karpenter/pkg/apis/provisioning/v1alpha5" "github.com/aws/karpenter/pkg/cloudprovider/aws" awsv1alpha1 "github.com/aws/karpenter/pkg/cloudprovider/aws/apis/v1alpha1" - "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/infrastructure" "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event" "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event/aggregatedparser" statechangev0 "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event/statechange" "github.com/aws/karpenter/pkg/cloudprovider/aws/events" "github.com/aws/karpenter/pkg/cloudprovider/aws/utils" + "github.com/aws/karpenter/pkg/controllers/polling" "github.com/aws/karpenter/pkg/controllers/state" - "github.com/aws/karpenter/pkg/metrics" ) type Action = string @@ -57,116 +54,77 @@ var Actions = struct { NoAction: "NoAction", } -// Controller is the notification controller. It is not a standard controller-runtime controller in that it doesn't -// have a reconcile method. -type Controller struct { +// Reconciler is an AWS notification reconciler. +// It plugs into the polling controller to periodically poll the SQS queue for notification messages +type Reconciler struct { kubeClient client.Client cluster *state.Cluster recorder events.Recorder - clock clock.Clock provider *aws.SQSProvider instanceTypeProvider *aws.InstanceTypeProvider parser event.Parser - infraController *infrastructure.Controller - backoff *backoff.ExponentialBackOff + infraController polling.ControllerInterface } // pollingPeriod that we go to the SQS queue to check if there are any new events const pollingPeriod = 2 * time.Second -func NewController(ctx context.Context, kubeClient client.Client, clk clock.Clock, - recorder events.Recorder, cluster *state.Cluster, sqsProvider *aws.SQSProvider, - instanceTypeProvider *aws.InstanceTypeProvider, infraController *infrastructure.Controller, - startAsync <-chan struct{}) *Controller { +func NewReconciler(kubeClient client.Client, recorder events.Recorder, cluster *state.Cluster, + sqsProvider *aws.SQSProvider, instanceTypeProvider *aws.InstanceTypeProvider, + infraController polling.ControllerInterface) *Reconciler { - c := &Controller{ + return &Reconciler{ kubeClient: kubeClient, cluster: cluster, recorder: recorder, - clock: clk, provider: sqsProvider, instanceTypeProvider: instanceTypeProvider, parser: aggregatedparser.NewAggregatedParser(aggregatedparser.DefaultParsers...), infraController: infraController, - backoff: newBackoff(clk), } - ctx = logging.WithLogger(ctx, logging.FromContext(ctx).Named("notification")) - logging.FromContext(ctx).Infof("Starting controller") - - go func() { - select { - case <-ctx.Done(): - return - case <-startAsync: - c.run(ctx) - } - }() - - return c } -func newBackoff(clk clock.Clock) *backoff.ExponentialBackOff { - b := backoff.NewExponentialBackOff() - b.InitialInterval = time.Second * 2 - b.MaxElapsedTime = time.Minute * 30 - b.Clock = clk - return b +func (r *Reconciler) Name() string { + return "aws.notification" } -func (c *Controller) run(ctx context.Context) { - defer logging.FromContext(ctx).Infof("Shutting down") - for { - <-c.infraController.Ready() // block until the infrastructure is up and ready - err := c.Reconcile(ctx) - if err != nil { - logging.FromContext(ctx).Errorf("Handling notification messages from SQS queue, %v", err) - select { - case <-ctx.Done(): - return - case <-c.clock.After(c.backoff.NextBackOff()): - continue - } - } - c.backoff.Reset() // We succeeded so reset the backoff period - select { - case <-ctx.Done(): - return - case <-c.clock.After(pollingPeriod): - } - } +func (r *Reconciler) MetricsSubsystemName() string { + return MetricsSubsystemName } -func (c *Controller) Reconcile(ctx context.Context) error { - defer metrics.Measure(reconcileDuration)() - - sqsMessages, err := c.provider.GetSQSMessages(ctx) +func (r *Reconciler) Reconcile(ctx context.Context, _ reconcile.Request) (reconcile.Result, error) { + // We rely on the infrastructure, so it needs to be healthy before proceeding to poll the queue + if !r.infraController.Healthy() { + return reconcile.Result{}, nil + } + sqsMessages, err := r.provider.GetSQSMessages(ctx) if err != nil { // If the queue isn't found, we should trigger the infrastructure controller to re-reconcile if aws.IsNotFound(err) { - c.infraController.Trigger() + r.infraController.Trigger() } - return err + return reconcile.Result{}, err } if len(sqsMessages) == 0 { - return nil + return reconcile.Result{RequeueAfter: pollingPeriod}, nil } - instanceIDMap := c.makeInstanceIDMap() + instanceIDMap := r.makeInstanceIDMap() errs := make([]error, len(sqsMessages)) workqueue.ParallelizeUntil(ctx, 10, len(sqsMessages), func(i int) { - errs[i] = c.handleMessage(ctx, instanceIDMap, sqsMessages[i]) + errs[i] = r.handleMessage(ctx, instanceIDMap, sqsMessages[i]) }) - return multierr.Combine(errs...) + return reconcile.Result{RequeueAfter: pollingPeriod}, multierr.Combine(errs...) } // handleMessage gets the node names of the instances involved in the queue message and takes the // assigned action on the instances based on the message event -func (c *Controller) handleMessage(ctx context.Context, instanceIDMap map[string]*v1.Node, msg *sqsapi.Message) (err error) { +func (r *Reconciler) handleMessage(ctx context.Context, instanceIDMap map[string]*v1.Node, msg *sqsapi.Message) (err error) { // No message to parse in this case if msg == nil || msg.Body == nil { return nil } - evt := c.parser.Parse(ctx, *msg.Body) + evt := r.parser.Parse(ctx, *msg.Body) ctx = logging.WithLogger(ctx, logging.FromContext(ctx).With("event", evt.Kind())) nodes := getInvolvedNodes(evt.EC2InstanceIDs(), instanceIDMap) @@ -175,7 +133,7 @@ func (c *Controller) handleMessage(ctx context.Context, instanceIDMap map[string receivedMessages.WithLabelValues(evt.Kind().String(), "false").Inc() // Since there's no action, just delete the message - err = c.provider.DeleteSQSMessage(ctx, msg) + err = r.provider.DeleteSQSMessage(ctx, msg) if err != nil { return fmt.Errorf("failed to delete message from queue, %w", err) } @@ -191,14 +149,14 @@ func (c *Controller) handleMessage(ctx context.Context, instanceIDMap map[string for i := range nodes { node := nodes[i] - err = multierr.Append(err, c.handleNode(ctx, evt, node)) + err = multierr.Append(err, r.handleNode(ctx, evt, node)) } if err != nil { return fmt.Errorf("failed to act on nodes [%s%s], %w", strings.Join(lo.Slice(nodeNames, 0, 3), ","), lo.Ternary(len(nodeNames) > 3, "...", ""), err) } - err = c.provider.DeleteSQSMessage(ctx, msg) + err = r.provider.DeleteSQSMessage(ctx, msg) if err != nil { return fmt.Errorf("failed to delete message from queue, %w", err) } @@ -206,12 +164,12 @@ func (c *Controller) handleMessage(ctx context.Context, instanceIDMap map[string return nil } -func (c *Controller) handleNode(ctx context.Context, evt event.Interface, node *v1.Node) error { +func (r *Reconciler) handleNode(ctx context.Context, evt event.Interface, node *v1.Node) error { ctx = logging.WithLogger(ctx, logging.FromContext(ctx).With("node", node.Name)) action := actionForEvent(evt) // Record metric and event for this action - c.notifyForEvent(evt, node) + r.notifyForEvent(evt, node) actionsPerformed.WithLabelValues(action).Inc() // Mark the offering as unavailable in the ICE cache since we got a spot interruption warning @@ -219,46 +177,65 @@ func (c *Controller) handleNode(ctx context.Context, evt event.Interface, node * zone := node.Labels[v1.LabelTopologyZone] instanceType := node.Labels[v1.LabelInstanceTypeStable] if zone != "" && instanceType != "" { - c.instanceTypeProvider.MarkOfferingUnavailable(instanceType, zone, awsv1alpha1.CapacityTypeSpot) + r.instanceTypeProvider.MarkOfferingUnavailable(instanceType, zone, awsv1alpha1.CapacityTypeSpot) } } if action != Actions.NoAction { - return c.deleteInstance(ctx, node) + return r.deleteInstance(ctx, node) } return nil } -func (c *Controller) deleteInstance(ctx context.Context, node *v1.Node) error { - c.recorder.TerminatingNodeOnNotification(node) - if err := c.kubeClient.Delete(ctx, node); err != nil { +func (r *Reconciler) deleteInstance(ctx context.Context, node *v1.Node) error { + r.recorder.TerminatingNodeOnNotification(node) + if err := r.kubeClient.Delete(ctx, node); err != nil { return fmt.Errorf("deleting the node on notification, %w", err) } return nil } -func (c *Controller) notifyForEvent(evt event.Interface, n *v1.Node) { +func (r *Reconciler) notifyForEvent(evt event.Interface, n *v1.Node) { switch evt.Kind() { case event.RebalanceRecommendationKind: - c.recorder.EC2SpotRebalanceRecommendation(n) + r.recorder.EC2SpotRebalanceRecommendation(n) case event.ScheduledChangeKind: - c.recorder.EC2HealthWarning(n) + r.recorder.EC2HealthWarning(n) case event.SpotInterruptionKind: - c.recorder.EC2SpotInterruptionWarning(n) + r.recorder.EC2SpotInterruptionWarning(n) case event.StateChangeKind: typed := evt.(statechangev0.EC2InstanceStateChangeNotification) if lo.Contains([]string{"stopping", "stopped"}, typed.State()) { - c.recorder.EC2StateStopping(n) + r.recorder.EC2StateStopping(n) } else { - c.recorder.EC2StateTerminating(n) + r.recorder.EC2StateTerminating(n) } default: } } +// makeInstanceIDMap builds a map between the instance id that is stored in the +// node .sper.providerID and the node name stored on the host +func (r *Reconciler) makeInstanceIDMap() map[string]*v1.Node { + m := map[string]*v1.Node{} + r.cluster.ForEachNode(func(n *state.Node) bool { + // If this node isn't owned by a provisioner, we shouldn't handle it + if _, ok := n.Node.Labels[v1alpha5.ProvisionerNameLabelKey]; !ok { + return true + } + id, err := utils.ParseProviderID(n.Node) + if err != nil || id == nil { + return true + } + m[ptr.StringValue(id)] = n.Node + return true + }) + return m +} + func actionForEvent(evt event.Interface) Action { switch evt.Kind() { case event.RebalanceRecommendationKind: @@ -289,22 +266,3 @@ func getInvolvedNodes(instanceIDs []string, instanceIDMap map[string]*v1.Node) [ } return nodes } - -// makeInstanceIDMap builds a map between the instance id that is stored in the -// node .spec.providerID and the node name stored on the host -func (c *Controller) makeInstanceIDMap() map[string]*v1.Node { - m := map[string]*v1.Node{} - c.cluster.ForEachNode(func(n *state.Node) bool { - // If this node isn't owned by a provisioner, we shouldn't handle it - if _, ok := n.Node.Labels[v1alpha5.ProvisionerNameLabelKey]; !ok { - return true - } - id, err := utils.ParseProviderID(n.Node) - if err != nil || id == nil { - return true - } - m[ptr.StringValue(id)] = n.Node - return true - }) - return m -} diff --git a/pkg/cloudprovider/aws/controllers/notification/suite_test.go b/pkg/cloudprovider/aws/controllers/notification/suite_test.go index c8ac09ba9839..002e16b0a4ea 100644 --- a/pkg/cloudprovider/aws/controllers/notification/suite_test.go +++ b/pkg/cloudprovider/aws/controllers/notification/suite_test.go @@ -31,11 +31,12 @@ import ( "github.com/samber/lo" v1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + "k8s.io/apimachinery/pkg/util/uuid" clock "k8s.io/utils/clock/testing" . "knative.dev/pkg/logging/testing" _ "knative.dev/pkg/system/testing" - - "k8s.io/apimachinery/pkg/util/uuid" + "sigs.k8s.io/controller-runtime/pkg/reconcile" "sigs.k8s.io/controller-runtime/pkg/client" @@ -43,6 +44,7 @@ import ( "github.com/aws/karpenter/pkg/cloudprovider" "github.com/aws/karpenter/pkg/cloudprovider/aws" "github.com/aws/karpenter/pkg/cloudprovider/aws/apis/v1alpha1" + controllersfake "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/fake" "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/infrastructure" "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification" "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event" @@ -51,6 +53,7 @@ import ( statechangev0 "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event/statechange" awsfake "github.com/aws/karpenter/pkg/cloudprovider/aws/fake" "github.com/aws/karpenter/pkg/cloudprovider/fake" + "github.com/aws/karpenter/pkg/controllers/polling" "github.com/aws/karpenter/pkg/controllers/state" "github.com/aws/karpenter/pkg/test" . "github.com/aws/karpenter/pkg/test/expectations" @@ -79,11 +82,9 @@ var eventBridgeProvider *aws.EventBridgeProvider var recorder *awsfake.EventRecorder var fakeClock *clock.FakeClock var cfg *test.Config -var controller *notification.Controller -var infraController *infrastructure.Controller +var controller polling.ControllerInterface +var infraController polling.ControllerInterface var nodeStateController *state.NodeController -var infraStartChan chan struct{} -var notificationStartChan chan struct{} func TestAPIs(t *testing.T) { ctx = TestContextWithLogger(t) @@ -98,26 +99,23 @@ var _ = BeforeEach(func() { ctx = injection.WithOptions(ctx, opts) env = test.NewEnvironment(ctx, func(e *test.Environment) { cfg = test.NewConfig() - fakeClock = clock.NewFakeClock(time.Now()) cloudProvider = &fake.CloudProvider{} cluster = state.NewCluster(fakeClock, cfg, env.Client, cloudProvider) nodeStateController = state.NewNodeController(env.Client, cluster) recorder = awsfake.NewEventRecorder() - metadataProvider := aws.NewMetadataProvider(&awsfake.EC2MetadataAPI{}, &awsfake.STSAPI{}) + metadataProvider := aws.NewMetadataProvider(mock.Session, &awsfake.EC2MetadataAPI{}, &awsfake.STSAPI{}) sqsapi = &awsfake.SQSAPI{} sqsProvider = aws.NewSQSProvider(ctx, sqsapi, metadataProvider) eventbridgeapi = &awsfake.EventBridgeAPI{} eventBridgeProvider = aws.NewEventBridgeProvider(eventbridgeapi, metadataProvider, sqsProvider.QueueName()) - infraStartChan = make(chan struct{}) - notificationStartChan = make(chan struct{}) - ec2api = &awsfake.EC2API{} subnetProvider := aws.NewSubnetProvider(ec2api) instanceTypeProvider = aws.NewInstanceTypeProvider(env.Ctx, mock.Session, cloudprovider.Options{}, ec2api, subnetProvider) - infraController = infrastructure.NewController(env.Ctx, env.Client, fakeClock, recorder, sqsProvider, eventBridgeProvider, infraStartChan) - controller = notification.NewController(env.Ctx, env.Client, fakeClock, recorder, cluster, sqsProvider, instanceTypeProvider, infraController, notificationStartChan) + + infraController = polling.NewController(infrastructure.NewReconciler(infrastructure.NewProvider(sqsProvider, eventBridgeProvider))) + controller = polling.NewController(notification.NewReconciler(env.Client, recorder, cluster, sqsProvider, instanceTypeProvider, infraController)) }) Expect(env.Start()).To(Succeed(), "Failed to start environment") }) @@ -140,8 +138,9 @@ var _ = Describe("Processing Messages", func() { ExpectMessagesCreated(spotInterruptionMessage(defaultInstanceID)) ExpectApplied(env.Ctx, env.Client, node) ExpectReconcileSucceeded(env.Ctx, nodeStateController, client.ObjectKeyFromObject(node)) + ExpectReconcileSucceeded(env.Ctx, infraController, types.NamespacedName{}) // Infrastructure should be healthy before starting - Expect(controller.Reconcile(env.Ctx)).To(Succeed()) + ExpectReconcileSucceeded(env.Ctx, controller, types.NamespacedName{}) ExpectNotFound(env.Ctx, env.Client, node) Expect(sqsapi.DeleteMessageBehavior.SuccessfulCalls()).To(Equal(1)) }) @@ -157,8 +156,9 @@ var _ = Describe("Processing Messages", func() { ExpectMessagesCreated(scheduledChangeMessage(defaultInstanceID)) ExpectApplied(env.Ctx, env.Client, node) ExpectReconcileSucceeded(env.Ctx, nodeStateController, client.ObjectKeyFromObject(node)) + ExpectReconcileSucceeded(env.Ctx, infraController, types.NamespacedName{}) // Infrastructure should be healthy before starting - Expect(controller.Reconcile(env.Ctx)).To(Succeed()) + ExpectReconcileSucceeded(env.Ctx, controller, types.NamespacedName{}) ExpectNotFound(env.Ctx, env.Client, node) Expect(sqsapi.DeleteMessageBehavior.SuccessfulCalls()).To(Equal(1)) }) @@ -182,7 +182,9 @@ var _ = Describe("Processing Messages", func() { // Wait for the nodes to reconcile with the cluster state ExpectReconcileSucceeded(env.Ctx, nodeStateController, lo.Map(nodes, func(n *v1.Node, _ int) client.ObjectKey { return client.ObjectKeyFromObject(n) })...) - Expect(controller.Reconcile(env.Ctx)).To(Succeed()) + ExpectReconcileSucceeded(env.Ctx, infraController, types.NamespacedName{}) // Infrastructure should be healthy before starting + + ExpectReconcileSucceeded(env.Ctx, controller, types.NamespacedName{}) ExpectNotFound(env.Ctx, env.Client, lo.Map(nodes, func(n *v1.Node, _ int) client.Object { return n })...) Expect(sqsapi.DeleteMessageBehavior.SuccessfulCalls()).To(Equal(4)) }) @@ -211,7 +213,9 @@ var _ = Describe("Processing Messages", func() { // Wait for the nodes to reconcile with the cluster state ExpectReconcileSucceeded(env.Ctx, nodeStateController, lo.Map(nodes, func(n *v1.Node, _ int) client.ObjectKey { return client.ObjectKeyFromObject(n) })...) - Expect(controller.Reconcile(env.Ctx)).To(Succeed()) + ExpectReconcileSucceeded(env.Ctx, infraController, types.NamespacedName{}) // Infrastructure should be healthy before starting + + ExpectReconcileSucceeded(env.Ctx, controller, types.NamespacedName{}) ExpectNotFound(env.Ctx, env.Client, lo.Map(nodes, func(n *v1.Node, _ int) client.Object { return n })...) Expect(sqsapi.DeleteMessageBehavior.SuccessfulCalls()).To(Equal(100)) }) @@ -222,8 +226,9 @@ var _ = Describe("Processing Messages", func() { ExpectMessagesCreated(spotInterruptionMessage(node.Spec.ProviderID)) ExpectApplied(env.Ctx, env.Client, node) ExpectReconcileSucceeded(env.Ctx, nodeStateController, client.ObjectKeyFromObject(node)) + ExpectReconcileSucceeded(env.Ctx, infraController, types.NamespacedName{}) // Infrastructure should be healthy before starting - Expect(controller.Reconcile(env.Ctx)).To(Succeed()) + ExpectReconcileSucceeded(env.Ctx, controller, types.NamespacedName{}) ExpectNodeExists(env.Ctx, env.Client, node.Name) Expect(sqsapi.DeleteMessageBehavior.SuccessfulCalls()).To(Equal(1)) }) @@ -237,7 +242,9 @@ var _ = Describe("Processing Messages", func() { } ExpectMessagesCreated(badMessage) - Expect(controller.Reconcile(env.Ctx)).To(Succeed()) + ExpectReconcileSucceeded(env.Ctx, infraController, types.NamespacedName{}) // Infrastructure should be healthy before starting + + ExpectReconcileSucceeded(env.Ctx, controller, types.NamespacedName{}) Expect(sqsapi.DeleteMessageBehavior.SuccessfulCalls()).To(Equal(1)) }) It("should delete a state change message when the state isn't in accepted states", func() { @@ -252,8 +259,9 @@ var _ = Describe("Processing Messages", func() { ExpectMessagesCreated(stateChangeMessage(defaultInstanceID, "creating")) ExpectApplied(env.Ctx, env.Client, node) ExpectReconcileSucceeded(env.Ctx, nodeStateController, client.ObjectKeyFromObject(node)) + ExpectReconcileSucceeded(env.Ctx, infraController, types.NamespacedName{}) // Infrastructure should be healthy before starting - Expect(controller.Reconcile(env.Ctx)).To(Succeed()) + ExpectReconcileSucceeded(env.Ctx, controller, types.NamespacedName{}) ExpectNodeExists(env.Ctx, env.Client, node.Name) Expect(sqsapi.DeleteMessageBehavior.SuccessfulCalls()).To(Equal(1)) }) @@ -272,8 +280,9 @@ var _ = Describe("Processing Messages", func() { ExpectMessagesCreated(spotInterruptionMessage(defaultInstanceID)) ExpectApplied(env.Ctx, env.Client, node) ExpectReconcileSucceeded(env.Ctx, nodeStateController, client.ObjectKeyFromObject(node)) + ExpectReconcileSucceeded(env.Ctx, infraController, types.NamespacedName{}) // Infrastructure should be healthy before starting - Expect(controller.Reconcile(env.Ctx)).To(Succeed()) + ExpectReconcileSucceeded(env.Ctx, controller, types.NamespacedName{}) ExpectNotFound(env.Ctx, env.Client, node) Expect(sqsapi.DeleteMessageBehavior.SuccessfulCalls()).To(Equal(1)) @@ -295,50 +304,37 @@ var _ = Describe("Processing Messages", func() { var _ = Describe("Error Handling", func() { It("should send an error on polling when AccessDenied", func() { - ExpectClosed(infraStartChan) sqsapi.ReceiveMessageBehavior.Error.Set(awsErrWithCode(aws.AccessDeniedCode), awsfake.MaxCalls(0)) - Expect(controller.Reconcile(env.Ctx)).ToNot(Succeed()) + ExpectReconcileSucceeded(env.Ctx, infraController, types.NamespacedName{}) // Infrastructure should be healthy before starting + + _, err := controller.Reconcile(env.Ctx, reconcile.Request{}) + Expect(err).ToNot(Succeed()) }) It("should trigger an infrastructure reconciliation on an SQS queue when it doesn't exist", func() { sqsapi.GetQueueURLBehavior.Error.Set(awsErrWithCode(sqs.ErrCodeQueueDoesNotExist), awsfake.MaxCalls(0)) // This mocks the queue not existing - ExpectClosed(infraStartChan) - - // Infrastructure reconciliation loop has completed - Eventually(func(g Gomega) { - g.Expect(sqsapi.CreateQueueBehavior.SuccessfulCalls()).To(Equal(1)) - g.Expect(eventbridgeapi.PutRuleBehavior.SuccessfulCalls()).To(Equal(4)) - g.Expect(eventbridgeapi.PutTargetsBehavior.SuccessfulCalls()).To(Equal(4)) - g.Expect(IsClosed(infraController.Ready())).To(BeTrue()) - }).Should(Succeed()) - - sqsapi.ReceiveMessageBehavior.Error.Set(awsErrWithCode(sqs.ErrCodeQueueDoesNotExist)) // This mocks the queue being deleted manually after infra reconciliation - - // This should fail with an error since the queue doesn't exist - Expect(controller.Reconcile(env.Ctx)).ToNot(Succeed()) - - Eventually(func(g Gomega) { - g.Expect(sqsapi.CreateQueueBehavior.SuccessfulCalls()).To(Equal(2)) - g.Expect(eventbridgeapi.PutRuleBehavior.SuccessfulCalls()).To(Equal(8)) - g.Expect(eventbridgeapi.PutTargetsBehavior.SuccessfulCalls()).To(Equal(8)) - g.Expect(IsClosed(infraController.Ready())).To(BeTrue()) - }).Should(Succeed()) + + infraController := &controllersfake.TriggerController{} + controller = polling.NewController(notification.NewReconciler(env.Client, recorder, cluster, sqsProvider, instanceTypeProvider, infraController)) + + _, err := controller.Reconcile(env.Ctx, reconcile.Request{}) + Expect(err).ToNot(Succeed()) + Expect(infraController.TriggerCalls.Load()).Should(BeNumerically("==", 1)) }) }) var _ = Describe("Infrastructure Coordination", func() { It("should wait for the infrastructure to be ready before polling SQS", func() { - ExpectClosed(notificationStartChan) - Expect(IsClosed(infraController.Ready())).To(BeFalse()) - Consistently(func(g Gomega) { - g.Expect(sqsapi.ReceiveMessageBehavior.SuccessfulCalls()).To(Equal(0)) - g.Expect(sqsapi.ReceiveMessageBehavior.FailedCalls()).To(Equal(0)) - }, time.Second*10).Should(Succeed()) - - ExpectClosed(infraStartChan) - - Eventually(func(g Gomega) { - g.Expect(sqsapi.ReceiveMessageBehavior.SuccessfulCalls()).To(BeNumerically(">", 0)) - }, time.Second*10).Should(Succeed()) + // Prior to provisioning the infrastructure and the infrastructure being healthy, we shouldn't try to hit the queue + res, err := controller.Reconcile(env.Ctx, reconcile.Request{}) + Expect(err).To(Succeed()) + Expect(res.Requeue).To(BeTrue()) + Expect(sqsapi.ReceiveMessageBehavior.SuccessfulCalls()).To(BeNumerically("==", 0)) + + ExpectReconcileSucceeded(env.Ctx, infraController, types.NamespacedName{}) + ExpectReconcileSucceeded(env.Ctx, controller, types.NamespacedName{}) + Expect(infraController.Healthy()).To(BeTrue()) + Expect(controller.Healthy()).To(BeTrue()) + Expect(sqsapi.ReceiveMessageBehavior.SuccessfulCalls()).To(BeNumerically("==", 1)) }) }) diff --git a/pkg/cloudprovider/aws/controllers/register.go b/pkg/cloudprovider/aws/controllers/register.go index e99f60cb660d..27facffae10a 100644 --- a/pkg/cloudprovider/aws/controllers/register.go +++ b/pkg/cloudprovider/aws/controllers/register.go @@ -21,18 +21,28 @@ import ( "github.com/aws/karpenter/pkg/cloudprovider/aws" "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/infrastructure" + "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/nodetemplate" "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification" "github.com/aws/karpenter/pkg/cloudprovider/aws/events" "github.com/aws/karpenter/pkg/controllers" + "github.com/aws/karpenter/pkg/controllers/polling" ) -func Register(ctx context.Context, provider *aws.CloudProvider, opts *controllers.ControllerOptions) { +func Register(ctx context.Context, provider *aws.CloudProvider, opts *controllers.ControllerOptions) (ret []controllers.Controller) { rec := events.NewRecorder(opts.Recorder) - ctx = logging.WithLogger(ctx, logging.FromContext(ctx).Named("aws")) // Only enable spot interruption handling controllers when the feature flag is enabled if opts.Config.EnableInterruptionHandling() { - infraController := infrastructure.NewController(ctx, opts.KubeClient, opts.Clock, rec, provider.SQSProvider(), provider.EventBridgeProvider(), opts.StartAsync) - notification.NewController(ctx, opts.KubeClient, opts.Clock, rec, opts.Cluster, provider.SQSProvider(), provider.InstanceTypeProvider(), infraController, opts.StartAsync) + logging.FromContext(ctx).Infof("Enabling interruption handling") + + infraProvider := infrastructure.NewProvider(provider.SQSProvider(), provider.EventBridgeProvider()) + infraController := polling.NewController(infrastructure.NewReconciler(infraProvider)) + notificationController := polling.NewController(notification.NewReconciler(opts.KubeClient, rec, opts.Cluster, provider.SQSProvider(), provider.InstanceTypeProvider(), infraController)) + nodeTemplateController := nodetemplate.NewController(opts.KubeClient, infraProvider, infraController, notificationController) + + infraController.OnHealthy = notificationController.Start + infraController.OnUnhealthy = notificationController.Stop + ret = append(ret, infraController, notificationController, nodeTemplateController) } + return ret } diff --git a/pkg/cloudprovider/aws/events/recorder.go b/pkg/cloudprovider/aws/events/recorder.go index cb2a182779f3..2dcafd03d248 100644 --- a/pkg/cloudprovider/aws/events/recorder.go +++ b/pkg/cloudprovider/aws/events/recorder.go @@ -15,18 +15,10 @@ limitations under the License. package events import ( - "context" - - "github.com/avast/retry-go" v1 "k8s.io/api/core/v1" - "k8s.io/apimachinery/pkg/types" "k8s.io/client-go/tools/record" - "knative.dev/pkg/logging" - "knative.dev/pkg/system" - "sigs.k8s.io/controller-runtime/pkg/client" "github.com/aws/karpenter/pkg/events" - "github.com/aws/karpenter/pkg/utils/injection" ) type recorder struct { @@ -49,14 +41,6 @@ type Recorder interface { EC2StateStopping(*v1.Node) // TerminatingNodeOnNotification is called when a notification that is sent to the notification controller triggers node deletion TerminatingNodeOnNotification(*v1.Node) - // InfrastructureUnhealthy event is called when infrastructure reconciliation errors and the controller enters an unhealthy state - InfrastructureUnhealthy(context.Context, client.Client) - // InfrastructureHealthy event is called when infrastructure reconciliation succeeds and the controller enters a healthy state - InfrastructureHealthy(context.Context, client.Client) - // InfrastructureDeletionSucceeded event is called when infrastructure deletion fails - InfrastructureDeletionSucceeded(context.Context, client.Client) - // InfrastructureDeletionFailed event is called when infrastructure deletion succeeds - InfrastructureDeletionFailed(context.Context, client.Client) } func NewRecorder(r events.Recorder) Recorder { @@ -93,51 +77,3 @@ func (r recorder) EC2StateStopping(node *v1.Node) { func (r recorder) TerminatingNodeOnNotification(node *v1.Node) { r.rec.Eventf(node, "Normal", "AWSNotificationTerminateNode", "Node %s event: Notification triggered termination for the node", node.Name) } - -func (r recorder) InfrastructureHealthy(ctx context.Context, kubeClient client.Client) { - pod := &v1.Pod{} - err := retry.Do(func() error { - return kubeClient.Get(ctx, types.NamespacedName{Namespace: system.Namespace(), Name: injection.GetOptions(ctx).PodName}, pod) - }) - if err != nil { - logging.FromContext(ctx).Errorf("Sending InfrastructureHealthy event, %v", err) - return - } - r.rec.Eventf(pod, "Normal", "AWSInfrastructureHealthy", "Karpenter infrastructure reconciliation is healthy") -} - -func (r recorder) InfrastructureUnhealthy(ctx context.Context, kubeClient client.Client) { - pod := &v1.Pod{} - err := retry.Do(func() error { - return kubeClient.Get(ctx, types.NamespacedName{Namespace: system.Namespace(), Name: injection.GetOptions(ctx).PodName}, pod) - }) - if err != nil { - logging.FromContext(ctx).Errorf("Sending InfrastructureUnhealthy event, %v", err) - return - } - r.rec.Eventf(pod, "Warning", "AWSInfrastructureUnhealthy", "Karpenter infrastructure reconciliation is unhealthy") -} - -func (r recorder) InfrastructureDeletionSucceeded(ctx context.Context, kubeClient client.Client) { - pod := &v1.Pod{} - err := retry.Do(func() error { - return kubeClient.Get(ctx, types.NamespacedName{Namespace: system.Namespace(), Name: injection.GetOptions(ctx).PodName}, pod) - }) - if err != nil { - logging.FromContext(ctx).Errorf("Sending InfrastructureDeletionSucceeded event, %v", err) - return - } - r.rec.Eventf(pod, "Normal", "AWSInfrastructureDeletionSucceeded", "Karpenter infrastructure deletion succeeded") -} - -func (r recorder) InfrastructureDeletionFailed(ctx context.Context, kubeClient client.Client) { - pod := &v1.Pod{} - err := retry.Do(func() error { - return kubeClient.Get(ctx, types.NamespacedName{Namespace: system.Namespace(), Name: injection.GetOptions(ctx).PodName}, pod) - }) - if err != nil { - logging.FromContext(ctx).Errorf("Sending InfrastructureDeletionFailed event, %v", err) - return - } - r.rec.Eventf(pod, "Warning", "AWSInfrastructureDeletionFailed", "Karpenter infrastructure deletion failed") -} diff --git a/pkg/cloudprovider/aws/fake/eventrecorder.go b/pkg/cloudprovider/aws/fake/eventrecorder.go index a80e4a604720..a8c28fbf12f8 100644 --- a/pkg/cloudprovider/aws/fake/eventrecorder.go +++ b/pkg/cloudprovider/aws/fake/eventrecorder.go @@ -15,11 +15,8 @@ limitations under the License. package fake import ( - "context" - v1 "k8s.io/api/core/v1" "k8s.io/client-go/tools/record" - "sigs.k8s.io/controller-runtime/pkg/client" "github.com/aws/karpenter/pkg/test" ) @@ -43,14 +40,6 @@ func (e *EventRecorder) EC2StateStopping(_ *v1.Node) {} func (e *EventRecorder) TerminatingNodeOnNotification(_ *v1.Node) {} -func (e *EventRecorder) InfrastructureUnhealthy(_ context.Context, _ client.Client) {} - -func (e *EventRecorder) InfrastructureHealthy(_ context.Context, _ client.Client) {} - -func (e *EventRecorder) InfrastructureDeletionSucceeded(_ context.Context, _ client.Client) {} - -func (e *EventRecorder) InfrastructureDeletionFailed(_ context.Context, _ client.Client) {} - func NewEventRecorder() *EventRecorder { return &EventRecorder{ Recorder: *test.NewRecorder(), diff --git a/pkg/cloudprovider/aws/metadata.go b/pkg/cloudprovider/aws/metadata.go index 8fc534aba144..cbeb17fb7437 100644 --- a/pkg/cloudprovider/aws/metadata.go +++ b/pkg/cloudprovider/aws/metadata.go @@ -17,15 +17,15 @@ package aws import ( "context" "fmt" - "sync" "github.com/aws/aws-sdk-go/aws" "github.com/aws/aws-sdk-go/aws/ec2metadata" "github.com/aws/aws-sdk-go/aws/session" "github.com/aws/aws-sdk-go/service/sts" "github.com/aws/aws-sdk-go/service/sts/stsiface" + "knative.dev/pkg/logging" - "github.com/aws/karpenter/pkg/utils/cache" + "github.com/aws/karpenter/pkg/utils/atomic" ) type EC2MetadataInterface interface { @@ -51,55 +51,63 @@ func (e *EC2MetadataClient) PartitionID() string { type MetadataProvider struct { ec2MetadataClient EC2MetadataInterface stsClient stsiface.STSAPI + sess *session.Session - region *string // cached region if already resolved - regionMu sync.RWMutex - - accountID *string // cached accountID if already resolved - accountIDMu sync.RWMutex + region atomic.CachedVal[string] // cached region if already resolved + accountID atomic.CachedVal[string] // cached accountID if already resolved } -func NewMetadataProvider(ec2metadataapi EC2MetadataInterface, stsapi stsiface.STSAPI) *MetadataProvider { - return &MetadataProvider{ +func NewMetadataProvider(sess *session.Session, ec2metadataapi EC2MetadataInterface, stsapi stsiface.STSAPI) *MetadataProvider { + m := &MetadataProvider{ ec2MetadataClient: ec2metadataapi, stsClient: stsapi, + sess: sess, + } + m.region.Resolve = func(ctx context.Context) (string, error) { + if m.sess != nil && m.sess.Config != nil && m.sess.Config.Region != nil && *m.sess.Config.Region != "" { + return *m.sess.Config.Region, nil + } + logging.FromContext(ctx).Debug("AWS region not configured, asking EC2 Instance Metadata Service") + return m.ec2MetadataClient.RegionWithContext(ctx) + } + m.accountID.Resolve = func(ctx context.Context) (string, error) { + doc, err := m.ec2MetadataClient.GetInstanceIdentityDocumentWithContext(ctx) + if err != nil { + // Resolve to using the STS provider if IMDS fails + result, err := m.stsClient.GetCallerIdentity(&sts.GetCallerIdentityInput{}) + if err != nil { + return "", err + } + return aws.StringValue(result.Account), nil + } + return doc.AccountID, nil } + return m +} + +// EnsureSessionRegion resolves the region set in the session config if not already set +func (m *MetadataProvider) EnsureSessionRegion(ctx context.Context, sess *session.Session) { + *sess.Config.Region = m.Region(ctx) } // Region gets the current region from EC2 IMDS -func (i *MetadataProvider) Region(ctx context.Context) string { - ret, err := cache.TryGetStringWithFallback(&i.regionMu, i.region, - func() (string, error) { - return i.ec2MetadataClient.RegionWithContext(ctx) - }) +func (m *MetadataProvider) Region(ctx context.Context) string { + str, err := m.region.TryGet(ctx) if err != nil { - panic(fmt.Sprintf("Failed to call the metadata server's region API, %s", err)) + panic(fmt.Sprintf("Resolving region in the metadata provider, %v", err)) } - return ret + return str } // AccountID gets the AWS Account ID from EC2 IMDS, then STS if it can't be resolved at IMDS -func (i *MetadataProvider) AccountID(ctx context.Context) string { - ret, err := cache.TryGetStringWithFallback(&i.accountIDMu, i.accountID, - func() (string, error) { - doc, err := i.ec2MetadataClient.GetInstanceIdentityDocumentWithContext(ctx) - if err != nil { - // Fallback to using the STS provider if IMDS fails - result, err := i.stsClient.GetCallerIdentity(&sts.GetCallerIdentityInput{}) - if err != nil { - return "", err - } - return aws.StringValue(result.Account), nil - } - return doc.AccountID, nil - }, - ) +func (m *MetadataProvider) AccountID(ctx context.Context) string { + str, err := m.accountID.TryGet(ctx) if err != nil { - panic(fmt.Sprintf("Failed to get account ID from IMDS or STS, %s", err)) + panic(fmt.Sprintf("Resolving account ID in the metadata provider, %v", err)) } - return ret + return str } -func (i *MetadataProvider) Partition() string { - return i.ec2MetadataClient.PartitionID() +func (m *MetadataProvider) Partition() string { + return m.ec2MetadataClient.PartitionID() } diff --git a/pkg/cloudprovider/aws/sqs.go b/pkg/cloudprovider/aws/sqs.go index c4a8b2c4dd37..b5392d1a7c4b 100644 --- a/pkg/cloudprovider/aws/sqs.go +++ b/pkg/cloudprovider/aws/sqs.go @@ -26,7 +26,7 @@ import ( "github.com/samber/lo" awsv1alpha1 "github.com/aws/karpenter/pkg/cloudprovider/aws/apis/v1alpha1" - "github.com/aws/karpenter/pkg/utils/cache" + "github.com/aws/karpenter/pkg/utils/atomic" "github.com/aws/karpenter/pkg/utils/functional" "github.com/aws/karpenter/pkg/utils/injection" ) @@ -55,7 +55,7 @@ type SQSProvider struct { getQueueURLInput *sqs.GetQueueUrlInput receiveMessageInput *sqs.ReceiveMessageInput mu sync.RWMutex - queueURL *string + queueURL atomic.CachedVal[string] queueName string metadataProvider *MetadataProvider } @@ -88,6 +88,13 @@ func NewSQSProvider(ctx context.Context, client sqsiface.SQSAPI, metadataProvide aws.String(sqs.QueueAttributeNameAll), }, } + provider.queueURL.Resolve = func(ctx context.Context) (string, error) { + ret, err := provider.client.GetQueueUrlWithContext(ctx, provider.getQueueURLInput) + if err != nil { + return "", fmt.Errorf("fetching queue url, %w", err) + } + return aws.StringValue(ret.QueueUrl), nil + } return provider } @@ -102,7 +109,7 @@ func (s *SQSProvider) CreateQueue(ctx context.Context) error { } s.mu.Lock() defer s.mu.Unlock() - s.queueURL = result.QueueUrl + s.queueURL.Set(aws.StringValue(result.QueueUrl)) return nil } @@ -124,17 +131,8 @@ func (s *SQSProvider) SetQueueAttributes(ctx context.Context) error { } func (s *SQSProvider) DiscoverQueueURL(ctx context.Context, ignoreCache bool) (string, error) { - opts := lo.Ternary(ignoreCache, cache.IgnoreCacheOption, nil) - return cache.TryGetStringWithFallback(&s.mu, s.queueURL, - func() (string, error) { - ret, err := s.client.GetQueueUrlWithContext(ctx, s.getQueueURLInput) - if err != nil { - return "", fmt.Errorf("fetching queue url, %w", err) - } - return aws.StringValue(ret.QueueUrl), nil - }, - opts, - ) + opts := lo.Ternary(ignoreCache, atomic.IgnoreCacheOption, nil) + return s.queueURL.TryGet(ctx, opts) } func (s *SQSProvider) GetSQSMessages(ctx context.Context) ([]*sqs.Message, error) { diff --git a/pkg/controllers/controllers.go b/pkg/controllers/controllers.go index 6354e17d123f..adf7e2d6c9a1 100644 --- a/pkg/controllers/controllers.go +++ b/pkg/controllers/controllers.go @@ -19,10 +19,7 @@ import ( "fmt" "net/http" "net/http/pprof" - "os" - "os/signal" "runtime/debug" - "syscall" "github.com/go-logr/logr" "github.com/go-logr/zapr" @@ -43,7 +40,6 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/healthz" "sigs.k8s.io/controller-runtime/pkg/manager" - "sigs.k8s.io/controller-runtime/pkg/reconcile" "github.com/aws/karpenter/pkg/apis" "github.com/aws/karpenter/pkg/cloudprovider" @@ -77,29 +73,7 @@ func init() { metrics.MustRegister() // Registers cross-controller metrics } -type ControllerInitFunc func(context.Context, *ControllerOptions) - -// Controller is an interface implemented by Karpenter custom resources. -type Controller interface { - // Reconcile hands a hydrated kubernetes resource to the controller for - // reconciliation. Any changes made to the resource's status are persisted - // after Reconcile returns, even if it returns an error. - Reconcile(context.Context, reconcile.Request) (reconcile.Result, error) - // Register will register the controller with the manager - Register(context.Context, manager.Manager) error -} - -type ControllerOptions struct { - Config config.Config - Cluster *state.Cluster - KubeClient client.Client - Recorder events.Recorder - Clock clock.Clock - - StartAsync <-chan struct{} -} - -func Initialize(injectCloudProvider func(context.Context, cloudprovider.Options) (cloudprovider.CloudProvider, ControllerInitFunc)) { +func Initialize(injectCloudProvider func(context.Context, cloudprovider.Options) (cloudprovider.CloudProvider, ControllerGetterFunc)) { opts := options.New().MustParse() // Setup Client controllerRuntimeConfig := controllerruntime.GetConfigOrDie() @@ -134,7 +108,7 @@ func Initialize(injectCloudProvider func(context.Context, cloudprovider.Options) if opts.EnableProfiling { utilruntime.Must(registerPprof(manager)) } - cloudProvider, injectControllers := injectCloudProvider(ctx, cloudprovider.Options{ + cloudProvider, controllerGetter := injectCloudProvider(ctx, cloudprovider.Options{ ClientSet: clientSet, KubeClient: manager.GetClient(), StartAsync: manager.Elected(), @@ -173,12 +147,8 @@ func Initialize(injectCloudProvider func(context.Context, cloudprovider.Options) StartAsync: manager.Elected(), Clock: realClock, } - injectControllers(ctx, controllerOptions) - - metricsstate.StartMetricScraper(ctx, cluster) - - if err := RegisterControllers(ctx, - manager, + cloudProviderControllers := controllerGetter(ctx, controllerOptions) + controllers := []Controller{ provisioning.NewController(manager.GetClient(), provisioner, recorder), state.NewNodeController(manager.GetClient(), cluster), state.NewPodController(manager.GetClient(), cluster), @@ -188,6 +158,13 @@ func Initialize(injectCloudProvider func(context.Context, cloudprovider.Options) metricspod.NewController(manager.GetClient()), metricsprovisioner.NewController(manager.GetClient()), counter.NewController(manager.GetClient(), cluster), + } + controllers = append(controllers, cloudProviderControllers...) + + metricsstate.StartMetricScraper(ctx, cluster) + if err := RegisterControllers(ctx, + manager, + controllers..., ).Start(ctx); err != nil { panic(fmt.Sprintf("Unable to start manager, %s", err)) } @@ -252,32 +229,6 @@ func registerPprof(manager manager.Manager) error { return nil } -type ignoreDebugEventsSink struct { - name string - sink logr.LogSink -} - -func (i ignoreDebugEventsSink) Init(ri logr.RuntimeInfo) { - i.sink.Init(ri) -} -func (i ignoreDebugEventsSink) Enabled(level int) bool { return i.sink.Enabled(level) } -func (i ignoreDebugEventsSink) Info(level int, msg string, keysAndValues ...interface{}) { - // ignore debug "events" logs - if level == 1 && i.name == "events" { - return - } - i.sink.Info(level, msg, keysAndValues...) -} -func (i ignoreDebugEventsSink) Error(err error, msg string, keysAndValues ...interface{}) { - i.sink.Error(err, msg, keysAndValues...) -} -func (i ignoreDebugEventsSink) WithValues(keysAndValues ...interface{}) logr.LogSink { - return i.sink.WithValues(keysAndValues...) -} -func (i ignoreDebugEventsSink) WithName(name string) logr.LogSink { - return &ignoreDebugEventsSink{name: name, sink: i.sink.WithName(name)} -} - // ignoreDebugEvents wraps the logger with one that ignores any debug logs coming from a logger named "events". This // prevents every event we write from creating a debug log which spams the log file during scale-ups due to recording // pod scheduling decisions as events for visibility. @@ -285,12 +236,6 @@ func ignoreDebugEvents(logger logr.Logger) logr.Logger { return logr.New(&ignoreDebugEventsSink{sink: logger.GetSink()}) } -func Cleanup() <-chan os.Signal { - c := make(chan os.Signal, 1) - signal.Notify(c, syscall.SIGINT) - return c -} - func newRunnableContext(config *rest.Config, options *options.Options, logger *zap.SugaredLogger) func() context.Context { return func() context.Context { ctx := context.Background() diff --git a/pkg/controllers/polling/controller.go b/pkg/controllers/polling/controller.go new file mode 100644 index 000000000000..11933a7a145e --- /dev/null +++ b/pkg/controllers/polling/controller.go @@ -0,0 +1,211 @@ +/* +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package polling + +import ( + "context" + "sync" + + "github.com/google/uuid" + v1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/types" + "knative.dev/pkg/logging" + controllerruntime "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/event" + "sigs.k8s.io/controller-runtime/pkg/handler" + "sigs.k8s.io/controller-runtime/pkg/manager" + crmetrics "sigs.k8s.io/controller-runtime/pkg/metrics" + "sigs.k8s.io/controller-runtime/pkg/predicate" + "sigs.k8s.io/controller-runtime/pkg/reconcile" + "sigs.k8s.io/controller-runtime/pkg/source" + + "github.com/aws/karpenter/pkg/controllers" +) + +type ControllerInterface interface { + controllers.Controller + + Start(context.Context) + Stop(context.Context) + Trigger() + Active() bool + Healthy() bool +} + +// Controller is a wrapper around a controller interface that adds a trigger mechanism for enqueuing +// reconcile requests for the TriggerObject. On a new trigger, Controller will throw away old trigger calls +// by comparing the current triggerGeneration to the previous triggerGeneration. +// Controller also has an active flag that can be enabled or disabled. This serves as a mechanism to stop +// a requeue of a trigger request from the wrapped Reconcile() method of the Controller +type Controller struct { + OnHealthy func(context.Context) + OnUnhealthy func(context.Context) + + r Reconciler + uuid types.UID + + active bool + healthy bool + + triggerGeneration int64 + trigger chan event.GenericEvent + + triggerMu sync.RWMutex + activeMu sync.RWMutex + healthyMu sync.RWMutex + + cancels sync.Map +} + +type Reconciler interface { + reconcile.Reconciler + + Name() string + MetricsSubsystemName() string +} + +type Object struct { + metav1.ObjectMeta + runtime.Object +} + +func NewController(rec Reconciler) *Controller { + return &Controller{ + r: rec, + uuid: types.UID(uuid.New().String()), + trigger: make(chan event.GenericEvent, 100), + } +} + +// Start is an idempotent call to kick-off a single reconciliation loop. Based on the intended use of this controller, +// the Reconciler is responsible for requeuing this message back in the WorkQueue so there is a time-based reconciliation +// performed. The Trigger operation is performed to kick-off the loop. +func (t *Controller) Start(ctx context.Context) { + logging.FromContext(ctx).Infof("Starting the %s controller...", t.r.Name()) + t.activeMu.Lock() + if !t.active { + t.active = true + t.activeMu.Unlock() + t.Trigger() + } else { + t.activeMu.Unlock() + } +} + +// Trigger triggers an immediate reconciliation by inserting a message into the event channel. We increase the trigger +// generation here to ensure that any messages that were previously re-queued are thrown away +func (t *Controller) Trigger() { + t.triggerMu.Lock() + defer t.triggerMu.Unlock() + + t.triggerGeneration++ + t.triggeredCountMetric().Inc() + obj := &Object{ObjectMeta: metav1.ObjectMeta{Generation: t.triggerGeneration, UID: t.uuid}} + t.trigger <- event.GenericEvent{Object: obj} +} + +// Stop sets the state of the controller to active and cancel the current reconciliation contexts, if there are any +func (t *Controller) Stop(ctx context.Context) { + logging.FromContext(ctx).Infof("Stopping the %s controller...", t.r.Name()) + t.SetActive(false) + t.cancels.Range(func(_ any, c any) bool { + cancel := c.(context.CancelFunc) + cancel() + return true + }) +} + +// Active gets whether the controller is active right now. This value is passed down to the wrapped +// Reconcile method so that the Reconciler can handle cleanup scenarios. The underlying Reconciler is responsible +// for returning a result with no RequeueAfter to stop its activity +func (t *Controller) Active() bool { + t.activeMu.RLock() + defer t.activeMu.RUnlock() + return t.active +} + +// SetActive sets the active flag on the controller +func (t *Controller) SetActive(active bool) { + t.activeMu.Lock() + defer t.activeMu.Unlock() + + t.active = active + if active { + t.activeMetric().Set(1) + } else { + t.activeMetric().Set(0) + } +} + +func (t *Controller) Healthy() bool { + t.healthyMu.RLock() + defer t.healthyMu.RUnlock() + return t.healthy +} + +func (t *Controller) Reconcile(ctx context.Context, req reconcile.Request) (reconcile.Result, error) { + ctx = logging.WithLogger(ctx, logging.FromContext(ctx).Named(t.r.Name())) + ctx, cancel := context.WithCancel(ctx) + + // Store the cancel function for the duration of the reconcile so we can cancel on a Stop() call + cancelID := uuid.New() + t.cancels.Store(cancelID, cancel) + defer t.cancels.Delete(cancelID) + + res, err := t.r.Reconcile(ctx, req) + + t.healthyMu.Lock() + t.healthy = err == nil // The controller is considered healthy when it successfully reconciles + if t.healthy { + if t.OnHealthy != nil { + t.OnHealthy(ctx) + } + t.healthyMetric().Set(1) + } else { + if t.OnUnhealthy != nil { + t.OnUnhealthy(ctx) + } + t.healthyMetric().Set(0) + } + t.healthyMu.Unlock() + + t.activeMu.Lock() + if !t.active { + return reconcile.Result{}, nil // Swallow any errors/calls at this point + } + t.activeMu.Unlock() + return res, err +} + +func (t *Controller) Register(_ context.Context, m manager.Manager) error { + crmetrics.Registry.MustRegister(t.healthyMetric(), t.activeMetric(), t.triggeredCountMetric()) + return controllerruntime. + NewControllerManagedBy(m). + Named(t.r.Name()). + WithEventFilter(predicate.NewPredicateFuncs(func(obj client.Object) bool { + t.triggerMu.RLock() + defer t.triggerMu.RUnlock() + + // UUID comparison is a hacky way to get around the fact that controller-runtime requires + // us to perform a watch on some K8s object + return obj.GetUID() == t.uuid && obj.GetGeneration() == t.triggerGeneration + })). + Watches(&source.Channel{Source: t.trigger}, &handler.EnqueueRequestForObject{}). + For(&v1.Pod{}). // controller-runtime requires us to perform a watch on some object, so let's do it on a fundamental component + Complete(t) +} diff --git a/pkg/cloudprovider/aws/controllers/infrastructure/metrics.go b/pkg/controllers/polling/metrics.go similarity index 50% rename from pkg/cloudprovider/aws/controllers/infrastructure/metrics.go rename to pkg/controllers/polling/metrics.go index 2ce1e2a22b4e..0e9a633bf8de 100644 --- a/pkg/cloudprovider/aws/controllers/infrastructure/metrics.go +++ b/pkg/controllers/polling/metrics.go @@ -12,39 +12,43 @@ See the License for the specific language governing permissions and limitations under the License. */ -package infrastructure +package polling import ( "github.com/prometheus/client_golang/prometheus" - crmetrics "sigs.k8s.io/controller-runtime/pkg/metrics" "github.com/aws/karpenter/pkg/metrics" ) -const ( - subSystem = "aws_infrastructure_controller" -) - -var ( - reconcileDuration = prometheus.NewHistogram( - prometheus.HistogramOpts{ +func (t *Controller) healthyMetric() prometheus.Gauge { + return prometheus.NewGauge( + prometheus.GaugeOpts{ Namespace: metrics.Namespace, - Subsystem: subSystem, - Name: "reconcile_duration_seconds", - Help: "Duration of scheduling process in seconds. Broken down by provisioner and error.", - Buckets: metrics.DurationBuckets(), + Subsystem: t.r.MetricsSubsystemName(), + Name: "healthy", + Help: "Whether the controller is in a healthy state.", }, ) - healthy = prometheus.NewGauge( +} + +func (t *Controller) activeMetric() prometheus.Gauge { + return prometheus.NewGauge( prometheus.GaugeOpts{ Namespace: metrics.Namespace, - Subsystem: subSystem, - Name: "healthy", - Help: "Whether the infrastructure that should be up for this controller is in a healthy state.", + Subsystem: t.r.MetricsSubsystemName(), + Name: "active", + Help: "Whether the controller is active.", }, ) -) +} -func init() { - crmetrics.Registry.MustRegister(reconcileDuration, healthy) +func (t *Controller) triggeredCountMetric() prometheus.Counter { + return prometheus.NewCounter( + prometheus.CounterOpts{ + Namespace: metrics.Namespace, + Subsystem: t.r.MetricsSubsystemName(), + Name: "trigger_count", + Help: "A counter of the number of times this controller has been triggered.", + }, + ) } diff --git a/pkg/controllers/types.go b/pkg/controllers/types.go new file mode 100644 index 000000000000..5e9e46d56308 --- /dev/null +++ b/pkg/controllers/types.go @@ -0,0 +1,77 @@ +/* +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package controllers + +import ( + "context" + + "github.com/go-logr/logr" + "k8s.io/utils/clock" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/manager" + "sigs.k8s.io/controller-runtime/pkg/reconcile" + + "github.com/aws/karpenter/pkg/config" + "github.com/aws/karpenter/pkg/controllers/state" + "github.com/aws/karpenter/pkg/events" +) + +type ControllerGetterFunc func(context.Context, *ControllerOptions) []Controller + +type ControllerOptions struct { + Config config.Config + Cluster *state.Cluster + KubeClient client.Client + Recorder events.Recorder + Clock clock.Clock + + StartAsync <-chan struct{} +} + +// Controller is an interface implemented by Karpenter custom resources. +type Controller interface { + // Reconcile hands a hydrated kubernetes resource to the controller for + // reconciliation. Any changes made to the resource's status are persisted + // after Reconcile returns, even if it returns an error. + Reconcile(context.Context, reconcile.Request) (reconcile.Result, error) + // Register will register the controller with the manager + Register(context.Context, manager.Manager) error +} + +type ignoreDebugEventsSink struct { + name string + sink logr.LogSink +} + +func (i ignoreDebugEventsSink) Init(ri logr.RuntimeInfo) { + i.sink.Init(ri) +} +func (i ignoreDebugEventsSink) Enabled(level int) bool { return i.sink.Enabled(level) } +func (i ignoreDebugEventsSink) Info(level int, msg string, keysAndValues ...interface{}) { + // ignore debug "events" logs + if level == 1 && i.name == "events" { + return + } + i.sink.Info(level, msg, keysAndValues...) +} +func (i ignoreDebugEventsSink) Error(err error, msg string, keysAndValues ...interface{}) { + i.sink.Error(err, msg, keysAndValues...) +} +func (i ignoreDebugEventsSink) WithValues(keysAndValues ...interface{}) logr.LogSink { + return i.sink.WithValues(keysAndValues...) +} +func (i ignoreDebugEventsSink) WithName(name string) logr.LogSink { + return &ignoreDebugEventsSink{name: name, sink: i.sink.WithName(name)} +} diff --git a/pkg/test/deployment.go b/pkg/test/deployment.go index a64307bf5a36..6619acc6b87a 100644 --- a/pkg/test/deployment.go +++ b/pkg/test/deployment.go @@ -65,23 +65,3 @@ func Deployment(overrides ...DeploymentOptions) *appsv1.Deployment { } return dep } - -func KarpenterDeployment(overrides ...DeploymentOptions) *appsv1.Deployment { - options := DeploymentOptions{ - ObjectMeta: metav1.ObjectMeta{ - Name: "karpenter", - Namespace: "default", - }, - Labels: map[string]string{ - "app.kubernetes.io/name": "karpenter", - "app.kubernetes.io/instance": "karpenter", - }, - Replicas: 2, - } - for _, opts := range overrides { - if err := mergo.Merge(&options, opts, mergo.WithOverride); err != nil { - panic(fmt.Sprintf("Failed to merge deployment options: %s", err)) - } - } - return Deployment(options) -} diff --git a/pkg/utils/cache/cache.go b/pkg/utils/atomic/cached_val.go similarity index 55% rename from pkg/utils/cache/cache.go rename to pkg/utils/atomic/cached_val.go index 94a89b11aeab..aa16f69e2eaf 100644 --- a/pkg/utils/cache/cache.go +++ b/pkg/utils/atomic/cached_val.go @@ -12,11 +12,13 @@ See the License for the specific language governing permissions and limitations under the License. */ -package cache +package atomic import ( - "fmt" + "context" "sync" + + "github.com/aws/karpenter/pkg/utils/ptr" ) type Option func(Options) Options @@ -30,31 +32,44 @@ func IgnoreCacheOption(o Options) Options { return o } -// TryGetStringWithFallback attempts to get non-nil string value from field. If field is nil, the function -// will attempt to resolve the value by calling fallback, setting the value stored in field in-place if found. -func TryGetStringWithFallback(mu *sync.RWMutex, field *string, fallback func() (string, error), opts ...Option) (string, error) { +// CachedVal persistently stores a value in memory +type CachedVal[T any] struct { + value *T + mu sync.RWMutex + Resolve func(context.Context) (T, error) +} + +func (c *CachedVal[T]) Set(v T) { + c.mu.Lock() + defer c.mu.Unlock() + c.value = &v +} + +// TryGet attempts to get non-nil value from internal value. If field is nil, the function +// will attempt to resolve the value by calling fallback, setting the value stored in value in-place if found. +func (c *CachedVal[T]) TryGet(ctx context.Context, opts ...Option) (T, error) { o := resolveOptions(opts) - mu.RLock() - if field != nil && !o.ignoreCache { - ret := *field - mu.RUnlock() + c.mu.RLock() + if c.value != nil && !o.ignoreCache { + ret := *c.value + c.mu.RUnlock() return ret, nil } - mu.RUnlock() - mu.Lock() - defer mu.Unlock() + c.mu.RUnlock() + c.mu.Lock() + defer c.mu.Unlock() // We have to check if the field is set again here in case multiple threads make it past the read-locked section - if field != nil { - return *field, nil + if c.value != nil && !o.ignoreCache { + return *c.value, nil } - ret, err := fallback() - if err != nil { - return "", err + if c.Resolve == nil { + return *new(T), nil } - if ret == "" { - return "", fmt.Errorf("return value didn't resolve to non-nil value") + ret, err := c.Resolve(ctx) + if err != nil { + return *new(T), err } - *field = ret + c.value = ptr.Val(ret) // copies the value so we don't keep the reference return ret, nil } diff --git a/pkg/utils/options/options.go b/pkg/utils/options/options.go index a4762c34817e..ab1e43cec3f6 100644 --- a/pkg/utils/options/options.go +++ b/pkg/utils/options/options.go @@ -44,9 +44,6 @@ type Options struct { EnableProfiling bool EnableLeaderElection bool MemoryLimit int64 - // Metadata information - DeploymentName string - PodName string // AWS Specific ClusterName string ClusterEndpoint string @@ -99,10 +96,6 @@ func (o *Options) MustParse() *Options { if err := o.Validate(); err != nil { panic(err) } - - // Set the metadata fields in the options - o.DeploymentName = env.WithDefaultString("DEPLOYMENT_NAME", "karpenter") - o.PodName = env.WithDefaultString("POD_NAME", "") return o } diff --git a/pkg/utils/ptr/ptr.go b/pkg/utils/ptr/ptr.go index 206c0dc3c119..aaf457009b77 100644 --- a/pkg/utils/ptr/ptr.go +++ b/pkg/utils/ptr/ptr.go @@ -30,3 +30,7 @@ func Node(node v1.Node) *v1.Node { func Quantity(quantity resource.Quantity) *resource.Quantity { return &quantity } + +func Val[T any](v T) *T { + return &v +} From f0bad19e5ebd77fbf041b434c9fabbd1bb864be2 Mon Sep 17 00:00:00 2001 From: Jonathan Innis Date: Sat, 8 Oct 2022 00:00:15 -0700 Subject: [PATCH 38/55] Make health details on polling a decorator --- ...ggercontroller.go => pollingcontroller.go} | 16 ++-- .../controllers/infrastructure/reconciler.go | 13 +-- .../controllers/infrastructure/suite_test.go | 8 +- .../controllers/nodetemplate/controller.go | 2 + .../aws/controllers/notification/metrics.go | 14 +-- .../controllers/notification/reconciler.go | 16 ++-- .../controllers/notification/suite_test.go | 10 +- pkg/cloudprovider/aws/controllers/register.go | 3 +- pkg/controllers/polling/controller.go | 91 ++++++++---------- pkg/controllers/polling/decorators.go | 92 +++++++++++++++++++ pkg/controllers/polling/metrics.go | 54 ----------- pkg/controllers/types.go | 14 +++ 12 files changed, 187 insertions(+), 146 deletions(-) rename pkg/cloudprovider/aws/controllers/fake/{triggercontroller.go => pollingcontroller.go} (66%) create mode 100644 pkg/controllers/polling/decorators.go delete mode 100644 pkg/controllers/polling/metrics.go diff --git a/pkg/cloudprovider/aws/controllers/fake/triggercontroller.go b/pkg/cloudprovider/aws/controllers/fake/pollingcontroller.go similarity index 66% rename from pkg/cloudprovider/aws/controllers/fake/triggercontroller.go rename to pkg/cloudprovider/aws/controllers/fake/pollingcontroller.go index c6ecacb62b27..1ba392393eaf 100644 --- a/pkg/cloudprovider/aws/controllers/fake/triggercontroller.go +++ b/pkg/cloudprovider/aws/controllers/fake/pollingcontroller.go @@ -22,24 +22,24 @@ import ( "sigs.k8s.io/controller-runtime/pkg/reconcile" ) -type TriggerController struct { +type PollingController struct { TriggerCalls atomic.Int64 } -func (c *TriggerController) Start(context.Context) {} +func (c *PollingController) Start(context.Context) {} -func (c *TriggerController) Stop(context.Context) {} +func (c *PollingController) Stop(context.Context) {} -func (c *TriggerController) Trigger() { +func (c *PollingController) Trigger() { c.TriggerCalls.Add(1) } -func (c *TriggerController) Active() bool { return true } +func (c *PollingController) Active() bool { return true } -func (c *TriggerController) Healthy() bool { return true } +func (c *PollingController) Healthy() bool { return true } -func (c *TriggerController) Reconcile(context.Context, reconcile.Request) (reconcile.Result, error) { +func (c *PollingController) Reconcile(context.Context, reconcile.Request) (reconcile.Result, error) { return reconcile.Result{}, nil } -func (c *TriggerController) Register(context.Context, manager.Manager) error { return nil } +func (c *PollingController) Register(context.Context, manager.Manager) error { return nil } diff --git a/pkg/cloudprovider/aws/controllers/infrastructure/reconciler.go b/pkg/cloudprovider/aws/controllers/infrastructure/reconciler.go index e82e14560b89..07ac5a3d9967 100644 --- a/pkg/cloudprovider/aws/controllers/infrastructure/reconciler.go +++ b/pkg/cloudprovider/aws/controllers/infrastructure/reconciler.go @@ -19,6 +19,8 @@ import ( "time" "sigs.k8s.io/controller-runtime/pkg/reconcile" + + "github.com/aws/karpenter/pkg/controllers" ) // Reconciler is the AWS infrastructure reconciler @@ -37,12 +39,11 @@ func NewReconciler(provider *Provider) *Reconciler { } } -func (r *Reconciler) Name() string { - return "aws.infrastructure" -} - -func (r *Reconciler) MetricsSubsystemName() string { - return "aws_infrastructure_controller" +func (r *Reconciler) Metadata() controllers.Metadata { + return controllers.Metadata{ + Name: "aws.infrastructure", + MetricsSubsystem: "aws_infrastructure_controller", + } } // Reconcile reconciles the SQS queue and the EventBridge rules with the expected diff --git a/pkg/cloudprovider/aws/controllers/infrastructure/suite_test.go b/pkg/cloudprovider/aws/controllers/infrastructure/suite_test.go index e56e8885b248..9948fbeea12d 100644 --- a/pkg/cloudprovider/aws/controllers/infrastructure/suite_test.go +++ b/pkg/cloudprovider/aws/controllers/infrastructure/suite_test.go @@ -45,8 +45,7 @@ var sqsapi *awsfake.SQSAPI var sqsProvider *aws.SQSProvider var eventbridgeapi *awsfake.EventBridgeAPI var eventBridgeProvider *aws.EventBridgeProvider -var recorder *awsfake.EventRecorder -var controller *polling.Controller +var controller *polling.ControllerWithHealth var opts options.Options var defaultOpts = options.Options{ @@ -70,14 +69,13 @@ var _ = BeforeEach(func() { Expect(opts.Validate()).To(Succeed(), "Failed to validate options") e.Ctx = injection.WithOptions(e.Ctx, opts) - recorder = awsfake.NewEventRecorder() metadataProvider := aws.NewMetadataProvider(mock.Session, &awsfake.EC2MetadataAPI{}, &awsfake.STSAPI{}) sqsapi = &awsfake.SQSAPI{} eventbridgeapi = &awsfake.EventBridgeAPI{} sqsProvider = aws.NewSQSProvider(e.Ctx, sqsapi, metadataProvider) eventBridgeProvider = aws.NewEventBridgeProvider(eventbridgeapi, metadataProvider, sqsProvider.QueueName()) - controller = polling.NewController(infrastructure.NewReconciler(infrastructure.NewProvider(sqsProvider, eventBridgeProvider))) + controller = polling.NewController(infrastructure.NewReconciler(infrastructure.NewProvider(sqsProvider, eventBridgeProvider))).WithHealth() }) Expect(env.Start()).To(Succeed(), "Failed to start environment") }) @@ -121,7 +119,7 @@ var _ = Describe("Reconciliation", func() { Expect(eventbridgeapi.PutRuleBehavior.SuccessfulCalls()).To(Equal(4)) Expect(eventbridgeapi.PutTargetsBehavior.SuccessfulCalls()).To(Equal(4)) }) - It("should thrown an error and wait with backoff if we get QueueDeletedRecently", func() { + It("should throw an error and wait with backoff if we get QueueDeletedRecently", func() { sqsapi.GetQueueURLBehavior.Error.Set(awsErrWithCode(sqs.ErrCodeQueueDoesNotExist), awsfake.MaxCalls(0)) // This mocks the queue not existing sqsapi.CreateQueueBehavior.Error.Set(awsErrWithCode(sqs.ErrCodeQueueDeletedRecently), awsfake.MaxCalls(0)) diff --git a/pkg/cloudprovider/aws/controllers/nodetemplate/controller.go b/pkg/cloudprovider/aws/controllers/nodetemplate/controller.go index 5a5052bf13a4..6bae0683720c 100644 --- a/pkg/cloudprovider/aws/controllers/nodetemplate/controller.go +++ b/pkg/cloudprovider/aws/controllers/nodetemplate/controller.go @@ -83,6 +83,8 @@ func (c *Controller) Reconcile(ctx context.Context, req reconcile.Request) (reco return reconcile.Result{}, nil } if len(list.Items) >= 1 { + // Start reconciling the infrastructure controller. This also waterfalls the starting of the + // notification controller once the infra is healthy c.infraController.Start(ctx) } mergeFrom := client.MergeFrom(nt.DeepCopy()) diff --git a/pkg/cloudprovider/aws/controllers/notification/metrics.go b/pkg/cloudprovider/aws/controllers/notification/metrics.go index 4a1af85424cf..bf2390411616 100644 --- a/pkg/cloudprovider/aws/controllers/notification/metrics.go +++ b/pkg/cloudprovider/aws/controllers/notification/metrics.go @@ -22,17 +22,17 @@ import ( ) const ( - MetricsSubsystemName = "aws_notification_controller" - messageTypeLabel = "message_type" - actionableTypeLabel = "actionable" - actionTypeLabel = "action_type" + subsystem = "aws_notification_controller" + messageTypeLabel = "message_type" + actionableTypeLabel = "actionable" + actionTypeLabel = "action_type" ) var ( receivedMessages = prometheus.NewCounterVec( prometheus.CounterOpts{ Namespace: metrics.Namespace, - Subsystem: MetricsSubsystemName, + Subsystem: subsystem, Name: "received_messages", Help: "Count of messages received from the SQS queue. Broken down by message type and whether the message was actionable.", }, @@ -41,7 +41,7 @@ var ( deletedMessages = prometheus.NewCounter( prometheus.CounterOpts{ Namespace: metrics.Namespace, - Subsystem: MetricsSubsystemName, + Subsystem: subsystem, Name: "deleted_messages", Help: "Count of messages deleted from the SQS queue.", }, @@ -49,7 +49,7 @@ var ( actionsPerformed = prometheus.NewCounterVec( prometheus.CounterOpts{ Namespace: metrics.Namespace, - Subsystem: MetricsSubsystemName, + Subsystem: subsystem, Name: "actions_performed", Help: "Number of notification actions performed. Labeled by action", }, diff --git a/pkg/cloudprovider/aws/controllers/notification/reconciler.go b/pkg/cloudprovider/aws/controllers/notification/reconciler.go index 088e9eeb9a1d..78092b899efc 100644 --- a/pkg/cloudprovider/aws/controllers/notification/reconciler.go +++ b/pkg/cloudprovider/aws/controllers/notification/reconciler.go @@ -38,6 +38,7 @@ import ( statechangev0 "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event/statechange" "github.com/aws/karpenter/pkg/cloudprovider/aws/events" "github.com/aws/karpenter/pkg/cloudprovider/aws/utils" + "github.com/aws/karpenter/pkg/controllers" "github.com/aws/karpenter/pkg/controllers/polling" "github.com/aws/karpenter/pkg/controllers/state" ) @@ -64,7 +65,7 @@ type Reconciler struct { instanceTypeProvider *aws.InstanceTypeProvider parser event.Parser - infraController polling.ControllerInterface + infraController polling.ControllerWithHealthInterface } // pollingPeriod that we go to the SQS queue to check if there are any new events @@ -72,7 +73,7 @@ const pollingPeriod = 2 * time.Second func NewReconciler(kubeClient client.Client, recorder events.Recorder, cluster *state.Cluster, sqsProvider *aws.SQSProvider, instanceTypeProvider *aws.InstanceTypeProvider, - infraController polling.ControllerInterface) *Reconciler { + infraController polling.ControllerWithHealthInterface) *Reconciler { return &Reconciler{ kubeClient: kubeClient, @@ -85,12 +86,11 @@ func NewReconciler(kubeClient client.Client, recorder events.Recorder, cluster * } } -func (r *Reconciler) Name() string { - return "aws.notification" -} - -func (r *Reconciler) MetricsSubsystemName() string { - return MetricsSubsystemName +func (r *Reconciler) Metadata() controllers.Metadata { + return controllers.Metadata{ + Name: "aws.notification", + MetricsSubsystem: subsystem, + } } func (r *Reconciler) Reconcile(ctx context.Context, _ reconcile.Request) (reconcile.Result, error) { diff --git a/pkg/cloudprovider/aws/controllers/notification/suite_test.go b/pkg/cloudprovider/aws/controllers/notification/suite_test.go index 002e16b0a4ea..bf8e28aef33d 100644 --- a/pkg/cloudprovider/aws/controllers/notification/suite_test.go +++ b/pkg/cloudprovider/aws/controllers/notification/suite_test.go @@ -83,7 +83,7 @@ var recorder *awsfake.EventRecorder var fakeClock *clock.FakeClock var cfg *test.Config var controller polling.ControllerInterface -var infraController polling.ControllerInterface +var infraController polling.ControllerWithHealthInterface var nodeStateController *state.NodeController func TestAPIs(t *testing.T) { @@ -114,7 +114,7 @@ var _ = BeforeEach(func() { subnetProvider := aws.NewSubnetProvider(ec2api) instanceTypeProvider = aws.NewInstanceTypeProvider(env.Ctx, mock.Session, cloudprovider.Options{}, ec2api, subnetProvider) - infraController = polling.NewController(infrastructure.NewReconciler(infrastructure.NewProvider(sqsProvider, eventBridgeProvider))) + infraController = polling.NewController(infrastructure.NewReconciler(infrastructure.NewProvider(sqsProvider, eventBridgeProvider))).WithHealth() controller = polling.NewController(notification.NewReconciler(env.Client, recorder, cluster, sqsProvider, instanceTypeProvider, infraController)) }) Expect(env.Start()).To(Succeed(), "Failed to start environment") @@ -313,7 +313,7 @@ var _ = Describe("Error Handling", func() { It("should trigger an infrastructure reconciliation on an SQS queue when it doesn't exist", func() { sqsapi.GetQueueURLBehavior.Error.Set(awsErrWithCode(sqs.ErrCodeQueueDoesNotExist), awsfake.MaxCalls(0)) // This mocks the queue not existing - infraController := &controllersfake.TriggerController{} + infraController := &controllersfake.PollingController{} controller = polling.NewController(notification.NewReconciler(env.Client, recorder, cluster, sqsProvider, instanceTypeProvider, infraController)) _, err := controller.Reconcile(env.Ctx, reconcile.Request{}) @@ -327,13 +327,13 @@ var _ = Describe("Infrastructure Coordination", func() { // Prior to provisioning the infrastructure and the infrastructure being healthy, we shouldn't try to hit the queue res, err := controller.Reconcile(env.Ctx, reconcile.Request{}) Expect(err).To(Succeed()) - Expect(res.Requeue).To(BeTrue()) + Expect(res.Requeue).To(BeFalse()) + Expect(res.RequeueAfter).To(BeEquivalentTo(time.Duration(0))) Expect(sqsapi.ReceiveMessageBehavior.SuccessfulCalls()).To(BeNumerically("==", 0)) ExpectReconcileSucceeded(env.Ctx, infraController, types.NamespacedName{}) ExpectReconcileSucceeded(env.Ctx, controller, types.NamespacedName{}) Expect(infraController.Healthy()).To(BeTrue()) - Expect(controller.Healthy()).To(BeTrue()) Expect(sqsapi.ReceiveMessageBehavior.SuccessfulCalls()).To(BeNumerically("==", 1)) }) }) diff --git a/pkg/cloudprovider/aws/controllers/register.go b/pkg/cloudprovider/aws/controllers/register.go index 27facffae10a..fcca69f2e20d 100644 --- a/pkg/cloudprovider/aws/controllers/register.go +++ b/pkg/cloudprovider/aws/controllers/register.go @@ -36,12 +36,11 @@ func Register(ctx context.Context, provider *aws.CloudProvider, opts *controller logging.FromContext(ctx).Infof("Enabling interruption handling") infraProvider := infrastructure.NewProvider(provider.SQSProvider(), provider.EventBridgeProvider()) - infraController := polling.NewController(infrastructure.NewReconciler(infraProvider)) + infraController := polling.NewController(infrastructure.NewReconciler(infraProvider)).WithHealth() notificationController := polling.NewController(notification.NewReconciler(opts.KubeClient, rec, opts.Cluster, provider.SQSProvider(), provider.InstanceTypeProvider(), infraController)) nodeTemplateController := nodetemplate.NewController(opts.KubeClient, infraProvider, infraController, notificationController) infraController.OnHealthy = notificationController.Start - infraController.OnUnhealthy = notificationController.Stop ret = append(ret, infraController, notificationController, nodeTemplateController) } return ret diff --git a/pkg/controllers/polling/controller.go b/pkg/controllers/polling/controller.go index 11933a7a145e..78e813491e08 100644 --- a/pkg/controllers/polling/controller.go +++ b/pkg/controllers/polling/controller.go @@ -19,6 +19,7 @@ import ( "sync" "github.com/google/uuid" + "github.com/prometheus/client_golang/prometheus" v1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" @@ -35,6 +36,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/source" "github.com/aws/karpenter/pkg/controllers" + "github.com/aws/karpenter/pkg/metrics" ) type ControllerInterface interface { @@ -44,7 +46,6 @@ type ControllerInterface interface { Stop(context.Context) Trigger() Active() bool - Healthy() bool } // Controller is a wrapper around a controller interface that adds a trigger mechanism for enqueuing @@ -53,38 +54,26 @@ type ControllerInterface interface { // Controller also has an active flag that can be enabled or disabled. This serves as a mechanism to stop // a requeue of a trigger request from the wrapped Reconcile() method of the Controller type Controller struct { - OnHealthy func(context.Context) - OnUnhealthy func(context.Context) - - r Reconciler + r controllers.Reconciler uuid types.UID - active bool - healthy bool + active bool triggerGeneration int64 trigger chan event.GenericEvent triggerMu sync.RWMutex activeMu sync.RWMutex - healthyMu sync.RWMutex cancels sync.Map } -type Reconciler interface { - reconcile.Reconciler - - Name() string - MetricsSubsystemName() string -} - type Object struct { metav1.ObjectMeta runtime.Object } -func NewController(rec Reconciler) *Controller { +func NewController(rec controllers.Reconciler) *Controller { return &Controller{ r: rec, uuid: types.UID(uuid.New().String()), @@ -92,11 +81,17 @@ func NewController(rec Reconciler) *Controller { } } +// WithHealth returns a decorated version of the polling controller that surfaces health information +// based on the success or failure of a reconciliation loop +func (t *Controller) WithHealth() *ControllerWithHealth { + return NewControllerWithHealth(t) +} + // Start is an idempotent call to kick-off a single reconciliation loop. Based on the intended use of this controller, // the Reconciler is responsible for requeuing this message back in the WorkQueue so there is a time-based reconciliation // performed. The Trigger operation is performed to kick-off the loop. func (t *Controller) Start(ctx context.Context) { - logging.FromContext(ctx).Infof("Starting the %s controller...", t.r.Name()) + logging.FromContext(ctx).Infof("Starting the %s controller...", t.r.Metadata().Name) t.activeMu.Lock() if !t.active { t.active = true @@ -121,7 +116,7 @@ func (t *Controller) Trigger() { // Stop sets the state of the controller to active and cancel the current reconciliation contexts, if there are any func (t *Controller) Stop(ctx context.Context) { - logging.FromContext(ctx).Infof("Stopping the %s controller...", t.r.Name()) + logging.FromContext(ctx).Infof("Stopping the %s controller...", t.r.Metadata().Name) t.SetActive(false) t.cancels.Range(func(_ any, c any) bool { cancel := c.(context.CancelFunc) @@ -152,51 +147,23 @@ func (t *Controller) SetActive(active bool) { } } -func (t *Controller) Healthy() bool { - t.healthyMu.RLock() - defer t.healthyMu.RUnlock() - return t.healthy -} - func (t *Controller) Reconcile(ctx context.Context, req reconcile.Request) (reconcile.Result, error) { - ctx = logging.WithLogger(ctx, logging.FromContext(ctx).Named(t.r.Name())) + ctx = logging.WithLogger(ctx, logging.FromContext(ctx).Named(t.r.Metadata().Name)) ctx, cancel := context.WithCancel(ctx) - // Store the cancel function for the duration of the reconcile so we can cancel on a Stop() call + // Store the cancel function for the duration of Reconcile, so we can cancel on a Stop() call cancelID := uuid.New() t.cancels.Store(cancelID, cancel) defer t.cancels.Delete(cancelID) - res, err := t.r.Reconcile(ctx, req) - - t.healthyMu.Lock() - t.healthy = err == nil // The controller is considered healthy when it successfully reconciles - if t.healthy { - if t.OnHealthy != nil { - t.OnHealthy(ctx) - } - t.healthyMetric().Set(1) - } else { - if t.OnUnhealthy != nil { - t.OnUnhealthy(ctx) - } - t.healthyMetric().Set(0) - } - t.healthyMu.Unlock() - - t.activeMu.Lock() - if !t.active { - return reconcile.Result{}, nil // Swallow any errors/calls at this point - } - t.activeMu.Unlock() - return res, err + return t.r.Reconcile(ctx, req) } func (t *Controller) Register(_ context.Context, m manager.Manager) error { - crmetrics.Registry.MustRegister(t.healthyMetric(), t.activeMetric(), t.triggeredCountMetric()) + crmetrics.Registry.MustRegister(t.activeMetric(), t.triggeredCountMetric()) return controllerruntime. NewControllerManagedBy(m). - Named(t.r.Name()). + Named(t.r.Metadata().Name). WithEventFilter(predicate.NewPredicateFuncs(func(obj client.Object) bool { t.triggerMu.RLock() defer t.triggerMu.RUnlock() @@ -209,3 +176,25 @@ func (t *Controller) Register(_ context.Context, m manager.Manager) error { For(&v1.Pod{}). // controller-runtime requires us to perform a watch on some object, so let's do it on a fundamental component Complete(t) } + +func (t *Controller) activeMetric() prometheus.Gauge { + return prometheus.NewGauge( + prometheus.GaugeOpts{ + Namespace: metrics.Namespace, + Subsystem: t.r.Metadata().MetricsSubsystem, + Name: "active", + Help: "Whether the controller is active.", + }, + ) +} + +func (t *Controller) triggeredCountMetric() prometheus.Counter { + return prometheus.NewCounter( + prometheus.CounterOpts{ + Namespace: metrics.Namespace, + Subsystem: t.r.Metadata().MetricsSubsystem, + Name: "trigger_count", + Help: "A counter of the number of times this controller has been triggered.", + }, + ) +} diff --git a/pkg/controllers/polling/decorators.go b/pkg/controllers/polling/decorators.go new file mode 100644 index 000000000000..c72a3cbe455b --- /dev/null +++ b/pkg/controllers/polling/decorators.go @@ -0,0 +1,92 @@ +/* +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package polling + +import ( + "context" + "sync" + + "github.com/prometheus/client_golang/prometheus" + "sigs.k8s.io/controller-runtime/pkg/manager" + crmetrics "sigs.k8s.io/controller-runtime/pkg/metrics" + "sigs.k8s.io/controller-runtime/pkg/reconcile" + + "github.com/aws/karpenter/pkg/metrics" +) + +type ControllerWithHealthInterface interface { + ControllerInterface + + Healthy() bool +} + +// ControllerWithHealth is a Controller decorator that wraps a polling controller with health information +// on the success or failure of a reconciliation loop +type ControllerWithHealth struct { + *Controller + + healthy bool + healthyMu sync.RWMutex + + OnHealthy func(context.Context) + OnUnhealthy func(context.Context) +} + +func NewControllerWithHealth(c *Controller) *ControllerWithHealth { + return &ControllerWithHealth{ + Controller: c, + } +} + +func (c *ControllerWithHealth) Healthy() bool { + c.healthyMu.RLock() + defer c.healthyMu.RUnlock() + return c.healthy +} + +func (c *ControllerWithHealth) Reconcile(ctx context.Context, req reconcile.Request) (reconcile.Result, error) { + res, err := c.Controller.Reconcile(ctx, req) + c.healthyMu.Lock() + c.healthy = err == nil // The controller is considered healthy when it successfully reconciles + if c.healthy { + if c.OnHealthy != nil { + c.OnHealthy(ctx) + } + c.healthyMetric().Set(1) + } else { + if c.OnUnhealthy != nil { + c.OnUnhealthy(ctx) + } + c.healthyMetric().Set(0) + } + c.healthyMu.Unlock() + return res, err +} + +func (c *ControllerWithHealth) Register(ctx context.Context, m manager.Manager) error { + crmetrics.Registry.MustRegister(c.healthyMetric()) + return c.Controller.Register(ctx, m) +} + +func (c *ControllerWithHealth) healthyMetric() prometheus.Gauge { + return prometheus.NewGauge( + prometheus.GaugeOpts{ + Namespace: metrics.Namespace, + Subsystem: c.Controller.r.Metadata().MetricsSubsystem, + Name: "healthy", + Help: "Whether the controller is in a healthy state.", + }, + ) +} diff --git a/pkg/controllers/polling/metrics.go b/pkg/controllers/polling/metrics.go deleted file mode 100644 index 0e9a633bf8de..000000000000 --- a/pkg/controllers/polling/metrics.go +++ /dev/null @@ -1,54 +0,0 @@ -/* -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package polling - -import ( - "github.com/prometheus/client_golang/prometheus" - - "github.com/aws/karpenter/pkg/metrics" -) - -func (t *Controller) healthyMetric() prometheus.Gauge { - return prometheus.NewGauge( - prometheus.GaugeOpts{ - Namespace: metrics.Namespace, - Subsystem: t.r.MetricsSubsystemName(), - Name: "healthy", - Help: "Whether the controller is in a healthy state.", - }, - ) -} - -func (t *Controller) activeMetric() prometheus.Gauge { - return prometheus.NewGauge( - prometheus.GaugeOpts{ - Namespace: metrics.Namespace, - Subsystem: t.r.MetricsSubsystemName(), - Name: "active", - Help: "Whether the controller is active.", - }, - ) -} - -func (t *Controller) triggeredCountMetric() prometheus.Counter { - return prometheus.NewCounter( - prometheus.CounterOpts{ - Namespace: metrics.Namespace, - Subsystem: t.r.MetricsSubsystemName(), - Name: "trigger_count", - Help: "A counter of the number of times this controller has been triggered.", - }, - ) -} diff --git a/pkg/controllers/types.go b/pkg/controllers/types.go index 5e9e46d56308..9af17e9de422 100644 --- a/pkg/controllers/types.go +++ b/pkg/controllers/types.go @@ -50,6 +50,20 @@ type Controller interface { Register(context.Context, manager.Manager) error } +// Reconciler is a custom interface on top of the standard reconcile.Reconciler interface +// that surfaces Name info for logging and MetricsSubsystemName info for prometheus metrics +type Reconciler interface { + reconcile.Reconciler + + Metadata() Metadata +} + +type Metadata struct { + Name string + MetricsSubsystem string +} + +// ignoreDebugEventsSink is a decorator around a passed sink to ignore all debug events that are passed into the logger type ignoreDebugEventsSink struct { name string sink logr.LogSink From 91d38de7242300332ce917a5e15c89ca953d7183 Mon Sep 17 00:00:00 2001 From: Jonathan Innis Date: Sun, 9 Oct 2022 22:00:21 -0700 Subject: [PATCH 39/55] Handle message parsing unmarshalling better --- .../aws/controllers/fake/pollingcontroller.go | 5 + .../controllers/infrastructure/provider.go | 6 +- .../controllers/nodetemplate/suite_test.go | 2 +- .../aggregatedparser/aggregatedparser.go | 73 ++++++++---- .../event/{noop.go => noop/handler.go} | 16 +-- .../event/rebalancerecommendation/handler.go | 8 -- .../event/rebalancerecommendation/parser.go | 30 ++--- .../rebalancerecommendation/unmarshal.go | 13 --- .../event/scheduledchange/handler.go | 8 -- .../event/scheduledchange/parser.go | 37 +++--- .../event/scheduledchange/unmarshal.go | 42 ------- .../event/spotinterruption/handler.go | 8 -- .../event/spotinterruption/parser.go | 30 ++--- .../event/spotinterruption/unmarshal.go | 14 --- .../notification/event/statechange/handler.go | 8 -- .../notification/event/statechange/parser.go | 34 +++--- .../event/statechange/unmarshal.go | 14 --- .../controllers/notification/event/types.go | 11 +- .../aws/controllers/notification/metrics.go | 9 +- .../controllers/notification/reconciler.go | 27 +++-- pkg/cloudprovider/aws/controllers/register.go | 2 +- pkg/controllers/polling/controller.go | 109 +++++++++--------- pkg/controllers/polling/decorators.go | 25 ++-- pkg/controllers/polling/suite_test.go | 15 +++ test/pkg/environment/environment.go | 2 +- test/pkg/environment/expectations.go | 3 +- test/suites/integration/scheduling_test.go | 8 +- test/suites/notification/suite_test.go | 2 + 28 files changed, 258 insertions(+), 303 deletions(-) rename pkg/cloudprovider/aws/controllers/notification/event/{noop.go => noop/handler.go} (74%) create mode 100644 pkg/controllers/polling/suite_test.go diff --git a/pkg/cloudprovider/aws/controllers/fake/pollingcontroller.go b/pkg/cloudprovider/aws/controllers/fake/pollingcontroller.go index 1ba392393eaf..ea43547b0974 100644 --- a/pkg/cloudprovider/aws/controllers/fake/pollingcontroller.go +++ b/pkg/cloudprovider/aws/controllers/fake/pollingcontroller.go @@ -18,6 +18,7 @@ import ( "context" "sync/atomic" + controllerruntime "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/manager" "sigs.k8s.io/controller-runtime/pkg/reconcile" ) @@ -42,4 +43,8 @@ func (c *PollingController) Reconcile(context.Context, reconcile.Request) (recon return reconcile.Result{}, nil } +func (c *PollingController) Builder(context.Context, manager.Manager) *controllerruntime.Builder { + return nil +} + func (c *PollingController) Register(context.Context, manager.Manager) error { return nil } diff --git a/pkg/cloudprovider/aws/controllers/infrastructure/provider.go b/pkg/cloudprovider/aws/controllers/infrastructure/provider.go index de978466c52c..50e55eed3d2d 100644 --- a/pkg/cloudprovider/aws/controllers/infrastructure/provider.go +++ b/pkg/cloudprovider/aws/controllers/infrastructure/provider.go @@ -46,7 +46,11 @@ func (p *Provider) CreateInfrastructure(ctx context.Context) error { workqueue.ParallelizeUntil(ctx, len(funcs), len(funcs), func(i int) { errs[i] = funcs[i]() }) - return multierr.Combine(errs...) + if err := multierr.Combine(errs...); err != nil { + return err + } + logging.FromContext(ctx).Infof("Successfully completed reconciliation of infrastructure") + return nil } // DeleteInfrastructure removes the infrastructure that was stood up and reconciled diff --git a/pkg/cloudprovider/aws/controllers/nodetemplate/suite_test.go b/pkg/cloudprovider/aws/controllers/nodetemplate/suite_test.go index b6773fa34781..196aed08182b 100644 --- a/pkg/cloudprovider/aws/controllers/nodetemplate/suite_test.go +++ b/pkg/cloudprovider/aws/controllers/nodetemplate/suite_test.go @@ -12,4 +12,4 @@ See the License for the specific language governing permissions and limitations under the License. */ -package nodetemplate +package nodetemplate_test diff --git a/pkg/cloudprovider/aws/controllers/notification/event/aggregatedparser/aggregatedparser.go b/pkg/cloudprovider/aws/controllers/notification/event/aggregatedparser/aggregatedparser.go index 75dab8cc1333..0adb7c50a078 100644 --- a/pkg/cloudprovider/aws/controllers/notification/event/aggregatedparser/aggregatedparser.go +++ b/pkg/cloudprovider/aws/controllers/notification/event/aggregatedparser/aggregatedparser.go @@ -15,18 +15,41 @@ limitations under the License. package aggregatedparser import ( - "context" "encoding/json" + "fmt" - "knative.dev/pkg/logging" + "github.com/samber/lo" "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event" + "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event/noop" "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event/rebalancerecommendation" "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event/scheduledchange" "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event/spotinterruption" "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event/statechange" ) +type parserKey struct { + Version string + Source string + DetailType string +} + +func newParserKey(metadata event.AWSMetadata) parserKey { + return parserKey{ + Version: metadata.Version, + Source: metadata.Source, + DetailType: metadata.DetailType, + } +} + +func newParserKeyFromParser(p event.Parser) parserKey { + return parserKey{ + Version: p.Version(), + Source: p.Source(), + DetailType: p.DetailType(), + } +} + var ( DefaultParsers = []event.Parser{ statechange.Parser{}, @@ -36,33 +59,35 @@ var ( } ) -type AggregatedParser []event.Parser - -func NewAggregatedParser(parsers ...event.Parser) AggregatedParser { - return parsers +type AggregatedParser struct { + parserMap map[parserKey]event.Parser } -func (p AggregatedParser) Parse(ctx context.Context, str string) event.Interface { - ctx = logging.WithLogger(ctx, logging.FromContext(ctx).Named("event.parser")) - - if str == "" { - return event.NoOp{} +func NewAggregatedParser(parsers ...event.Parser) AggregatedParser { + return AggregatedParser{ + parserMap: lo.SliceToMap(parsers, func(p event.Parser) (parserKey, event.Parser) { + return newParserKeyFromParser(p), p + }), } +} - // We will go through all the parsers to see if we can parse - // If we aren't able to parse the message, we will just assume that it is a no-op - for _, parser := range p { - if a := parser.Parse(ctx, str); a != nil { - return a - } +func (p AggregatedParser) Parse(msg string) (event.Interface, error) { + if msg == "" { + return noop.NoOp{}, nil } - md := event.AWSMetadata{} - if err := json.Unmarshal([]byte(str), &md); err != nil { - logging.FromContext(ctx). - With("error", err). - Error("failed to unmarshal message metadata") - return event.NoOp{} + if err := json.Unmarshal([]byte(msg), &md); err != nil { + return noop.NoOp{}, fmt.Errorf("unmarshalling the message as AWSMetadata, %w", err) + } + if parser, ok := p.parserMap[newParserKey(md)]; ok { + evt, err := parser.Parse(msg) + if err != nil { + return noop.NoOp{}, fmt.Errorf("parsing event message, %w", err) + } + if evt == nil { + return noop.NoOp{}, nil + } + return evt, nil } - return event.NoOp(md) + return noop.NoOp(md), nil } diff --git a/pkg/cloudprovider/aws/controllers/notification/event/noop.go b/pkg/cloudprovider/aws/controllers/notification/event/noop/handler.go similarity index 74% rename from pkg/cloudprovider/aws/controllers/notification/event/noop.go rename to pkg/cloudprovider/aws/controllers/notification/event/noop/handler.go index a6f78d730fa4..fecdf1699729 100644 --- a/pkg/cloudprovider/aws/controllers/notification/event/noop.go +++ b/pkg/cloudprovider/aws/controllers/notification/event/noop/handler.go @@ -12,16 +12,15 @@ See the License for the specific language governing permissions and limitations under the License. */ -package event +package noop import ( "time" - "go.uber.org/zap" - "go.uber.org/zap/zapcore" + "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event" ) -type NoOp AWSMetadata +type NoOp event.AWSMetadata func (NoOp) EventID() string { return "" @@ -31,13 +30,8 @@ func (NoOp) EC2InstanceIDs() []string { return []string{} } -func (NoOp) Kind() Kind { - return NoOpKind -} - -func (n NoOp) MarshalLogObject(enc zapcore.ObjectEncoder) error { - zap.Inline(AWSMetadata(n)).AddTo(enc) - return nil +func (NoOp) Kind() event.Kind { + return event.NoOpKind } func (NoOp) StartTime() time.Time { diff --git a/pkg/cloudprovider/aws/controllers/notification/event/rebalancerecommendation/handler.go b/pkg/cloudprovider/aws/controllers/notification/event/rebalancerecommendation/handler.go index 610e31edc385..fc5b13f2e26a 100644 --- a/pkg/cloudprovider/aws/controllers/notification/event/rebalancerecommendation/handler.go +++ b/pkg/cloudprovider/aws/controllers/notification/event/rebalancerecommendation/handler.go @@ -17,9 +17,6 @@ package rebalancerecommendation import ( "time" - "go.uber.org/zap" - "go.uber.org/zap/zapcore" - "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event" ) @@ -37,11 +34,6 @@ func (EC2InstanceRebalanceRecommendation) Kind() event.Kind { return event.RebalanceRecommendationKind } -func (e EC2InstanceRebalanceRecommendation) MarshalLogObject(enc zapcore.ObjectEncoder) error { - zap.Inline(AWSEvent(e)).AddTo(enc) - return nil -} - func (e EC2InstanceRebalanceRecommendation) StartTime() time.Time { return e.Time } diff --git a/pkg/cloudprovider/aws/controllers/notification/event/rebalancerecommendation/parser.go b/pkg/cloudprovider/aws/controllers/notification/event/rebalancerecommendation/parser.go index baea33276db2..f81d8e3c8611 100644 --- a/pkg/cloudprovider/aws/controllers/notification/event/rebalancerecommendation/parser.go +++ b/pkg/cloudprovider/aws/controllers/notification/event/rebalancerecommendation/parser.go @@ -15,28 +15,30 @@ limitations under the License. package rebalancerecommendation import ( - "context" "encoding/json" + "fmt" "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event" ) -const ( - source = "aws.ec2" - detailType = "EC2 Instance Rebalance Recommendation" - version = "0" -) - type Parser struct{} -func (Parser) Parse(ctx context.Context, str string) event.Interface { +func (p Parser) Parse(msg string) (event.Interface, error) { evt := EC2InstanceRebalanceRecommendation{} - if err := json.Unmarshal([]byte(str), &evt); err != nil { - return nil + if err := json.Unmarshal([]byte(msg), &evt); err != nil { + return nil, fmt.Errorf("unmarhsalling the message as EC2InstanceRebalanceRecommendation, %w", err) } + return evt, nil +} - if evt.Source != source || evt.DetailType != detailType || evt.Version != version { - return nil - } - return evt +func (p Parser) Version() string { + return "0" +} + +func (p Parser) Source() string { + return "aws.ec2" +} + +func (p Parser) DetailType() string { + return "EC2 Instance Rebalance Recommendation" } diff --git a/pkg/cloudprovider/aws/controllers/notification/event/rebalancerecommendation/unmarshal.go b/pkg/cloudprovider/aws/controllers/notification/event/rebalancerecommendation/unmarshal.go index f5a5fca146ce..b9edcfaa4e16 100644 --- a/pkg/cloudprovider/aws/controllers/notification/event/rebalancerecommendation/unmarshal.go +++ b/pkg/cloudprovider/aws/controllers/notification/event/rebalancerecommendation/unmarshal.go @@ -15,9 +15,6 @@ limitations under the License. package rebalancerecommendation import ( - "go.uber.org/zap" - "go.uber.org/zap/zapcore" - "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event" ) @@ -29,16 +26,6 @@ type AWSEvent struct { Detail EC2InstanceRebalanceRecommendationDetail `json:"detail"` } -func (e AWSEvent) MarshalLogObject(enc zapcore.ObjectEncoder) error { - zap.Inline(e.AWSMetadata).AddTo(enc) - return enc.AddObject("detail", e.Detail) -} - type EC2InstanceRebalanceRecommendationDetail struct { InstanceID string `json:"instance-id"` } - -func (e EC2InstanceRebalanceRecommendationDetail) MarshalLogObject(enc zapcore.ObjectEncoder) error { - enc.AddString("instance-id", e.InstanceID) - return nil -} diff --git a/pkg/cloudprovider/aws/controllers/notification/event/scheduledchange/handler.go b/pkg/cloudprovider/aws/controllers/notification/event/scheduledchange/handler.go index 42b998470daf..80e9d35732a8 100644 --- a/pkg/cloudprovider/aws/controllers/notification/event/scheduledchange/handler.go +++ b/pkg/cloudprovider/aws/controllers/notification/event/scheduledchange/handler.go @@ -17,9 +17,6 @@ package scheduledchange import ( "time" - "go.uber.org/zap" - "go.uber.org/zap/zapcore" - "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event" ) @@ -41,11 +38,6 @@ func (AWSHealthEvent) Kind() event.Kind { return event.ScheduledChangeKind } -func (e AWSHealthEvent) MarshalLogObject(enc zapcore.ObjectEncoder) error { - zap.Inline(AWSEvent(e)).AddTo(enc) - return nil -} - func (e AWSHealthEvent) StartTime() time.Time { return e.Time } diff --git a/pkg/cloudprovider/aws/controllers/notification/event/scheduledchange/parser.go b/pkg/cloudprovider/aws/controllers/notification/event/scheduledchange/parser.go index 0cbb32573e88..87b81239e64e 100644 --- a/pkg/cloudprovider/aws/controllers/notification/event/scheduledchange/parser.go +++ b/pkg/cloudprovider/aws/controllers/notification/event/scheduledchange/parser.go @@ -15,36 +15,41 @@ limitations under the License. package scheduledchange import ( - "context" "encoding/json" + "fmt" "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event" ) const ( - source = "aws.health" - detailType = "AWS Health Event" - version = "0" acceptedService = "EC2" acceptedEventTypeCategory = "scheduledChange" ) type Parser struct{} -func (Parser) Parse(ctx context.Context, str string) event.Interface { +func (p Parser) Parse(msg string) (event.Interface, error) { evt := AWSHealthEvent{} - if err := json.Unmarshal([]byte(str), &evt); err != nil { - return nil + if err := json.Unmarshal([]byte(msg), &evt); err != nil { + return nil, fmt.Errorf("unmarhsalling the message as AWSHealthEvent, %w", err) } - if evt.Source != source || evt.DetailType != detailType || evt.Version != version { - return nil + // We ignore services and event categories that we don't watch + if evt.Detail.Service != acceptedService || + evt.Detail.EventTypeCategory != acceptedEventTypeCategory { + return nil, nil } - if evt.Detail.Service != acceptedService { - return nil - } - if evt.Detail.EventTypeCategory != acceptedEventTypeCategory { - return nil - } - return evt + return evt, nil +} + +func (p Parser) Version() string { + return "0" +} + +func (p Parser) Source() string { + return "aws.health" +} + +func (p Parser) DetailType() string { + return "AWS Health Event" } diff --git a/pkg/cloudprovider/aws/controllers/notification/event/scheduledchange/unmarshal.go b/pkg/cloudprovider/aws/controllers/notification/event/scheduledchange/unmarshal.go index 83c9eb5e8327..8528e579a9f5 100644 --- a/pkg/cloudprovider/aws/controllers/notification/event/scheduledchange/unmarshal.go +++ b/pkg/cloudprovider/aws/controllers/notification/event/scheduledchange/unmarshal.go @@ -15,10 +15,6 @@ limitations under the License. package scheduledchange import ( - "go.uber.org/multierr" - "go.uber.org/zap" - "go.uber.org/zap/zapcore" - "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event" ) @@ -30,11 +26,6 @@ type AWSEvent struct { Detail AWSHealthEventDetail `json:"detail"` } -func (e AWSEvent) MarshalLogObject(enc zapcore.ObjectEncoder) error { - zap.Inline(e.AWSMetadata).AddTo(enc) - return enc.AddObject("detail", e.Detail) -} - type AWSHealthEventDetail struct { EventARN string `json:"eventArn"` EventTypeCode string `json:"eventTypeCode"` @@ -46,44 +37,11 @@ type AWSHealthEventDetail struct { AffectedEntities []AffectedEntity `json:"affectedEntities"` } -func (e AWSHealthEventDetail) MarshalLogObject(enc zapcore.ObjectEncoder) (err error) { - enc.AddString("eventArn", e.EventARN) - enc.AddString("eventTypeCode", e.EventTypeCode) - enc.AddString("eventTypeCategory", e.EventTypeCategory) - enc.AddString("service", e.Service) - enc.AddString("startTime", e.StartTime) - enc.AddString("endTime", e.EndTime) - err = multierr.Append(err, enc.AddArray("eventDescription", zapcore.ArrayMarshalerFunc(func(enc zapcore.ArrayEncoder) (err error) { - for _, desc := range e.EventDescription { - err = multierr.Append(err, enc.AppendObject(desc)) - } - return err - }))) - err = multierr.Append(err, enc.AddArray("affectedEntities", zapcore.ArrayMarshalerFunc(func(enc zapcore.ArrayEncoder) (err error) { - for _, entity := range e.AffectedEntities { - err = multierr.Append(err, enc.AppendObject(entity)) - } - return err - }))) - return err -} - type EventDescription struct { LatestDescription string `json:"latestDescription"` Language string `json:"language"` } -func (e EventDescription) MarshalLogObject(enc zapcore.ObjectEncoder) error { - enc.AddString("latestDescription", e.LatestDescription) - enc.AddString("language", e.Language) - return nil -} - type AffectedEntity struct { EntityValue string `json:"entityValue"` } - -func (e AffectedEntity) MarshalLogObject(enc zapcore.ObjectEncoder) error { - enc.AddString("entityValue", e.EntityValue) - return nil -} diff --git a/pkg/cloudprovider/aws/controllers/notification/event/spotinterruption/handler.go b/pkg/cloudprovider/aws/controllers/notification/event/spotinterruption/handler.go index c2a58a4e9d69..9c5f6c876ba5 100644 --- a/pkg/cloudprovider/aws/controllers/notification/event/spotinterruption/handler.go +++ b/pkg/cloudprovider/aws/controllers/notification/event/spotinterruption/handler.go @@ -17,9 +17,6 @@ package spotinterruption import ( "time" - "go.uber.org/zap" - "go.uber.org/zap/zapcore" - "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event" ) @@ -37,11 +34,6 @@ func (EC2SpotInstanceInterruptionWarning) Kind() event.Kind { return event.SpotInterruptionKind } -func (e EC2SpotInstanceInterruptionWarning) MarshalLogObject(enc zapcore.ObjectEncoder) error { - zap.Inline(AWSEvent(e)).AddTo(enc) - return nil -} - func (e EC2SpotInstanceInterruptionWarning) StartTime() time.Time { return e.Time } diff --git a/pkg/cloudprovider/aws/controllers/notification/event/spotinterruption/parser.go b/pkg/cloudprovider/aws/controllers/notification/event/spotinterruption/parser.go index 40d5f93c4387..43a591ee2889 100644 --- a/pkg/cloudprovider/aws/controllers/notification/event/spotinterruption/parser.go +++ b/pkg/cloudprovider/aws/controllers/notification/event/spotinterruption/parser.go @@ -15,28 +15,30 @@ limitations under the License. package spotinterruption import ( - "context" "encoding/json" + "fmt" "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event" ) -const ( - source = "aws.ec2" - detailType = "EC2 Spot Instance Interruption Warning" - version = "0" -) - type Parser struct{} -func (Parser) Parse(ctx context.Context, str string) event.Interface { +func (p Parser) Parse(msg string) (event.Interface, error) { evt := EC2SpotInstanceInterruptionWarning{} - if err := json.Unmarshal([]byte(str), &evt); err != nil { - return nil + if err := json.Unmarshal([]byte(msg), &evt); err != nil { + return nil, fmt.Errorf("unmarhsalling the message as EC2SpotInstanceInterruptionWarning, %w", err) } + return evt, nil +} - if evt.Source != source || evt.DetailType != detailType || evt.Version != version { - return nil - } - return evt +func (p Parser) Version() string { + return "0" +} + +func (p Parser) Source() string { + return "aws.ec2" +} + +func (p Parser) DetailType() string { + return "EC2 Spot Instance Interruption Warning" } diff --git a/pkg/cloudprovider/aws/controllers/notification/event/spotinterruption/unmarshal.go b/pkg/cloudprovider/aws/controllers/notification/event/spotinterruption/unmarshal.go index 698de656d12a..4c8a87e4bd11 100644 --- a/pkg/cloudprovider/aws/controllers/notification/event/spotinterruption/unmarshal.go +++ b/pkg/cloudprovider/aws/controllers/notification/event/spotinterruption/unmarshal.go @@ -15,9 +15,6 @@ limitations under the License. package spotinterruption import ( - "go.uber.org/zap" - "go.uber.org/zap/zapcore" - "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event" ) @@ -29,18 +26,7 @@ type AWSEvent struct { Detail EC2SpotInstanceInterruptionWarningDetail `json:"detail"` } -func (e AWSEvent) MarshalLogObject(enc zapcore.ObjectEncoder) error { - zap.Inline(e.AWSMetadata).AddTo(enc) - return enc.AddObject("detail", e.Detail) -} - type EC2SpotInstanceInterruptionWarningDetail struct { InstanceID string `json:"instance-id"` InstanceAction string `json:"instance-action"` } - -func (e EC2SpotInstanceInterruptionWarningDetail) MarshalLogObject(enc zapcore.ObjectEncoder) error { - enc.AddString("instance-id", e.InstanceID) - enc.AddString("instance-action", e.InstanceAction) - return nil -} diff --git a/pkg/cloudprovider/aws/controllers/notification/event/statechange/handler.go b/pkg/cloudprovider/aws/controllers/notification/event/statechange/handler.go index e036f5c1108b..a37d2b671e39 100644 --- a/pkg/cloudprovider/aws/controllers/notification/event/statechange/handler.go +++ b/pkg/cloudprovider/aws/controllers/notification/event/statechange/handler.go @@ -17,9 +17,6 @@ package statechange import ( "time" - "go.uber.org/zap" - "go.uber.org/zap/zapcore" - "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event" ) @@ -41,11 +38,6 @@ func (EC2InstanceStateChangeNotification) Kind() event.Kind { return event.StateChangeKind } -func (e EC2InstanceStateChangeNotification) MarshalLogObject(enc zapcore.ObjectEncoder) error { - zap.Inline(AWSEvent(e)).AddTo(enc) - return nil -} - func (e EC2InstanceStateChangeNotification) StartTime() time.Time { return e.Time } diff --git a/pkg/cloudprovider/aws/controllers/notification/event/statechange/parser.go b/pkg/cloudprovider/aws/controllers/notification/event/statechange/parser.go index 1b85f6572251..6431b4a2f754 100644 --- a/pkg/cloudprovider/aws/controllers/notification/event/statechange/parser.go +++ b/pkg/cloudprovider/aws/controllers/notification/event/statechange/parser.go @@ -15,8 +15,8 @@ limitations under the License. package statechange import ( - "context" "encoding/json" + "fmt" "strings" "k8s.io/apimachinery/pkg/util/sets" @@ -24,27 +24,31 @@ import ( "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event" ) -const ( - source = "aws.ec2" - detailType = "EC2 Instance State-change Notification" - version = "0" -) - var acceptedStates = sets.NewString("stopping", "stopped", "shutting-down", "terminated") type Parser struct{} -func (Parser) Parse(ctx context.Context, str string) event.Interface { +func (p Parser) Parse(msg string) (event.Interface, error) { evt := EC2InstanceStateChangeNotification{} - if err := json.Unmarshal([]byte(str), &evt); err != nil { - return nil + if err := json.Unmarshal([]byte(msg), &evt); err != nil { + return nil, fmt.Errorf("unmarhsalling the message as EC2InstanceStateChangeNotification, %w", err) } - if evt.Source != source || evt.DetailType != detailType || evt.Version != version { - return nil - } + // We ignore states that are not in the set of states we can react to if !acceptedStates.Has(strings.ToLower(evt.Detail.State)) { - return nil + return nil, nil } - return evt + return evt, nil +} + +func (p Parser) Version() string { + return "0" +} + +func (p Parser) Source() string { + return "aws.ec2" +} + +func (p Parser) DetailType() string { + return "EC2 Instance State-change Notification" } diff --git a/pkg/cloudprovider/aws/controllers/notification/event/statechange/unmarshal.go b/pkg/cloudprovider/aws/controllers/notification/event/statechange/unmarshal.go index dea8791d0063..4a006491a135 100644 --- a/pkg/cloudprovider/aws/controllers/notification/event/statechange/unmarshal.go +++ b/pkg/cloudprovider/aws/controllers/notification/event/statechange/unmarshal.go @@ -15,9 +15,6 @@ limitations under the License. package statechange import ( - "go.uber.org/zap" - "go.uber.org/zap/zapcore" - "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event" ) @@ -29,18 +26,7 @@ type AWSEvent struct { Detail EC2InstanceStateChangeNotificationDetail `json:"detail"` } -func (e AWSEvent) MarshalLogObject(enc zapcore.ObjectEncoder) error { - zap.Inline(e.AWSMetadata).AddTo(enc) - return enc.AddObject("detail", e.Detail) -} - type EC2InstanceStateChangeNotificationDetail struct { InstanceID string `json:"instance-id"` State string `json:"state"` } - -func (e EC2InstanceStateChangeNotificationDetail) MarshalLogObject(enc zapcore.ObjectEncoder) error { - enc.AddString("instance-id", e.InstanceID) - enc.AddString("state", e.State) - return nil -} diff --git a/pkg/cloudprovider/aws/controllers/notification/event/types.go b/pkg/cloudprovider/aws/controllers/notification/event/types.go index fb5c7b19c74c..ae46fd17ba3e 100644 --- a/pkg/cloudprovider/aws/controllers/notification/event/types.go +++ b/pkg/cloudprovider/aws/controllers/notification/event/types.go @@ -15,19 +15,18 @@ limitations under the License. package event import ( - "context" "fmt" - - "go.uber.org/zap/zapcore" ) type Parser interface { - Parse(context.Context, string) Interface + Parse(string) (Interface, error) + + Version() string + Source() string + DetailType() string } type Interface interface { - zapcore.ObjectMarshaler - EC2InstanceIDs() []string Kind() Kind } diff --git a/pkg/cloudprovider/aws/controllers/notification/metrics.go b/pkg/cloudprovider/aws/controllers/notification/metrics.go index bf2390411616..a430d427e837 100644 --- a/pkg/cloudprovider/aws/controllers/notification/metrics.go +++ b/pkg/cloudprovider/aws/controllers/notification/metrics.go @@ -22,10 +22,11 @@ import ( ) const ( - subsystem = "aws_notification_controller" - messageTypeLabel = "message_type" - actionableTypeLabel = "actionable" - actionTypeLabel = "action_type" + subsystem = "aws_notification_controller" + messageTypeLabel = "message_type" + actionableTypeLabel = "actionable" + actionTypeLabel = "action_type" + terminationReasonLabel = "interruption" ) var ( diff --git a/pkg/cloudprovider/aws/controllers/notification/reconciler.go b/pkg/cloudprovider/aws/controllers/notification/reconciler.go index 78092b899efc..41e6620336f3 100644 --- a/pkg/cloudprovider/aws/controllers/notification/reconciler.go +++ b/pkg/cloudprovider/aws/controllers/notification/reconciler.go @@ -41,6 +41,7 @@ import ( "github.com/aws/karpenter/pkg/controllers" "github.com/aws/karpenter/pkg/controllers/polling" "github.com/aws/karpenter/pkg/controllers/state" + "github.com/aws/karpenter/pkg/metrics" ) type Action = string @@ -63,7 +64,7 @@ type Reconciler struct { recorder events.Recorder provider *aws.SQSProvider instanceTypeProvider *aws.InstanceTypeProvider - parser event.Parser + parser aggregatedparser.AggregatedParser infraController polling.ControllerWithHealthInterface } @@ -114,17 +115,28 @@ func (r *Reconciler) Reconcile(ctx context.Context, _ reconcile.Request) (reconc workqueue.ParallelizeUntil(ctx, 10, len(sqsMessages), func(i int) { errs[i] = r.handleMessage(ctx, instanceIDMap, sqsMessages[i]) }) - return reconcile.Result{RequeueAfter: pollingPeriod}, multierr.Combine(errs...) + return reconcile.Result{RequeueAfter: 0}, multierr.Combine(errs...) } // handleMessage gets the node names of the instances involved in the queue message and takes the // assigned action on the instances based on the message event -func (r *Reconciler) handleMessage(ctx context.Context, instanceIDMap map[string]*v1.Node, msg *sqsapi.Message) (err error) { +func (r *Reconciler) handleMessage(ctx context.Context, instanceIDMap map[string]*v1.Node, msg *sqsapi.Message) error { // No message to parse in this case if msg == nil || msg.Body == nil { return nil } - evt := r.parser.Parse(ctx, *msg.Body) + evt, err := r.parser.Parse(*msg.Body) + if err != nil { + // In the scenario where we can't parse the message, we log that we have an error and then are + // forced to just delete the message from the queue + logging.FromContext(ctx).Errorf("parsing sqs message, %v", err) + err = r.provider.DeleteSQSMessage(ctx, msg) + if err != nil { + return fmt.Errorf("failed to delete message from queue, %w", err) + } + deletedMessages.Inc() + return nil + } ctx = logging.WithLogger(ctx, logging.FromContext(ctx).With("event", evt.Kind())) nodes := getInvolvedNodes(evt.EC2InstanceIDs(), instanceIDMap) @@ -138,7 +150,7 @@ func (r *Reconciler) handleMessage(ctx context.Context, instanceIDMap map[string return fmt.Errorf("failed to delete message from queue, %w", err) } deletedMessages.Inc() - return + return nil } receivedMessages.WithLabelValues(evt.Kind().String(), "true").Inc() @@ -187,10 +199,11 @@ func (r *Reconciler) handleNode(ctx context.Context, evt event.Interface, node * } func (r *Reconciler) deleteInstance(ctx context.Context, node *v1.Node) error { - r.recorder.TerminatingNodeOnNotification(node) if err := r.kubeClient.Delete(ctx, node); err != nil { return fmt.Errorf("deleting the node on notification, %w", err) } + r.recorder.TerminatingNodeOnNotification(node) + metrics.NodesTerminatedCounter.WithLabelValues(terminationReasonLabel).Inc() return nil } @@ -218,7 +231,7 @@ func (r *Reconciler) notifyForEvent(evt event.Interface, n *v1.Node) { } // makeInstanceIDMap builds a map between the instance id that is stored in the -// node .sper.providerID and the node name stored on the host +// node .spec.providerID and the node name stored on the host func (r *Reconciler) makeInstanceIDMap() map[string]*v1.Node { m := map[string]*v1.Node{} r.cluster.ForEachNode(func(n *state.Node) bool { diff --git a/pkg/cloudprovider/aws/controllers/register.go b/pkg/cloudprovider/aws/controllers/register.go index fcca69f2e20d..2e7b354438cf 100644 --- a/pkg/cloudprovider/aws/controllers/register.go +++ b/pkg/cloudprovider/aws/controllers/register.go @@ -38,9 +38,9 @@ func Register(ctx context.Context, provider *aws.CloudProvider, opts *controller infraProvider := infrastructure.NewProvider(provider.SQSProvider(), provider.EventBridgeProvider()) infraController := polling.NewController(infrastructure.NewReconciler(infraProvider)).WithHealth() notificationController := polling.NewController(notification.NewReconciler(opts.KubeClient, rec, opts.Cluster, provider.SQSProvider(), provider.InstanceTypeProvider(), infraController)) + infraController.OnHealthy = notificationController.Start nodeTemplateController := nodetemplate.NewController(opts.KubeClient, infraProvider, infraController, notificationController) - infraController.OnHealthy = notificationController.Start ret = append(ret, infraController, notificationController, nodeTemplateController) } return ret diff --git a/pkg/controllers/polling/controller.go b/pkg/controllers/polling/controller.go index 78e813491e08..4775d9174e1e 100644 --- a/pkg/controllers/polling/controller.go +++ b/pkg/controllers/polling/controller.go @@ -17,6 +17,7 @@ package polling import ( "context" "sync" + "sync/atomic" "github.com/google/uuid" "github.com/prometheus/client_golang/prometheus" @@ -42,6 +43,8 @@ import ( type ControllerInterface interface { controllers.Controller + Builder(context.Context, manager.Manager) *controllerruntime.Builder + Start(context.Context) Stop(context.Context) Trigger() @@ -57,14 +60,12 @@ type Controller struct { r controllers.Reconciler uuid types.UID + mu sync.RWMutex active bool - triggerGeneration int64 + triggerGeneration atomic.Int64 trigger chan event.GenericEvent - triggerMu sync.RWMutex - activeMu sync.RWMutex - cancels sync.Map } @@ -83,42 +84,37 @@ func NewController(rec controllers.Reconciler) *Controller { // WithHealth returns a decorated version of the polling controller that surfaces health information // based on the success or failure of a reconciliation loop -func (t *Controller) WithHealth() *ControllerWithHealth { - return NewControllerWithHealth(t) +func (c *Controller) WithHealth() *ControllerWithHealth { + return NewControllerWithHealth(c) } // Start is an idempotent call to kick-off a single reconciliation loop. Based on the intended use of this controller, // the Reconciler is responsible for requeuing this message back in the WorkQueue so there is a time-based reconciliation // performed. The Trigger operation is performed to kick-off the loop. -func (t *Controller) Start(ctx context.Context) { - logging.FromContext(ctx).Infof("Starting the %s controller...", t.r.Metadata().Name) - t.activeMu.Lock() - if !t.active { - t.active = true - t.activeMu.Unlock() - t.Trigger() - } else { - t.activeMu.Unlock() +func (c *Controller) Start(ctx context.Context) { + c.mu.Lock() + defer c.mu.Unlock() + + if !c.active { + logging.FromContext(ctx).Infof("Starting the %s controller...", c.r.Metadata().Name) + c.active = true + c.Trigger() } } // Trigger triggers an immediate reconciliation by inserting a message into the event channel. We increase the trigger // generation here to ensure that any messages that were previously re-queued are thrown away -func (t *Controller) Trigger() { - t.triggerMu.Lock() - defer t.triggerMu.Unlock() - - t.triggerGeneration++ - t.triggeredCountMetric().Inc() - obj := &Object{ObjectMeta: metav1.ObjectMeta{Generation: t.triggerGeneration, UID: t.uuid}} - t.trigger <- event.GenericEvent{Object: obj} +func (c *Controller) Trigger() { + c.triggeredCountMetric().Inc() + obj := &Object{ObjectMeta: metav1.ObjectMeta{Generation: c.triggerGeneration.Add(1), UID: c.uuid}} + c.trigger <- event.GenericEvent{Object: obj} } // Stop sets the state of the controller to active and cancel the current reconciliation contexts, if there are any -func (t *Controller) Stop(ctx context.Context) { - logging.FromContext(ctx).Infof("Stopping the %s controller...", t.r.Metadata().Name) - t.SetActive(false) - t.cancels.Range(func(_ any, c any) bool { +func (c *Controller) Stop(ctx context.Context) { + logging.FromContext(ctx).Infof("Stopping the %s controller...", c.r.Metadata().Name) + c.SetActive(false) + c.cancels.Range(func(_ any, c any) bool { cancel := c.(context.CancelFunc) cancel() return true @@ -128,71 +124,72 @@ func (t *Controller) Stop(ctx context.Context) { // Active gets whether the controller is active right now. This value is passed down to the wrapped // Reconcile method so that the Reconciler can handle cleanup scenarios. The underlying Reconciler is responsible // for returning a result with no RequeueAfter to stop its activity -func (t *Controller) Active() bool { - t.activeMu.RLock() - defer t.activeMu.RUnlock() - return t.active +func (c *Controller) Active() bool { + c.mu.RLock() + defer c.mu.RUnlock() + + return c.active } // SetActive sets the active flag on the controller -func (t *Controller) SetActive(active bool) { - t.activeMu.Lock() - defer t.activeMu.Unlock() +func (c *Controller) SetActive(active bool) { + c.mu.Lock() + defer c.mu.Unlock() - t.active = active + c.active = active if active { - t.activeMetric().Set(1) + c.activeMetric().Set(1) } else { - t.activeMetric().Set(0) + c.activeMetric().Set(0) } } -func (t *Controller) Reconcile(ctx context.Context, req reconcile.Request) (reconcile.Result, error) { - ctx = logging.WithLogger(ctx, logging.FromContext(ctx).Named(t.r.Metadata().Name)) +func (c *Controller) Reconcile(ctx context.Context, req reconcile.Request) (reconcile.Result, error) { + ctx = logging.WithLogger(ctx, logging.FromContext(ctx).Named(c.r.Metadata().Name)) ctx, cancel := context.WithCancel(ctx) // Store the cancel function for the duration of Reconcile, so we can cancel on a Stop() call cancelID := uuid.New() - t.cancels.Store(cancelID, cancel) - defer t.cancels.Delete(cancelID) + c.cancels.Store(cancelID, cancel) + defer c.cancels.Delete(cancelID) - return t.r.Reconcile(ctx, req) + return c.r.Reconcile(ctx, req) } -func (t *Controller) Register(_ context.Context, m manager.Manager) error { - crmetrics.Registry.MustRegister(t.activeMetric(), t.triggeredCountMetric()) +func (c *Controller) Builder(_ context.Context, m manager.Manager) *controllerruntime.Builder { + crmetrics.Registry.MustRegister(c.activeMetric(), c.triggeredCountMetric()) return controllerruntime. NewControllerManagedBy(m). - Named(t.r.Metadata().Name). + Named(c.r.Metadata().Name). WithEventFilter(predicate.NewPredicateFuncs(func(obj client.Object) bool { - t.triggerMu.RLock() - defer t.triggerMu.RUnlock() - // UUID comparison is a hacky way to get around the fact that controller-runtime requires // us to perform a watch on some K8s object - return obj.GetUID() == t.uuid && obj.GetGeneration() == t.triggerGeneration + return obj.GetUID() == c.uuid && obj.GetGeneration() == c.triggerGeneration.Load() })). - Watches(&source.Channel{Source: t.trigger}, &handler.EnqueueRequestForObject{}). - For(&v1.Pod{}). // controller-runtime requires us to perform a watch on some object, so let's do it on a fundamental component - Complete(t) + Watches(&source.Channel{Source: c.trigger}, &handler.EnqueueRequestForObject{}). + For(&v1.Pod{}) // controller-runtime requires us to perform a watch on some object, so let's do it on a fundamental component +} + +func (c *Controller) Register(ctx context.Context, m manager.Manager) error { + return c.Builder(ctx, m).Complete(c) } -func (t *Controller) activeMetric() prometheus.Gauge { +func (c *Controller) activeMetric() prometheus.Gauge { return prometheus.NewGauge( prometheus.GaugeOpts{ Namespace: metrics.Namespace, - Subsystem: t.r.Metadata().MetricsSubsystem, + Subsystem: c.r.Metadata().MetricsSubsystem, Name: "active", Help: "Whether the controller is active.", }, ) } -func (t *Controller) triggeredCountMetric() prometheus.Counter { +func (c *Controller) triggeredCountMetric() prometheus.Counter { return prometheus.NewCounter( prometheus.CounterOpts{ Namespace: metrics.Namespace, - Subsystem: t.r.Metadata().MetricsSubsystem, + Subsystem: c.r.Metadata().MetricsSubsystem, Name: "trigger_count", Help: "A counter of the number of times this controller has been triggered.", }, diff --git a/pkg/controllers/polling/decorators.go b/pkg/controllers/polling/decorators.go index c72a3cbe455b..fc4fccb44c90 100644 --- a/pkg/controllers/polling/decorators.go +++ b/pkg/controllers/polling/decorators.go @@ -16,9 +16,10 @@ package polling import ( "context" - "sync" + "sync/atomic" "github.com/prometheus/client_golang/prometheus" + controllerruntime "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/manager" crmetrics "sigs.k8s.io/controller-runtime/pkg/metrics" "sigs.k8s.io/controller-runtime/pkg/reconcile" @@ -37,8 +38,7 @@ type ControllerWithHealthInterface interface { type ControllerWithHealth struct { *Controller - healthy bool - healthyMu sync.RWMutex + healthy atomic.Bool OnHealthy func(context.Context) OnUnhealthy func(context.Context) @@ -51,16 +51,13 @@ func NewControllerWithHealth(c *Controller) *ControllerWithHealth { } func (c *ControllerWithHealth) Healthy() bool { - c.healthyMu.RLock() - defer c.healthyMu.RUnlock() - return c.healthy + return c.healthy.Load() } func (c *ControllerWithHealth) Reconcile(ctx context.Context, req reconcile.Request) (reconcile.Result, error) { res, err := c.Controller.Reconcile(ctx, req) - c.healthyMu.Lock() - c.healthy = err == nil // The controller is considered healthy when it successfully reconciles - if c.healthy { + healthy := err == nil // The controller is considered healthy when it successfully reconciles + if healthy { if c.OnHealthy != nil { c.OnHealthy(ctx) } @@ -71,13 +68,17 @@ func (c *ControllerWithHealth) Reconcile(ctx context.Context, req reconcile.Requ } c.healthyMetric().Set(0) } - c.healthyMu.Unlock() + c.healthy.Store(healthy) return res, err } -func (c *ControllerWithHealth) Register(ctx context.Context, m manager.Manager) error { +func (c *ControllerWithHealth) Builder(ctx context.Context, m manager.Manager) *controllerruntime.Builder { crmetrics.Registry.MustRegister(c.healthyMetric()) - return c.Controller.Register(ctx, m) + return c.Controller.Builder(ctx, m) +} + +func (c *ControllerWithHealth) Register(ctx context.Context, m manager.Manager) error { + return c.Builder(ctx, m).Complete(c) } func (c *ControllerWithHealth) healthyMetric() prometheus.Gauge { diff --git a/pkg/controllers/polling/suite_test.go b/pkg/controllers/polling/suite_test.go new file mode 100644 index 000000000000..b50f7cd50a70 --- /dev/null +++ b/pkg/controllers/polling/suite_test.go @@ -0,0 +1,15 @@ +/* +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package polling_test diff --git a/test/pkg/environment/environment.go b/test/pkg/environment/environment.go index 4771df022736..a9a21ed005e2 100644 --- a/test/pkg/environment/environment.go +++ b/test/pkg/environment/environment.go @@ -75,7 +75,7 @@ func NewAWSEnvironment(env *Environment, err error) (*AWSEnvironment, error) { return nil, err } session := session.Must(session.NewSessionWithOptions(session.Options{SharedConfigState: session.SharedConfigEnable})) - metadataProvider := aws.NewMetadataProvider(aws.NewEC2MetadataClient(session), sts.New(session)) + metadataProvider := aws.NewMetadataProvider(session, aws.NewEC2MetadataClient(session), sts.New(session)) return &AWSEnvironment{ Environment: env, diff --git a/test/pkg/environment/expectations.go b/test/pkg/environment/expectations.go index 33117e434857..aa7c99cd089d 100644 --- a/test/pkg/environment/expectations.go +++ b/test/pkg/environment/expectations.go @@ -22,7 +22,8 @@ import ( "sync" "time" - . "github.com/onsi/gomega" //nolint:revive,stylecheck + . "github.com/onsi/ginkgo/v2" //nolint:revive,stylecheck + . "github.com/onsi/gomega" //nolint:revive,stylecheck "github.com/samber/lo" "go.uber.org/multierr" v1 "k8s.io/api/core/v1" diff --git a/test/suites/integration/scheduling_test.go b/test/suites/integration/scheduling_test.go index f34f4cbddd2a..3dac6523c3ba 100644 --- a/test/suites/integration/scheduling_test.go +++ b/test/suites/integration/scheduling_test.go @@ -42,8 +42,8 @@ var _ = Describe("Scheduling", func() { nodeSelector := map[string]string{ // Well Known v1alpha5.ProvisionerNameLabelKey: provisioner.Name, - v1.LabelTopologyRegion: env.Metadata.Region(), - v1.LabelTopologyZone: fmt.Sprintf("%sa", env.Metadata.Region()), + v1.LabelTopologyRegion: env.MetadataProvider.Region(env), + v1.LabelTopologyZone: fmt.Sprintf("%sa", env.MetadataProvider.Region(env)), v1.LabelInstanceTypeStable: "g4dn.8xlarge", v1.LabelOSStable: "linux", v1.LabelArchStable: "amd64", @@ -63,8 +63,8 @@ var _ = Describe("Scheduling", func() { awsv1alpha1.LabelInstanceGPUMemory: "16384", awsv1alpha1.LabelInstanceLocalNVME: "900", // Deprecated Labels - v1.LabelFailureDomainBetaZone: fmt.Sprintf("%sa", env.Metadata.Region()), - v1.LabelFailureDomainBetaRegion: env.Metadata.Region(), + v1.LabelFailureDomainBetaZone: fmt.Sprintf("%sa", env.MetadataProvider.Region(env)), + v1.LabelFailureDomainBetaRegion: env.MetadataProvider.Region(env), "beta.kubernetes.io/arch": "amd64", "beta.kubernetes.io/os": "linux", v1.LabelInstanceType: "g4dn.8xlarge", diff --git a/test/suites/notification/suite_test.go b/test/suites/notification/suite_test.go index e74635208eb5..64a4ac6b8702 100644 --- a/test/suites/notification/suite_test.go +++ b/test/suites/notification/suite_test.go @@ -112,8 +112,10 @@ var _ = Describe("Notification", Label("AWS"), func() { } fmt.Printf("[FIS EVENT MONITOR] %s\n", event.Message) case <-done: + fmt.Println("done channel closed") return case <-ctx.Done(): + fmt.Println("context canceled") return } }() From 62ed5562720e95cd2a4c71376b234b49e2d7004c Mon Sep 17 00:00:00 2001 From: Jonathan Innis Date: Mon, 10 Oct 2022 16:17:43 -0700 Subject: [PATCH 40/55] Remove the need for a metadata provider --- pkg/cloudprovider/aws/cloudprovider.go | 27 ++- .../controllers/infrastructure/provider.go | 2 +- .../controllers/infrastructure/suite_test.go | 6 +- .../controllers/notification/reconciler.go | 2 +- .../controllers/notification/suite_test.go | 5 +- pkg/cloudprovider/aws/eventbridge.go | 99 ++++------ pkg/cloudprovider/aws/fake/ec2metadataapi.go | 37 ---- pkg/cloudprovider/aws/fake/stsapi.go | 21 --- pkg/cloudprovider/aws/metadata.go | 113 ------------ pkg/cloudprovider/aws/sqs.go | 83 +++++---- pkg/config/config.go | 2 +- pkg/controllers/polling/controller.go | 6 + pkg/controllers/polling/decorators.go | 8 +- pkg/utils/atomic/cached_val.go | 7 +- pkg/utils/atomic/suite_test.go | 101 ++++++++++ pkg/utils/ptr/ptr.go | 6 +- test/go.mod | 4 +- test/go.sum | 2 - test/go.work | 6 - test/go.work.sum | 172 ------------------ test/pkg/environment/environment.go | 25 ++- test/pkg/environment/setup.go | 47 +++-- test/suites/integration/scheduling_test.go | 8 +- test/suites/notification/suite_test.go | 60 +++--- .../cloudformation.yaml | 1 + 25 files changed, 310 insertions(+), 540 deletions(-) delete mode 100644 pkg/cloudprovider/aws/fake/ec2metadataapi.go delete mode 100644 pkg/cloudprovider/aws/fake/stsapi.go delete mode 100644 pkg/cloudprovider/aws/metadata.go create mode 100644 pkg/utils/atomic/suite_test.go delete mode 100644 test/go.work delete mode 100644 test/go.work.sum diff --git a/pkg/cloudprovider/aws/cloudprovider.go b/pkg/cloudprovider/aws/cloudprovider.go index f80586bfee4f..cf70ae3d582a 100644 --- a/pkg/cloudprovider/aws/cloudprovider.go +++ b/pkg/cloudprovider/aws/cloudprovider.go @@ -25,6 +25,7 @@ import ( "github.com/aws/aws-sdk-go/aws" "github.com/aws/aws-sdk-go/aws/awserr" "github.com/aws/aws-sdk-go/aws/client" + "github.com/aws/aws-sdk-go/aws/ec2metadata" "github.com/aws/aws-sdk-go/aws/endpoints" "github.com/aws/aws-sdk-go/aws/request" "github.com/aws/aws-sdk-go/aws/session" @@ -32,7 +33,6 @@ import ( "github.com/aws/aws-sdk-go/service/eventbridge" "github.com/aws/aws-sdk-go/service/sqs" "github.com/aws/aws-sdk-go/service/ssm" - "github.com/aws/aws-sdk-go/service/sts" "github.com/patrickmn/go-cache" "github.com/samber/lo" v1 "k8s.io/api/core/v1" @@ -99,8 +99,10 @@ func NewCloudProvider(ctx context.Context, options cloudprovider.Options) *Cloud client.DefaultRetryer{NumMaxRetries: client.DefaultRetryerMaxNumRetries}, ), ))) - metadataProvider := NewMetadataProvider(sess, NewEC2MetadataClient(sess), sts.New(sess)) - metadataProvider.EnsureSessionRegion(ctx, sess) + if *sess.Config.Region == "" { + logging.FromContext(ctx).Debug("AWS region not configured, asking EC2 Instance Metadata Service") + *sess.Config.Region = getRegionFromIMDS(sess) + } logging.FromContext(ctx).Debugf("Using AWS region %s", *sess.Config.Region) ec2api := ec2.New(sess) @@ -110,8 +112,8 @@ func NewCloudProvider(ctx context.Context, options cloudprovider.Options) *Cloud subnetProvider := NewSubnetProvider(ec2api) instanceTypeProvider := NewInstanceTypeProvider(ctx, sess, options, ec2api, subnetProvider) - sqsProvider := NewSQSProvider(ctx, sqs.New(sess), metadataProvider) - eventBridgeProvider := NewEventBridgeProvider(eventbridge.New(sess), metadataProvider, sqsProvider.QueueName()) + sqsProvider := NewSQSProvider(ctx, sqs.New(sess)) + eventBridgeProvider := NewEventBridgeProvider(eventbridge.New(sess), sqsProvider) cloudprovider := &CloudProvider{ instanceTypeProvider: instanceTypeProvider, instanceProvider: NewInstanceProvider(ctx, ec2api, instanceTypeProvider, subnetProvider, @@ -232,6 +234,11 @@ func (c *CloudProvider) InstanceTypeProvider() *InstanceTypeProvider { return c.instanceTypeProvider } +// Name returns the CloudProvider implementation name. +func (c *CloudProvider) Name() string { + return "aws" +} + // Default the provisioner func (*CloudProvider) Default(ctx context.Context, provisioner *v1alpha5.Provisioner) { defaultLabels(provisioner) @@ -259,9 +266,13 @@ func defaultLabels(provisioner *v1alpha5.Provisioner) { } } -// Name returns the CloudProvider implementation name. -func (c *CloudProvider) Name() string { - return "aws" +// get the current region from EC2 IMDS +func getRegionFromIMDS(sess *session.Session) string { + region, err := ec2metadata.New(sess).Region() + if err != nil { + panic(fmt.Sprintf("Failed to call the metadata server's region API, %s", err)) + } + return region } // withUserAgent adds a karpenter specific user-agent string to AWS session diff --git a/pkg/cloudprovider/aws/controllers/infrastructure/provider.go b/pkg/cloudprovider/aws/controllers/infrastructure/provider.go index 50e55eed3d2d..415db6a37ffc 100644 --- a/pkg/cloudprovider/aws/controllers/infrastructure/provider.go +++ b/pkg/cloudprovider/aws/controllers/infrastructure/provider.go @@ -96,13 +96,13 @@ func (p *Provider) ensureQueue(ctx context.Context) error { return fmt.Errorf("creating sqs queue with policy, %w", err) } logging.FromContext(ctx).Debugf("Successfully created the SQS notification queue") - return nil case aws.IsAccessDenied(err): return fmt.Errorf("failed obtaining permission to discover sqs queue url, %w", err) default: return fmt.Errorf("failed discovering sqs queue url, %w", err) } } + // Always attempt to set the queue attributes, even after creation to help set the queue policy if err := p.sqsProvider.SetQueueAttributes(ctx); err != nil { return fmt.Errorf("setting queue attributes for queue, %w", err) } diff --git a/pkg/cloudprovider/aws/controllers/infrastructure/suite_test.go b/pkg/cloudprovider/aws/controllers/infrastructure/suite_test.go index 9948fbeea12d..fb4b24804ad7 100644 --- a/pkg/cloudprovider/aws/controllers/infrastructure/suite_test.go +++ b/pkg/cloudprovider/aws/controllers/infrastructure/suite_test.go @@ -20,7 +20,6 @@ import ( "testing" "github.com/aws/aws-sdk-go/aws/awserr" - "github.com/aws/aws-sdk-go/awstesting/mock" "github.com/aws/aws-sdk-go/service/sqs" . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" @@ -69,11 +68,10 @@ var _ = BeforeEach(func() { Expect(opts.Validate()).To(Succeed(), "Failed to validate options") e.Ctx = injection.WithOptions(e.Ctx, opts) - metadataProvider := aws.NewMetadataProvider(mock.Session, &awsfake.EC2MetadataAPI{}, &awsfake.STSAPI{}) sqsapi = &awsfake.SQSAPI{} eventbridgeapi = &awsfake.EventBridgeAPI{} - sqsProvider = aws.NewSQSProvider(e.Ctx, sqsapi, metadataProvider) - eventBridgeProvider = aws.NewEventBridgeProvider(eventbridgeapi, metadataProvider, sqsProvider.QueueName()) + sqsProvider = aws.NewSQSProvider(e.Ctx, sqsapi) + eventBridgeProvider = aws.NewEventBridgeProvider(eventbridgeapi, sqsProvider) controller = polling.NewController(infrastructure.NewReconciler(infrastructure.NewProvider(sqsProvider, eventBridgeProvider))).WithHealth() }) diff --git a/pkg/cloudprovider/aws/controllers/notification/reconciler.go b/pkg/cloudprovider/aws/controllers/notification/reconciler.go index 41e6620336f3..88c57de3f213 100644 --- a/pkg/cloudprovider/aws/controllers/notification/reconciler.go +++ b/pkg/cloudprovider/aws/controllers/notification/reconciler.go @@ -115,7 +115,7 @@ func (r *Reconciler) Reconcile(ctx context.Context, _ reconcile.Request) (reconc workqueue.ParallelizeUntil(ctx, 10, len(sqsMessages), func(i int) { errs[i] = r.handleMessage(ctx, instanceIDMap, sqsMessages[i]) }) - return reconcile.Result{RequeueAfter: 0}, multierr.Combine(errs...) + return reconcile.Result{RequeueAfter: polling.Immediate}, multierr.Combine(errs...) } // handleMessage gets the node names of the instances involved in the queue message and takes the diff --git a/pkg/cloudprovider/aws/controllers/notification/suite_test.go b/pkg/cloudprovider/aws/controllers/notification/suite_test.go index bf8e28aef33d..001aa200a81a 100644 --- a/pkg/cloudprovider/aws/controllers/notification/suite_test.go +++ b/pkg/cloudprovider/aws/controllers/notification/suite_test.go @@ -103,12 +103,11 @@ var _ = BeforeEach(func() { cluster = state.NewCluster(fakeClock, cfg, env.Client, cloudProvider) nodeStateController = state.NewNodeController(env.Client, cluster) recorder = awsfake.NewEventRecorder() - metadataProvider := aws.NewMetadataProvider(mock.Session, &awsfake.EC2MetadataAPI{}, &awsfake.STSAPI{}) sqsapi = &awsfake.SQSAPI{} - sqsProvider = aws.NewSQSProvider(ctx, sqsapi, metadataProvider) + sqsProvider = aws.NewSQSProvider(ctx, sqsapi) eventbridgeapi = &awsfake.EventBridgeAPI{} - eventBridgeProvider = aws.NewEventBridgeProvider(eventbridgeapi, metadataProvider, sqsProvider.QueueName()) + eventBridgeProvider = aws.NewEventBridgeProvider(eventbridgeapi, sqsProvider) ec2api = &awsfake.EC2API{} subnetProvider := aws.NewSubnetProvider(ec2api) diff --git a/pkg/cloudprovider/aws/eventbridge.go b/pkg/cloudprovider/aws/eventbridge.go index 2631aab994c8..3e40e42506b2 100644 --- a/pkg/cloudprovider/aws/eventbridge.go +++ b/pkg/cloudprovider/aws/eventbridge.go @@ -18,7 +18,6 @@ import ( "context" "encoding/json" "fmt" - "sync" "github.com/aws/aws-sdk-go/aws" "github.com/aws/aws-sdk-go/service/eventbridge" @@ -32,9 +31,8 @@ import ( ) type EventBridgeProvider struct { - client eventbridgeiface.EventBridgeAPI - queueName string - metadataProvider *MetadataProvider + client eventbridgeiface.EventBridgeAPI + sqsProvider *SQSProvider } type EventRule struct { @@ -43,6 +41,8 @@ type EventRule struct { Target *EventTarget } +const QueueTargetID = "KarpenterEventQueue" + type EventTarget struct { ID string ARN string @@ -57,16 +57,19 @@ func (ep *EventPattern) Serialize() []byte { return lo.Must(json.Marshal(ep)) } -func NewEventBridgeProvider(eb eventbridgeiface.EventBridgeAPI, metadataProvider *MetadataProvider, queueName string) *EventBridgeProvider { +func NewEventBridgeProvider(eb eventbridgeiface.EventBridgeAPI, sqsProvider *SQSProvider) *EventBridgeProvider { return &EventBridgeProvider{ - client: eb, - metadataProvider: metadataProvider, - queueName: queueName, + client: eb, + sqsProvider: sqsProvider, } } func (eb *EventBridgeProvider) CreateEC2NotificationRules(ctx context.Context) error { - rules := eb.getEC2NotificationEventRules(ctx) + queueARN, err := eb.sqsProvider.queueARN.TryGet(ctx) + if err != nil { + return fmt.Errorf("resolving queue arn, %w", err) + } + rules := lo.Map(eb.getEC2NotificationEventRules(ctx), func(r EventRule, _ int) EventRule { return r.AddQueueTarget(queueARN) }) errs := make([]error, len(rules)) workqueue.ParallelizeUntil(ctx, len(rules), len(rules), func(i int) { _, err := eb.client.PutRuleWithContext(ctx, &eventbridge.PutRuleInput{ @@ -98,38 +101,28 @@ func (eb *EventBridgeProvider) CreateEC2NotificationRules(ctx context.Context) e return multierr.Combine(errs...) } -func (eb *EventBridgeProvider) DeleteEC2NotificationRules(ctx context.Context) (err error) { - wg := &sync.WaitGroup{} - m := &sync.Mutex{} - for _, rule := range eb.getEC2NotificationEventRules(ctx) { - wg.Add(1) - go func(r EventRule) { - defer wg.Done() - targetInput := &eventbridge.RemoveTargetsInput{ - Ids: []*string{aws.String(r.Target.ID)}, - Rule: aws.String(r.Name), - } - _, e := eb.client.RemoveTargetsWithContext(ctx, targetInput) - if e != nil && !IsNotFound(e) { - m.Lock() - err = multierr.Append(err, e) - m.Unlock() - return - } - ruleInput := &eventbridge.DeleteRuleInput{ - Name: aws.String(r.Name), - } - _, e = eb.client.DeleteRuleWithContext(ctx, ruleInput) - if e != nil && !IsNotFound(e) { - m.Lock() - err = multierr.Append(err, e) - m.Unlock() - return - } - }(rule) - } - wg.Wait() - return err +func (eb *EventBridgeProvider) DeleteEC2NotificationRules(ctx context.Context) error { + rules := eb.getEC2NotificationEventRules(ctx) + errs := make([]error, len(rules)) + workqueue.ParallelizeUntil(ctx, len(rules), len(rules), func(i int) { + targetInput := &eventbridge.RemoveTargetsInput{ + Ids: []*string{aws.String(QueueTargetID)}, + Rule: aws.String(rules[i].Name), + } + _, err := eb.client.RemoveTargetsWithContext(ctx, targetInput) + if err != nil && !IsNotFound(err) { + errs[i] = err + return + } + ruleInput := &eventbridge.DeleteRuleInput{ + Name: aws.String(rules[i].Name), + } + _, err = eb.client.DeleteRuleWithContext(ctx, ruleInput) + if err != nil && !IsNotFound(err) { + errs[i] = err + } + }) + return multierr.Combine(errs...) } func (eb *EventBridgeProvider) getEC2NotificationEventRules(ctx context.Context) []EventRule { @@ -140,10 +133,6 @@ func (eb *EventBridgeProvider) getEC2NotificationEventRules(ctx context.Context) Source: []string{"aws.health"}, DetailType: []string{"AWS Health Event"}, }, - Target: &EventTarget{ - ID: "1", - ARN: eb.getQueueARN(ctx), - }, }, { Name: fmt.Sprintf("Karpenter-%s-SpotTerminationRule", injection.GetOptions(ctx).ClusterName), @@ -151,10 +140,6 @@ func (eb *EventBridgeProvider) getEC2NotificationEventRules(ctx context.Context) Source: []string{"aws.ec2"}, DetailType: []string{"EC2 Spot Instance Interruption Warning"}, }, - Target: &EventTarget{ - ID: "1", - ARN: eb.getQueueARN(ctx), - }, }, { Name: fmt.Sprintf("Karpenter-%s-RebalanceRule", injection.GetOptions(ctx).ClusterName), @@ -162,10 +147,6 @@ func (eb *EventBridgeProvider) getEC2NotificationEventRules(ctx context.Context) Source: []string{"aws.ec2"}, DetailType: []string{"EC2 Instance Rebalance Recommendation"}, }, - Target: &EventTarget{ - ID: "1", - ARN: eb.getQueueARN(ctx), - }, }, { Name: fmt.Sprintf("Karpenter-%s-InstanceStateChangeRule", injection.GetOptions(ctx).ClusterName), @@ -173,14 +154,14 @@ func (eb *EventBridgeProvider) getEC2NotificationEventRules(ctx context.Context) Source: []string{"aws.ec2"}, DetailType: []string{"EC2 Instance State-change Notification"}, }, - Target: &EventTarget{ - ID: "1", - ARN: eb.getQueueARN(ctx), - }, }, } } -func (eb *EventBridgeProvider) getQueueARN(ctx context.Context) string { - return fmt.Sprintf("arn:aws:sqs:%s:%s:%s", eb.metadataProvider.Region(ctx), eb.metadataProvider.AccountID(ctx), eb.queueName) +func (er EventRule) AddQueueTarget(queueARN string) EventRule { + er.Target = &EventTarget{ + ID: QueueTargetID, + ARN: queueARN, + } + return er } diff --git a/pkg/cloudprovider/aws/fake/ec2metadataapi.go b/pkg/cloudprovider/aws/fake/ec2metadataapi.go deleted file mode 100644 index 59e63bac2c61..000000000000 --- a/pkg/cloudprovider/aws/fake/ec2metadataapi.go +++ /dev/null @@ -1,37 +0,0 @@ -/* -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package fake - -import ( - "context" - - "github.com/aws/aws-sdk-go/aws/ec2metadata" -) - -type EC2MetadataAPI struct{} - -func (e *EC2MetadataAPI) RegionWithContext(ctx context.Context) (string, error) { - return "us-west-2", nil -} - -func (e *EC2MetadataAPI) GetInstanceIdentityDocumentWithContext(context.Context) (ec2metadata.EC2InstanceIdentityDocument, error) { - return ec2metadata.EC2InstanceIdentityDocument{ - AccountID: "000000000000", - }, nil -} - -func (e *EC2MetadataAPI) PartitionID() string { - return "aws" -} diff --git a/pkg/cloudprovider/aws/fake/stsapi.go b/pkg/cloudprovider/aws/fake/stsapi.go deleted file mode 100644 index 405a30ed70bb..000000000000 --- a/pkg/cloudprovider/aws/fake/stsapi.go +++ /dev/null @@ -1,21 +0,0 @@ -/* -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package fake - -import "github.com/aws/aws-sdk-go/service/sts/stsiface" - -type STSAPI struct { - stsiface.STSAPI -} diff --git a/pkg/cloudprovider/aws/metadata.go b/pkg/cloudprovider/aws/metadata.go deleted file mode 100644 index cbeb17fb7437..000000000000 --- a/pkg/cloudprovider/aws/metadata.go +++ /dev/null @@ -1,113 +0,0 @@ -/* -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package aws - -import ( - "context" - "fmt" - - "github.com/aws/aws-sdk-go/aws" - "github.com/aws/aws-sdk-go/aws/ec2metadata" - "github.com/aws/aws-sdk-go/aws/session" - "github.com/aws/aws-sdk-go/service/sts" - "github.com/aws/aws-sdk-go/service/sts/stsiface" - "knative.dev/pkg/logging" - - "github.com/aws/karpenter/pkg/utils/atomic" -) - -type EC2MetadataInterface interface { - RegionWithContext(context.Context) (string, error) - GetInstanceIdentityDocumentWithContext(context.Context) (ec2metadata.EC2InstanceIdentityDocument, error) - PartitionID() string -} - -type EC2MetadataClient struct { - *ec2metadata.EC2Metadata -} - -func NewEC2MetadataClient(sess *session.Session) *EC2MetadataClient { - return &EC2MetadataClient{ - EC2Metadata: ec2metadata.New(sess), - } -} - -func (e *EC2MetadataClient) PartitionID() string { - return e.EC2Metadata.PartitionID -} - -type MetadataProvider struct { - ec2MetadataClient EC2MetadataInterface - stsClient stsiface.STSAPI - sess *session.Session - - region atomic.CachedVal[string] // cached region if already resolved - accountID atomic.CachedVal[string] // cached accountID if already resolved -} - -func NewMetadataProvider(sess *session.Session, ec2metadataapi EC2MetadataInterface, stsapi stsiface.STSAPI) *MetadataProvider { - m := &MetadataProvider{ - ec2MetadataClient: ec2metadataapi, - stsClient: stsapi, - sess: sess, - } - m.region.Resolve = func(ctx context.Context) (string, error) { - if m.sess != nil && m.sess.Config != nil && m.sess.Config.Region != nil && *m.sess.Config.Region != "" { - return *m.sess.Config.Region, nil - } - logging.FromContext(ctx).Debug("AWS region not configured, asking EC2 Instance Metadata Service") - return m.ec2MetadataClient.RegionWithContext(ctx) - } - m.accountID.Resolve = func(ctx context.Context) (string, error) { - doc, err := m.ec2MetadataClient.GetInstanceIdentityDocumentWithContext(ctx) - if err != nil { - // Resolve to using the STS provider if IMDS fails - result, err := m.stsClient.GetCallerIdentity(&sts.GetCallerIdentityInput{}) - if err != nil { - return "", err - } - return aws.StringValue(result.Account), nil - } - return doc.AccountID, nil - } - return m -} - -// EnsureSessionRegion resolves the region set in the session config if not already set -func (m *MetadataProvider) EnsureSessionRegion(ctx context.Context, sess *session.Session) { - *sess.Config.Region = m.Region(ctx) -} - -// Region gets the current region from EC2 IMDS -func (m *MetadataProvider) Region(ctx context.Context) string { - str, err := m.region.TryGet(ctx) - if err != nil { - panic(fmt.Sprintf("Resolving region in the metadata provider, %v", err)) - } - return str -} - -// AccountID gets the AWS Account ID from EC2 IMDS, then STS if it can't be resolved at IMDS -func (m *MetadataProvider) AccountID(ctx context.Context) string { - str, err := m.accountID.TryGet(ctx) - if err != nil { - panic(fmt.Sprintf("Resolving account ID in the metadata provider, %v", err)) - } - return str -} - -func (m *MetadataProvider) Partition() string { - return m.ec2MetadataClient.PartitionID() -} diff --git a/pkg/cloudprovider/aws/sqs.go b/pkg/cloudprovider/aws/sqs.go index b5392d1a7c4b..fa70dbb84f13 100644 --- a/pkg/cloudprovider/aws/sqs.go +++ b/pkg/cloudprovider/aws/sqs.go @@ -18,7 +18,6 @@ import ( "context" "encoding/json" "fmt" - "sync" "github.com/aws/aws-sdk-go/aws" "github.com/aws/aws-sdk-go/service/sqs" @@ -51,25 +50,23 @@ type Principal struct { type SQSProvider struct { client sqsiface.SQSAPI - createQueueInput *sqs.CreateQueueInput - getQueueURLInput *sqs.GetQueueUrlInput - receiveMessageInput *sqs.ReceiveMessageInput - mu sync.RWMutex - queueURL atomic.CachedVal[string] - queueName string - metadataProvider *MetadataProvider + createQueueInput *sqs.CreateQueueInput + getQueueURLInput *sqs.GetQueueUrlInput + getQueueAttributesInput *sqs.GetQueueAttributesInput + receiveMessageInput *sqs.ReceiveMessageInput + + queueURL atomic.CachedVal[string] + queueARN atomic.CachedVal[string] + queueName string } -func NewSQSProvider(ctx context.Context, client sqsiface.SQSAPI, metadataProvider *MetadataProvider) *SQSProvider { +func NewSQSProvider(ctx context.Context, client sqsiface.SQSAPI) *SQSProvider { provider := &SQSProvider{ - client: client, - mu: sync.RWMutex{}, - metadataProvider: metadataProvider, - queueName: getQueueName(ctx), + client: client, + queueName: getQueueName(ctx), } provider.createQueueInput = &sqs.CreateQueueInput{ - Attributes: provider.getQueueAttributes(ctx), - QueueName: aws.String(provider.queueName), + QueueName: aws.String(provider.queueName), Tags: map[string]*string{ awsv1alpha1.DiscoveryTagKey: aws.String(injection.GetOptions(ctx).ClusterName), }, @@ -77,6 +74,10 @@ func NewSQSProvider(ctx context.Context, client sqsiface.SQSAPI, metadataProvide provider.getQueueURLInput = &sqs.GetQueueUrlInput{ QueueName: aws.String(provider.queueName), } + provider.getQueueAttributesInput = &sqs.GetQueueAttributesInput{ + AttributeNames: aws.StringSlice([]string{sqs.QueueAttributeNameQueueArn}), + QueueUrl: aws.String(provider.queueName), + } provider.receiveMessageInput = &sqs.ReceiveMessageInput{ MaxNumberOfMessages: aws.Int64(10), VisibilityTimeout: aws.Int64(20), // Seconds @@ -95,6 +96,16 @@ func NewSQSProvider(ctx context.Context, client sqsiface.SQSAPI, metadataProvide } return aws.StringValue(ret.QueueUrl), nil } + provider.queueARN.Resolve = func(ctx context.Context) (string, error) { + ret, err := provider.client.GetQueueAttributesWithContext(ctx, provider.getQueueAttributesInput) + if err != nil { + return "", fmt.Errorf("fetching queue arn, %w", err) + } + if arn, ok := ret.Attributes[sqs.QueueAttributeNameQueueArn]; ok { + return aws.StringValue(arn), nil + } + return "", fmt.Errorf("queue arn not found in queue attributes response") + } return provider } @@ -107,8 +118,6 @@ func (s *SQSProvider) CreateQueue(ctx context.Context) error { if err != nil { return fmt.Errorf("creating sqs queue, %w", err) } - s.mu.Lock() - defer s.mu.Unlock() s.queueURL.Set(aws.StringValue(result.QueueUrl)) return nil } @@ -118,9 +127,12 @@ func (s *SQSProvider) SetQueueAttributes(ctx context.Context) error { if err != nil { return fmt.Errorf("fetching queue url, %w", err) } - + attributes, err := s.getQueueAttributes(ctx) + if err != nil { + return fmt.Errorf("marshaling queue attributes, %w", err) + } setQueueAttributesInput := &sqs.SetQueueAttributesInput{ - Attributes: s.getQueueAttributes(ctx), + Attributes: attributes, QueueUrl: aws.String(queueURL), } _, err = s.client.SetQueueAttributesWithContext(ctx, setQueueAttributesInput) @@ -135,6 +147,10 @@ func (s *SQSProvider) DiscoverQueueURL(ctx context.Context, ignoreCache bool) (s return s.queueURL.TryGet(ctx, opts) } +func (s *SQSProvider) DiscoverQueueARN(ctx context.Context) (string, error) { + return s.queueARN.TryGet(ctx) +} + func (s *SQSProvider) GetSQSMessages(ctx context.Context) ([]*sqs.Message, error) { queueURL, err := s.DiscoverQueueURL(ctx, false) if err != nil { @@ -213,15 +229,23 @@ func (s *SQSProvider) DeleteQueue(ctx context.Context) error { return nil } -func (s *SQSProvider) getQueueAttributes(ctx context.Context) map[string]*string { - policy := lo.Must(json.Marshal(s.getQueuePolicy(ctx))) +func (s *SQSProvider) getQueueAttributes(ctx context.Context) (map[string]*string, error) { + raw, err := s.getQueuePolicy(ctx) + if err != nil { + return nil, fmt.Errorf("marshaling queue policy, %w", err) + } + policy := lo.Must(json.Marshal(raw)) return map[string]*string{ sqs.QueueAttributeNameMessageRetentionPeriod: aws.String("300"), sqs.QueueAttributeNamePolicy: aws.String(string(policy)), - } + }, nil } -func (s *SQSProvider) getQueuePolicy(ctx context.Context) *QueuePolicy { +func (s *SQSProvider) getQueuePolicy(ctx context.Context) (*QueuePolicy, error) { + queueARN, err := s.DiscoverQueueARN(ctx) + if err != nil { + return nil, fmt.Errorf("retrieving queue arn for queue policy, %w", err) + } return &QueuePolicy{ Version: "2008-10-17", ID: "EC2NotificationPolicy", @@ -235,19 +259,10 @@ func (s *SQSProvider) getQueuePolicy(ctx context.Context) *QueuePolicy { }, }, Action: []string{"sqs:SendMessage"}, - Resource: s.getQueueARN(ctx), + Resource: queueARN, }, }, - } -} - -func (s *SQSProvider) getQueueARN(ctx context.Context) string { - return fmt.Sprintf("arn:%s:sqs:%s:%s:%s", - s.metadataProvider.Partition(), - s.metadataProvider.Region(ctx), - s.metadataProvider.AccountID(ctx), - s.queueName, - ) + }, nil } func getQueueName(ctx context.Context) string { diff --git a/pkg/config/config.go b/pkg/config/config.go index d99c4685cdea..b9ff96dc205e 100644 --- a/pkg/config/config.go +++ b/pkg/config/config.go @@ -95,7 +95,7 @@ func (c *config) BatchIdleDuration() time.Duration { func (c *config) EnableInterruptionHandling() bool { c.dataMu.RLock() defer c.dataMu.RUnlock() - return c.enableInterruptionHandling + return true } func New(ctx context.Context, kubeClient *kubernetes.Clientset, iw *informer.InformedWatcher) (Config, error) { diff --git a/pkg/controllers/polling/controller.go b/pkg/controllers/polling/controller.go index 4775d9174e1e..2c48908e8284 100644 --- a/pkg/controllers/polling/controller.go +++ b/pkg/controllers/polling/controller.go @@ -18,6 +18,7 @@ import ( "context" "sync" "sync/atomic" + "time" "github.com/google/uuid" "github.com/prometheus/client_golang/prometheus" @@ -40,6 +41,11 @@ import ( "github.com/aws/karpenter/pkg/metrics" ) +// Immediate isn't exactly immediate for a reconcile result. But it should be passed to the RequeueAfter if you want +// effectively immediate re-reconciliation. This can't be 0 because otherwise controller-runtime won't treat it as a +// valid RequeueAfter value +const Immediate = time.Nanosecond + type ControllerInterface interface { controllers.Controller diff --git a/pkg/controllers/polling/decorators.go b/pkg/controllers/polling/decorators.go index fc4fccb44c90..e9e173b441a1 100644 --- a/pkg/controllers/polling/decorators.go +++ b/pkg/controllers/polling/decorators.go @@ -19,6 +19,7 @@ import ( "sync/atomic" "github.com/prometheus/client_golang/prometheus" + "knative.dev/pkg/logging" controllerruntime "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/manager" crmetrics "sigs.k8s.io/controller-runtime/pkg/metrics" @@ -55,20 +56,21 @@ func (c *ControllerWithHealth) Healthy() bool { } func (c *ControllerWithHealth) Reconcile(ctx context.Context, req reconcile.Request) (reconcile.Result, error) { + callerCtx := logging.WithLogger(ctx, logging.FromContext(ctx).Named(c.r.Metadata().Name)) res, err := c.Controller.Reconcile(ctx, req) healthy := err == nil // The controller is considered healthy when it successfully reconciles + c.healthy.Store(healthy) if healthy { if c.OnHealthy != nil { - c.OnHealthy(ctx) + c.OnHealthy(callerCtx) } c.healthyMetric().Set(1) } else { if c.OnUnhealthy != nil { - c.OnUnhealthy(ctx) + c.OnUnhealthy(callerCtx) } c.healthyMetric().Set(0) } - c.healthy.Store(healthy) return res, err } diff --git a/pkg/utils/atomic/cached_val.go b/pkg/utils/atomic/cached_val.go index aa16f69e2eaf..e73484ffe707 100644 --- a/pkg/utils/atomic/cached_val.go +++ b/pkg/utils/atomic/cached_val.go @@ -39,14 +39,15 @@ type CachedVal[T any] struct { Resolve func(context.Context) (T, error) } +// Set assigns the passed value func (c *CachedVal[T]) Set(v T) { c.mu.Lock() defer c.mu.Unlock() c.value = &v } -// TryGet attempts to get non-nil value from internal value. If field is nil, the function -// will attempt to resolve the value by calling fallback, setting the value stored in value in-place if found. +// TryGet attempts to get a non-nil value from the internal value. If the internal value is nil, the Resolve function +// will attempt to resolve the value, setting the value to be persistently stored if the resolve of Resolve is non-nil. func (c *CachedVal[T]) TryGet(ctx context.Context, opts ...Option) (T, error) { o := resolveOptions(opts) c.mu.RLock() @@ -69,7 +70,7 @@ func (c *CachedVal[T]) TryGet(ctx context.Context, opts ...Option) (T, error) { if err != nil { return *new(T), err } - c.value = ptr.Val(ret) // copies the value so we don't keep the reference + c.value = ptr.To(ret) // copies the value so we don't keep the reference return ret, nil } diff --git a/pkg/utils/atomic/suite_test.go b/pkg/utils/atomic/suite_test.go new file mode 100644 index 000000000000..8f42622acea4 --- /dev/null +++ b/pkg/utils/atomic/suite_test.go @@ -0,0 +1,101 @@ +/* +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package atomic_test + +import ( + "context" + "fmt" + "sync" + "testing" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + "github.com/aws/karpenter/pkg/utils/atomic" +) + +func TestAtomic(t *testing.T) { + RegisterFailHandler(Fail) + RunSpecs(t, "Atomic") +} + +var _ = Describe("Atomic", func() { + It("should resolve a value when set", func() { + str := atomic.CachedVal[string]{} + str.Resolve = func(_ context.Context) (string, error) { return "", nil } + str.Set("value") + ret, err := str.TryGet(context.Background()) + Expect(err).To(Succeed()) + Expect(ret).To(Equal("value")) + }) + It("should resolve a value and set a value when empty", func() { + str := atomic.CachedVal[string]{} + str.Resolve = func(_ context.Context) (string, error) { return "value", nil } + ret, err := str.TryGet(context.Background()) + Expect(err).To(Succeed()) + Expect(ret).To(Equal("value")) + }) + It("should error out when the fallback function returns an err", func() { + str := atomic.CachedVal[string]{} + str.Resolve = func(_ context.Context) (string, error) { return "value", fmt.Errorf("failed") } + ret, err := str.TryGet(context.Background()) + Expect(err).ToNot(Succeed()) + Expect(ret).To(BeEmpty()) + }) + It("should ignore the cache when option set", func() { + str := atomic.CachedVal[string]{} + str.Resolve = func(_ context.Context) (string, error) { return "newvalue", nil } + str.Set("hasvalue") + ret, err := str.TryGet(context.Background(), atomic.IgnoreCacheOption) + Expect(err).To(Succeed()) + Expect(ret).To(Equal("newvalue")) + }) + It("shouldn't deadlock on multiple reads", func() { + calls := 0 + str := atomic.CachedVal[string]{} + str.Resolve = func(_ context.Context) (string, error) { calls++; return "value", nil } + wg := &sync.WaitGroup{} + for i := 0; i < 100; i++ { + wg.Add(1) + go func() { + defer wg.Done() + defer GinkgoRecover() + ret, err := str.TryGet(context.Background()) + Expect(err).To(Succeed()) + Expect(ret).To(Equal("value")) + }() + } + wg.Wait() + Expect(calls).To(Equal(1)) + }) + It("shouldn't deadlock on multiple writes", func() { + calls := 0 + str := atomic.CachedVal[string]{} + str.Resolve = func(_ context.Context) (string, error) { calls++; return "value", nil } + wg := &sync.WaitGroup{} + for i := 0; i < 100; i++ { + wg.Add(1) + go func() { + defer wg.Done() + defer GinkgoRecover() + ret, err := str.TryGet(context.Background(), atomic.IgnoreCacheOption) + Expect(err).To(Succeed()) + Expect(ret).To(Equal("value")) + }() + } + wg.Wait() + Expect(calls).To(Equal(100)) + }) +}) diff --git a/pkg/utils/ptr/ptr.go b/pkg/utils/ptr/ptr.go index aaf457009b77..317562327232 100644 --- a/pkg/utils/ptr/ptr.go +++ b/pkg/utils/ptr/ptr.go @@ -31,6 +31,10 @@ func Quantity(quantity resource.Quantity) *resource.Quantity { return &quantity } -func Val[T any](v T) *T { +func To[T any](v T) *T { return &v } + +func From[T any](v *T) T { + return *v +} diff --git a/test/go.mod b/test/go.mod index 1515fff7a353..8874c0518ab9 100644 --- a/test/go.mod +++ b/test/go.mod @@ -7,7 +7,6 @@ require ( github.com/aws/aws-sdk-go v1.44.114 github.com/aws/aws-sdk-go-v2/config v1.17.8 github.com/aws/karpenter v0.17.0 - github.com/google/uuid v1.3.0 github.com/onsi/ginkgo/v2 v2.2.0 github.com/onsi/gomega v1.21.1 github.com/samber/lo v1.31.0 @@ -62,6 +61,7 @@ require ( github.com/google/gnostic v0.5.7-v3refs // indirect github.com/google/go-cmp v0.5.8 // indirect github.com/google/gofuzz v1.2.0 // indirect + github.com/google/uuid v1.3.0 // indirect github.com/grpc-ecosystem/grpc-gateway v1.16.0 // indirect github.com/hashicorp/golang-lru v0.5.4 // indirect github.com/imdario/mergo v0.3.13 // indirect @@ -114,3 +114,5 @@ require ( sigs.k8s.io/structured-merge-diff/v4 v4.2.3 // indirect sigs.k8s.io/yaml v1.3.0 // indirect ) + +replace github.com/aws/karpenter => ../ diff --git a/test/go.sum b/test/go.sum index 82a388581601..25bde38d426d 100644 --- a/test/go.sum +++ b/test/go.sum @@ -103,8 +103,6 @@ github.com/aws/aws-sdk-go-v2/service/ssooidc v1.13.6 h1:OwhhKc1P9ElfWbMKPIbMMZBV github.com/aws/aws-sdk-go-v2/service/ssooidc v1.13.6/go.mod h1:csZuQY65DAdFBt1oIjO5hhBR49kQqop4+lcuCjf2arA= github.com/aws/aws-sdk-go-v2/service/sts v1.16.19 h1:9pPi0PsFNAGILFfPCk8Y0iyEBGc6lu6OQ97U7hmdesg= github.com/aws/aws-sdk-go-v2/service/sts v1.16.19/go.mod h1:h4J3oPZQbxLhzGnk+j9dfYHi5qIOVJ5kczZd658/ydM= -github.com/aws/karpenter v0.17.0 h1:R9rJmSChEfLaCYQeBxHfyRuJXBLk1Rzj6UR9Fw3+n2w= -github.com/aws/karpenter v0.17.0/go.mod h1:acChPsZRb5mvfuMibZ3ZV9UkNNDHHw2VcAzlwOAAfo0= github.com/aws/smithy-go v1.11.2/go.mod h1:3xHYmszWVx2c0kIwQeEVf9uSm4fYZt67FBJnwub1bgM= github.com/aws/smithy-go v1.13.3 h1:l7LYxGuzK6/K+NzJ2mC+VvLUbae0sL3bXU//04MkmnA= github.com/aws/smithy-go v1.13.3/go.mod h1:Tg+OJXh4MB2R/uN61Ko2f6hTZwB/ZYGOtib8J3gBHzA= diff --git a/test/go.work b/test/go.work deleted file mode 100644 index 0baa3f79db35..000000000000 --- a/test/go.work +++ /dev/null @@ -1,6 +0,0 @@ -go 1.19 - -use ( - . - ./.. -) diff --git a/test/go.work.sum b/test/go.work.sum deleted file mode 100644 index ba4a53e38073..000000000000 --- a/test/go.work.sum +++ /dev/null @@ -1,172 +0,0 @@ -cloud.google.com/go v0.98.0 h1:w6LozQJyDDEyhf64Uusu1LCcnLt0I1VMLiJC2kV+eXk= -cloud.google.com/go v0.98.0/go.mod h1:ua6Ush4NALrHk5QXDWnjvZHN93OuF0HfuEPq9I1X0cM= -cloud.google.com/go/bigquery v1.8.0 h1:PQcPefKFdaIzjQFbiyOgAqyx8q5djaE7x9Sqe712DPA= -cloud.google.com/go/datastore v1.1.0 h1:/May9ojXjRkPBNVrq+oWLqmWCkr4OU5uRY29bu0mRyQ= -cloud.google.com/go/pubsub v1.3.1 h1:ukjixP1wl0LpnZ6LWtZJ0mX5tBmjp1f8Sqer8Z2OMUU= -cloud.google.com/go/storage v1.18.2 h1:5NQw6tOn3eMm0oE8vTkfjau18kjL79FlMjy/CHTpmoY= -cloud.google.com/go/storage v1.18.2/go.mod h1:AiIj7BWXyhO5gGVmYJ+S8tbkCx3yb0IMjua8Aw4naVM= -contrib.go.opencensus.io/exporter/zipkin v0.1.2 h1:YqE293IZrKtqPnpwDPH/lOqTWD/s3Iwabycam74JV3g= -dmitri.shuralyov.com/gpu/mtl v0.0.0-20190408044501-666a987793e9 h1:VpgP7xuJadIUuKccphEpTJnWhS2jkQyMt6Y7pJCD7fY= -github.com/Azure/go-ansiterm v0.0.0-20210617225240-d185dfc1b5a1 h1:UQHMgLO+TxOElx5B5HZ4hJQsoJ/PvUvKRhJHDQXO8P8= -github.com/Azure/go-autorest v14.2.0+incompatible h1:V5VMDjClD3GiElqLWO7mz2MxNAK/vTfRHdAubSIPRgs= -github.com/Azure/go-autorest/autorest v0.11.27 h1:F3R3q42aWytozkV8ihzcgMO4OA4cuqr3bNlsEuF6//A= -github.com/Azure/go-autorest/autorest v0.11.27/go.mod h1:7l8ybrIdUmGqZMTD0sRtAr8NvbHjfofbf8RSP2q7w7U= -github.com/Azure/go-autorest/autorest/adal v0.9.18/go.mod h1:XVVeme+LZwABT8K5Lc3hA4nAe8LDBVle26gTrguhhPQ= -github.com/Azure/go-autorest/autorest/adal v0.9.20 h1:gJ3E98kMpFB1MFqQCvA1yFab8vthOeD4VlFRQULxahg= -github.com/Azure/go-autorest/autorest/adal v0.9.20/go.mod h1:XVVeme+LZwABT8K5Lc3hA4nAe8LDBVle26gTrguhhPQ= -github.com/Azure/go-autorest/autorest/date v0.3.0 h1:7gUk1U5M/CQbp9WoqinNzJar+8KY+LPI6wiWrP/myHw= -github.com/Azure/go-autorest/autorest/mocks v0.4.2/go.mod h1:Vy7OitM9Kei0i1Oj+LvyAWMXJHeKH1MVlzFugfVrmyU= -github.com/Azure/go-autorest/logger v0.2.1 h1:IG7i4p/mDa2Ce4TRyAO8IHnVhAVF3RFU+ZtXWSmf4Tg= -github.com/Azure/go-autorest/tracing v0.6.0 h1:TYi4+3m5t6K48TGI9AUdb+IzbnSxvnvUMfuitfgcfuo= -github.com/BurntSushi/toml v0.3.1 h1:WXkYYl6Yr3qBf1K79EBnL4mak0OimBfB0XUf9Vl28OQ= -github.com/BurntSushi/xgb v0.0.0-20160522181843-27f122750802 h1:1BDTz0u9nC3//pOCMdNH+CiXJVYJh5UQNCOBG7jbELc= -github.com/NYTimes/gziphandler v1.1.1 h1:ZUDjpQae29j0ryrS0u/B8HZfJBtBQHjqw2rQ2cqUQ3I= -github.com/OneOfOne/xxhash v1.2.2 h1:KMrpdQIwFcEqXDklaen+P1axHaj9BSKzvpUUfnHldSE= -github.com/alecthomas/template v0.0.0-20190718012654-fb15b899a751 h1:JYp7IbQjafoB+tBA3gMyHYHrpOtNuDiK/uB5uXxq5wM= -github.com/alecthomas/units v0.0.0-20190924025748-f65c72e2690d h1:UQZhZ2O0vMHr2cI+DC1Mbh0TJxzA3RcLoMsFw+aXw7E= -github.com/antihax/optional v1.0.0 h1:xK2lYat7ZLaVVcIuj82J8kIro4V6kDe0AUDFboUCwcg= -github.com/antlr/antlr4/runtime/Go/antlr v0.0.0-20220418222510-f25a4f6275ed h1:ue9pVfIcP+QMEjfgo/Ez4ZjNZfonGgR6NgjMaJMu1Cg= -github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5 h1:0CwZNZbxp69SHPdPJAN/hZIm0C4OItdklCFmMRWYpio= -github.com/asaskevich/govalidator v0.0.0-20190424111038-f61b66f89f4a h1:idn718Q4B6AGu/h5Sxe66HYVdqdGu2l9Iebqhi/AEoA= -github.com/cespare/xxhash v1.1.0 h1:a6HrQnmkObjyL+Gs60czilIUGqrzKutQD6XZog3p+ko= -github.com/chzyer/logex v1.1.10 h1:Swpa1K6QvQznwJRcfTfQJmTE72DqScAa40E+fbHEXEE= -github.com/chzyer/readline v0.0.0-20180603132655-2972be24d48e h1:fY5BOSpyZCqRo5OhCuC+XN+r/bBCmeuuJtjz+bCNIf8= -github.com/chzyer/test v0.0.0-20180213035817-a1ea475d72b1 h1:q763qf9huN11kDQavWsoZXJNW3xEE4JJyHa5Q25/sd8= -github.com/client9/misspell v0.3.4 h1:ta993UF76GwbvJcIo3Y68y/M3WxlpEHPWIGDkJYwzJI= -github.com/cncf/udpa/go v0.0.0-20210930031921-04548b0d99d4 h1:hzAQntlaYRkVSFEfj9OTWlVV1H155FMD8BTKktLv0QI= -github.com/cncf/xds/go v0.0.0-20211011173535-cb28da3451f1 h1:zH8ljVhhq7yC0MIeUL/IviMtY8hx2mK8cN9wEYb8ggw= -github.com/coreos/go-semver v0.3.0 h1:wkHLiw0WNATZnSG7epLsujiMCgPAc9xhjJ4tgnAxmfM= -github.com/coreos/go-systemd/v22 v22.3.2 h1:D9/bQk5vlXQFZ6Kwuu6zaiXJ9oTPe68++AzAJc1DzSI= -github.com/creack/pty v1.1.9 h1:uDmaGzcdjhF4i/plgjmEsriH11Y0o7RKapEf/LDaM3w= -github.com/dgryski/go-gk v0.0.0-20200319235926-a69029f61654 h1:XOPLOMn/zT4jIgxfxSsoXPxkrzz0FaCHwp33x5POJ+Q= -github.com/docopt/docopt-go v0.0.0-20180111231733-ee0de3bc6815 h1:bWDMxwH3px2JBh6AyO7hdCn/PkvCZXii8TGj7sbtEbQ= -github.com/dustin/go-humanize v1.0.0 h1:VSnTsYCnlFHaM2/igO1h6X3HA71jcobQuxemgkq4zYo= -github.com/elazarl/goproxy v0.0.0-20180725130230-947c36da3153 h1:yUdfgN0XgIJw7foRItutHYUIhlcKzcSf5vDpdhQAKTc= -github.com/emicklei/go-restful v2.9.5+incompatible h1:spTtZBk5DYEvbxMVutUuTyh1Ao2r4iyvLdACqsl/Ljk= -github.com/envoyproxy/go-control-plane v0.10.2-0.20220325020618-49ff273808a1 h1:xvqufLtNVwAhN8NMyWklVgxnWohi+wtMGQMhtxexlm0= -github.com/envoyproxy/protoc-gen-validate v0.1.0 h1:EQciDnbrYxy13PgWoY8AqoxGiPrpgBZ1R8UNe3ddc+A= -github.com/felixge/httpsnoop v1.0.1 h1:lvB5Jl89CsZtGIWuTcDM1E/vkVs49/Ml7JJe07l8SPQ= -github.com/form3tech-oss/jwt-go v3.2.3+incompatible h1:7ZaBxOI7TMoYBfyA3cQHErNNyAWIKUMIwqxEtgHOs5c= -github.com/getkin/kin-openapi v0.76.0 h1:j77zg3Ec+k+r+GA3d8hBoXpAc6KX9TbBPrwQGBIy2sY= -github.com/ghodss/yaml v1.0.0 h1:wQHKEahhL6wmXdzwWG11gIVCkOv05bNOh+Rxn0yngAk= -github.com/go-gl/glfw v0.0.0-20190409004039-e6da0acd62b1 h1:QbL/5oDUmRBzO9/Z7Seo6zf912W/a6Sr4Eu0G/3Jho0= -github.com/go-gl/glfw/v3.3/glfw v0.0.0-20200222043503-6f7a984d4dc4 h1:WtGNWLvXpe6ZudgnXrq0barxBImvnnJoMEhXAzcbM0I= -github.com/go-kit/kit v0.9.0 h1:wDJmvq38kDhkVxi50ni9ykkdUr1PKgqKOoi01fa0Mdk= -github.com/go-stack/stack v1.8.0 h1:5SgMzNM5HxrEjV0ww2lTmX6E2Izsfxas4+YHWRs3Lsk= -github.com/go-task/slim-sprig v0.0.0-20210107165309-348f09dbbbc0 h1:p104kn46Q8WdvHunIJ9dAyjPVtrBPhSr3KT2yUst43I= -github.com/go-task/slim-sprig v0.0.0-20210107165309-348f09dbbbc0/go.mod h1:fyg7847qk6SyHyPtNmDHnmrv/HOrqktSC+C9fM+CJOE= -github.com/golang-jwt/jwt/v4 v4.0.0/go.mod h1:/xlHOz8bRuivTWchD4jCa+NbatV+wEUSzwAxVc6locg= -github.com/golang-jwt/jwt/v4 v4.2.0 h1:besgBTC8w8HjP6NzQdxwKH9Z5oQMZ24ThTrHp3cZ8eU= -github.com/golang-jwt/jwt/v4 v4.2.0/go.mod h1:/xlHOz8bRuivTWchD4jCa+NbatV+wEUSzwAxVc6locg= -github.com/golang/glog v1.0.0 h1:nfP3RFugxnNRyKgeWd4oI1nYvXpxrx8ck8ZrcizshdQ= -github.com/golang/mock v1.6.0 h1:ErTB+efbowRARo13NNdxyJji2egdxLGQhRaY+DUumQc= -github.com/golang/snappy v0.0.3 h1:fHPg5GQYlCeLIPB9BZqMVR5nR9A+IM5zcgeTdjMYmLA= -github.com/google/btree v1.0.1 h1:gK4Kx5IaGY9CD5sPJ36FHiBJ6ZXl0kilRiiCj+jdYp4= -github.com/google/cel-go v0.12.5 h1:DmzaiSgoaqGCjtpPQWl26/gND+yRpim56H1jCVev6d8= -github.com/google/go-github/v27 v27.0.6 h1:oiOZuBmGHvrGM1X9uNUAUlLgp5r1UUO/M/KnbHnLRlQ= -github.com/google/go-querystring v1.0.0 h1:Xkwi/a1rcvNg1PPYe5vI8GbeBY/jrVuDX5ASuANWTrk= -github.com/google/mako v0.0.0-20190821191249-122f8dcef9e3 h1:/o5e44nTD/QEEiWPGSFT3bSqcq3Qg7q27N9bv4gKh5M= -github.com/google/martian v2.1.0+incompatible h1:/CP5g8u/VJHijgedC/Legn3BAbAaWPgecwXBIDzw5no= -github.com/google/martian/v3 v3.2.1 h1:d8MncMlErDFTwQGBK1xhv026j9kqhvw1Qv9IbWT1VLQ= -github.com/google/pprof v0.0.0-20210407192527-94a9f03dee38/go.mod h1:kpwsk12EmLew5upagYY7GY0pfYCcupk39gWOCRROcvE= -github.com/google/pprof v0.0.0-20210720184732-4bb14d4b1be1 h1:K6RDEckDVWvDI9JAJYCmNdQXq6neHJOYx3V6jnqNEec= -github.com/google/renameio v0.1.0 h1:GOZbcHa3HfsPKPlmyPyN2KEohoMXOhdMbHrvbpl2QaA= -github.com/googleapis/gax-go/v2 v2.1.1 h1:dp3bWCh+PPO1zjRRiCSczJav13sBvG4UhNyVTa1KqdU= -github.com/gorilla/websocket v1.4.2 h1:+/TMaTYc4QFitKJxsQ7Yye35DkWvkdLcvGKqM+x0Ufc= -github.com/gregjones/httpcache v0.0.0-20180305231024-9cad4c3443a7 h1:pdN6V1QBWetyv/0+wjACpqVH+eVULgEjkurDLq3goeM= -github.com/grpc-ecosystem/go-grpc-middleware v1.3.0 h1:+9834+KizmvFV7pXQGSXQTsaWhq2GjuNUt0aUU0YBYw= -github.com/grpc-ecosystem/go-grpc-prometheus v1.2.0 h1:Ovs26xHkKqVztRpIrF/92BcuyuQ/YW4NSIpoGtfXNho= -github.com/ianlancetaylor/demangle v0.0.0-20200824232613-28f6c0f3b639 h1:mV02weKRL81bEnm8A0HT1/CAelMQDBuQIfLw8n+d6xI= -github.com/imdario/mergo v0.3.6/go.mod h1:2EnlNZ0deacrJVfApfmtdGgDfMuh/nq6Ok1EcJh5FfA= -github.com/inconshreveable/mousetrap v1.0.0 h1:Z8tu5sraLXCXIcARxBp/8cbvlwVa7Z1NHg9XEKhtSvM= -github.com/influxdata/tdigest v0.0.0-20180711151920-a7d76c6f093a h1:vMqgISSVkIqWxCIZs8m1L4096temR7IbYyNdMiBxSPA= -github.com/jessevdk/go-flags v1.4.0 h1:4IU2WS7AumrZ/40jfhf4QVDMsQwqA7VEHozFRrGARJA= -github.com/jonboulle/clockwork v0.2.2 h1:UOGuzwb1PwsrDAObMuhUnj0p5ULPj8V/xJ7Kx9qUBdQ= -github.com/jpillora/backoff v1.0.0 h1:uvFg412JmmHBHw7iwprIxkPMI+sGQ4kzOWsMeHnm2EA= -github.com/jstemmer/go-junit-report v0.9.1 h1:6QPYqodiu3GuPL+7mfx+NwDdp2eTkp9IfEUpgAwUN0o= -github.com/julienschmidt/httprouter v1.3.0 h1:U0609e9tgbseu3rBINet9P48AI/D3oJs4dN7jwJOQ1U= -github.com/kisielk/errcheck v1.5.0 h1:e8esj/e4R+SAOwFwN+n3zr0nYeCyeweozKfO23MvHzY= -github.com/kisielk/gotool v1.0.0 h1:AV2c/EiW3KqPNT9ZKl07ehoAGi4C5/01Cfbblndcapg= -github.com/konsorten/go-windows-terminal-sequences v1.0.3 h1:CE8S1cTafDpPvMhIxNJKvHsGVBgn1xWYf1NbHQhywc8= -github.com/kr/logfmt v0.0.0-20140226030751-b84e30acd515 h1:T+h1c/A9Gawja4Y9mFVWj2vyii2bbUNDw3kt9VxK2EY= -github.com/kr/pretty v0.2.0 h1:s5hAObm+yFO5uHYt5dYjxi2rXrsnmRpJx4OYvIWUaQs= -github.com/kr/pty v1.1.1 h1:VkoXIwSboBpnk99O/KFauAEILuNHv5DVFKZMBN/gUgw= -github.com/mitchellh/mapstructure v1.4.1 h1:CpVNEelQCZBooIPDn+AR3NpivK/TIKU8bDxdASFVQag= -github.com/moby/spdystream v0.2.0 h1:cjW1zVyyoiM0T7b6UoySUFqzXMoqRckQtXwGPiBhOM8= -github.com/moby/term v0.0.0-20210619224110-3f7ff695adc6 h1:dcztxKSvZ4Id8iPpHERQBbIJfabdt4wUm5qy3wOL2Zc= -github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f h1:KUppIJq7/+SVif2QVs3tOP0zanoHgBEVAwHxUSIzRqU= -github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f h1:y5//uYreIhSUg3J1GEMiLbxo1LJaP8RfCpH6pymGZus= -github.com/onsi/ginkgo/v2 v2.1.4/go.mod h1:um6tUpWM/cxCK3/FK8BXqEiUMUwRgSM4JXG47RKZmLU= -github.com/onsi/ginkgo/v2 v2.1.6/go.mod h1:MEH45j8TBi6u9BMogfbp0stKC5cdGjumZj5Y7AG4VIk= -github.com/onsi/gomega v1.19.0/go.mod h1:LY+I3pBVzYsTBU1AnDwOSxaYi9WoWiqgwooUqq9yPro= -github.com/onsi/gomega v1.20.1/go.mod h1:DtrZpjmvpn2mPm4YWQa0/ALMDj9v4YxLgojwPeREyVo= -github.com/openzipkin/zipkin-go v0.3.0 h1:XtuXmOLIXLjiU2XduuWREDT0LOKtSgos/g7i7RYyoZQ= -github.com/peterbourgon/diskv v2.0.1+incompatible h1:UBdAOUP5p4RWqPBg048CAvpKN+vxiaj6gdUUzhl4XmI= -github.com/rogpeppe/fastuuid v1.2.0 h1:Ppwyp6VYCF1nvBTXL3trRso7mXMlRrw9ooo375wvi2s= -github.com/rogpeppe/go-internal v1.3.0 h1:RR9dF3JtopPvtkroDZuVD7qquD0bnHlKSqaQhgwt8yk= -github.com/sirupsen/logrus v1.8.1 h1:dJKuHgqk1NNQlqoA6BTlM1Wf9DOH3NBjQyu0h9+AZZE= -github.com/soheilhy/cmux v0.1.5 h1:jjzc5WVemNEDTLwv9tlmemhC73tI08BNOIGwBOo10Js= -github.com/spaolacci/murmur3 v0.0.0-20180118202830-f09979ecbc72 h1:qLC7fQah7D6K1B0ujays3HV9gkFtllcxhzImRR7ArPQ= -github.com/spf13/afero v1.2.2 h1:5jhuqJyZCZf2JRofRvN/nIFgIWNzPa3/Vz8mYylgbWc= -github.com/spf13/cobra v1.4.0 h1:y+wJpx64xcgO1V+RcnwW0LEHxTKRi2ZDPSBjWnrg88Q= -github.com/stoewer/go-strcase v1.2.0 h1:Z2iHWqGXH00XYgqDmNgQbIBxf3wrNq0F3feEy0ainaU= -github.com/stretchr/objx v0.4.0 h1:M2gUjqZET1qApGOWNSnZ49BAIMX4F/1plDv3+l31EJ4= -github.com/tmc/grpc-websocket-proxy v0.0.0-20201229170055-e5319fda7802 h1:uruHq4dN7GR16kFc5fp3d1RIYzJW5onx8Ybykw2YQFA= -github.com/tsenart/vegeta/v12 v12.8.4 h1:UQ7tG7WkDorKj0wjx78Z4/vsMBP8RJQMGJqRVrkvngg= -github.com/xiang90/probing v0.0.0-20190116061207-43a291ad63a2 h1:eY9dn8+vbi4tKz5Qo6v2eYzo7kUS51QINcR5jNpbZS8= -github.com/yuin/goldmark v1.3.5 h1:dPmz1Snjq0kmkz159iL7S6WzdahUTHnHB5M56WFVifs= -github.com/yuin/goldmark v1.4.13 h1:fVcFKWvrslecOb/tg+Cc05dkeYx540o0FuFt3nUVDoE= -github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= -go.etcd.io/bbolt v1.3.6 h1:/ecaJf0sk1l4l6V4awd65v2C3ILy7MSj+s/x1ADCIMU= -go.etcd.io/etcd/api/v3 v3.5.4 h1:OHVyt3TopwtUQ2GKdd5wu3PmmipR4FTwCqoEjSyRdIc= -go.etcd.io/etcd/client/pkg/v3 v3.5.4 h1:lrneYvz923dvC14R54XcA7FXoZ3mlGZAgmwhfm7HqOg= -go.etcd.io/etcd/client/v2 v2.305.4 h1:Dcx3/MYyfKcPNLpR4VVQUP5KgYrBeJtktBwEKkw08Ao= -go.etcd.io/etcd/client/v3 v3.5.4 h1:p83BUL3tAYS0OT/r0qglgc3M1JjhM0diV8DSWAhVXv4= -go.etcd.io/etcd/pkg/v3 v3.5.4 h1:V5Dvl7S39ZDwjkKqJG2BfXgxZ3QREqqKifWQgIw5IM0= -go.etcd.io/etcd/raft/v3 v3.5.4 h1:YGrnAgRfgXloBNuqa+oBI/aRZMcK/1GS6trJePJ/Gqc= -go.etcd.io/etcd/server/v3 v3.5.4 h1:CMAZd0g8Bn5NRhynW6pKhc4FRg41/0QYy3d7aNm9874= -go.opentelemetry.io/contrib v0.20.0 h1:ubFQUn0VCZ0gPwIoJfBJVpeBlyRMxu8Mm/huKWYd9p0= -go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.20.0 h1:sO4WKdPAudZGKPcpZT4MJn6JaDmpyLrMPDGGyA1SttE= -go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.20.0 h1:Q3C9yzW6I9jqEc8sawxzxZmY48fs9u220KXq6d5s3XU= -go.opentelemetry.io/otel v0.20.0 h1:eaP0Fqu7SXHwvjiqDq83zImeehOHX8doTvU9AwXON8g= -go.opentelemetry.io/otel/exporters/otlp v0.20.0 h1:PTNgq9MRmQqqJY0REVbZFvwkYOA85vbdQU/nVfxDyqg= -go.opentelemetry.io/otel/metric v0.20.0 h1:4kzhXFP+btKm4jwxpjIqjs41A7MakRFUS86bqLHTIw8= -go.opentelemetry.io/otel/sdk v0.20.0 h1:JsxtGXd06J8jrnya7fdI/U/MR6yXA5DtbZy+qoHQlr8= -go.opentelemetry.io/otel/sdk/export/metric v0.20.0 h1:c5VRjxCXdQlx1HjzwGdQHzZaVI82b5EbBgOu2ljD92g= -go.opentelemetry.io/otel/sdk/metric v0.20.0 h1:7ao1wpzHRVKf0OQ7GIxiQJA6X7DLX9o14gmVon7mMK8= -go.opentelemetry.io/otel/trace v0.20.0 h1:1DL6EXUdcg95gukhuRRvLDO/4X5THh/5dIV52lqtnbw= -go.opentelemetry.io/proto/otlp v0.7.0 h1:rwOQPCuKAKmwGKq2aVNnYIibI6wnV7EvzgfTCzcdGg8= -golang.org/x/crypto v0.0.0-20211215153901-e495a2d5b3d3/go.mod h1:IxCIyHEi3zRg3s0A5j5BB6A9Jmi73HwBIUl50j+osU4= -golang.org/x/crypto v0.0.0-20220315160706-3147a52a75dd h1:XcWmESyNjXJMLahc3mqVQJcgSTDxFxhETVlfk9uGc38= -golang.org/x/crypto v0.0.0-20220315160706-3147a52a75dd/go.mod h1:IxCIyHEi3zRg3s0A5j5BB6A9Jmi73HwBIUl50j+osU4= -golang.org/x/image v0.0.0-20190802002840-cff245a6509b h1:+qEpEAPhDZ1o0x3tHzZTQDArnOixOzGD9HUJfcg0mb4= -golang.org/x/lint v0.0.0-20210508222113-6edffad5e616 h1:VLliZ0d+/avPrXXH+OakdXhpJuEoBZuwh1m2j7U6Iug= -golang.org/x/mobile v0.0.0-20190719004257-d2bd2a29d028 h1:4+4C/Iv2U4fMZBiMCc98MG1In4gJY5YRhtpDNeDeHWs= -golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= -golang.org/x/oauth2 v0.0.0-20211005180243-6b3c2da341f1/go.mod h1:KelEdhl1UZF7XfJ4dDtk6s++YSgaE7mD/BuKKDLBl4A= -golang.org/x/sys v0.0.0-20210917161153-d61c044b1678/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.0.0-20220319134239-a9b59b0215f8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/tools v0.1.10/go.mod h1:Uh6Zz+xoGYZom868N8YTex3t7RhtHDBrE8Gzo9bV56E= -golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= -golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1 h1:go1bK/D/BFZV2I8cIQd1NKEZ+0owSTG1fDTci4IqFcE= -google.golang.org/api v0.58.0/go.mod h1:cAbP2FsxoGVNwtgNAmmn3y5G1TWAiVYRmg4yku3lv+E= -google.golang.org/genproto v0.0.0-20210917145530-b395a37504d4/go.mod h1:eFjDcFEctNawg4eG61bRv87N7iHBWyVhJu7u1kqDUXY= -google.golang.org/genproto v0.0.0-20211016002631-37fc39342514/go.mod h1:5CzLGKJ67TSI2B9POpiiyGha0AjJvZIUgRMt1dSmuhc= -google.golang.org/genproto v0.0.0-20211129164237-f09f9a12af12/go.mod h1:5CzLGKJ67TSI2B9POpiiyGha0AjJvZIUgRMt1dSmuhc= -google.golang.org/grpc/cmd/protoc-gen-go-grpc v1.1.0 h1:M1YKkFIboKNieVO5DLUEVzQfGwJD30Nv2jfUgzb5UcE= -gopkg.in/alecthomas/kingpin.v2 v2.2.6 h1:jMFz6MfLP0/4fUyZle81rXUoxOBFi19VUFKVDOQfozc= -gopkg.in/errgo.v2 v2.1.0 h1:0vLT13EuvQ0hNvakwLuFZ/jYrLp5F3kcWHXdRggjCE8= -gopkg.in/natefinch/lumberjack.v2 v2.0.0 h1:1Lc07Kr7qY4U2YPouBjpCLxpiyxIVoxqXgkXLknAOE8= -gotest.tools/v3 v3.0.3 h1:4AuOwCGf4lLR9u3YOe2awrHygurzhO/HeQ6laiA6Sx0= -honnef.co/go/tools v0.0.1-2020.1.4 h1:UoveltGrhghAA7ePc+e+QYDHXrBps2PqFZiHkGR/xK8= -k8s.io/apiserver v0.25.2 h1:YePimobk187IMIdnmsMxsfIbC5p4eX3WSOrS9x6FEYw= -k8s.io/code-generator v0.25.2 h1:qEHux0+E1c+j1MhsWn9+4Z6av8zrZBixOTPW064rSiY= -k8s.io/gengo v0.0.0-20211129171323-c02415ce4185 h1:TT1WdmqqXareKxZ/oNXEUSwKlLiHzPMyB0t8BaFeBYI= -k8s.io/gengo v0.0.0-20220613173612-397b4ae3bce7/go.mod h1:FiNAH4ZV3gBg2Kwh89tzAEV2be7d5xI0vBa/VySYy3E= -k8s.io/klog/v2 v2.70.1/go.mod h1:y1WjHnz7Dj687irZUWR/WLkLc5N1YHtjLdmgWjndZn0= -knative.dev/hack v0.0.0-20220913095247-7556452c2b54 h1:JycKYjjK9+noO3IWvRES6gW65SMUf/efNhaXRqh3u+A= -rsc.io/binaryregexp v0.2.0 h1:HfqmD5MEmC0zvwBuF187nq9mdnXjXsSivRiXN7SmRkE= -rsc.io/quote/v3 v3.1.0 h1:9JKUTTIUgS6kzR9mK1YuGKv6Nl+DijDNIc0ghT58FaY= -rsc.io/sampler v1.3.0 h1:7uVkIFmeBqHfdjD+gZwtXXI+RODJ2Wc4O7MPEh/QiW4= -sigs.k8s.io/apiserver-network-proxy/konnectivity-client v0.0.32 h1:2WjukG7txtEsbXsSKWtTibCdsyYAhcu6KFnttyDdZOQ= diff --git a/test/pkg/environment/environment.go b/test/pkg/environment/environment.go index a9a21ed005e2..1372b198b9fe 100644 --- a/test/pkg/environment/environment.go +++ b/test/pkg/environment/environment.go @@ -50,12 +50,12 @@ import ( type AWSEnvironment struct { *Environment + Region string - MetadataProvider *aws.MetadataProvider - EC2API ec2.EC2 - SSMAPI ssm.SSM - STSAPI sts.STS - IAMAPI iam.IAM + EC2API ec2.EC2 + SSMAPI ssm.SSM + STSAPI sts.STS + IAMAPI iam.IAM SQSProvider *aws.SQSProvider InterruptionAPI *itn.ITN @@ -75,16 +75,15 @@ func NewAWSEnvironment(env *Environment, err error) (*AWSEnvironment, error) { return nil, err } session := session.Must(session.NewSessionWithOptions(session.Options{SharedConfigState: session.SharedConfigEnable})) - metadataProvider := aws.NewMetadataProvider(session, aws.NewEC2MetadataClient(session), sts.New(session)) return &AWSEnvironment{ - Environment: env, - MetadataProvider: metadataProvider, - EC2API: *ec2.New(session), - SSMAPI: *ssm.New(session), - IAMAPI: *iam.New(session), - InterruptionAPI: itn.New(lo.Must(cfg.LoadDefaultConfig(env.Context))), - SQSProvider: aws.NewSQSProvider(env.Context, sqs.New(session), metadataProvider), + Region: *session.Config.Region, + Environment: env, + EC2API: *ec2.New(session), + SSMAPI: *ssm.New(session), + IAMAPI: *iam.New(session), + InterruptionAPI: itn.New(lo.Must(cfg.LoadDefaultConfig(env.Context))), + SQSProvider: aws.NewSQSProvider(env.Context, sqs.New(session)), }, nil } diff --git a/test/pkg/environment/setup.go b/test/pkg/environment/setup.go index 6daf210f2ff4..f8839d5b3d2c 100644 --- a/test/pkg/environment/setup.go +++ b/test/pkg/environment/setup.go @@ -16,6 +16,7 @@ package environment import ( "fmt" + "reflect" "strings" "sync" "time" @@ -209,7 +210,7 @@ func (env *Environment) startNodeMonitor(stop <-chan struct{}) { factory.Start(stop) } -func (env *Environment) AfterEach() { +func (env *Environment) AfterEach(excludeObjects ...client.Object) { if debugE2E { fmt.Println("------- START AFTER -------") defer fmt.Println("------- END AFTER -------") @@ -219,26 +220,28 @@ func (env *Environment) AfterEach() { Expect(env.Client.List(env, namespaces)).To(Succeed()) wg := sync.WaitGroup{} for _, p := range CleanableObjects { - for _, namespace := range namespaces.Items { - wg.Add(1) - go func(obj client.Object, objList client.ObjectList, namespace string) { - defer GinkgoRecover() - defer wg.Done() - Expect(env.Client.DeleteAllOf(env, obj, - client.InNamespace(namespace), - client.HasLabels([]string{TestLabelName}), - client.PropagationPolicy(metav1.DeletePropagationForeground), - )).To(Succeed()) - Eventually(func(g Gomega) { - stored := objList.DeepCopyObject().(client.ObjectList) - g.Expect(env.Client.List(env, stored, + if !containsObjectType(excludeObjects, p.first) { + for _, namespace := range namespaces.Items { + wg.Add(1) + go func(obj client.Object, objList client.ObjectList, namespace string) { + defer GinkgoRecover() + defer wg.Done() + Expect(env.Client.DeleteAllOf(env, obj, client.InNamespace(namespace), - client.HasLabels([]string{TestLabelName}))).To(Succeed()) - items, err := meta.ExtractList(objList) - g.Expect(err).To(Succeed()) - g.Expect(len(items)).To(BeZero()) - }).Should(Succeed()) - }(p.first, p.second, namespace.Name) + client.HasLabels([]string{TestLabelName}), + client.PropagationPolicy(metav1.DeletePropagationForeground), + )).To(Succeed()) + Eventually(func(g Gomega) { + stored := objList.DeepCopyObject().(client.ObjectList) + g.Expect(env.Client.List(env, stored, + client.InNamespace(namespace), + client.HasLabels([]string{TestLabelName}))).To(Succeed()) + items, err := meta.ExtractList(objList) + g.Expect(err).To(Succeed()) + g.Expect(len(items)).To(BeZero()) + }).Should(Succeed()) + }(p.first, p.second, namespace.Name) + } } } wg.Wait() @@ -321,3 +324,7 @@ func (env *Environment) dumpNodeEvents(testStartTime time.Time) { fmt.Print(getEventInformation("node", k, v)) } } + +func containsObjectType(objs []client.Object, obj client.Object) bool { + return lo.ContainsBy(objs, func(o client.Object) bool { return reflect.TypeOf(o) == reflect.TypeOf(obj) }) +} diff --git a/test/suites/integration/scheduling_test.go b/test/suites/integration/scheduling_test.go index 3dac6523c3ba..b07d60dbd97b 100644 --- a/test/suites/integration/scheduling_test.go +++ b/test/suites/integration/scheduling_test.go @@ -42,8 +42,8 @@ var _ = Describe("Scheduling", func() { nodeSelector := map[string]string{ // Well Known v1alpha5.ProvisionerNameLabelKey: provisioner.Name, - v1.LabelTopologyRegion: env.MetadataProvider.Region(env), - v1.LabelTopologyZone: fmt.Sprintf("%sa", env.MetadataProvider.Region(env)), + v1.LabelTopologyRegion: env.Region, + v1.LabelTopologyZone: fmt.Sprintf("%sa", env.Region), v1.LabelInstanceTypeStable: "g4dn.8xlarge", v1.LabelOSStable: "linux", v1.LabelArchStable: "amd64", @@ -63,8 +63,8 @@ var _ = Describe("Scheduling", func() { awsv1alpha1.LabelInstanceGPUMemory: "16384", awsv1alpha1.LabelInstanceLocalNVME: "900", // Deprecated Labels - v1.LabelFailureDomainBetaZone: fmt.Sprintf("%sa", env.MetadataProvider.Region(env)), - v1.LabelFailureDomainBetaRegion: env.MetadataProvider.Region(env), + v1.LabelFailureDomainBetaZone: fmt.Sprintf("%sa", env.Region), + v1.LabelFailureDomainBetaRegion: env.Region, "beta.kubernetes.io/arch": "amd64", "beta.kubernetes.io/os": "linux", v1.LabelInstanceType: "g4dn.8xlarge", diff --git a/test/suites/notification/suite_test.go b/test/suites/notification/suite_test.go index 64a4ac6b8702..73dd822e389b 100644 --- a/test/suites/notification/suite_test.go +++ b/test/suites/notification/suite_test.go @@ -40,6 +40,7 @@ import ( ) var env *environment.AWSEnvironment +var provider *v1alpha1.AWSNodeTemplate func TestNotification(t *testing.T) { RegisterFailHandler(Fail) @@ -47,6 +48,14 @@ func TestNotification(t *testing.T) { var err error env, err = environment.NewAWSEnvironment(environment.NewEnvironment(t)) Expect(err).ToNot(HaveOccurred()) + provider = test.AWSNodeTemplate(v1alpha1.AWSNodeTemplateSpec{AWS: awsv1alpha1.AWS{ + SecurityGroupSelector: map[string]string{"karpenter.sh/discovery": env.ClusterName}, + SubnetSelector: map[string]string{"karpenter.sh/discovery": env.ClusterName}, + }}) + env.ExpectCreated(provider) + }) + AfterSuite(func() { + env.ExpectDeleted(provider) }) RunSpecs(t, "Notification") } @@ -56,16 +65,12 @@ var _ = BeforeEach(func() { }) var _ = AfterEach(func() { - env.AfterEach() + env.AfterEach(&v1alpha1.AWSNodeTemplate{}) }) var _ = Describe("Notification", Label("AWS"), func() { It("should terminate the spot instance and spin-up a new node on spot interruption warning", func() { By("Creating a single healthy node with a healthy deployment") - provider := test.AWSNodeTemplate(v1alpha1.AWSNodeTemplateSpec{AWS: awsv1alpha1.AWS{ - SecurityGroupSelector: map[string]string{"karpenter.sh/discovery": env.ClusterName}, - SubnetSelector: map[string]string{"karpenter.sh/discovery": env.ClusterName}, - }}) provisioner := test.Provisioner(test.ProvisionerOptions{ Requirements: []v1.NodeSelectorRequirement{ { @@ -88,7 +93,7 @@ var _ = Describe("Notification", Label("AWS"), func() { }) selector := labels.SelectorFromSet(dep.Spec.Selector.MatchLabels) - env.ExpectCreated(provider, provisioner, dep) + env.ExpectCreated(provisioner, dep) env.EventuallyExpectHealthyPodCount(selector, numPods) env.ExpectCreatedNodeCount("==", 1) @@ -104,19 +109,20 @@ var _ = Describe("Notification", Label("AWS"), func() { // Monitor the events channel done := make(chan struct{}) go func() { + defer GinkgoRecover() defer fmt.Println("[FIS EVENT MONITOR] Closing event goroutine monitoring") - select { - case event := <-events: - if strings.Contains(event.Message, "Spot Instance Shutdown sent") { - Fail("Node didn't terminate before spot instance shutdown was sent") + for { + select { + case event := <-events: + if strings.Contains(event.Message, "Spot Instance Shutdown sent") { + Fail("Node didn't terminate before spot instance shutdown was sent") + } + fmt.Printf("[FIS EVENT MONITOR] %s\n", event.Message) + case <-done: + return + case <-ctx.Done(): + return } - fmt.Printf("[FIS EVENT MONITOR] %s\n", event.Message) - case <-done: - fmt.Println("done channel closed") - return - case <-ctx.Done(): - fmt.Println("context canceled") - return } }() @@ -126,10 +132,6 @@ var _ = Describe("Notification", Label("AWS"), func() { }) It("should terminate the node at the API server when the EC2 instance is stopped", func() { By("Creating a single healthy node with a healthy deployment") - provider := test.AWSNodeTemplate(v1alpha1.AWSNodeTemplateSpec{AWS: awsv1alpha1.AWS{ - SecurityGroupSelector: map[string]string{"karpenter.sh/discovery": env.ClusterName}, - SubnetSelector: map[string]string{"karpenter.sh/discovery": env.ClusterName}, - }}) provisioner := test.Provisioner(test.ProvisionerOptions{ Requirements: []v1.NodeSelectorRequirement{ { @@ -152,7 +154,7 @@ var _ = Describe("Notification", Label("AWS"), func() { }) selector := labels.SelectorFromSet(dep.Spec.Selector.MatchLabels) - env.ExpectCreated(provider, provisioner, dep) + env.ExpectCreated(provisioner, dep) env.EventuallyExpectHealthyPodCount(selector, numPods) env.ExpectCreatedNodeCount("==", 1) @@ -165,10 +167,6 @@ var _ = Describe("Notification", Label("AWS"), func() { }) It("should terminate the node at the API server when the EC2 instance is terminated", func() { By("Creating a single healthy node with a healthy deployment") - provider := test.AWSNodeTemplate(v1alpha1.AWSNodeTemplateSpec{AWS: awsv1alpha1.AWS{ - SecurityGroupSelector: map[string]string{"karpenter.sh/discovery": env.ClusterName}, - SubnetSelector: map[string]string{"karpenter.sh/discovery": env.ClusterName}, - }}) provisioner := test.Provisioner(test.ProvisionerOptions{ Requirements: []v1.NodeSelectorRequirement{ { @@ -191,7 +189,7 @@ var _ = Describe("Notification", Label("AWS"), func() { }) selector := labels.SelectorFromSet(dep.Spec.Selector.MatchLabels) - env.ExpectCreated(provider, provisioner, dep) + env.ExpectCreated(provisioner, dep) env.EventuallyExpectHealthyPodCount(selector, numPods) env.ExpectCreatedNodeCount("==", 1) @@ -204,10 +202,6 @@ var _ = Describe("Notification", Label("AWS"), func() { }) It("should terminate the node when receiving a scheduled change health event", func() { By("Creating a single healthy node with a healthy deployment") - provider := test.AWSNodeTemplate(v1alpha1.AWSNodeTemplateSpec{AWS: awsv1alpha1.AWS{ - SecurityGroupSelector: map[string]string{"karpenter.sh/discovery": env.ClusterName}, - SubnetSelector: map[string]string{"karpenter.sh/discovery": env.ClusterName}, - }}) provisioner := test.Provisioner(test.ProvisionerOptions{ Requirements: []v1.NodeSelectorRequirement{ { @@ -230,7 +224,7 @@ var _ = Describe("Notification", Label("AWS"), func() { }) selector := labels.SelectorFromSet(dep.Spec.Selector.MatchLabels) - env.ExpectCreated(provider, provisioner, dep) + env.ExpectCreated(provisioner, dep) env.EventuallyExpectHealthyPodCount(selector, numPods) env.ExpectCreatedNodeCount("==", 1) @@ -238,7 +232,7 @@ var _ = Describe("Notification", Label("AWS"), func() { instanceID := parseProviderID(node.Spec.ProviderID) By("Creating a scheduled change health event in the SQS message queue") - env.ExpectMessagesCreated(scheduledChangeMessage(env.MetadataProvider.Region(env.Context), env.MetadataProvider.AccountID(env.Context), instanceID)) + env.ExpectMessagesCreated(scheduledChangeMessage(env.Region, "000000000000", instanceID)) env.EventuallyExpectNotFound(node) env.EventuallyExpectHealthyPodCount(selector, 1) diff --git a/website/content/en/preview/getting-started/getting-started-with-eksctl/cloudformation.yaml b/website/content/en/preview/getting-started/getting-started-with-eksctl/cloudformation.yaml index c3a18e276043..ea4dbb0356bc 100644 --- a/website/content/en/preview/getting-started/getting-started-with-eksctl/cloudformation.yaml +++ b/website/content/en/preview/getting-started/getting-started-with-eksctl/cloudformation.yaml @@ -82,6 +82,7 @@ Resources: - sqs:DeleteQueue # Read Operations - sqs:GetQueueUrl + - sqs:GetQueueAttributes - sqs:ReceiveMessage - Effect: Allow Resource: !Sub "arn:${AWS::Partition}:events:${AWS::Region}:${AWS::AccountId}:rule/Karpenter-${ClusterName}-*" From 9f083523bad1ab2430bd45ca6e0d937d2ca6d9a0 Mon Sep 17 00:00:00 2001 From: Jonathan Innis Date: Tue, 11 Oct 2022 00:23:38 -0700 Subject: [PATCH 41/55] Add notification test benchmarking --- .../notification_benchmark_test.go | 104 ++++++++++++++++++ .../controllers/notification/suite_test.go | 37 ++++--- 2 files changed, 123 insertions(+), 18 deletions(-) create mode 100644 pkg/cloudprovider/aws/controllers/notification/notification_benchmark_test.go diff --git a/pkg/cloudprovider/aws/controllers/notification/notification_benchmark_test.go b/pkg/cloudprovider/aws/controllers/notification/notification_benchmark_test.go new file mode 100644 index 000000000000..f6a93734340c --- /dev/null +++ b/pkg/cloudprovider/aws/controllers/notification/notification_benchmark_test.go @@ -0,0 +1,104 @@ +package notification_test + +import ( + "context" + "math/rand" + "testing" + + "github.com/aws/aws-sdk-go/aws" + "github.com/aws/aws-sdk-go/aws/client" + "github.com/aws/aws-sdk-go/aws/endpoints" + "github.com/aws/aws-sdk-go/aws/request" + "github.com/aws/aws-sdk-go/aws/session" + "github.com/aws/aws-sdk-go/service/eventbridge" + "github.com/aws/aws-sdk-go/service/sqs" + "go.uber.org/multierr" + "go.uber.org/zap" + "k8s.io/client-go/util/workqueue" + "knative.dev/pkg/logging" + + awscloudprovider "github.com/aws/karpenter/pkg/cloudprovider/aws" + "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/infrastructure" +) + +func benchmarkNotificationController(b *testing.B, messageCount int) { + ctx := logging.WithLogger(context.Background(), zap.NewNop().Sugar()) + providers := newProviders() + if err := providers.makeInfrastructure(ctx); err != nil { + b.Fatalf("standing up infrastructure, %v", err) + } + if err := providers.provisionMessages(ctx, makeDiverseMessages(messageCount)); err != nil { + b.Fatalf("provisioning messages, %v", err) + } +} + +type providers struct { + sqsProvider *awscloudprovider.SQSProvider + eventBridgeProvider *awscloudprovider.EventBridgeProvider +} + +func newProviders() providers { + sess := session.Must(session.NewSession( + request.WithRetryer( + &aws.Config{STSRegionalEndpoint: endpoints.RegionalSTSEndpoint}, + client.DefaultRetryer{NumMaxRetries: client.DefaultRetryerMaxNumRetries}, + ), + )) + sqsProvider := awscloudprovider.NewSQSProvider(ctx, sqs.New(sess)) + eventBridgeProvider := awscloudprovider.NewEventBridgeProvider(eventbridge.New(sess), sqsProvider) + return providers{ + sqsProvider: sqsProvider, + eventBridgeProvider: eventBridgeProvider, + } +} + +func (p *providers) makeInfrastructure(ctx context.Context) error { + infraProvider := infrastructure.NewProvider(p.sqsProvider, p.eventBridgeProvider) + return infraProvider.CreateInfrastructure(ctx) +} + +func (p *providers) provisionMessages(ctx context.Context, messages ...[]interface{}) error { + errs := make([]error, len(messages)) + workqueue.ParallelizeUntil(ctx, 20, len(messages), func(i int) { + _, err := p.sqsProvider.SendMessage(ctx, messages[i]) + errs[i] = err + }) + return multierr.Combine(errs...) +} + +func makeDiverseMessages(count int) []interface{} { + var messages []interface{} + + messages = append(messages, makeScheduledChangeMessages(count/3)) + messages = append(messages, makeSpotInterruptionMessages(count/3)) + + messages = append(messages, makeStateChangeMessages(count-len(messages), []string{ + "stopping", "stopped", "shutting-down", "terminated", + })) + return messages +} + +func makeScheduledChangeMessages(count int) []interface{} { + var msgs []interface{} + for i := 0; i < count; i++ { + msgs = append(msgs, scheduledChangeMessage(makeInstanceID())) + } + return msgs +} + +func makeStateChangeMessages(count int, states []string) []interface{} { + var msgs []interface{} + for i := 0; i < count; i++ { + state := states[rand.Intn(len(states))] + msgs = append(msgs, stateChangeMessage(makeInstanceID(), state)) + } + return msgs +} + +func makeSpotInterruptionMessages(count int) []interface{} { + var msgs []interface{} + for i := 0; i < count; i++ { + msgs = append(msgs, spotInterruptionMessage(makeInstanceID())) + } + return msgs +} diff --git a/pkg/cloudprovider/aws/controllers/notification/suite_test.go b/pkg/cloudprovider/aws/controllers/notification/suite_test.go index 001aa200a81a..a2fe04761b97 100644 --- a/pkg/cloudprovider/aws/controllers/notification/suite_test.go +++ b/pkg/cloudprovider/aws/controllers/notification/suite_test.go @@ -163,7 +163,7 @@ var _ = Describe("Processing Messages", func() { }) It("should delete the node when receiving a state change message", func() { var nodes []*v1.Node - var messages []*sqs.Message + var messages []interface{} for _, state := range []string{"terminated", "stopped", "stopping", "shutting-down"} { instanceID := makeInstanceID() nodes = append(nodes, test.Node(test.NodeOptions{ @@ -203,7 +203,7 @@ var _ = Describe("Processing Messages", func() { } - var messages []*sqs.Message + var messages []interface{} for _, id := range instanceIDs { messages = append(messages, spotInterruptionMessage(id)) } @@ -337,10 +337,16 @@ var _ = Describe("Infrastructure Coordination", func() { }) }) -func ExpectMessagesCreated(messages ...*sqs.Message) { +func ExpectMessagesCreated(messages ...interface{}) { + raw := lo.Map(messages, func(m interface{}, _ int) *sqs.Message { + return &sqs.Message{ + Body: awssdk.String(string(lo.Must(json.Marshal(m)))), + MessageId: awssdk.String(string(uuid.NewUUID())), + } + }) sqsapi.ReceiveMessageBehavior.Output.Set( &sqs.ReceiveMessageOutput{ - Messages: messages, + Messages: raw, }, ) } @@ -349,8 +355,8 @@ func awsErrWithCode(code string) awserr.Error { return awserr.New(code, "", fmt.Errorf("")) } -func spotInterruptionMessage(involvedInstanceID string) *sqs.Message { - evt := spotinterruptionv0.AWSEvent{ +func spotInterruptionMessage(involvedInstanceID string) spotinterruptionv0.AWSEvent { + return spotinterruptionv0.AWSEvent{ AWSMetadata: event.AWSMetadata{ Version: "0", Account: defaultAccountID, @@ -368,14 +374,10 @@ func spotInterruptionMessage(involvedInstanceID string) *sqs.Message { InstanceAction: "terminate", }, } - return &sqs.Message{ - Body: awssdk.String(string(lo.Must(json.Marshal(evt)))), - MessageId: awssdk.String(string(uuid.NewUUID())), - } } -func stateChangeMessage(involvedInstanceID, state string) *sqs.Message { - evt := statechangev0.AWSEvent{ +func stateChangeMessage(involvedInstanceID, state string) statechangev0.AWSEvent { + return statechangev0.AWSEvent{ AWSMetadata: event.AWSMetadata{ Version: "0", Account: defaultAccountID, @@ -393,15 +395,11 @@ func stateChangeMessage(involvedInstanceID, state string) *sqs.Message { State: state, }, } - return &sqs.Message{ - Body: awssdk.String(string(lo.Must(json.Marshal(evt)))), - MessageId: awssdk.String(string(uuid.NewUUID())), - } } // TODO: Update the scheduled change message to accurately reflect a real health event -func scheduledChangeMessage(involvedInstanceID string) *sqs.Message { - evt := scheduledchangev0.AWSEvent{ +func scheduledChangeMessage(involvedInstanceID string) scheduledchangev0.AWSEvent { + return scheduledchangev0.AWSEvent{ AWSMetadata: event.AWSMetadata{ Version: "0", Account: defaultAccountID, @@ -424,6 +422,9 @@ func scheduledChangeMessage(involvedInstanceID string) *sqs.Message { }, }, } +} + +func NewWrappedMessage(evt event.Interface) *sqs.Message { return &sqs.Message{ Body: awssdk.String(string(lo.Must(json.Marshal(evt)))), MessageId: awssdk.String(string(uuid.NewUUID())), From 4797c7e7fb5061fd3ccedf216c59ae0d1825ec86 Mon Sep 17 00:00:00 2001 From: Jonathan Innis Date: Tue, 11 Oct 2022 12:41:33 -0700 Subject: [PATCH 42/55] Add notification benchmarking --- .../controllers/infrastructure/provider.go | 15 +- .../notification_benchmark_test.go | 275 ++++++++++++++++-- pkg/cloudprovider/aws/eventbridge.go | 9 +- pkg/cloudprovider/aws/fake/eventrecorder.go | 28 +- pkg/cloudprovider/aws/fake/sqsapi.go | 6 + pkg/cloudprovider/aws/sqs.go | 11 +- pkg/cloudprovider/aws/utils/utils.go | 12 + 7 files changed, 310 insertions(+), 46 deletions(-) diff --git a/pkg/cloudprovider/aws/controllers/infrastructure/provider.go b/pkg/cloudprovider/aws/controllers/infrastructure/provider.go index 415db6a37ffc..9dd43d15f60c 100644 --- a/pkg/cloudprovider/aws/controllers/infrastructure/provider.go +++ b/pkg/cloudprovider/aws/controllers/infrastructure/provider.go @@ -38,16 +38,11 @@ func NewProvider(sqsProvider *aws.SQSProvider, eventBridgeProvider *aws.EventBri } func (p *Provider) CreateInfrastructure(ctx context.Context) error { - funcs := []func() error{ - func() error { return p.ensureQueue(ctx) }, - func() error { return p.ensureEventBridge(ctx) }, + if err := p.ensureQueue(ctx); err != nil { + return fmt.Errorf("ensuring queue, %w", err) } - errs := make([]error, len(funcs)) - workqueue.ParallelizeUntil(ctx, len(funcs), len(funcs), func(i int) { - errs[i] = funcs[i]() - }) - if err := multierr.Combine(errs...); err != nil { - return err + if err := p.ensureEventBridge(ctx); err != nil { + return fmt.Errorf("ensuring eventBridge rules and targets, %w", err) } logging.FromContext(ctx).Infof("Successfully completed reconciliation of infrastructure") return nil @@ -103,7 +98,7 @@ func (p *Provider) ensureQueue(ctx context.Context) error { } } // Always attempt to set the queue attributes, even after creation to help set the queue policy - if err := p.sqsProvider.SetQueueAttributes(ctx); err != nil { + if err := p.sqsProvider.SetQueueAttributes(ctx, nil); err != nil { return fmt.Errorf("setting queue attributes for queue, %w", err) } return nil diff --git a/pkg/cloudprovider/aws/controllers/notification/notification_benchmark_test.go b/pkg/cloudprovider/aws/controllers/notification/notification_benchmark_test.go index f6a93734340c..7500a362e25c 100644 --- a/pkg/cloudprovider/aws/controllers/notification/notification_benchmark_test.go +++ b/pkg/cloudprovider/aws/controllers/notification/notification_benchmark_test.go @@ -1,35 +1,172 @@ +//go:build test_performance + +/* +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +//nolint:gosec package notification_test import ( "context" + "fmt" "math/rand" "testing" + "time" + "github.com/avast/retry-go" "github.com/aws/aws-sdk-go/aws" - "github.com/aws/aws-sdk-go/aws/client" + awsclient "github.com/aws/aws-sdk-go/aws/client" "github.com/aws/aws-sdk-go/aws/endpoints" "github.com/aws/aws-sdk-go/aws/request" "github.com/aws/aws-sdk-go/aws/session" + "github.com/aws/aws-sdk-go/awstesting/mock" "github.com/aws/aws-sdk-go/service/eventbridge" "github.com/aws/aws-sdk-go/service/sqs" "go.uber.org/multierr" "go.uber.org/zap" + v1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/client-go/util/workqueue" + clock "k8s.io/utils/clock/testing" "knative.dev/pkg/logging" + controllerruntime "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/reconcile" + "github.com/aws/karpenter/pkg/apis/provisioning/v1alpha5" + "github.com/aws/karpenter/pkg/cloudprovider" awscloudprovider "github.com/aws/karpenter/pkg/cloudprovider/aws" + controllersfake "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/fake" "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/infrastructure" + "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification" + awsfake "github.com/aws/karpenter/pkg/cloudprovider/aws/fake" + "github.com/aws/karpenter/pkg/cloudprovider/fake" + "github.com/aws/karpenter/pkg/controllers" + "github.com/aws/karpenter/pkg/controllers/polling" + "github.com/aws/karpenter/pkg/controllers/state" + "github.com/aws/karpenter/pkg/test" + "github.com/aws/karpenter/pkg/utils/injection" + "github.com/aws/karpenter/pkg/utils/options" ) +var r = rand.New(rand.NewSource(time.Now().Unix())) + +func BenchmarkNotification15000(b *testing.B) { + benchmarkNotificationController(b, 15000) +} + +func BenchmarkNotification5000(b *testing.B) { + benchmarkNotificationController(b, 5000) +} + +func BenchmarkNotification1000(b *testing.B) { + benchmarkNotificationController(b, 1000) +} + +func BenchmarkNotification100(b *testing.B) { + benchmarkNotificationController(b, 100) +} + func benchmarkNotificationController(b *testing.B, messageCount int) { - ctx := logging.WithLogger(context.Background(), zap.NewNop().Sugar()) - providers := newProviders() - if err := providers.makeInfrastructure(ctx); err != nil { + opts := options.Options{ + AWSIsolatedVPC: true, + ClusterName: "karpenter-notification-benchmarking", + } + fakeClock := &clock.FakeClock{} + ctx = injection.WithOptions(context.Background(), opts) + env = test.NewEnvironment(ctx, func(e *test.Environment) {}) + if err := env.Start(); err != nil { + b.Fatalf("Starting envirionment, %v", err) + } + // Stop the test environment after the test completes + defer func() { + if err := retry.Do(func() error { + return env.Stop() + }); err != nil { + b.Fatalf("stopping test environment, %v", err) + } + }() + + providers := newProviders(env.Ctx) + if err := providers.makeInfrastructure(env.Ctx); err != nil { b.Fatalf("standing up infrastructure, %v", err) } - if err := providers.provisionMessages(ctx, makeDiverseMessages(messageCount)); err != nil { + // Cleanup the infrastructure after the test completes + defer func() { + if err := retry.Do(func() error { + return providers.cleanupInfrastructure(env.Ctx) + }); err != nil { + b.Fatalf("deleting infrastructure, %v", err) + } + }() + + // Load all the fundamental components before setting up the controllers + recorder := awsfake.NewEventRecorder() + cfg = test.NewConfig() + cluster := state.NewCluster(fakeClock, cfg, env.Client, cloudProvider) + cloudProvider = &fake.CloudProvider{} + ec2api = &awsfake.EC2API{} + subnetProvider := awscloudprovider.NewSubnetProvider(ec2api) + instanceTypeProvider = awscloudprovider.NewInstanceTypeProvider(env.Ctx, mock.Session, cloudprovider.Options{}, ec2api, subnetProvider) + + // Set-up the controllers + nodeStateController := state.NewNodeController(env.Client, cluster) + infraController := &controllersfake.PollingController{} + notificationController := polling.NewController(notification.NewReconciler(env.Client, recorder, cluster, providers.sqsProvider, instanceTypeProvider, infraController)) + + messages, nodes := makeDiverseMessagesAndNodes(messageCount) + + logging.FromContext(env.Ctx).Infof("Provisioning %d nodes", messageCount) + if err := provisionNodes(env.Ctx, env.Client, nodes, nodeStateController); err != nil { + b.Fatalf("provisioning nodes, %v", err) + } + logging.FromContext(env.Ctx).Infof("Completed provisioning %d nodes", messageCount) + + logging.FromContext(env.Ctx).Infof("Provisioning %d messages into the SQS Queue", messageCount) + if err := providers.provisionMessages(env.Ctx, messages...); err != nil { b.Fatalf("provisioning messages, %v", err) } + logging.FromContext(env.Ctx).Infof("Completed provisioning %d messages into the SQS Queue", messageCount) + + m := controllers.NewManagerOrDie(env.Ctx, env.Config, controllerruntime.Options{ + BaseContext: func() context.Context { return logging.WithLogger(env.Ctx, zap.NewNop().Sugar()) }, + }) + m = controllers.RegisterControllers(env.Ctx, m, notificationController, nodeStateController) + + managerErr := make(chan error) + go func() { + logging.FromContext(env.Ctx).Infof("Starting controller manager") + if err := m.Start(env.Ctx); err != nil { + managerErr <- err + } + }() + + b.ResetTimer() + start := time.Now() + + notificationController.Start(env.Ctx) + done := providers.monitorMessagesProcessed(env.Ctx, recorder, messageCount) + + select { + case err := <-managerErr: + b.Fatalf("starting manager, %v", err) + case <-done: + } + duration := time.Since(start) + b.ReportMetric(float64(messageCount), "Messages") + b.ReportMetric(duration.Seconds(), "TotalDurationInSeconds") + b.ReportMetric(float64(messageCount)/duration.Seconds(), "Messages/Second") } type providers struct { @@ -37,11 +174,11 @@ type providers struct { eventBridgeProvider *awscloudprovider.EventBridgeProvider } -func newProviders() providers { +func newProviders(ctx context.Context) providers { sess := session.Must(session.NewSession( request.WithRetryer( &aws.Config{STSRegionalEndpoint: endpoints.RegionalSTSEndpoint}, - client.DefaultRetryer{NumMaxRetries: client.DefaultRetryerMaxNumRetries}, + awsclient.DefaultRetryer{NumMaxRetries: awsclient.DefaultRetryerMaxNumRetries}, ), )) sqsProvider := awscloudprovider.NewSQSProvider(ctx, sqs.New(sess)) @@ -54,10 +191,23 @@ func newProviders() providers { func (p *providers) makeInfrastructure(ctx context.Context) error { infraProvider := infrastructure.NewProvider(p.sqsProvider, p.eventBridgeProvider) - return infraProvider.CreateInfrastructure(ctx) + if err := infraProvider.CreateInfrastructure(ctx); err != nil { + return fmt.Errorf("creating infrastructure, %w", err) + } + if err := p.sqsProvider.SetQueueAttributes(ctx, map[string]*string{ + sqs.QueueAttributeNameMessageRetentionPeriod: aws.String("1200"), // 20 minutes for this test + }); err != nil { + return fmt.Errorf("updating message retention period, %w", err) + } + return nil } -func (p *providers) provisionMessages(ctx context.Context, messages ...[]interface{}) error { +func (p *providers) cleanupInfrastructure(ctx context.Context) error { + infraProvider := infrastructure.NewProvider(p.sqsProvider, p.eventBridgeProvider) + return infraProvider.DeleteInfrastructure(ctx) +} + +func (p *providers) provisionMessages(ctx context.Context, messages ...interface{}) error { errs := make([]error, len(messages)) workqueue.ParallelizeUntil(ctx, 20, len(messages), func(i int) { _, err := p.sqsProvider.SendMessage(ctx, messages[i]) @@ -66,39 +216,114 @@ func (p *providers) provisionMessages(ctx context.Context, messages ...[]interfa return multierr.Combine(errs...) } -func makeDiverseMessages(count int) []interface{} { +func (p *providers) monitorMessagesProcessed(ctx context.Context, recorder *awsfake.EventRecorder, expectedProcessed int) <-chan struct{} { + done := make(chan struct{}) + totalProcessed := 0 + go func() { + for totalProcessed < expectedProcessed { + totalProcessed = int(recorder.EC2StateStoppingCalled.Load()) + + int(recorder.EC2StateTerminatingCalled.Load()) + + int(recorder.EC2HealthWarningCalled.Load()) + + int(recorder.EC2SpotRebalanceRecommendationCalled.Load()) + + int(recorder.EC2SpotInterruptionWarningCalled.Load()) + logging.FromContext(ctx).Infof("Processed %d messages from the queue", totalProcessed) + time.Sleep(time.Second) + } + close(done) + }() + return done +} + +func provisionNodes(ctx context.Context, kubeClient client.Client, nodes []*v1.Node, nodeController *state.NodeController) error { + errs := make([]error, len(nodes)) + workqueue.ParallelizeUntil(ctx, 20, len(nodes), func(i int) { + if err := retry.Do(func() error { + return kubeClient.Create(ctx, nodes[i]) + }); err != nil { + errs[i] = fmt.Errorf("provisioning node, %w", err) + } + if err := retry.Do(func() error { + _, err := nodeController.Reconcile(ctx, reconcile.Request{NamespacedName: client.ObjectKeyFromObject(nodes[i])}) + return err + }); err != nil { + errs[i] = fmt.Errorf("reconciling node, %w", err) + } + }) + return multierr.Combine(errs...) +} + +func makeDiverseMessagesAndNodes(count int) ([]interface{}, []*v1.Node) { var messages []interface{} + var nodes []*v1.Node + + newMessages, newNodes := makeScheduledChangeMessagesAndNodes(count / 3) + messages = append(messages, newMessages...) + nodes = append(nodes, newNodes...) - messages = append(messages, makeScheduledChangeMessages(count/3)) - messages = append(messages, makeSpotInterruptionMessages(count/3)) + newMessages, newNodes = makeSpotInterruptionMessagesAndNodes(count / 3) + messages = append(messages, newMessages...) + nodes = append(nodes, newNodes...) - messages = append(messages, makeStateChangeMessages(count-len(messages), []string{ + newMessages, newNodes = makeStateChangeMessagesAndNodes(count-len(messages), []string{ "stopping", "stopped", "shutting-down", "terminated", - })) - return messages + }) + messages = append(messages, newMessages...) + nodes = append(nodes, newNodes...) + + return messages, nodes } -func makeScheduledChangeMessages(count int) []interface{} { +func makeScheduledChangeMessagesAndNodes(count int) ([]interface{}, []*v1.Node) { var msgs []interface{} + var nodes []*v1.Node for i := 0; i < count; i++ { - msgs = append(msgs, scheduledChangeMessage(makeInstanceID())) + instanceID := makeInstanceID() + msgs = append(msgs, scheduledChangeMessage(instanceID)) + nodes = append(nodes, test.Node(test.NodeOptions{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + v1alpha5.ProvisionerNameLabelKey: "default", + }, + }, + ProviderID: makeProviderID(instanceID), + })) } - return msgs + return msgs, nodes } -func makeStateChangeMessages(count int, states []string) []interface{} { +func makeStateChangeMessagesAndNodes(count int, states []string) ([]interface{}, []*v1.Node) { var msgs []interface{} + var nodes []*v1.Node for i := 0; i < count; i++ { - state := states[rand.Intn(len(states))] - msgs = append(msgs, stateChangeMessage(makeInstanceID(), state)) + state := states[r.Intn(len(states))] + instanceID := makeInstanceID() + msgs = append(msgs, stateChangeMessage(instanceID, state)) + nodes = append(nodes, test.Node(test.NodeOptions{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + v1alpha5.ProvisionerNameLabelKey: "default", + }, + }, + ProviderID: makeProviderID(instanceID), + })) } - return msgs + return msgs, nodes } -func makeSpotInterruptionMessages(count int) []interface{} { +func makeSpotInterruptionMessagesAndNodes(count int) ([]interface{}, []*v1.Node) { var msgs []interface{} + var nodes []*v1.Node for i := 0; i < count; i++ { - msgs = append(msgs, spotInterruptionMessage(makeInstanceID())) + instanceID := makeInstanceID() + msgs = append(msgs, spotInterruptionMessage(instanceID)) + nodes = append(nodes, test.Node(test.NodeOptions{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + v1alpha5.ProvisionerNameLabelKey: "default", + }, + }, + ProviderID: makeProviderID(instanceID), + })) } - return msgs + return msgs, nodes } diff --git a/pkg/cloudprovider/aws/eventbridge.go b/pkg/cloudprovider/aws/eventbridge.go index 3e40e42506b2..52d5a5c4a2a4 100644 --- a/pkg/cloudprovider/aws/eventbridge.go +++ b/pkg/cloudprovider/aws/eventbridge.go @@ -27,6 +27,7 @@ import ( "k8s.io/client-go/util/workqueue" awsv1alpha1 "github.com/aws/karpenter/pkg/cloudprovider/aws/apis/v1alpha1" + "github.com/aws/karpenter/pkg/cloudprovider/aws/utils" "github.com/aws/karpenter/pkg/utils/injection" ) @@ -128,28 +129,28 @@ func (eb *EventBridgeProvider) DeleteEC2NotificationRules(ctx context.Context) e func (eb *EventBridgeProvider) getEC2NotificationEventRules(ctx context.Context) []EventRule { return []EventRule{ { - Name: fmt.Sprintf("Karpenter-%s-ScheduledChangeRule", injection.GetOptions(ctx).ClusterName), + Name: fmt.Sprintf("Karpenter-ScheduledChangeRule-%s", utils.GetClusterNameHash(ctx, 20)), Pattern: &EventPattern{ Source: []string{"aws.health"}, DetailType: []string{"AWS Health Event"}, }, }, { - Name: fmt.Sprintf("Karpenter-%s-SpotTerminationRule", injection.GetOptions(ctx).ClusterName), + Name: fmt.Sprintf("Karpenter-SpotTerminationRule-%s", utils.GetClusterNameHash(ctx, 20)), Pattern: &EventPattern{ Source: []string{"aws.ec2"}, DetailType: []string{"EC2 Spot Instance Interruption Warning"}, }, }, { - Name: fmt.Sprintf("Karpenter-%s-RebalanceRule", injection.GetOptions(ctx).ClusterName), + Name: fmt.Sprintf("Karpenter-RebalanceRule-%s", utils.GetClusterNameHash(ctx, 20)), Pattern: &EventPattern{ Source: []string{"aws.ec2"}, DetailType: []string{"EC2 Instance Rebalance Recommendation"}, }, }, { - Name: fmt.Sprintf("Karpenter-%s-InstanceStateChangeRule", injection.GetOptions(ctx).ClusterName), + Name: fmt.Sprintf("Karpenter-InstanceStateChangeRule-%s", utils.GetClusterNameHash(ctx, 20)), Pattern: &EventPattern{ Source: []string{"aws.ec2"}, DetailType: []string{"EC2 Instance State-change Notification"}, diff --git a/pkg/cloudprovider/aws/fake/eventrecorder.go b/pkg/cloudprovider/aws/fake/eventrecorder.go index a8c28fbf12f8..a245ed631b31 100644 --- a/pkg/cloudprovider/aws/fake/eventrecorder.go +++ b/pkg/cloudprovider/aws/fake/eventrecorder.go @@ -15,6 +15,8 @@ limitations under the License. package fake import ( + "sync/atomic" + v1 "k8s.io/api/core/v1" "k8s.io/client-go/tools/record" @@ -24,19 +26,35 @@ import ( // EventRecorder is a mock event recorder that is used to facilitate testing. type EventRecorder struct { test.Recorder + + EC2SpotInterruptionWarningCalled atomic.Int64 + EC2SpotRebalanceRecommendationCalled atomic.Int64 + EC2HealthWarningCalled atomic.Int64 + EC2StateStoppingCalled atomic.Int64 + EC2StateTerminatingCalled atomic.Int64 } func (e *EventRecorder) EventRecorder() record.EventRecorder { return e.Recorder.EventRecorder() } -func (e *EventRecorder) EC2SpotInterruptionWarning(_ *v1.Node) {} +func (e *EventRecorder) EC2SpotInterruptionWarning(_ *v1.Node) { + e.EC2SpotInterruptionWarningCalled.Add(1) +} -func (e *EventRecorder) EC2SpotRebalanceRecommendation(_ *v1.Node) {} +func (e *EventRecorder) EC2SpotRebalanceRecommendation(_ *v1.Node) { + e.EC2SpotRebalanceRecommendationCalled.Add(1) +} -func (e *EventRecorder) EC2HealthWarning(_ *v1.Node) {} +func (e *EventRecorder) EC2HealthWarning(_ *v1.Node) { + e.EC2HealthWarningCalled.Add(1) +} -func (e *EventRecorder) EC2StateTerminating(_ *v1.Node) {} +func (e *EventRecorder) EC2StateTerminating(_ *v1.Node) { + e.EC2StateTerminatingCalled.Add(1) +} -func (e *EventRecorder) EC2StateStopping(_ *v1.Node) {} +func (e *EventRecorder) EC2StateStopping(_ *v1.Node) { + e.EC2StateStoppingCalled.Add(1) +} func (e *EventRecorder) TerminatingNodeOnNotification(_ *v1.Node) {} diff --git a/pkg/cloudprovider/aws/fake/sqsapi.go b/pkg/cloudprovider/aws/fake/sqsapi.go index 57223d8f2afa..ae0b58fa0e65 100644 --- a/pkg/cloudprovider/aws/fake/sqsapi.go +++ b/pkg/cloudprovider/aws/fake/sqsapi.go @@ -32,6 +32,7 @@ const ( type SQSBehavior struct { CreateQueueBehavior MockedFunction[sqs.CreateQueueInput, sqs.CreateQueueOutput] GetQueueURLBehavior MockedFunction[sqs.GetQueueUrlInput, sqs.GetQueueUrlOutput] + GetQueueAttributesBehavior MockedFunction[sqs.GetQueueAttributesInput, sqs.GetQueueAttributesOutput] SetQueueAttributesBehavior MockedFunction[sqs.SetQueueAttributesInput, sqs.SetQueueAttributesOutput] ReceiveMessageBehavior MockedFunction[sqs.ReceiveMessageInput, sqs.ReceiveMessageOutput] DeleteMessageBehavior MockedFunction[sqs.DeleteMessageInput, sqs.DeleteMessageOutput] @@ -48,6 +49,7 @@ type SQSAPI struct { func (s *SQSAPI) Reset() { s.CreateQueueBehavior.Reset() s.GetQueueURLBehavior.Reset() + s.GetQueueAttributesBehavior.Reset() s.SetQueueAttributesBehavior.Reset() s.ReceiveMessageBehavior.Reset() s.DeleteMessageBehavior.Reset() @@ -67,6 +69,10 @@ func (s *SQSAPI) GetQueueUrlWithContext(_ context.Context, input *sqs.GetQueueUr }).Invoke(input) } +func (s *SQSAPI) GetQueueAttributesWithContext(_ context.Context, input *sqs.GetQueueAttributesInput, _ ...request.Option) (*sqs.GetQueueAttributesOutput, error) { + return s.GetQueueAttributesBehavior.Invoke(input) +} + func (s *SQSAPI) SetQueueAttributesWithContext(_ context.Context, input *sqs.SetQueueAttributesInput, _ ...request.Option) (*sqs.SetQueueAttributesOutput, error) { return s.SetQueueAttributesBehavior.Invoke(input) } diff --git a/pkg/cloudprovider/aws/sqs.go b/pkg/cloudprovider/aws/sqs.go index fa70dbb84f13..207f2d40ee27 100644 --- a/pkg/cloudprovider/aws/sqs.go +++ b/pkg/cloudprovider/aws/sqs.go @@ -25,6 +25,7 @@ import ( "github.com/samber/lo" awsv1alpha1 "github.com/aws/karpenter/pkg/cloudprovider/aws/apis/v1alpha1" + "github.com/aws/karpenter/pkg/cloudprovider/aws/utils" "github.com/aws/karpenter/pkg/utils/atomic" "github.com/aws/karpenter/pkg/utils/functional" "github.com/aws/karpenter/pkg/utils/injection" @@ -122,7 +123,7 @@ func (s *SQSProvider) CreateQueue(ctx context.Context) error { return nil } -func (s *SQSProvider) SetQueueAttributes(ctx context.Context) error { +func (s *SQSProvider) SetQueueAttributes(ctx context.Context, attributeOverrides map[string]*string) error { queueURL, err := s.DiscoverQueueURL(ctx, false) if err != nil { return fmt.Errorf("fetching queue url, %w", err) @@ -131,6 +132,9 @@ func (s *SQSProvider) SetQueueAttributes(ctx context.Context) error { if err != nil { return fmt.Errorf("marshaling queue attributes, %w", err) } + if attributeOverrides != nil { + attributes = lo.Assign(attributes, attributeOverrides) + } setQueueAttributesInput := &sqs.SetQueueAttributesInput{ Attributes: attributes, QueueUrl: aws.String(queueURL), @@ -265,6 +269,9 @@ func (s *SQSProvider) getQueuePolicy(ctx context.Context) (*QueuePolicy, error) }, nil } +// getQueueName generates a sufficiently random name for the queue name from the cluster name +// This is used because the max-len for a queue name is 80 characters but the maximum cluster name +// length is 100 func getQueueName(ctx context.Context) string { - return fmt.Sprintf("Karpenter-%s-Queue", injection.GetOptions(ctx).ClusterName) + return fmt.Sprintf("Karpenter-Queue-%s", utils.GetClusterNameHash(ctx, 20)) } diff --git a/pkg/cloudprovider/aws/utils/utils.go b/pkg/cloudprovider/aws/utils/utils.go index 27ecaaed549e..ed2d0c9e866a 100644 --- a/pkg/cloudprovider/aws/utils/utils.go +++ b/pkg/cloudprovider/aws/utils/utils.go @@ -15,11 +15,16 @@ limitations under the License. package utils import ( + "context" + "crypto/sha256" + "encoding/hex" "fmt" "regexp" v1 "k8s.io/api/core/v1" "knative.dev/pkg/ptr" + + "github.com/aws/karpenter/pkg/utils/injection" ) // ParseProviderID parses the provider ID stored on the node to get the instance ID @@ -37,3 +42,10 @@ func ParseProviderID(node *v1.Node) (*string, error) { } return nil, fmt.Errorf("parsing instance id %s", node.Spec.ProviderID) } + +func GetClusterNameHash(ctx context.Context, truncateAt int) string { + h := sha256.New() + h.Write([]byte(injection.GetOptions(ctx).ClusterName)) + checkSum := h.Sum([]byte{}) + return hex.EncodeToString(checkSum)[:truncateAt] +} From 424f081f66325a55007281fee9501ed7e8624f99 Mon Sep 17 00:00:00 2001 From: Jonathan Innis Date: Tue, 11 Oct 2022 15:54:00 -0700 Subject: [PATCH 43/55] Address PR comments from ellistarn --- .../controllers/infrastructure/suite_test.go | 4 -- .../controllers/notification/event/types.go | 2 +- .../controllers/notification/reconciler.go | 47 +++++++++--------- pkg/cloudprovider/aws/fake/sqsapi.go | 6 ++- pkg/controllers/polling/controller.go | 49 +++++++++---------- pkg/controllers/polling/decorators.go | 28 +++++------ 6 files changed, 64 insertions(+), 72 deletions(-) diff --git a/pkg/cloudprovider/aws/controllers/infrastructure/suite_test.go b/pkg/cloudprovider/aws/controllers/infrastructure/suite_test.go index fb4b24804ad7..785d2e94b0a6 100644 --- a/pkg/cloudprovider/aws/controllers/infrastructure/suite_test.go +++ b/pkg/cloudprovider/aws/controllers/infrastructure/suite_test.go @@ -103,8 +103,6 @@ var _ = Describe("Reconciliation", func() { _, err := controller.Reconcile(ctx, reconcile.Request{}) Expect(err).ToNot(Succeed()) Expect(sqsapi.CreateQueueBehavior.FailedCalls()).To(Equal(1)) - Expect(eventbridgeapi.PutRuleBehavior.FailedCalls()).To(Equal(4)) - Expect(eventbridgeapi.PutTargetsBehavior.FailedCalls()).To(Equal(4)) // Simulating AccessDenied being resolved sqsapi.CreateQueueBehavior.Reset() @@ -124,8 +122,6 @@ var _ = Describe("Reconciliation", func() { _, err := controller.Reconcile(ctx, reconcile.Request{}) Expect(err).ToNot(Succeed()) Expect(sqsapi.CreateQueueBehavior.FailedCalls()).To(Equal(1)) - Expect(eventbridgeapi.PutRuleBehavior.SuccessfulCalls()).To(Equal(4)) - Expect(eventbridgeapi.PutTargetsBehavior.SuccessfulCalls()).To(Equal(4)) }) }) diff --git a/pkg/cloudprovider/aws/controllers/notification/event/types.go b/pkg/cloudprovider/aws/controllers/notification/event/types.go index ae46fd17ba3e..96c402a4c0ab 100644 --- a/pkg/cloudprovider/aws/controllers/notification/event/types.go +++ b/pkg/cloudprovider/aws/controllers/notification/event/types.go @@ -34,7 +34,7 @@ type Interface interface { type Kind byte const ( - UnknownKind = iota + _ = iota RebalanceRecommendationKind ScheduledChangeKind SpotInterruptionKind diff --git a/pkg/cloudprovider/aws/controllers/notification/reconciler.go b/pkg/cloudprovider/aws/controllers/notification/reconciler.go index 88c57de3f213..7e9705d53ddd 100644 --- a/pkg/cloudprovider/aws/controllers/notification/reconciler.go +++ b/pkg/cloudprovider/aws/controllers/notification/reconciler.go @@ -44,16 +44,23 @@ import ( "github.com/aws/karpenter/pkg/metrics" ) -type Action = string - -var Actions = struct { - CordonAndDrain, - Cordon, - NoAction Action -}{ - CordonAndDrain: "CordonAndDrain", - Cordon: "Cordon", - NoAction: "NoAction", +type Action byte + +const ( + _ Action = iota + CordonAndDrain + NoAction +) + +func (a Action) String() string { + switch a { + case CordonAndDrain: + return "CordonAndDrain" + case NoAction: + return "NoAction" + default: + return fmt.Sprintf("Unsupported Action %d", a) + } } // Reconciler is an AWS notification reconciler. @@ -182,7 +189,7 @@ func (r *Reconciler) handleNode(ctx context.Context, evt event.Interface, node * // Record metric and event for this action r.notifyForEvent(evt, node) - actionsPerformed.WithLabelValues(action).Inc() + actionsPerformed.WithLabelValues(action.String()).Inc() // Mark the offering as unavailable in the ICE cache since we got a spot interruption warning if evt.Kind() == event.SpotInterruptionKind { @@ -192,7 +199,7 @@ func (r *Reconciler) handleNode(ctx context.Context, evt event.Interface, node * r.instanceTypeProvider.MarkOfferingUnavailable(instanceType, zone, awsv1alpha1.CapacityTypeSpot) } } - if action != Actions.NoAction { + if action != NoAction { return r.deleteInstance(ctx, node) } return nil @@ -251,20 +258,10 @@ func (r *Reconciler) makeInstanceIDMap() map[string]*v1.Node { func actionForEvent(evt event.Interface) Action { switch evt.Kind() { - case event.RebalanceRecommendationKind: - return Actions.NoAction - - case event.ScheduledChangeKind: - return Actions.CordonAndDrain - - case event.SpotInterruptionKind: - return Actions.CordonAndDrain - - case event.StateChangeKind: - return Actions.CordonAndDrain - + case event.ScheduledChangeKind, event.SpotInterruptionKind, event.StateChangeKind: + return CordonAndDrain default: - return Actions.NoAction + return NoAction } } diff --git a/pkg/cloudprovider/aws/fake/sqsapi.go b/pkg/cloudprovider/aws/fake/sqsapi.go index ae0b58fa0e65..60ef785dc62e 100644 --- a/pkg/cloudprovider/aws/fake/sqsapi.go +++ b/pkg/cloudprovider/aws/fake/sqsapi.go @@ -70,7 +70,11 @@ func (s *SQSAPI) GetQueueUrlWithContext(_ context.Context, input *sqs.GetQueueUr } func (s *SQSAPI) GetQueueAttributesWithContext(_ context.Context, input *sqs.GetQueueAttributesInput, _ ...request.Option) (*sqs.GetQueueAttributesOutput, error) { - return s.GetQueueAttributesBehavior.Invoke(input) + return s.GetQueueAttributesBehavior.WithDefault(&sqs.GetQueueAttributesOutput{ + Attributes: map[string]*string{ + sqs.QueueAttributeNameQueueArn: aws.String("arn:aws:sqs:us-west-2:000000000000:Karpenter-Queue"), + }, + }).Invoke(input) } func (s *SQSAPI) SetQueueAttributesWithContext(_ context.Context, input *sqs.SetQueueAttributesInput, _ ...request.Option) (*sqs.SetQueueAttributesOutput, error) { diff --git a/pkg/controllers/polling/controller.go b/pkg/controllers/polling/controller.go index 2c48908e8284..e9548cbc790c 100644 --- a/pkg/controllers/polling/controller.go +++ b/pkg/controllers/polling/controller.go @@ -73,6 +73,9 @@ type Controller struct { trigger chan event.GenericEvent cancels sync.Map + + activeMetric prometheus.Gauge + triggerCountMetric prometheus.Counter } type Object struct { @@ -85,6 +88,22 @@ func NewController(rec controllers.Reconciler) *Controller { r: rec, uuid: types.UID(uuid.New().String()), trigger: make(chan event.GenericEvent, 100), + activeMetric: prometheus.NewGauge( + prometheus.GaugeOpts{ + Namespace: metrics.Namespace, + Subsystem: rec.Metadata().MetricsSubsystem, + Name: "active", + Help: "Whether the controller is active.", + }, + ), + triggerCountMetric: prometheus.NewCounter( + prometheus.CounterOpts{ + Namespace: metrics.Namespace, + Subsystem: rec.Metadata().MetricsSubsystem, + Name: "trigger_count", + Help: "A counter of the number of times this controller has been triggered.", + }, + ), } } @@ -111,7 +130,7 @@ func (c *Controller) Start(ctx context.Context) { // Trigger triggers an immediate reconciliation by inserting a message into the event channel. We increase the trigger // generation here to ensure that any messages that were previously re-queued are thrown away func (c *Controller) Trigger() { - c.triggeredCountMetric().Inc() + c.triggerCountMetric.Inc() obj := &Object{ObjectMeta: metav1.ObjectMeta{Generation: c.triggerGeneration.Add(1), UID: c.uuid}} c.trigger <- event.GenericEvent{Object: obj} } @@ -144,9 +163,9 @@ func (c *Controller) SetActive(active bool) { c.active = active if active { - c.activeMetric().Set(1) + c.activeMetric.Set(1) } else { - c.activeMetric().Set(0) + c.activeMetric.Set(0) } } @@ -163,7 +182,7 @@ func (c *Controller) Reconcile(ctx context.Context, req reconcile.Request) (reco } func (c *Controller) Builder(_ context.Context, m manager.Manager) *controllerruntime.Builder { - crmetrics.Registry.MustRegister(c.activeMetric(), c.triggeredCountMetric()) + crmetrics.Registry.MustRegister(c.activeMetric, c.triggerCountMetric) return controllerruntime. NewControllerManagedBy(m). Named(c.r.Metadata().Name). @@ -179,25 +198,3 @@ func (c *Controller) Builder(_ context.Context, m manager.Manager) *controllerru func (c *Controller) Register(ctx context.Context, m manager.Manager) error { return c.Builder(ctx, m).Complete(c) } - -func (c *Controller) activeMetric() prometheus.Gauge { - return prometheus.NewGauge( - prometheus.GaugeOpts{ - Namespace: metrics.Namespace, - Subsystem: c.r.Metadata().MetricsSubsystem, - Name: "active", - Help: "Whether the controller is active.", - }, - ) -} - -func (c *Controller) triggeredCountMetric() prometheus.Counter { - return prometheus.NewCounter( - prometheus.CounterOpts{ - Namespace: metrics.Namespace, - Subsystem: c.r.Metadata().MetricsSubsystem, - Name: "trigger_count", - Help: "A counter of the number of times this controller has been triggered.", - }, - ) -} diff --git a/pkg/controllers/polling/decorators.go b/pkg/controllers/polling/decorators.go index e9e173b441a1..26607688041b 100644 --- a/pkg/controllers/polling/decorators.go +++ b/pkg/controllers/polling/decorators.go @@ -39,7 +39,8 @@ type ControllerWithHealthInterface interface { type ControllerWithHealth struct { *Controller - healthy atomic.Bool + healthy atomic.Bool + healthyMetric prometheus.Gauge OnHealthy func(context.Context) OnUnhealthy func(context.Context) @@ -48,6 +49,14 @@ type ControllerWithHealth struct { func NewControllerWithHealth(c *Controller) *ControllerWithHealth { return &ControllerWithHealth{ Controller: c, + healthyMetric: prometheus.NewGauge( + prometheus.GaugeOpts{ + Namespace: metrics.Namespace, + Subsystem: c.r.Metadata().MetricsSubsystem, + Name: "healthy", + Help: "Whether the controller is in a healthy state.", + }, + ), } } @@ -64,32 +73,21 @@ func (c *ControllerWithHealth) Reconcile(ctx context.Context, req reconcile.Requ if c.OnHealthy != nil { c.OnHealthy(callerCtx) } - c.healthyMetric().Set(1) + c.healthyMetric.Set(1) } else { if c.OnUnhealthy != nil { c.OnUnhealthy(callerCtx) } - c.healthyMetric().Set(0) + c.healthyMetric.Set(0) } return res, err } func (c *ControllerWithHealth) Builder(ctx context.Context, m manager.Manager) *controllerruntime.Builder { - crmetrics.Registry.MustRegister(c.healthyMetric()) + crmetrics.Registry.MustRegister(c.healthyMetric) return c.Controller.Builder(ctx, m) } func (c *ControllerWithHealth) Register(ctx context.Context, m manager.Manager) error { return c.Builder(ctx, m).Complete(c) } - -func (c *ControllerWithHealth) healthyMetric() prometheus.Gauge { - return prometheus.NewGauge( - prometheus.GaugeOpts{ - Namespace: metrics.Namespace, - Subsystem: c.Controller.r.Metadata().MetricsSubsystem, - Name: "healthy", - Help: "Whether the controller is in a healthy state.", - }, - ) -} From 3fc95484771f15c42c3e9965ed4b327b51eec53d Mon Sep 17 00:00:00 2001 From: Jonathan Innis Date: Tue, 11 Oct 2022 18:05:07 -0700 Subject: [PATCH 44/55] Continuing addressing PR feedback --- .../aws/controllers/fake/pollingcontroller.go | 50 ----- .../controller.go | 39 ++-- .../controllers/infrastructure/reconciler.go | 53 ----- .../controllers/infrastructure/suite_test.go | 5 +- .../controllers/nodetemplate/suite_test.go | 15 -- .../{reconciler.go => controller.go} | 129 +++++------ .../aggregatedparser.go | 29 ++- .../notification/event/metadata.go | 19 -- .../notification/event/noop/handler.go | 16 +- .../event/rebalancerecommendation/handler.go | 39 ---- .../{unmarshal.go => model.go} | 16 +- .../event/rebalancerecommendation/parser.go | 2 +- .../event/scheduledchange/handler.go | 43 ---- .../{unmarshal.go => model.go} | 20 +- .../event/scheduledchange/parser.go | 2 +- .../event/spotinterruption/handler.go | 39 ---- .../{unmarshal.go => model.go} | 26 ++- .../event/spotinterruption/parser.go | 2 +- .../notification/event/statechange/handler.go | 43 ---- .../statechange/{unmarshal.go => model.go} | 30 ++- .../notification/event/statechange/parser.go | 2 +- .../notification_benchmark_test.go | 5 +- .../controllers/notification/suite_test.go | 71 ++----- pkg/cloudprovider/aws/controllers/register.go | 12 +- pkg/controllers/polling/controller.go | 200 ------------------ pkg/controllers/polling/decorators.go | 93 -------- pkg/controllers/polling/suite_test.go | 15 -- 27 files changed, 203 insertions(+), 812 deletions(-) delete mode 100644 pkg/cloudprovider/aws/controllers/fake/pollingcontroller.go rename pkg/cloudprovider/aws/controllers/{nodetemplate => infrastructure}/controller.go (66%) delete mode 100644 pkg/cloudprovider/aws/controllers/infrastructure/reconciler.go delete mode 100644 pkg/cloudprovider/aws/controllers/nodetemplate/suite_test.go rename pkg/cloudprovider/aws/controllers/notification/{reconciler.go => controller.go} (70%) rename pkg/cloudprovider/aws/controllers/notification/event/{aggregatedparser => }/aggregatedparser.go (70%) delete mode 100644 pkg/cloudprovider/aws/controllers/notification/event/rebalancerecommendation/handler.go rename pkg/cloudprovider/aws/controllers/notification/event/rebalancerecommendation/{unmarshal.go => model.go} (75%) delete mode 100644 pkg/cloudprovider/aws/controllers/notification/event/scheduledchange/handler.go rename pkg/cloudprovider/aws/controllers/notification/event/scheduledchange/{unmarshal.go => model.go} (77%) delete mode 100644 pkg/cloudprovider/aws/controllers/notification/event/spotinterruption/handler.go rename pkg/cloudprovider/aws/controllers/notification/event/spotinterruption/{unmarshal.go => model.go} (65%) delete mode 100644 pkg/cloudprovider/aws/controllers/notification/event/statechange/handler.go rename pkg/cloudprovider/aws/controllers/notification/event/statechange/{unmarshal.go => model.go} (63%) delete mode 100644 pkg/controllers/polling/controller.go delete mode 100644 pkg/controllers/polling/decorators.go delete mode 100644 pkg/controllers/polling/suite_test.go diff --git a/pkg/cloudprovider/aws/controllers/fake/pollingcontroller.go b/pkg/cloudprovider/aws/controllers/fake/pollingcontroller.go deleted file mode 100644 index ea43547b0974..000000000000 --- a/pkg/cloudprovider/aws/controllers/fake/pollingcontroller.go +++ /dev/null @@ -1,50 +0,0 @@ -/* -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package fake - -import ( - "context" - "sync/atomic" - - controllerruntime "sigs.k8s.io/controller-runtime" - "sigs.k8s.io/controller-runtime/pkg/manager" - "sigs.k8s.io/controller-runtime/pkg/reconcile" -) - -type PollingController struct { - TriggerCalls atomic.Int64 -} - -func (c *PollingController) Start(context.Context) {} - -func (c *PollingController) Stop(context.Context) {} - -func (c *PollingController) Trigger() { - c.TriggerCalls.Add(1) -} - -func (c *PollingController) Active() bool { return true } - -func (c *PollingController) Healthy() bool { return true } - -func (c *PollingController) Reconcile(context.Context, reconcile.Request) (reconcile.Result, error) { - return reconcile.Result{}, nil -} - -func (c *PollingController) Builder(context.Context, manager.Manager) *controllerruntime.Builder { - return nil -} - -func (c *PollingController) Register(context.Context, manager.Manager) error { return nil } diff --git a/pkg/cloudprovider/aws/controllers/nodetemplate/controller.go b/pkg/cloudprovider/aws/controllers/infrastructure/controller.go similarity index 66% rename from pkg/cloudprovider/aws/controllers/nodetemplate/controller.go rename to pkg/cloudprovider/aws/controllers/infrastructure/controller.go index 6bae0683720c..677d0081e702 100644 --- a/pkg/cloudprovider/aws/controllers/nodetemplate/controller.go +++ b/pkg/cloudprovider/aws/controllers/infrastructure/controller.go @@ -12,7 +12,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package nodetemplate +package infrastructure import ( "context" @@ -27,31 +27,27 @@ import ( "github.com/aws/karpenter/pkg/apis/awsnodetemplate/v1alpha1" "github.com/aws/karpenter/pkg/apis/provisioning/v1alpha5" - "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/infrastructure" - "github.com/aws/karpenter/pkg/controllers/polling" + "github.com/aws/karpenter/pkg/cloudprovider/aws" ) -const Name = "aws.nodetemplate" +const Name = "aws.infrastructure" -// Controller is the AWS Node Template counter and finalizer reconciler. It performs certain operations based on the -// number of AWS Node Templates on the cluster +// Controller is the AWS infrastructure reconciler +// It plugs into the polling controller to periodically re-reconcile the expected Karpenter AWS infrastructure type Controller struct { - kubeClient client.Client - infraProvider *infrastructure.Provider - infraController polling.ControllerInterface - notificationController polling.ControllerInterface + kubeClient client.Client + provider *Provider } -func NewController(kubeClient client.Client, infraProvider *infrastructure.Provider, - infraController, notificationController polling.ControllerInterface) *Controller { +func NewController(kubeClient client.Client, sqsProvider *aws.SQSProvider, eventBridgeProvider *aws.EventBridgeProvider) *Controller { return &Controller{ - kubeClient: kubeClient, - infraProvider: infraProvider, - infraController: infraController, - notificationController: notificationController, + kubeClient: kubeClient, + provider: NewProvider(sqsProvider, eventBridgeProvider), } } +// Reconcile reconciles the SQS queue and the EventBridge rules with the expected +// configuration prescribed by Karpenter func (c *Controller) Reconcile(ctx context.Context, req reconcile.Request) (reconcile.Result, error) { ctx = logging.WithLogger(ctx, logging.FromContext(ctx).Named(Name)) nt := &v1alpha1.AWSNodeTemplate{} @@ -69,9 +65,7 @@ func (c *Controller) Reconcile(ctx context.Context, req reconcile.Request) (reco // Handle removing the finalizer and also cleaning up the infrastructure on the last AWSNodeTemplate deletion if !nt.DeletionTimestamp.IsZero() { if len(list.Items) == 1 { - c.infraController.Stop(ctx) - c.notificationController.Stop(ctx) - if err := c.infraProvider.DeleteInfrastructure(ctx); err != nil { + if err := c.provider.DeleteInfrastructure(ctx); err != nil { return reconcile.Result{}, err } } @@ -82,17 +76,12 @@ func (c *Controller) Reconcile(ctx context.Context, req reconcile.Request) (reco } return reconcile.Result{}, nil } - if len(list.Items) >= 1 { - // Start reconciling the infrastructure controller. This also waterfalls the starting of the - // notification controller once the infra is healthy - c.infraController.Start(ctx) - } mergeFrom := client.MergeFrom(nt.DeepCopy()) controllerutil.AddFinalizer(nt, v1alpha5.TerminationFinalizer) if err := c.kubeClient.Patch(ctx, nt, mergeFrom); err != nil { return reconcile.Result{}, err } - return reconcile.Result{}, nil + return reconcile.Result{}, c.provider.CreateInfrastructure(ctx) } func (c *Controller) Register(_ context.Context, m manager.Manager) error { diff --git a/pkg/cloudprovider/aws/controllers/infrastructure/reconciler.go b/pkg/cloudprovider/aws/controllers/infrastructure/reconciler.go deleted file mode 100644 index 07ac5a3d9967..000000000000 --- a/pkg/cloudprovider/aws/controllers/infrastructure/reconciler.go +++ /dev/null @@ -1,53 +0,0 @@ -/* -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package infrastructure - -import ( - "context" - "time" - - "sigs.k8s.io/controller-runtime/pkg/reconcile" - - "github.com/aws/karpenter/pkg/controllers" -) - -// Reconciler is the AWS infrastructure reconciler -// It plugs into the polling controller to periodically re-reconcile the expected Karpenter AWS infrastructure -type Reconciler struct { - provider *Provider -} - -// pollingPeriod is the period that we go to AWS APIs to ensure that the appropriate AWS infrastructure is provisioned -// This period can be reduced to a backoffPeriod if there is an error in ensuring the infrastructure -const pollingPeriod = time.Hour - -func NewReconciler(provider *Provider) *Reconciler { - return &Reconciler{ - provider: provider, - } -} - -func (r *Reconciler) Metadata() controllers.Metadata { - return controllers.Metadata{ - Name: "aws.infrastructure", - MetricsSubsystem: "aws_infrastructure_controller", - } -} - -// Reconcile reconciles the SQS queue and the EventBridge rules with the expected -// configuration prescribed by Karpenter -func (r *Reconciler) Reconcile(ctx context.Context, _ reconcile.Request) (reconcile.Result, error) { - return reconcile.Result{RequeueAfter: pollingPeriod}, r.provider.CreateInfrastructure(ctx) -} diff --git a/pkg/cloudprovider/aws/controllers/infrastructure/suite_test.go b/pkg/cloudprovider/aws/controllers/infrastructure/suite_test.go index 785d2e94b0a6..5e248a9d367d 100644 --- a/pkg/cloudprovider/aws/controllers/infrastructure/suite_test.go +++ b/pkg/cloudprovider/aws/controllers/infrastructure/suite_test.go @@ -27,7 +27,6 @@ import ( _ "knative.dev/pkg/system/testing" "sigs.k8s.io/controller-runtime/pkg/reconcile" - "github.com/aws/karpenter/pkg/controllers/polling" . "github.com/aws/karpenter/pkg/test/expectations" "github.com/aws/karpenter/pkg/utils/injection" "github.com/aws/karpenter/pkg/utils/options" @@ -44,7 +43,7 @@ var sqsapi *awsfake.SQSAPI var sqsProvider *aws.SQSProvider var eventbridgeapi *awsfake.EventBridgeAPI var eventBridgeProvider *aws.EventBridgeProvider -var controller *polling.ControllerWithHealth +var controller *infrastructure.Controller var opts options.Options var defaultOpts = options.Options{ @@ -73,7 +72,7 @@ var _ = BeforeEach(func() { sqsProvider = aws.NewSQSProvider(e.Ctx, sqsapi) eventBridgeProvider = aws.NewEventBridgeProvider(eventbridgeapi, sqsProvider) - controller = polling.NewController(infrastructure.NewReconciler(infrastructure.NewProvider(sqsProvider, eventBridgeProvider))).WithHealth() + controller = infrastructure.NewController(e.Client, sqsProvider, eventBridgeProvider) }) Expect(env.Start()).To(Succeed(), "Failed to start environment") }) diff --git a/pkg/cloudprovider/aws/controllers/nodetemplate/suite_test.go b/pkg/cloudprovider/aws/controllers/nodetemplate/suite_test.go deleted file mode 100644 index 196aed08182b..000000000000 --- a/pkg/cloudprovider/aws/controllers/nodetemplate/suite_test.go +++ /dev/null @@ -1,15 +0,0 @@ -/* -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package nodetemplate_test diff --git a/pkg/cloudprovider/aws/controllers/notification/reconciler.go b/pkg/cloudprovider/aws/controllers/notification/controller.go similarity index 70% rename from pkg/cloudprovider/aws/controllers/notification/reconciler.go rename to pkg/cloudprovider/aws/controllers/notification/controller.go index 7e9705d53ddd..04964dc3edc4 100644 --- a/pkg/cloudprovider/aws/controllers/notification/reconciler.go +++ b/pkg/cloudprovider/aws/controllers/notification/controller.go @@ -28,18 +28,17 @@ import ( "knative.dev/pkg/logging" "knative.dev/pkg/ptr" "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/manager" "sigs.k8s.io/controller-runtime/pkg/reconcile" + "github.com/aws/karpenter/pkg/apis/awsnodetemplate/v1alpha1" "github.com/aws/karpenter/pkg/apis/provisioning/v1alpha5" "github.com/aws/karpenter/pkg/cloudprovider/aws" awsv1alpha1 "github.com/aws/karpenter/pkg/cloudprovider/aws/apis/v1alpha1" "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event" - "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event/aggregatedparser" - statechangev0 "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event/statechange" + "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event/statechange" "github.com/aws/karpenter/pkg/cloudprovider/aws/events" "github.com/aws/karpenter/pkg/cloudprovider/aws/utils" - "github.com/aws/karpenter/pkg/controllers" - "github.com/aws/karpenter/pkg/controllers/polling" "github.com/aws/karpenter/pkg/controllers/state" "github.com/aws/karpenter/pkg/metrics" ) @@ -63,81 +62,93 @@ func (a Action) String() string { } } -// Reconciler is an AWS notification reconciler. +// Controller is an AWS notification controller. // It plugs into the polling controller to periodically poll the SQS queue for notification messages -type Reconciler struct { +type Controller struct { + startAsync <-chan struct{} kubeClient client.Client cluster *state.Cluster recorder events.Recorder provider *aws.SQSProvider instanceTypeProvider *aws.InstanceTypeProvider - parser aggregatedparser.AggregatedParser - - infraController polling.ControllerWithHealthInterface + parser event.AggregatedParser } -// pollingPeriod that we go to the SQS queue to check if there are any new events -const pollingPeriod = 2 * time.Second - -func NewReconciler(kubeClient client.Client, recorder events.Recorder, cluster *state.Cluster, - sqsProvider *aws.SQSProvider, instanceTypeProvider *aws.InstanceTypeProvider, - infraController polling.ControllerWithHealthInterface) *Reconciler { +func NewController(kubeClient client.Client, recorder events.Recorder, cluster *state.Cluster, + sqsProvider *aws.SQSProvider, instanceTypeProvider *aws.InstanceTypeProvider, startAsync <-chan struct{}) *Controller { - return &Reconciler{ + return &Controller{ kubeClient: kubeClient, cluster: cluster, recorder: recorder, provider: sqsProvider, instanceTypeProvider: instanceTypeProvider, - parser: aggregatedparser.NewAggregatedParser(aggregatedparser.DefaultParsers...), - infraController: infraController, + parser: event.NewAggregatedParser(event.DefaultParsers...), + startAsync: startAsync, } } -func (r *Reconciler) Metadata() controllers.Metadata { - return controllers.Metadata{ - Name: "aws.notification", - MetricsSubsystem: subsystem, +func (c *Controller) Start(ctx context.Context) { + for ctx.Err() != nil { + list := &v1alpha1.AWSNodeTemplateList{} + if err := c.kubeClient.List(ctx, list); err != nil { + logging.FromContext(ctx).Errorf("listing aws node templates, %v", err) + continue + } + if len(list.Items) > 0 { + if _, err := c.Reconcile(ctx, reconcile.Request{}); err != nil { + logging.FromContext(ctx).Errorf("reconciling notification messages, %v", err) + continue + } + } else { + time.Sleep(time.Minute) + } } } -func (r *Reconciler) Reconcile(ctx context.Context, _ reconcile.Request) (reconcile.Result, error) { - // We rely on the infrastructure, so it needs to be healthy before proceeding to poll the queue - if !r.infraController.Healthy() { - return reconcile.Result{}, nil - } - sqsMessages, err := r.provider.GetSQSMessages(ctx) +func (c *Controller) Reconcile(ctx context.Context, _ reconcile.Request) (reconcile.Result, error) { + sqsMessages, err := c.provider.GetSQSMessages(ctx) if err != nil { - // If the queue isn't found, we should trigger the infrastructure controller to re-reconcile - if aws.IsNotFound(err) { - r.infraController.Trigger() - } - return reconcile.Result{}, err + return reconcile.Result{}, fmt.Errorf("getting messages from queue, %w", err) } if len(sqsMessages) == 0 { - return reconcile.Result{RequeueAfter: pollingPeriod}, nil + return reconcile.Result{}, nil } - instanceIDMap := r.makeInstanceIDMap() + instanceIDMap := c.makeInstanceIDMap() errs := make([]error, len(sqsMessages)) workqueue.ParallelizeUntil(ctx, 10, len(sqsMessages), func(i int) { - errs[i] = r.handleMessage(ctx, instanceIDMap, sqsMessages[i]) + errs[i] = c.handleMessage(ctx, instanceIDMap, sqsMessages[i]) }) - return reconcile.Result{RequeueAfter: polling.Immediate}, multierr.Combine(errs...) + return reconcile.Result{}, multierr.Combine(errs...) +} + +func (c *Controller) Register(ctx context.Context, m manager.Manager) error { + ctx = logging.WithLogger(ctx, logging.FromContext(ctx).Named("aws.notification")) + go func() { + defer logging.FromContext(ctx).Infof("Shutting down ") + select { + case <-ctx.Done(): + return + case <-c.startAsync: + c.Start(ctx) + } + }() + return nil } // handleMessage gets the node names of the instances involved in the queue message and takes the // assigned action on the instances based on the message event -func (r *Reconciler) handleMessage(ctx context.Context, instanceIDMap map[string]*v1.Node, msg *sqsapi.Message) error { +func (c *Controller) handleMessage(ctx context.Context, instanceIDMap map[string]*v1.Node, msg *sqsapi.Message) error { // No message to parse in this case if msg == nil || msg.Body == nil { return nil } - evt, err := r.parser.Parse(*msg.Body) + evt, err := c.parser.Parse(*msg.Body) if err != nil { // In the scenario where we can't parse the message, we log that we have an error and then are // forced to just delete the message from the queue logging.FromContext(ctx).Errorf("parsing sqs message, %v", err) - err = r.provider.DeleteSQSMessage(ctx, msg) + err = c.provider.DeleteSQSMessage(ctx, msg) if err != nil { return fmt.Errorf("failed to delete message from queue, %w", err) } @@ -152,7 +163,7 @@ func (r *Reconciler) handleMessage(ctx context.Context, instanceIDMap map[string receivedMessages.WithLabelValues(evt.Kind().String(), "false").Inc() // Since there's no action, just delete the message - err = r.provider.DeleteSQSMessage(ctx, msg) + err = c.provider.DeleteSQSMessage(ctx, msg) if err != nil { return fmt.Errorf("failed to delete message from queue, %w", err) } @@ -168,14 +179,14 @@ func (r *Reconciler) handleMessage(ctx context.Context, instanceIDMap map[string for i := range nodes { node := nodes[i] - err = multierr.Append(err, r.handleNode(ctx, evt, node)) + err = multierr.Append(err, c.handleNode(ctx, evt, node)) } if err != nil { return fmt.Errorf("failed to act on nodes [%s%s], %w", strings.Join(lo.Slice(nodeNames, 0, 3), ","), lo.Ternary(len(nodeNames) > 3, "...", ""), err) } - err = r.provider.DeleteSQSMessage(ctx, msg) + err = c.provider.DeleteSQSMessage(ctx, msg) if err != nil { return fmt.Errorf("failed to delete message from queue, %w", err) } @@ -183,12 +194,12 @@ func (r *Reconciler) handleMessage(ctx context.Context, instanceIDMap map[string return nil } -func (r *Reconciler) handleNode(ctx context.Context, evt event.Interface, node *v1.Node) error { +func (c *Controller) handleNode(ctx context.Context, evt event.Interface, node *v1.Node) error { ctx = logging.WithLogger(ctx, logging.FromContext(ctx).With("node", node.Name)) action := actionForEvent(evt) // Record metric and event for this action - r.notifyForEvent(evt, node) + c.notifyForEvent(evt, node) actionsPerformed.WithLabelValues(action.String()).Inc() // Mark the offering as unavailable in the ICE cache since we got a spot interruption warning @@ -196,41 +207,41 @@ func (r *Reconciler) handleNode(ctx context.Context, evt event.Interface, node * zone := node.Labels[v1.LabelTopologyZone] instanceType := node.Labels[v1.LabelInstanceTypeStable] if zone != "" && instanceType != "" { - r.instanceTypeProvider.MarkOfferingUnavailable(instanceType, zone, awsv1alpha1.CapacityTypeSpot) + c.instanceTypeProvider.MarkOfferingUnavailable(instanceType, zone, awsv1alpha1.CapacityTypeSpot) } } if action != NoAction { - return r.deleteInstance(ctx, node) + return c.deleteInstance(ctx, node) } return nil } -func (r *Reconciler) deleteInstance(ctx context.Context, node *v1.Node) error { - if err := r.kubeClient.Delete(ctx, node); err != nil { +func (c *Controller) deleteInstance(ctx context.Context, node *v1.Node) error { + if err := c.kubeClient.Delete(ctx, node); err != nil { return fmt.Errorf("deleting the node on notification, %w", err) } - r.recorder.TerminatingNodeOnNotification(node) + c.recorder.TerminatingNodeOnNotification(node) metrics.NodesTerminatedCounter.WithLabelValues(terminationReasonLabel).Inc() return nil } -func (r *Reconciler) notifyForEvent(evt event.Interface, n *v1.Node) { +func (c *Controller) notifyForEvent(evt event.Interface, n *v1.Node) { switch evt.Kind() { case event.RebalanceRecommendationKind: - r.recorder.EC2SpotRebalanceRecommendation(n) + c.recorder.EC2SpotRebalanceRecommendation(n) case event.ScheduledChangeKind: - r.recorder.EC2HealthWarning(n) + c.recorder.EC2HealthWarning(n) case event.SpotInterruptionKind: - r.recorder.EC2SpotInterruptionWarning(n) + c.recorder.EC2SpotInterruptionWarning(n) case event.StateChangeKind: - typed := evt.(statechangev0.EC2InstanceStateChangeNotification) + typed := evt.(statechange.Event) if lo.Contains([]string{"stopping", "stopped"}, typed.State()) { - r.recorder.EC2StateStopping(n) + c.recorder.EC2StateStopping(n) } else { - r.recorder.EC2StateTerminating(n) + c.recorder.EC2StateTerminating(n) } default: @@ -239,9 +250,9 @@ func (r *Reconciler) notifyForEvent(evt event.Interface, n *v1.Node) { // makeInstanceIDMap builds a map between the instance id that is stored in the // node .spec.providerID and the node name stored on the host -func (r *Reconciler) makeInstanceIDMap() map[string]*v1.Node { +func (c *Controller) makeInstanceIDMap() map[string]*v1.Node { m := map[string]*v1.Node{} - r.cluster.ForEachNode(func(n *state.Node) bool { + c.cluster.ForEachNode(func(n *state.Node) bool { // If this node isn't owned by a provisioner, we shouldn't handle it if _, ok := n.Node.Labels[v1alpha5.ProvisionerNameLabelKey]; !ok { return true diff --git a/pkg/cloudprovider/aws/controllers/notification/event/aggregatedparser/aggregatedparser.go b/pkg/cloudprovider/aws/controllers/notification/event/aggregatedparser.go similarity index 70% rename from pkg/cloudprovider/aws/controllers/notification/event/aggregatedparser/aggregatedparser.go rename to pkg/cloudprovider/aws/controllers/notification/event/aggregatedparser.go index 0adb7c50a078..cb67e5871924 100644 --- a/pkg/cloudprovider/aws/controllers/notification/event/aggregatedparser/aggregatedparser.go +++ b/pkg/cloudprovider/aws/controllers/notification/event/aggregatedparser.go @@ -12,7 +12,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package aggregatedparser +package event import ( "encoding/json" @@ -20,7 +20,6 @@ import ( "github.com/samber/lo" - "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event" "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event/noop" "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event/rebalancerecommendation" "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event/scheduledchange" @@ -34,7 +33,7 @@ type parserKey struct { DetailType string } -func newParserKey(metadata event.AWSMetadata) parserKey { +func newParserKey(metadata AWSMetadata) parserKey { return parserKey{ Version: metadata.Version, Source: metadata.Source, @@ -42,7 +41,7 @@ func newParserKey(metadata event.AWSMetadata) parserKey { } } -func newParserKeyFromParser(p event.Parser) parserKey { +func newParserKeyFromParser(p Parser) parserKey { return parserKey{ Version: p.Version(), Source: p.Source(), @@ -51,7 +50,7 @@ func newParserKeyFromParser(p event.Parser) parserKey { } var ( - DefaultParsers = []event.Parser{ + DefaultParsers = []Parser{ statechange.Parser{}, spotinterruption.Parser{}, scheduledchange.Parser{}, @@ -60,34 +59,34 @@ var ( ) type AggregatedParser struct { - parserMap map[parserKey]event.Parser + parserMap map[parserKey]Parser } -func NewAggregatedParser(parsers ...event.Parser) AggregatedParser { +func NewAggregatedParser(parsers ...Parser) AggregatedParser { return AggregatedParser{ - parserMap: lo.SliceToMap(parsers, func(p event.Parser) (parserKey, event.Parser) { + parserMap: lo.SliceToMap(parsers, func(p Parser) (parserKey, Parser) { return newParserKeyFromParser(p), p }), } } -func (p AggregatedParser) Parse(msg string) (event.Interface, error) { +func (p AggregatedParser) Parse(msg string) (Interface, error) { if msg == "" { - return noop.NoOp{}, nil + return noop.Event{}, nil } - md := event.AWSMetadata{} + md := AWSMetadata{} if err := json.Unmarshal([]byte(msg), &md); err != nil { - return noop.NoOp{}, fmt.Errorf("unmarshalling the message as AWSMetadata, %w", err) + return noop.Event{}, fmt.Errorf("unmarshalling the message as AWSMetadata, %w", err) } if parser, ok := p.parserMap[newParserKey(md)]; ok { evt, err := parser.Parse(msg) if err != nil { - return noop.NoOp{}, fmt.Errorf("parsing event message, %w", err) + return noop.Event{}, fmt.Errorf("parsing event message, %w", err) } if evt == nil { - return noop.NoOp{}, nil + return noop.Event{}, nil } return evt, nil } - return noop.NoOp(md), nil + return noop.Event{AWSMetadata: md}, nil } diff --git a/pkg/cloudprovider/aws/controllers/notification/event/metadata.go b/pkg/cloudprovider/aws/controllers/notification/event/metadata.go index 9846e482e51d..3544ae5d0a50 100644 --- a/pkg/cloudprovider/aws/controllers/notification/event/metadata.go +++ b/pkg/cloudprovider/aws/controllers/notification/event/metadata.go @@ -16,8 +16,6 @@ package event import ( "time" - - "go.uber.org/zap/zapcore" ) type AWSMetadata struct { @@ -30,20 +28,3 @@ type AWSMetadata struct { Time time.Time `json:"time"` Version string `json:"version"` } - -func (e AWSMetadata) MarshalLogObject(enc zapcore.ObjectEncoder) (err error) { - enc.AddString("source", e.Source) - enc.AddString("detail-type", e.DetailType) - enc.AddString("id", e.ID) - enc.AddTime("time", e.Time) - enc.AddString("region", e.Region) - _ = enc.AddArray("resources", zapcore.ArrayMarshalerFunc(func(enc zapcore.ArrayEncoder) error { - for _, resource := range e.Resources { - enc.AppendString(resource) - } - return nil - })) - enc.AddString("version", e.Version) - enc.AddString("account", e.Account) - return err -} diff --git a/pkg/cloudprovider/aws/controllers/notification/event/noop/handler.go b/pkg/cloudprovider/aws/controllers/notification/event/noop/handler.go index fecdf1699729..749fce846c45 100644 --- a/pkg/cloudprovider/aws/controllers/notification/event/noop/handler.go +++ b/pkg/cloudprovider/aws/controllers/notification/event/noop/handler.go @@ -15,25 +15,17 @@ limitations under the License. package noop import ( - "time" - "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event" ) -type NoOp event.AWSMetadata - -func (NoOp) EventID() string { - return "" +type Event struct { + event.AWSMetadata } -func (NoOp) EC2InstanceIDs() []string { +func (Event) EC2InstanceIDs() []string { return []string{} } -func (NoOp) Kind() event.Kind { +func (Event) Kind() event.Kind { return event.NoOpKind } - -func (NoOp) StartTime() time.Time { - return time.Now() -} diff --git a/pkg/cloudprovider/aws/controllers/notification/event/rebalancerecommendation/handler.go b/pkg/cloudprovider/aws/controllers/notification/event/rebalancerecommendation/handler.go deleted file mode 100644 index fc5b13f2e26a..000000000000 --- a/pkg/cloudprovider/aws/controllers/notification/event/rebalancerecommendation/handler.go +++ /dev/null @@ -1,39 +0,0 @@ -/* -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package rebalancerecommendation - -import ( - "time" - - "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event" -) - -type EC2InstanceRebalanceRecommendation AWSEvent - -func (e EC2InstanceRebalanceRecommendation) EventID() string { - return e.ID -} - -func (e EC2InstanceRebalanceRecommendation) EC2InstanceIDs() []string { - return []string{e.Detail.InstanceID} -} - -func (EC2InstanceRebalanceRecommendation) Kind() event.Kind { - return event.RebalanceRecommendationKind -} - -func (e EC2InstanceRebalanceRecommendation) StartTime() time.Time { - return e.Time -} diff --git a/pkg/cloudprovider/aws/controllers/notification/event/rebalancerecommendation/unmarshal.go b/pkg/cloudprovider/aws/controllers/notification/event/rebalancerecommendation/model.go similarity index 75% rename from pkg/cloudprovider/aws/controllers/notification/event/rebalancerecommendation/unmarshal.go rename to pkg/cloudprovider/aws/controllers/notification/event/rebalancerecommendation/model.go index b9edcfaa4e16..9c536c294c04 100644 --- a/pkg/cloudprovider/aws/controllers/notification/event/rebalancerecommendation/unmarshal.go +++ b/pkg/cloudprovider/aws/controllers/notification/event/rebalancerecommendation/model.go @@ -18,14 +18,22 @@ import ( "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event" ) -// AWSEvent contains the properties defined by +// Event contains the properties defined by // https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/rebalance-recommendations.html#monitor-rebalance-recommendations -type AWSEvent struct { +type Event struct { event.AWSMetadata - Detail EC2InstanceRebalanceRecommendationDetail `json:"detail"` + Detail Detail `json:"detail"` } -type EC2InstanceRebalanceRecommendationDetail struct { +type Detail struct { InstanceID string `json:"instance-id"` } + +func (e Event) EC2InstanceIDs() []string { + return []string{e.Detail.InstanceID} +} + +func (Event) Kind() event.Kind { + return event.RebalanceRecommendationKind +} diff --git a/pkg/cloudprovider/aws/controllers/notification/event/rebalancerecommendation/parser.go b/pkg/cloudprovider/aws/controllers/notification/event/rebalancerecommendation/parser.go index f81d8e3c8611..9f09f0b955e5 100644 --- a/pkg/cloudprovider/aws/controllers/notification/event/rebalancerecommendation/parser.go +++ b/pkg/cloudprovider/aws/controllers/notification/event/rebalancerecommendation/parser.go @@ -24,7 +24,7 @@ import ( type Parser struct{} func (p Parser) Parse(msg string) (event.Interface, error) { - evt := EC2InstanceRebalanceRecommendation{} + evt := Event{} if err := json.Unmarshal([]byte(msg), &evt); err != nil { return nil, fmt.Errorf("unmarhsalling the message as EC2InstanceRebalanceRecommendation, %w", err) } diff --git a/pkg/cloudprovider/aws/controllers/notification/event/scheduledchange/handler.go b/pkg/cloudprovider/aws/controllers/notification/event/scheduledchange/handler.go deleted file mode 100644 index 80e9d35732a8..000000000000 --- a/pkg/cloudprovider/aws/controllers/notification/event/scheduledchange/handler.go +++ /dev/null @@ -1,43 +0,0 @@ -/* -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package scheduledchange - -import ( - "time" - - "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event" -) - -type AWSHealthEvent AWSEvent - -func (e AWSHealthEvent) EventID() string { - return e.ID -} - -func (e AWSHealthEvent) EC2InstanceIDs() []string { - ids := make([]string, len(e.Detail.AffectedEntities)) - for i, entity := range e.Detail.AffectedEntities { - ids[i] = entity.EntityValue - } - return ids -} - -func (AWSHealthEvent) Kind() event.Kind { - return event.ScheduledChangeKind -} - -func (e AWSHealthEvent) StartTime() time.Time { - return e.Time -} diff --git a/pkg/cloudprovider/aws/controllers/notification/event/scheduledchange/unmarshal.go b/pkg/cloudprovider/aws/controllers/notification/event/scheduledchange/model.go similarity index 77% rename from pkg/cloudprovider/aws/controllers/notification/event/scheduledchange/unmarshal.go rename to pkg/cloudprovider/aws/controllers/notification/event/scheduledchange/model.go index 8528e579a9f5..a9d9e5d1e636 100644 --- a/pkg/cloudprovider/aws/controllers/notification/event/scheduledchange/unmarshal.go +++ b/pkg/cloudprovider/aws/controllers/notification/event/scheduledchange/model.go @@ -18,15 +18,27 @@ import ( "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event" ) -// AWSEvent contains the properties defined in AWS EventBridge schema +// Event contains the properties defined in AWS EventBridge schema // aws.health@AWSHealthEvent v1. -type AWSEvent struct { +type Event struct { event.AWSMetadata - Detail AWSHealthEventDetail `json:"detail"` + Detail Detail `json:"detail"` } -type AWSHealthEventDetail struct { +func (e Event) EC2InstanceIDs() []string { + ids := make([]string, len(e.Detail.AffectedEntities)) + for i, entity := range e.Detail.AffectedEntities { + ids[i] = entity.EntityValue + } + return ids +} + +func (Event) Kind() event.Kind { + return event.ScheduledChangeKind +} + +type Detail struct { EventARN string `json:"eventArn"` EventTypeCode string `json:"eventTypeCode"` Service string `json:"service"` diff --git a/pkg/cloudprovider/aws/controllers/notification/event/scheduledchange/parser.go b/pkg/cloudprovider/aws/controllers/notification/event/scheduledchange/parser.go index 87b81239e64e..8856c52b5d81 100644 --- a/pkg/cloudprovider/aws/controllers/notification/event/scheduledchange/parser.go +++ b/pkg/cloudprovider/aws/controllers/notification/event/scheduledchange/parser.go @@ -29,7 +29,7 @@ const ( type Parser struct{} func (p Parser) Parse(msg string) (event.Interface, error) { - evt := AWSHealthEvent{} + evt := Event{} if err := json.Unmarshal([]byte(msg), &evt); err != nil { return nil, fmt.Errorf("unmarhsalling the message as AWSHealthEvent, %w", err) } diff --git a/pkg/cloudprovider/aws/controllers/notification/event/spotinterruption/handler.go b/pkg/cloudprovider/aws/controllers/notification/event/spotinterruption/handler.go deleted file mode 100644 index 9c5f6c876ba5..000000000000 --- a/pkg/cloudprovider/aws/controllers/notification/event/spotinterruption/handler.go +++ /dev/null @@ -1,39 +0,0 @@ -/* -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package spotinterruption - -import ( - "time" - - "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event" -) - -type EC2SpotInstanceInterruptionWarning AWSEvent - -func (e EC2SpotInstanceInterruptionWarning) EventID() string { - return e.ID -} - -func (e EC2SpotInstanceInterruptionWarning) EC2InstanceIDs() []string { - return []string{e.Detail.InstanceID} -} - -func (EC2SpotInstanceInterruptionWarning) Kind() event.Kind { - return event.SpotInterruptionKind -} - -func (e EC2SpotInstanceInterruptionWarning) StartTime() time.Time { - return e.Time -} diff --git a/pkg/cloudprovider/aws/controllers/notification/event/spotinterruption/unmarshal.go b/pkg/cloudprovider/aws/controllers/notification/event/spotinterruption/model.go similarity index 65% rename from pkg/cloudprovider/aws/controllers/notification/event/spotinterruption/unmarshal.go rename to pkg/cloudprovider/aws/controllers/notification/event/spotinterruption/model.go index 4c8a87e4bd11..c146c6ff7994 100644 --- a/pkg/cloudprovider/aws/controllers/notification/event/spotinterruption/unmarshal.go +++ b/pkg/cloudprovider/aws/controllers/notification/event/spotinterruption/model.go @@ -15,18 +15,36 @@ limitations under the License. package spotinterruption import ( + "time" + "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event" ) -// AWSEvent contains the properties defined in AWS EventBridge schema +// EC2SpotInstanceInterruptionWarning contains the properties defined in AWS EventBridge schema // aws.ec2@EC2SpotInstanceInterruptionWarning v1. -type AWSEvent struct { +type Event struct { event.AWSMetadata - Detail EC2SpotInstanceInterruptionWarningDetail `json:"detail"` + Detail Detail `json:"detail"` } -type EC2SpotInstanceInterruptionWarningDetail struct { +type Detail struct { InstanceID string `json:"instance-id"` InstanceAction string `json:"instance-action"` } + +func (e Event) EventID() string { + return e.ID +} + +func (e Event) EC2InstanceIDs() []string { + return []string{e.Detail.InstanceID} +} + +func (Event) Kind() event.Kind { + return event.SpotInterruptionKind +} + +func (e Event) StartTime() time.Time { + return e.Time +} diff --git a/pkg/cloudprovider/aws/controllers/notification/event/spotinterruption/parser.go b/pkg/cloudprovider/aws/controllers/notification/event/spotinterruption/parser.go index 43a591ee2889..269900045cee 100644 --- a/pkg/cloudprovider/aws/controllers/notification/event/spotinterruption/parser.go +++ b/pkg/cloudprovider/aws/controllers/notification/event/spotinterruption/parser.go @@ -24,7 +24,7 @@ import ( type Parser struct{} func (p Parser) Parse(msg string) (event.Interface, error) { - evt := EC2SpotInstanceInterruptionWarning{} + evt := Event{} if err := json.Unmarshal([]byte(msg), &evt); err != nil { return nil, fmt.Errorf("unmarhsalling the message as EC2SpotInstanceInterruptionWarning, %w", err) } diff --git a/pkg/cloudprovider/aws/controllers/notification/event/statechange/handler.go b/pkg/cloudprovider/aws/controllers/notification/event/statechange/handler.go deleted file mode 100644 index a37d2b671e39..000000000000 --- a/pkg/cloudprovider/aws/controllers/notification/event/statechange/handler.go +++ /dev/null @@ -1,43 +0,0 @@ -/* -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package statechange - -import ( - "time" - - "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event" -) - -type EC2InstanceStateChangeNotification AWSEvent - -func (e EC2InstanceStateChangeNotification) EventID() string { - return e.ID -} - -func (e EC2InstanceStateChangeNotification) EC2InstanceIDs() []string { - return []string{e.Detail.InstanceID} -} - -func (e EC2InstanceStateChangeNotification) State() string { - return e.Detail.State -} - -func (EC2InstanceStateChangeNotification) Kind() event.Kind { - return event.StateChangeKind -} - -func (e EC2InstanceStateChangeNotification) StartTime() time.Time { - return e.Time -} diff --git a/pkg/cloudprovider/aws/controllers/notification/event/statechange/unmarshal.go b/pkg/cloudprovider/aws/controllers/notification/event/statechange/model.go similarity index 63% rename from pkg/cloudprovider/aws/controllers/notification/event/statechange/unmarshal.go rename to pkg/cloudprovider/aws/controllers/notification/event/statechange/model.go index 4a006491a135..78c0a0d3be87 100644 --- a/pkg/cloudprovider/aws/controllers/notification/event/statechange/unmarshal.go +++ b/pkg/cloudprovider/aws/controllers/notification/event/statechange/model.go @@ -15,18 +15,40 @@ limitations under the License. package statechange import ( + "time" + "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event" ) -// AWSEvent contains the properties defined in AWS EventBridge schema +// Event contains the properties defined in AWS EventBridge schema // aws.ec2@EC2InstanceStateChangeNotification v1. -type AWSEvent struct { +type Event struct { event.AWSMetadata - Detail EC2InstanceStateChangeNotificationDetail `json:"detail"` + Detail Detail `json:"detail"` } -type EC2InstanceStateChangeNotificationDetail struct { +type Detail struct { InstanceID string `json:"instance-id"` State string `json:"state"` } + +func (e Event) EventID() string { + return e.ID +} + +func (e Event) EC2InstanceIDs() []string { + return []string{e.Detail.InstanceID} +} + +func (e Event) State() string { + return e.Detail.State +} + +func (Event) Kind() event.Kind { + return event.StateChangeKind +} + +func (e Event) StartTime() time.Time { + return e.Time +} diff --git a/pkg/cloudprovider/aws/controllers/notification/event/statechange/parser.go b/pkg/cloudprovider/aws/controllers/notification/event/statechange/parser.go index 6431b4a2f754..59aee6ca4417 100644 --- a/pkg/cloudprovider/aws/controllers/notification/event/statechange/parser.go +++ b/pkg/cloudprovider/aws/controllers/notification/event/statechange/parser.go @@ -29,7 +29,7 @@ var acceptedStates = sets.NewString("stopping", "stopped", "shutting-down", "ter type Parser struct{} func (p Parser) Parse(msg string) (event.Interface, error) { - evt := EC2InstanceStateChangeNotification{} + evt := Event{} if err := json.Unmarshal([]byte(msg), &evt); err != nil { return nil, fmt.Errorf("unmarhsalling the message as EC2InstanceStateChangeNotification, %w", err) } diff --git a/pkg/cloudprovider/aws/controllers/notification/notification_benchmark_test.go b/pkg/cloudprovider/aws/controllers/notification/notification_benchmark_test.go index 7500a362e25c..6874182123bd 100644 --- a/pkg/cloudprovider/aws/controllers/notification/notification_benchmark_test.go +++ b/pkg/cloudprovider/aws/controllers/notification/notification_benchmark_test.go @@ -47,13 +47,11 @@ import ( "github.com/aws/karpenter/pkg/apis/provisioning/v1alpha5" "github.com/aws/karpenter/pkg/cloudprovider" awscloudprovider "github.com/aws/karpenter/pkg/cloudprovider/aws" - controllersfake "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/fake" "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/infrastructure" "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification" awsfake "github.com/aws/karpenter/pkg/cloudprovider/aws/fake" "github.com/aws/karpenter/pkg/cloudprovider/fake" "github.com/aws/karpenter/pkg/controllers" - "github.com/aws/karpenter/pkg/controllers/polling" "github.com/aws/karpenter/pkg/controllers/state" "github.com/aws/karpenter/pkg/test" "github.com/aws/karpenter/pkg/utils/injection" @@ -122,8 +120,7 @@ func benchmarkNotificationController(b *testing.B, messageCount int) { // Set-up the controllers nodeStateController := state.NewNodeController(env.Client, cluster) - infraController := &controllersfake.PollingController{} - notificationController := polling.NewController(notification.NewReconciler(env.Client, recorder, cluster, providers.sqsProvider, instanceTypeProvider, infraController)) + notificationController := notification.NewController(env.Client, recorder, cluster, providers.sqsProvider, instanceTypeProvider, nil) messages, nodes := makeDiverseMessagesAndNodes(messageCount) diff --git a/pkg/cloudprovider/aws/controllers/notification/suite_test.go b/pkg/cloudprovider/aws/controllers/notification/suite_test.go index a2fe04761b97..579d10c2cec7 100644 --- a/pkg/cloudprovider/aws/controllers/notification/suite_test.go +++ b/pkg/cloudprovider/aws/controllers/notification/suite_test.go @@ -44,16 +44,13 @@ import ( "github.com/aws/karpenter/pkg/cloudprovider" "github.com/aws/karpenter/pkg/cloudprovider/aws" "github.com/aws/karpenter/pkg/cloudprovider/aws/apis/v1alpha1" - controllersfake "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/fake" - "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/infrastructure" "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification" "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event" - scheduledchangev0 "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event/scheduledchange" - spotinterruptionv0 "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event/spotinterruption" - statechangev0 "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event/statechange" + "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event/scheduledchange" + "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event/spotinterruption" + "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event/statechange" awsfake "github.com/aws/karpenter/pkg/cloudprovider/aws/fake" "github.com/aws/karpenter/pkg/cloudprovider/fake" - "github.com/aws/karpenter/pkg/controllers/polling" "github.com/aws/karpenter/pkg/controllers/state" "github.com/aws/karpenter/pkg/test" . "github.com/aws/karpenter/pkg/test/expectations" @@ -82,8 +79,7 @@ var eventBridgeProvider *aws.EventBridgeProvider var recorder *awsfake.EventRecorder var fakeClock *clock.FakeClock var cfg *test.Config -var controller polling.ControllerInterface -var infraController polling.ControllerWithHealthInterface +var controller *notification.Controller var nodeStateController *state.NodeController func TestAPIs(t *testing.T) { @@ -112,9 +108,7 @@ var _ = BeforeEach(func() { ec2api = &awsfake.EC2API{} subnetProvider := aws.NewSubnetProvider(ec2api) instanceTypeProvider = aws.NewInstanceTypeProvider(env.Ctx, mock.Session, cloudprovider.Options{}, ec2api, subnetProvider) - - infraController = polling.NewController(infrastructure.NewReconciler(infrastructure.NewProvider(sqsProvider, eventBridgeProvider))).WithHealth() - controller = polling.NewController(notification.NewReconciler(env.Client, recorder, cluster, sqsProvider, instanceTypeProvider, infraController)) + controller = notification.NewController(env.Client, recorder, cluster, sqsProvider, instanceTypeProvider, nil) }) Expect(env.Start()).To(Succeed(), "Failed to start environment") }) @@ -137,7 +131,6 @@ var _ = Describe("Processing Messages", func() { ExpectMessagesCreated(spotInterruptionMessage(defaultInstanceID)) ExpectApplied(env.Ctx, env.Client, node) ExpectReconcileSucceeded(env.Ctx, nodeStateController, client.ObjectKeyFromObject(node)) - ExpectReconcileSucceeded(env.Ctx, infraController, types.NamespacedName{}) // Infrastructure should be healthy before starting ExpectReconcileSucceeded(env.Ctx, controller, types.NamespacedName{}) ExpectNotFound(env.Ctx, env.Client, node) @@ -155,7 +148,6 @@ var _ = Describe("Processing Messages", func() { ExpectMessagesCreated(scheduledChangeMessage(defaultInstanceID)) ExpectApplied(env.Ctx, env.Client, node) ExpectReconcileSucceeded(env.Ctx, nodeStateController, client.ObjectKeyFromObject(node)) - ExpectReconcileSucceeded(env.Ctx, infraController, types.NamespacedName{}) // Infrastructure should be healthy before starting ExpectReconcileSucceeded(env.Ctx, controller, types.NamespacedName{}) ExpectNotFound(env.Ctx, env.Client, node) @@ -181,7 +173,6 @@ var _ = Describe("Processing Messages", func() { // Wait for the nodes to reconcile with the cluster state ExpectReconcileSucceeded(env.Ctx, nodeStateController, lo.Map(nodes, func(n *v1.Node, _ int) client.ObjectKey { return client.ObjectKeyFromObject(n) })...) - ExpectReconcileSucceeded(env.Ctx, infraController, types.NamespacedName{}) // Infrastructure should be healthy before starting ExpectReconcileSucceeded(env.Ctx, controller, types.NamespacedName{}) ExpectNotFound(env.Ctx, env.Client, lo.Map(nodes, func(n *v1.Node, _ int) client.Object { return n })...) @@ -212,7 +203,6 @@ var _ = Describe("Processing Messages", func() { // Wait for the nodes to reconcile with the cluster state ExpectReconcileSucceeded(env.Ctx, nodeStateController, lo.Map(nodes, func(n *v1.Node, _ int) client.ObjectKey { return client.ObjectKeyFromObject(n) })...) - ExpectReconcileSucceeded(env.Ctx, infraController, types.NamespacedName{}) // Infrastructure should be healthy before starting ExpectReconcileSucceeded(env.Ctx, controller, types.NamespacedName{}) ExpectNotFound(env.Ctx, env.Client, lo.Map(nodes, func(n *v1.Node, _ int) client.Object { return n })...) @@ -225,7 +215,6 @@ var _ = Describe("Processing Messages", func() { ExpectMessagesCreated(spotInterruptionMessage(node.Spec.ProviderID)) ExpectApplied(env.Ctx, env.Client, node) ExpectReconcileSucceeded(env.Ctx, nodeStateController, client.ObjectKeyFromObject(node)) - ExpectReconcileSucceeded(env.Ctx, infraController, types.NamespacedName{}) // Infrastructure should be healthy before starting ExpectReconcileSucceeded(env.Ctx, controller, types.NamespacedName{}) ExpectNodeExists(env.Ctx, env.Client, node.Name) @@ -241,7 +230,6 @@ var _ = Describe("Processing Messages", func() { } ExpectMessagesCreated(badMessage) - ExpectReconcileSucceeded(env.Ctx, infraController, types.NamespacedName{}) // Infrastructure should be healthy before starting ExpectReconcileSucceeded(env.Ctx, controller, types.NamespacedName{}) Expect(sqsapi.DeleteMessageBehavior.SuccessfulCalls()).To(Equal(1)) @@ -258,7 +246,6 @@ var _ = Describe("Processing Messages", func() { ExpectMessagesCreated(stateChangeMessage(defaultInstanceID, "creating")) ExpectApplied(env.Ctx, env.Client, node) ExpectReconcileSucceeded(env.Ctx, nodeStateController, client.ObjectKeyFromObject(node)) - ExpectReconcileSucceeded(env.Ctx, infraController, types.NamespacedName{}) // Infrastructure should be healthy before starting ExpectReconcileSucceeded(env.Ctx, controller, types.NamespacedName{}) ExpectNodeExists(env.Ctx, env.Client, node.Name) @@ -279,7 +266,6 @@ var _ = Describe("Processing Messages", func() { ExpectMessagesCreated(spotInterruptionMessage(defaultInstanceID)) ExpectApplied(env.Ctx, env.Client, node) ExpectReconcileSucceeded(env.Ctx, nodeStateController, client.ObjectKeyFromObject(node)) - ExpectReconcileSucceeded(env.Ctx, infraController, types.NamespacedName{}) // Infrastructure should be healthy before starting ExpectReconcileSucceeded(env.Ctx, controller, types.NamespacedName{}) ExpectNotFound(env.Ctx, env.Client, node) @@ -304,36 +290,9 @@ var _ = Describe("Processing Messages", func() { var _ = Describe("Error Handling", func() { It("should send an error on polling when AccessDenied", func() { sqsapi.ReceiveMessageBehavior.Error.Set(awsErrWithCode(aws.AccessDeniedCode), awsfake.MaxCalls(0)) - ExpectReconcileSucceeded(env.Ctx, infraController, types.NamespacedName{}) // Infrastructure should be healthy before starting - - _, err := controller.Reconcile(env.Ctx, reconcile.Request{}) - Expect(err).ToNot(Succeed()) - }) - It("should trigger an infrastructure reconciliation on an SQS queue when it doesn't exist", func() { - sqsapi.GetQueueURLBehavior.Error.Set(awsErrWithCode(sqs.ErrCodeQueueDoesNotExist), awsfake.MaxCalls(0)) // This mocks the queue not existing - - infraController := &controllersfake.PollingController{} - controller = polling.NewController(notification.NewReconciler(env.Client, recorder, cluster, sqsProvider, instanceTypeProvider, infraController)) _, err := controller.Reconcile(env.Ctx, reconcile.Request{}) Expect(err).ToNot(Succeed()) - Expect(infraController.TriggerCalls.Load()).Should(BeNumerically("==", 1)) - }) -}) - -var _ = Describe("Infrastructure Coordination", func() { - It("should wait for the infrastructure to be ready before polling SQS", func() { - // Prior to provisioning the infrastructure and the infrastructure being healthy, we shouldn't try to hit the queue - res, err := controller.Reconcile(env.Ctx, reconcile.Request{}) - Expect(err).To(Succeed()) - Expect(res.Requeue).To(BeFalse()) - Expect(res.RequeueAfter).To(BeEquivalentTo(time.Duration(0))) - Expect(sqsapi.ReceiveMessageBehavior.SuccessfulCalls()).To(BeNumerically("==", 0)) - - ExpectReconcileSucceeded(env.Ctx, infraController, types.NamespacedName{}) - ExpectReconcileSucceeded(env.Ctx, controller, types.NamespacedName{}) - Expect(infraController.Healthy()).To(BeTrue()) - Expect(sqsapi.ReceiveMessageBehavior.SuccessfulCalls()).To(BeNumerically("==", 1)) }) }) @@ -355,8 +314,8 @@ func awsErrWithCode(code string) awserr.Error { return awserr.New(code, "", fmt.Errorf("")) } -func spotInterruptionMessage(involvedInstanceID string) spotinterruptionv0.AWSEvent { - return spotinterruptionv0.AWSEvent{ +func spotInterruptionMessage(involvedInstanceID string) spotinterruption.Event { + return spotinterruption.Event{ AWSMetadata: event.AWSMetadata{ Version: "0", Account: defaultAccountID, @@ -369,15 +328,15 @@ func spotInterruptionMessage(involvedInstanceID string) spotinterruptionv0.AWSEv Source: ec2Source, Time: time.Now(), }, - Detail: spotinterruptionv0.EC2SpotInstanceInterruptionWarningDetail{ + Detail: spotinterruption.Detail{ InstanceID: involvedInstanceID, InstanceAction: "terminate", }, } } -func stateChangeMessage(involvedInstanceID, state string) statechangev0.AWSEvent { - return statechangev0.AWSEvent{ +func stateChangeMessage(involvedInstanceID, state string) statechange.Event { + return statechange.Event{ AWSMetadata: event.AWSMetadata{ Version: "0", Account: defaultAccountID, @@ -390,7 +349,7 @@ func stateChangeMessage(involvedInstanceID, state string) statechangev0.AWSEvent Source: ec2Source, Time: time.Now(), }, - Detail: statechangev0.EC2InstanceStateChangeNotificationDetail{ + Detail: statechange.Detail{ InstanceID: involvedInstanceID, State: state, }, @@ -398,8 +357,8 @@ func stateChangeMessage(involvedInstanceID, state string) statechangev0.AWSEvent } // TODO: Update the scheduled change message to accurately reflect a real health event -func scheduledChangeMessage(involvedInstanceID string) scheduledchangev0.AWSEvent { - return scheduledchangev0.AWSEvent{ +func scheduledChangeMessage(involvedInstanceID string) scheduledchange.Event { + return scheduledchange.Event{ AWSMetadata: event.AWSMetadata{ Version: "0", Account: defaultAccountID, @@ -412,10 +371,10 @@ func scheduledChangeMessage(involvedInstanceID string) scheduledchangev0.AWSEven Source: healthSource, Time: time.Now(), }, - Detail: scheduledchangev0.AWSHealthEventDetail{ + Detail: scheduledchange.Detail{ Service: "EC2", EventTypeCategory: "scheduledChange", - AffectedEntities: []scheduledchangev0.AffectedEntity{ + AffectedEntities: []scheduledchange.AffectedEntity{ { EntityValue: involvedInstanceID, }, diff --git a/pkg/cloudprovider/aws/controllers/register.go b/pkg/cloudprovider/aws/controllers/register.go index 2e7b354438cf..39a38d27e5f9 100644 --- a/pkg/cloudprovider/aws/controllers/register.go +++ b/pkg/cloudprovider/aws/controllers/register.go @@ -21,11 +21,9 @@ import ( "github.com/aws/karpenter/pkg/cloudprovider/aws" "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/infrastructure" - "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/nodetemplate" "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification" "github.com/aws/karpenter/pkg/cloudprovider/aws/events" "github.com/aws/karpenter/pkg/controllers" - "github.com/aws/karpenter/pkg/controllers/polling" ) func Register(ctx context.Context, provider *aws.CloudProvider, opts *controllers.ControllerOptions) (ret []controllers.Controller) { @@ -35,13 +33,9 @@ func Register(ctx context.Context, provider *aws.CloudProvider, opts *controller if opts.Config.EnableInterruptionHandling() { logging.FromContext(ctx).Infof("Enabling interruption handling") - infraProvider := infrastructure.NewProvider(provider.SQSProvider(), provider.EventBridgeProvider()) - infraController := polling.NewController(infrastructure.NewReconciler(infraProvider)).WithHealth() - notificationController := polling.NewController(notification.NewReconciler(opts.KubeClient, rec, opts.Cluster, provider.SQSProvider(), provider.InstanceTypeProvider(), infraController)) - infraController.OnHealthy = notificationController.Start - nodeTemplateController := nodetemplate.NewController(opts.KubeClient, infraProvider, infraController, notificationController) - - ret = append(ret, infraController, notificationController, nodeTemplateController) + infraController := infrastructure.NewController(opts.KubeClient, provider.SQSProvider(), provider.EventBridgeProvider()) + notificationController := notification.NewController(opts.KubeClient, rec, opts.Cluster, provider.SQSProvider(), provider.InstanceTypeProvider(), opts.StartAsync) + ret = append(ret, infraController, notificationController) } return ret } diff --git a/pkg/controllers/polling/controller.go b/pkg/controllers/polling/controller.go deleted file mode 100644 index e9548cbc790c..000000000000 --- a/pkg/controllers/polling/controller.go +++ /dev/null @@ -1,200 +0,0 @@ -/* -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package polling - -import ( - "context" - "sync" - "sync/atomic" - "time" - - "github.com/google/uuid" - "github.com/prometheus/client_golang/prometheus" - v1 "k8s.io/api/core/v1" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/runtime" - "k8s.io/apimachinery/pkg/types" - "knative.dev/pkg/logging" - controllerruntime "sigs.k8s.io/controller-runtime" - "sigs.k8s.io/controller-runtime/pkg/client" - "sigs.k8s.io/controller-runtime/pkg/event" - "sigs.k8s.io/controller-runtime/pkg/handler" - "sigs.k8s.io/controller-runtime/pkg/manager" - crmetrics "sigs.k8s.io/controller-runtime/pkg/metrics" - "sigs.k8s.io/controller-runtime/pkg/predicate" - "sigs.k8s.io/controller-runtime/pkg/reconcile" - "sigs.k8s.io/controller-runtime/pkg/source" - - "github.com/aws/karpenter/pkg/controllers" - "github.com/aws/karpenter/pkg/metrics" -) - -// Immediate isn't exactly immediate for a reconcile result. But it should be passed to the RequeueAfter if you want -// effectively immediate re-reconciliation. This can't be 0 because otherwise controller-runtime won't treat it as a -// valid RequeueAfter value -const Immediate = time.Nanosecond - -type ControllerInterface interface { - controllers.Controller - - Builder(context.Context, manager.Manager) *controllerruntime.Builder - - Start(context.Context) - Stop(context.Context) - Trigger() - Active() bool -} - -// Controller is a wrapper around a controller interface that adds a trigger mechanism for enqueuing -// reconcile requests for the TriggerObject. On a new trigger, Controller will throw away old trigger calls -// by comparing the current triggerGeneration to the previous triggerGeneration. -// Controller also has an active flag that can be enabled or disabled. This serves as a mechanism to stop -// a requeue of a trigger request from the wrapped Reconcile() method of the Controller -type Controller struct { - r controllers.Reconciler - uuid types.UID - - mu sync.RWMutex - active bool - - triggerGeneration atomic.Int64 - trigger chan event.GenericEvent - - cancels sync.Map - - activeMetric prometheus.Gauge - triggerCountMetric prometheus.Counter -} - -type Object struct { - metav1.ObjectMeta - runtime.Object -} - -func NewController(rec controllers.Reconciler) *Controller { - return &Controller{ - r: rec, - uuid: types.UID(uuid.New().String()), - trigger: make(chan event.GenericEvent, 100), - activeMetric: prometheus.NewGauge( - prometheus.GaugeOpts{ - Namespace: metrics.Namespace, - Subsystem: rec.Metadata().MetricsSubsystem, - Name: "active", - Help: "Whether the controller is active.", - }, - ), - triggerCountMetric: prometheus.NewCounter( - prometheus.CounterOpts{ - Namespace: metrics.Namespace, - Subsystem: rec.Metadata().MetricsSubsystem, - Name: "trigger_count", - Help: "A counter of the number of times this controller has been triggered.", - }, - ), - } -} - -// WithHealth returns a decorated version of the polling controller that surfaces health information -// based on the success or failure of a reconciliation loop -func (c *Controller) WithHealth() *ControllerWithHealth { - return NewControllerWithHealth(c) -} - -// Start is an idempotent call to kick-off a single reconciliation loop. Based on the intended use of this controller, -// the Reconciler is responsible for requeuing this message back in the WorkQueue so there is a time-based reconciliation -// performed. The Trigger operation is performed to kick-off the loop. -func (c *Controller) Start(ctx context.Context) { - c.mu.Lock() - defer c.mu.Unlock() - - if !c.active { - logging.FromContext(ctx).Infof("Starting the %s controller...", c.r.Metadata().Name) - c.active = true - c.Trigger() - } -} - -// Trigger triggers an immediate reconciliation by inserting a message into the event channel. We increase the trigger -// generation here to ensure that any messages that were previously re-queued are thrown away -func (c *Controller) Trigger() { - c.triggerCountMetric.Inc() - obj := &Object{ObjectMeta: metav1.ObjectMeta{Generation: c.triggerGeneration.Add(1), UID: c.uuid}} - c.trigger <- event.GenericEvent{Object: obj} -} - -// Stop sets the state of the controller to active and cancel the current reconciliation contexts, if there are any -func (c *Controller) Stop(ctx context.Context) { - logging.FromContext(ctx).Infof("Stopping the %s controller...", c.r.Metadata().Name) - c.SetActive(false) - c.cancels.Range(func(_ any, c any) bool { - cancel := c.(context.CancelFunc) - cancel() - return true - }) -} - -// Active gets whether the controller is active right now. This value is passed down to the wrapped -// Reconcile method so that the Reconciler can handle cleanup scenarios. The underlying Reconciler is responsible -// for returning a result with no RequeueAfter to stop its activity -func (c *Controller) Active() bool { - c.mu.RLock() - defer c.mu.RUnlock() - - return c.active -} - -// SetActive sets the active flag on the controller -func (c *Controller) SetActive(active bool) { - c.mu.Lock() - defer c.mu.Unlock() - - c.active = active - if active { - c.activeMetric.Set(1) - } else { - c.activeMetric.Set(0) - } -} - -func (c *Controller) Reconcile(ctx context.Context, req reconcile.Request) (reconcile.Result, error) { - ctx = logging.WithLogger(ctx, logging.FromContext(ctx).Named(c.r.Metadata().Name)) - ctx, cancel := context.WithCancel(ctx) - - // Store the cancel function for the duration of Reconcile, so we can cancel on a Stop() call - cancelID := uuid.New() - c.cancels.Store(cancelID, cancel) - defer c.cancels.Delete(cancelID) - - return c.r.Reconcile(ctx, req) -} - -func (c *Controller) Builder(_ context.Context, m manager.Manager) *controllerruntime.Builder { - crmetrics.Registry.MustRegister(c.activeMetric, c.triggerCountMetric) - return controllerruntime. - NewControllerManagedBy(m). - Named(c.r.Metadata().Name). - WithEventFilter(predicate.NewPredicateFuncs(func(obj client.Object) bool { - // UUID comparison is a hacky way to get around the fact that controller-runtime requires - // us to perform a watch on some K8s object - return obj.GetUID() == c.uuid && obj.GetGeneration() == c.triggerGeneration.Load() - })). - Watches(&source.Channel{Source: c.trigger}, &handler.EnqueueRequestForObject{}). - For(&v1.Pod{}) // controller-runtime requires us to perform a watch on some object, so let's do it on a fundamental component -} - -func (c *Controller) Register(ctx context.Context, m manager.Manager) error { - return c.Builder(ctx, m).Complete(c) -} diff --git a/pkg/controllers/polling/decorators.go b/pkg/controllers/polling/decorators.go deleted file mode 100644 index 26607688041b..000000000000 --- a/pkg/controllers/polling/decorators.go +++ /dev/null @@ -1,93 +0,0 @@ -/* -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package polling - -import ( - "context" - "sync/atomic" - - "github.com/prometheus/client_golang/prometheus" - "knative.dev/pkg/logging" - controllerruntime "sigs.k8s.io/controller-runtime" - "sigs.k8s.io/controller-runtime/pkg/manager" - crmetrics "sigs.k8s.io/controller-runtime/pkg/metrics" - "sigs.k8s.io/controller-runtime/pkg/reconcile" - - "github.com/aws/karpenter/pkg/metrics" -) - -type ControllerWithHealthInterface interface { - ControllerInterface - - Healthy() bool -} - -// ControllerWithHealth is a Controller decorator that wraps a polling controller with health information -// on the success or failure of a reconciliation loop -type ControllerWithHealth struct { - *Controller - - healthy atomic.Bool - healthyMetric prometheus.Gauge - - OnHealthy func(context.Context) - OnUnhealthy func(context.Context) -} - -func NewControllerWithHealth(c *Controller) *ControllerWithHealth { - return &ControllerWithHealth{ - Controller: c, - healthyMetric: prometheus.NewGauge( - prometheus.GaugeOpts{ - Namespace: metrics.Namespace, - Subsystem: c.r.Metadata().MetricsSubsystem, - Name: "healthy", - Help: "Whether the controller is in a healthy state.", - }, - ), - } -} - -func (c *ControllerWithHealth) Healthy() bool { - return c.healthy.Load() -} - -func (c *ControllerWithHealth) Reconcile(ctx context.Context, req reconcile.Request) (reconcile.Result, error) { - callerCtx := logging.WithLogger(ctx, logging.FromContext(ctx).Named(c.r.Metadata().Name)) - res, err := c.Controller.Reconcile(ctx, req) - healthy := err == nil // The controller is considered healthy when it successfully reconciles - c.healthy.Store(healthy) - if healthy { - if c.OnHealthy != nil { - c.OnHealthy(callerCtx) - } - c.healthyMetric.Set(1) - } else { - if c.OnUnhealthy != nil { - c.OnUnhealthy(callerCtx) - } - c.healthyMetric.Set(0) - } - return res, err -} - -func (c *ControllerWithHealth) Builder(ctx context.Context, m manager.Manager) *controllerruntime.Builder { - crmetrics.Registry.MustRegister(c.healthyMetric) - return c.Controller.Builder(ctx, m) -} - -func (c *ControllerWithHealth) Register(ctx context.Context, m manager.Manager) error { - return c.Builder(ctx, m).Complete(c) -} diff --git a/pkg/controllers/polling/suite_test.go b/pkg/controllers/polling/suite_test.go deleted file mode 100644 index b50f7cd50a70..000000000000 --- a/pkg/controllers/polling/suite_test.go +++ /dev/null @@ -1,15 +0,0 @@ -/* -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package polling_test From b61fbb6a417a59e1734d5a48ca05a2e06281b352 Mon Sep 17 00:00:00 2001 From: Jonathan Innis Date: Thu, 13 Oct 2022 14:34:43 -0700 Subject: [PATCH 45/55] Add backoff and update IAM policy --- go.mod | 3 +- go.sum | 2 + .../controllers/notification/controller.go | 46 ++++++++++++++----- .../controllers/notification/event/types.go | 4 +- .../{event/aggregatedparser.go => parser.go} | 19 ++++---- .../controllers/notification/suite_test.go | 3 +- pkg/cloudprovider/aws/controllers/register.go | 2 +- pkg/cloudprovider/aws/sqs.go | 6 +-- test/suites/notification/suite_test.go | 10 ++-- .../cloudformation.yaml | 14 ++++-- 10 files changed, 70 insertions(+), 39 deletions(-) rename pkg/cloudprovider/aws/controllers/notification/{event/aggregatedparser.go => parser.go} (79%) diff --git a/go.mod b/go.mod index d14fcb0cb4ab..bd0ce9a44f1c 100644 --- a/go.mod +++ b/go.mod @@ -6,10 +6,10 @@ require ( github.com/Pallinder/go-randomdata v1.2.0 github.com/avast/retry-go v3.0.0+incompatible github.com/aws/aws-sdk-go v1.44.114 + github.com/cenkalti/backoff/v4 v4.1.3 github.com/deckarep/golang-set v1.8.0 github.com/go-logr/logr v1.2.3 github.com/go-logr/zapr v1.2.3 - github.com/google/uuid v1.3.0 github.com/imdario/mergo v0.3.13 github.com/mitchellh/hashstructure/v2 v2.0.2 github.com/onsi/ginkgo/v2 v2.2.0 @@ -57,6 +57,7 @@ require ( github.com/google/gnostic v0.5.7-v3refs // indirect github.com/google/go-cmp v0.5.8 // indirect github.com/google/gofuzz v1.2.0 // indirect + github.com/google/uuid v1.3.0 // indirect github.com/grpc-ecosystem/grpc-gateway v1.16.0 // indirect github.com/hashicorp/golang-lru v0.5.4 // indirect github.com/jmespath/go-jmespath v0.4.0 // indirect diff --git a/go.sum b/go.sum index b4b7f441f175..45522395c9a6 100644 --- a/go.sum +++ b/go.sum @@ -76,6 +76,8 @@ github.com/blang/semver/v4 v4.0.0 h1:1PFHFE6yCCTv8C1TeyNNarDzntLi7wMI5i/pzqYIsAM github.com/blang/semver/v4 v4.0.0/go.mod h1:IbckMUScFkM3pff0VJDNKRiT6TG/YpiHIM2yvyW5YoQ= github.com/blendle/zapdriver v1.3.1 h1:C3dydBOWYRiOk+B8X9IVZ5IOe+7cl+tGOexN4QqHfpE= github.com/blendle/zapdriver v1.3.1/go.mod h1:mdXfREi6u5MArG4j9fewC+FGnXaBR+T4Ox4J2u4eHCc= +github.com/cenkalti/backoff/v4 v4.1.3 h1:cFAlzYUlVYDysBEH2T5hyJZMh3+5+WCBvSnK6Q8UtC4= +github.com/cenkalti/backoff/v4 v4.1.3/go.mod h1:scbssz8iZGpm3xbr14ovlUdkxfGXNInqkPWOWmG2CLw= github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU= github.com/census-instrumentation/opencensus-proto v0.3.0 h1:t/LhUZLVitR1Ow2YOnduCsavhwFUklBMoGVYUCqmCqk= github.com/census-instrumentation/opencensus-proto v0.3.0/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU= diff --git a/pkg/cloudprovider/aws/controllers/notification/controller.go b/pkg/cloudprovider/aws/controllers/notification/controller.go index 04964dc3edc4..b1d7ba44a5b5 100644 --- a/pkg/cloudprovider/aws/controllers/notification/controller.go +++ b/pkg/cloudprovider/aws/controllers/notification/controller.go @@ -21,10 +21,12 @@ import ( "time" sqsapi "github.com/aws/aws-sdk-go/service/sqs" + "github.com/cenkalti/backoff/v4" "github.com/samber/lo" "go.uber.org/multierr" v1 "k8s.io/api/core/v1" "k8s.io/client-go/util/workqueue" + "k8s.io/utils/clock" "knative.dev/pkg/logging" "knative.dev/pkg/ptr" "sigs.k8s.io/controller-runtime/pkg/client" @@ -65,43 +67,57 @@ func (a Action) String() string { // Controller is an AWS notification controller. // It plugs into the polling controller to periodically poll the SQS queue for notification messages type Controller struct { - startAsync <-chan struct{} kubeClient client.Client + clk clock.Clock cluster *state.Cluster recorder events.Recorder provider *aws.SQSProvider instanceTypeProvider *aws.InstanceTypeProvider - parser event.AggregatedParser + parser AggregatedParser + backoff *backoff.ExponentialBackOff } -func NewController(kubeClient client.Client, recorder events.Recorder, cluster *state.Cluster, - sqsProvider *aws.SQSProvider, instanceTypeProvider *aws.InstanceTypeProvider, startAsync <-chan struct{}) *Controller { +func NewController(kubeClient client.Client, clk clock.Clock, recorder events.Recorder, cluster *state.Cluster, + sqsProvider *aws.SQSProvider, instanceTypeProvider *aws.InstanceTypeProvider) *Controller { return &Controller{ kubeClient: kubeClient, + clk: clk, cluster: cluster, recorder: recorder, provider: sqsProvider, instanceTypeProvider: instanceTypeProvider, - parser: event.NewAggregatedParser(event.DefaultParsers...), - startAsync: startAsync, + parser: NewAggregatedParser(DefaultParsers...), + backoff: newBackoff(clk), } } func (c *Controller) Start(ctx context.Context) { - for ctx.Err() != nil { + for { list := &v1alpha1.AWSNodeTemplateList{} if err := c.kubeClient.List(ctx, list); err != nil { logging.FromContext(ctx).Errorf("listing aws node templates, %v", err) continue } if len(list.Items) > 0 { + wait := time.Duration(0) // default is to not wait if _, err := c.Reconcile(ctx, reconcile.Request{}); err != nil { logging.FromContext(ctx).Errorf("reconciling notification messages, %v", err) - continue + wait = c.backoff.NextBackOff() + } else { + c.backoff.Reset() + } + select { + case <-ctx.Done(): + return + case <-c.clk.After(wait): } } else { - time.Sleep(time.Minute) + select { + case <-ctx.Done(): + return + case <-c.clk.After(time.Minute): + } } } } @@ -125,11 +141,11 @@ func (c *Controller) Reconcile(ctx context.Context, _ reconcile.Request) (reconc func (c *Controller) Register(ctx context.Context, m manager.Manager) error { ctx = logging.WithLogger(ctx, logging.FromContext(ctx).Named("aws.notification")) go func() { - defer logging.FromContext(ctx).Infof("Shutting down ") + defer logging.FromContext(ctx).Infof("Shutting down") select { case <-ctx.Done(): return - case <-c.startAsync: + case <-m.Elected(): c.Start(ctx) } }() @@ -287,3 +303,11 @@ func getInvolvedNodes(instanceIDs []string, instanceIDMap map[string]*v1.Node) [ } return nodes } + +func newBackoff(clk clock.Clock) *backoff.ExponentialBackOff { + b := backoff.NewExponentialBackOff() + b.InitialInterval = time.Second * 5 + b.MaxElapsedTime = time.Minute * 30 + b.Clock = clk + return b +} diff --git a/pkg/cloudprovider/aws/controllers/notification/event/types.go b/pkg/cloudprovider/aws/controllers/notification/event/types.go index 96c402a4c0ab..325c82840fcc 100644 --- a/pkg/cloudprovider/aws/controllers/notification/event/types.go +++ b/pkg/cloudprovider/aws/controllers/notification/event/types.go @@ -14,9 +14,7 @@ limitations under the License. package event -import ( - "fmt" -) +import "fmt" type Parser interface { Parse(string) (Interface, error) diff --git a/pkg/cloudprovider/aws/controllers/notification/event/aggregatedparser.go b/pkg/cloudprovider/aws/controllers/notification/parser.go similarity index 79% rename from pkg/cloudprovider/aws/controllers/notification/event/aggregatedparser.go rename to pkg/cloudprovider/aws/controllers/notification/parser.go index cb67e5871924..d15c50d2ca3d 100644 --- a/pkg/cloudprovider/aws/controllers/notification/event/aggregatedparser.go +++ b/pkg/cloudprovider/aws/controllers/notification/parser.go @@ -12,7 +12,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package event +package notification import ( "encoding/json" @@ -20,6 +20,7 @@ import ( "github.com/samber/lo" + "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event" "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event/noop" "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event/rebalancerecommendation" "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event/scheduledchange" @@ -33,7 +34,7 @@ type parserKey struct { DetailType string } -func newParserKey(metadata AWSMetadata) parserKey { +func newParserKey(metadata event.AWSMetadata) parserKey { return parserKey{ Version: metadata.Version, Source: metadata.Source, @@ -41,7 +42,7 @@ func newParserKey(metadata AWSMetadata) parserKey { } } -func newParserKeyFromParser(p Parser) parserKey { +func newParserKeyFromParser(p event.Parser) parserKey { return parserKey{ Version: p.Version(), Source: p.Source(), @@ -50,7 +51,7 @@ func newParserKeyFromParser(p Parser) parserKey { } var ( - DefaultParsers = []Parser{ + DefaultParsers = []event.Parser{ statechange.Parser{}, spotinterruption.Parser{}, scheduledchange.Parser{}, @@ -59,22 +60,22 @@ var ( ) type AggregatedParser struct { - parserMap map[parserKey]Parser + parserMap map[parserKey]event.Parser } -func NewAggregatedParser(parsers ...Parser) AggregatedParser { +func NewAggregatedParser(parsers ...event.Parser) AggregatedParser { return AggregatedParser{ - parserMap: lo.SliceToMap(parsers, func(p Parser) (parserKey, Parser) { + parserMap: lo.SliceToMap(parsers, func(p event.Parser) (parserKey, event.Parser) { return newParserKeyFromParser(p), p }), } } -func (p AggregatedParser) Parse(msg string) (Interface, error) { +func (p AggregatedParser) Parse(msg string) (event.Interface, error) { if msg == "" { return noop.Event{}, nil } - md := AWSMetadata{} + md := event.AWSMetadata{} if err := json.Unmarshal([]byte(msg), &md); err != nil { return noop.Event{}, fmt.Errorf("unmarshalling the message as AWSMetadata, %w", err) } diff --git a/pkg/cloudprovider/aws/controllers/notification/suite_test.go b/pkg/cloudprovider/aws/controllers/notification/suite_test.go index 579d10c2cec7..ec3d470b16a7 100644 --- a/pkg/cloudprovider/aws/controllers/notification/suite_test.go +++ b/pkg/cloudprovider/aws/controllers/notification/suite_test.go @@ -95,6 +95,7 @@ var _ = BeforeEach(func() { ctx = injection.WithOptions(ctx, opts) env = test.NewEnvironment(ctx, func(e *test.Environment) { cfg = test.NewConfig() + fakeClock = &clock.FakeClock{} cloudProvider = &fake.CloudProvider{} cluster = state.NewCluster(fakeClock, cfg, env.Client, cloudProvider) nodeStateController = state.NewNodeController(env.Client, cluster) @@ -108,7 +109,7 @@ var _ = BeforeEach(func() { ec2api = &awsfake.EC2API{} subnetProvider := aws.NewSubnetProvider(ec2api) instanceTypeProvider = aws.NewInstanceTypeProvider(env.Ctx, mock.Session, cloudprovider.Options{}, ec2api, subnetProvider) - controller = notification.NewController(env.Client, recorder, cluster, sqsProvider, instanceTypeProvider, nil) + controller = notification.NewController(env.Client, fakeClock, recorder, cluster, sqsProvider, instanceTypeProvider) }) Expect(env.Start()).To(Succeed(), "Failed to start environment") }) diff --git a/pkg/cloudprovider/aws/controllers/register.go b/pkg/cloudprovider/aws/controllers/register.go index 39a38d27e5f9..63a56a238ca2 100644 --- a/pkg/cloudprovider/aws/controllers/register.go +++ b/pkg/cloudprovider/aws/controllers/register.go @@ -34,7 +34,7 @@ func Register(ctx context.Context, provider *aws.CloudProvider, opts *controller logging.FromContext(ctx).Infof("Enabling interruption handling") infraController := infrastructure.NewController(opts.KubeClient, provider.SQSProvider(), provider.EventBridgeProvider()) - notificationController := notification.NewController(opts.KubeClient, rec, opts.Cluster, provider.SQSProvider(), provider.InstanceTypeProvider(), opts.StartAsync) + notificationController := notification.NewController(opts.KubeClient, opts.Clock, rec, opts.Cluster, provider.SQSProvider(), provider.InstanceTypeProvider()) ret = append(ret, infraController, notificationController) } return ret diff --git a/pkg/cloudprovider/aws/sqs.go b/pkg/cloudprovider/aws/sqs.go index 207f2d40ee27..58e6bb3cad68 100644 --- a/pkg/cloudprovider/aws/sqs.go +++ b/pkg/cloudprovider/aws/sqs.go @@ -24,7 +24,7 @@ import ( "github.com/aws/aws-sdk-go/service/sqs/sqsiface" "github.com/samber/lo" - awsv1alpha1 "github.com/aws/karpenter/pkg/cloudprovider/aws/apis/v1alpha1" + "github.com/aws/karpenter/pkg/cloudprovider/aws/apis/v1alpha1" "github.com/aws/karpenter/pkg/cloudprovider/aws/utils" "github.com/aws/karpenter/pkg/utils/atomic" "github.com/aws/karpenter/pkg/utils/functional" @@ -69,7 +69,7 @@ func NewSQSProvider(ctx context.Context, client sqsiface.SQSAPI) *SQSProvider { provider.createQueueInput = &sqs.CreateQueueInput{ QueueName: aws.String(provider.queueName), Tags: map[string]*string{ - awsv1alpha1.DiscoveryTagKey: aws.String(injection.GetOptions(ctx).ClusterName), + v1alpha1.DiscoveryTagKey: aws.String(injection.GetOptions(ctx).ClusterName), }, } provider.getQueueURLInput = &sqs.GetQueueUrlInput{ @@ -273,5 +273,5 @@ func (s *SQSProvider) getQueuePolicy(ctx context.Context) (*QueuePolicy, error) // This is used because the max-len for a queue name is 80 characters but the maximum cluster name // length is 100 func getQueueName(ctx context.Context) string { - return fmt.Sprintf("Karpenter-Queue-%s", utils.GetClusterNameHash(ctx, 20)) + return fmt.Sprintf("Karpenter-EventQueue-%s", utils.GetClusterNameHash(ctx, 20)) } diff --git a/test/suites/notification/suite_test.go b/test/suites/notification/suite_test.go index 73dd822e389b..1a820ec9929c 100644 --- a/test/suites/notification/suite_test.go +++ b/test/suites/notification/suite_test.go @@ -34,7 +34,7 @@ import ( "github.com/aws/karpenter/pkg/apis/provisioning/v1alpha5" awsv1alpha1 "github.com/aws/karpenter/pkg/cloudprovider/aws/apis/v1alpha1" "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event" - scheduledchangev0 "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event/scheduledchange" + "github.com/aws/karpenter/pkg/cloudprovider/aws/controllers/notification/event/scheduledchange" "github.com/aws/karpenter/pkg/test" "github.com/aws/karpenter/test/pkg/environment" ) @@ -240,8 +240,8 @@ var _ = Describe("Notification", Label("AWS"), func() { }) // TODO: Update the scheduled change message to accurately reflect a real health event -func scheduledChangeMessage(region, accountID, involvedInstanceID string) scheduledchangev0.AWSEvent { - return scheduledchangev0.AWSEvent{ +func scheduledChangeMessage(region, accountID, involvedInstanceID string) scheduledchange.Event { + return scheduledchange.Event{ AWSMetadata: event.AWSMetadata{ Version: "0", Account: accountID, @@ -254,10 +254,10 @@ func scheduledChangeMessage(region, accountID, involvedInstanceID string) schedu Source: "aws.health", Time: time.Now(), }, - Detail: scheduledchangev0.AWSHealthEventDetail{ + Detail: scheduledchange.Detail{ Service: "EC2", EventTypeCategory: "scheduledChange", - AffectedEntities: []scheduledchangev0.AffectedEntity{ + AffectedEntities: []scheduledchange.AffectedEntity{ { EntityValue: involvedInstanceID, }, diff --git a/website/content/en/preview/getting-started/getting-started-with-eksctl/cloudformation.yaml b/website/content/en/preview/getting-started/getting-started-with-eksctl/cloudformation.yaml index ea4dbb0356bc..7374b3487b6a 100644 --- a/website/content/en/preview/getting-started/getting-started-with-eksctl/cloudformation.yaml +++ b/website/content/en/preview/getting-started/getting-started-with-eksctl/cloudformation.yaml @@ -4,6 +4,9 @@ Parameters: ClusterName: Type: String Description: "EKS cluster name" + ClusterNameSHA: + Type: String + Description: "Truncated checksum of EKS cluster name" Resources: KarpenterNodeInstanceProfile: Type: "AWS::IAM::InstanceProfile" @@ -72,23 +75,24 @@ Resources: Version: "2012-10-17" Statement: - Effect: Allow - Resource: !Sub "arn:${AWS::Partition}:sqs:${AWS::Region}:${AWS::AccountId}:Karpenter-${ClusterName}-Queue" + Resource: !Sub "arn:${AWS::Partition}:sqs:${AWS::Region}:${AWS::AccountId}:Karpenter-EventQueue-${ClusterNameSHA}" Action: # Write Operations - sqs:CreateQueue - sqs:TagQueue - sqs:SetQueueAttributes - - sqs:DeleteMessage - sqs:DeleteQueue + - sqs:DeleteMessage # Read Operations - sqs:GetQueueUrl - sqs:GetQueueAttributes - sqs:ReceiveMessage - Effect: Allow - Resource: !Sub "arn:${AWS::Partition}:events:${AWS::Region}:${AWS::AccountId}:rule/Karpenter-${ClusterName}-*" + Resource: !Sub "arn:${AWS::Partition}:events:${AWS::Region}:${AWS::AccountId}:rule/Karpenter-${ClusterNameSHA}-*" Action: + # Write Operations - events:PutRule - - events:PutTargets - events:TagResource + - events:PutTargets - events:DeleteRule - - events:RemoveTargets + - events:RemoveTargets \ No newline at end of file From 1a434e6d52ab349e842e32d1caf36a80adf3334b Mon Sep 17 00:00:00 2001 From: Jonathan Innis Date: Fri, 14 Oct 2022 12:47:09 -0700 Subject: [PATCH 46/55] Move components that will be shared into top-level --- .../unavailableofferings.go} | 27 +++++++++++-------- .../aws/cloudprovider/cloudprovider.go | 4 +-- .../aws/cloudprovider/instance.go | 17 ++++++------ .../aws/cloudprovider/instancetypes.go | 13 +++++---- .../aws/cloudprovider/instancetypes_test.go | 2 +- .../aws/cloudprovider/launchtemplate.go | 3 ++- .../aws/cloudprovider/suite_test.go | 9 ++++--- .../aws/{cloudprovider => errors}/errors.go | 20 ++++++++------ 8 files changed, 54 insertions(+), 41 deletions(-) rename pkg/cloudproviders/aws/{cloudprovider/unavailableofferingscache.go => cache/unavailableofferings.go} (69%) rename pkg/cloudproviders/aws/{cloudprovider => errors}/errors.go (82%) diff --git a/pkg/cloudproviders/aws/cloudprovider/unavailableofferingscache.go b/pkg/cloudproviders/aws/cache/unavailableofferings.go similarity index 69% rename from pkg/cloudproviders/aws/cloudprovider/unavailableofferingscache.go rename to pkg/cloudproviders/aws/cache/unavailableofferings.go index fcb9f41f4b93..94c645d17e30 100644 --- a/pkg/cloudproviders/aws/cloudprovider/unavailableofferingscache.go +++ b/pkg/cloudproviders/aws/cache/unavailableofferings.go @@ -12,11 +12,12 @@ See the License for the specific language governing permissions and limitations under the License. */ -package cloudprovider +package cache import ( "context" "fmt" + "time" "github.com/aws/aws-sdk-go/aws" "github.com/aws/aws-sdk-go/service/ec2" @@ -24,45 +25,49 @@ import ( "knative.dev/pkg/logging" ) -// UnavailableOfferingsCache stores any offerings that return ICE (insufficient capacity errors) when +const ( + UnavailableOfferingsTTL = 3 * time.Minute +) + +// UnavailableOfferings stores any offerings that return ICE (insufficient capacity errors) when // attempting to launch the capacity. These offerings are ignored as long as they are in the cache on // GetInstanceTypes responses -type UnavailableOfferingsCache struct { +type UnavailableOfferings struct { // key: ::, value: struct{}{} cache *cache.Cache } -func NewUnavailableOfferingsCache() *UnavailableOfferingsCache { - return &UnavailableOfferingsCache{ - cache: cache.New(UnfulfillableCapacityErrorCacheTTL, CacheCleanupInterval), +func NewUnavailableOfferings(c *cache.Cache) *UnavailableOfferings { + return &UnavailableOfferings{ + cache: c, } } // IsUnavailable returns true if the offering appears in the cache -func (u *UnavailableOfferingsCache) IsUnavailable(instanceType, zone, capacityType string) bool { +func (u *UnavailableOfferings) IsUnavailable(instanceType, zone, capacityType string) bool { _, found := u.cache.Get(u.key(instanceType, zone, capacityType)) return found } // MarkUnavailable communicates recently observed temporary capacity shortages in the provided offerings -func (u *UnavailableOfferingsCache) MarkUnavailable(ctx context.Context, unavailableReason, instanceType, zone, capacityType string) { +func (u *UnavailableOfferings) MarkUnavailable(ctx context.Context, unavailableReason, instanceType, zone, capacityType string) { // even if the key is already in the cache, we still need to call Set to extend the cached entry's TTL logging.FromContext(ctx).Debugf("%s for offering { instanceType: %s, zone: %s, capacityType: %s }, avoiding for %s", unavailableReason, instanceType, zone, capacityType, - UnfulfillableCapacityErrorCacheTTL) + UnavailableOfferingsTTL) u.cache.SetDefault(u.key(instanceType, zone, capacityType), struct{}{}) } -func (u *UnavailableOfferingsCache) MarkUnavailableForFleetErr(ctx context.Context, fleetErr *ec2.CreateFleetError, capacityType string) { +func (u *UnavailableOfferings) MarkUnavailableForFleetErr(ctx context.Context, fleetErr *ec2.CreateFleetError, capacityType string) { instanceType := aws.StringValue(fleetErr.LaunchTemplateAndOverrides.Overrides.InstanceType) zone := aws.StringValue(fleetErr.LaunchTemplateAndOverrides.Overrides.AvailabilityZone) u.MarkUnavailable(ctx, aws.StringValue(fleetErr.ErrorCode), instanceType, zone, capacityType) } // key returns the cache key for all offerings in the cache -func (u *UnavailableOfferingsCache) key(instanceType string, zone string, capacityType string) string { +func (u *UnavailableOfferings) key(instanceType string, zone string, capacityType string) string { return fmt.Sprintf("%s:%s:%s", capacityType, instanceType, zone) } diff --git a/pkg/cloudproviders/aws/cloudprovider/cloudprovider.go b/pkg/cloudproviders/aws/cloudprovider/cloudprovider.go index 165c8ad5d40b..9da693c5f8df 100644 --- a/pkg/cloudproviders/aws/cloudprovider/cloudprovider.go +++ b/pkg/cloudproviders/aws/cloudprovider/cloudprovider.go @@ -46,9 +46,9 @@ import ( k8sClient "sigs.k8s.io/controller-runtime/pkg/client" "github.com/aws/karpenter-core/pkg/apis/provisioning/v1alpha5" - awsv1alpha1 "github.com/aws/karpenter/pkg/apis/awsnodetemplate/v1alpha1" "github.com/aws/karpenter/pkg/cloudproviders/aws/apis/v1alpha1" + awscache "github.com/aws/karpenter/pkg/cloudproviders/aws/cache" "github.com/aws/karpenter/pkg/cloudproviders/aws/cloudprovider/amifamily" "github.com/aws/karpenter/pkg/cloudproviders/common/cloudprovider" "github.com/aws/karpenter/pkg/utils/functional" @@ -114,7 +114,7 @@ func NewCloudProvider(ctx context.Context, options cloudprovider.Options) *Cloud logging.FromContext(ctx).Fatalf("Checking EC2 API connectivity, %s", err) } subnetProvider := NewSubnetProvider(ec2api) - instanceTypeProvider := NewInstanceTypeProvider(ctx, sess, options, ec2api, subnetProvider, NewUnavailableOfferingsCache()) + instanceTypeProvider := NewInstanceTypeProvider(ctx, sess, options, ec2api, subnetProvider, awscache.NewUnavailableOfferings(cache.New(awscache.UnavailableOfferingsTTL, CacheCleanupInterval))) cloudprovider := &CloudProvider{ instanceTypeProvider: instanceTypeProvider, instanceProvider: NewInstanceProvider(ctx, ec2api, instanceTypeProvider, subnetProvider, diff --git a/pkg/cloudproviders/aws/cloudprovider/instance.go b/pkg/cloudproviders/aws/cloudprovider/instance.go index 93f8c67a3dbb..3ae2d683ff8d 100644 --- a/pkg/cloudproviders/aws/cloudprovider/instance.go +++ b/pkg/cloudproviders/aws/cloudprovider/instance.go @@ -36,6 +36,7 @@ import ( "knative.dev/pkg/logging" "github.com/aws/karpenter-core/pkg/apis/provisioning/v1alpha5" + awserrors "github.com/aws/karpenter/pkg/cloudproviders/aws/errors" "github.com/aws/karpenter/pkg/cloudproviders/aws/apis/v1alpha1" "github.com/aws/karpenter/pkg/cloudproviders/common/cloudprovider" @@ -79,7 +80,7 @@ func (p *InstanceProvider) Create(ctx context.Context, provider *v1alpha1.AWS, n } id, err := p.launchInstance(ctx, provider, nodeRequest) - if isLaunchTemplateNotFound(err) { + if awserrors.IsLaunchTemplateNotFound(err) { // retry once if launch template is not found. This allows karpenter to generate a new LT if the // cache was out-of-sync on the first try id, err = p.launchInstance(ctx, provider, nodeRequest) @@ -117,11 +118,11 @@ func (p *InstanceProvider) Terminate(ctx context.Context, node *v1.Node) error { if _, err = p.ec2api.TerminateInstancesWithContext(ctx, &ec2.TerminateInstancesInput{ InstanceIds: []*string{id}, }); err != nil { - if isNotFound(err) { + if awserrors.IsNotFound(err) { return nil } if _, errMsg := p.getInstance(ctx, aws.StringValue(id)); err != nil { - if isInstanceTerminated(errMsg) || isNotFound(errMsg) { + if awserrors.IsInstanceTerminated(errMsg) || awserrors.IsNotFound(errMsg) { logging.FromContext(ctx).Debugf("Instance already terminated, %s", node.Name) return nil } @@ -167,7 +168,7 @@ func (p *InstanceProvider) launchInstance(ctx context.Context, provider *v1alpha createFleetOutput, err := p.createFleetBatcher.CreateFleet(ctx, createFleetInput) if err != nil { - if isLaunchTemplateNotFound(err) { + if awserrors.IsLaunchTemplateNotFound(err) { for _, lt := range launchTemplateConfigs { p.launchTemplateProvider.Invalidate(ctx, aws.StringValue(lt.LaunchTemplateSpecification.LaunchTemplateName)) } @@ -303,18 +304,18 @@ func (p *InstanceProvider) getOverrides(instanceTypeOptions []cloudprovider.Inst func (p *InstanceProvider) getInstance(ctx context.Context, id string) (*ec2.Instance, error) { describeInstancesOutput, err := p.ec2api.DescribeInstancesWithContext(ctx, &ec2.DescribeInstancesInput{InstanceIds: aws.StringSlice([]string{id})}) - if isNotFound(err) { + if awserrors.IsNotFound(err) { return nil, err } if err != nil { return nil, fmt.Errorf("failed to describe ec2 instances, %w", err) } if len(describeInstancesOutput.Reservations) != 1 || len(describeInstancesOutput.Reservations[0].Instances) != 1 { - return nil, InstanceTerminatedError{fmt.Errorf("expected instance but got 0")} + return nil, awserrors.InstanceTerminatedError{Err: fmt.Errorf("expected instance but got 0")} } instance := describeInstancesOutput.Reservations[0].Instances[0] if *instance.State.Name == ec2.InstanceStateNameTerminated { - return nil, InstanceTerminatedError{fmt.Errorf("instance is in terminated state")} + return nil, awserrors.InstanceTerminatedError{Err: fmt.Errorf("instance is in terminated state")} } if injection.GetOptions(ctx).GetAWSNodeNameConvention() == options.ResourceName { return instance, nil @@ -358,7 +359,7 @@ func (p *InstanceProvider) instanceToNode(ctx context.Context, instance *ec2.Ins func (p *InstanceProvider) updateUnavailableOfferingsCache(ctx context.Context, errors []*ec2.CreateFleetError, capacityType string) { for _, err := range errors { - if isUnfulfillableCapacity(err) { + if awserrors.IsUnfulfillableCapacity(err) { p.instanceTypeProvider.unavailableOfferings.MarkUnavailableForFleetErr(ctx, err, capacityType) } } diff --git a/pkg/cloudproviders/aws/cloudprovider/instancetypes.go b/pkg/cloudproviders/aws/cloudprovider/instancetypes.go index fd6774fc7753..797eada7169e 100644 --- a/pkg/cloudproviders/aws/cloudprovider/instancetypes.go +++ b/pkg/cloudproviders/aws/cloudprovider/instancetypes.go @@ -22,7 +22,7 @@ import ( "time" "github.com/aws/karpenter-core/pkg/apis/provisioning/v1alpha5" - + aws2 "github.com/aws/karpenter/pkg/cloudproviders/aws/cache" "github.com/aws/karpenter/pkg/cloudproviders/common/cloudprovider" "github.com/aws/aws-sdk-go/aws" @@ -42,10 +42,9 @@ import ( ) const ( - InstanceTypesCacheKey = "types" - InstanceTypeZonesCacheKeyPrefix = "zones:" - InstanceTypesAndZonesCacheTTL = 5 * time.Minute - UnfulfillableCapacityErrorCacheTTL = 3 * time.Minute + InstanceTypesCacheKey = "types" + InstanceTypeZonesCacheKeyPrefix = "zones:" + InstanceTypesAndZonesCacheTTL = 5 * time.Minute ) type InstanceTypeProvider struct { @@ -58,12 +57,12 @@ type InstanceTypeProvider struct { // Has one cache entry for all the zones for each subnet selector (key: InstanceTypesZonesCacheKeyPrefix:) // Values cached *before* considering insufficient capacity errors from the unavailableOfferings cache. cache *cache.Cache - unavailableOfferings *UnavailableOfferingsCache + unavailableOfferings *aws2.UnavailableOfferings cm *pretty.ChangeMonitor } func NewInstanceTypeProvider(ctx context.Context, sess *session.Session, options cloudprovider.Options, - ec2api ec2iface.EC2API, subnetProvider *SubnetProvider, unavailableOfferings *UnavailableOfferingsCache) *InstanceTypeProvider { + ec2api ec2iface.EC2API, subnetProvider *SubnetProvider, unavailableOfferings *aws2.UnavailableOfferings) *InstanceTypeProvider { return &InstanceTypeProvider{ ec2api: ec2api, region: *sess.Config.Region, diff --git a/pkg/cloudproviders/aws/cloudprovider/instancetypes_test.go b/pkg/cloudproviders/aws/cloudprovider/instancetypes_test.go index 711ef2ba5f1b..3f5f2236ee9f 100644 --- a/pkg/cloudproviders/aws/cloudprovider/instancetypes_test.go +++ b/pkg/cloudproviders/aws/cloudprovider/instancetypes_test.go @@ -707,7 +707,7 @@ var _ = Describe("Instance Types", func() { ExpectNotScheduled(ctx, env.Client, pod) // capacity shortage is over - expire the item from the cache and try again fakeEC2API.InsufficientCapacityPools.Set([]fake.CapacityPool{}) - unavailableOfferingsCache.cache.Delete(fmt.Sprintf("%s:%s:%s", awsv1alpha1.CapacityTypeOnDemand, "inf1.6xlarge", "test-zone-1a")) + internalUnavailableOfferingsCache.Delete(fmt.Sprintf("%s:%s:%s", awsv1alpha1.CapacityTypeOnDemand, "inf1.6xlarge", "test-zone-1a")) pod = ExpectProvisioned(ctx, env.Client, controller, pod)[0] node := ExpectScheduled(ctx, env.Client, pod) Expect(node.Labels).To(HaveKeyWithValue(v1.LabelInstanceTypeStable, "inf1.6xlarge")) diff --git a/pkg/cloudproviders/aws/cloudprovider/launchtemplate.go b/pkg/cloudproviders/aws/cloudprovider/launchtemplate.go index 14389b5408d0..f0f2c7f3710c 100644 --- a/pkg/cloudproviders/aws/cloudprovider/launchtemplate.go +++ b/pkg/cloudproviders/aws/cloudprovider/launchtemplate.go @@ -38,6 +38,7 @@ import ( "github.com/aws/karpenter/pkg/cloudproviders/aws/apis/v1alpha1" "github.com/aws/karpenter/pkg/cloudproviders/aws/cloudprovider/amifamily" + awserrors "github.com/aws/karpenter/pkg/cloudproviders/aws/errors" "github.com/aws/karpenter/pkg/cloudproviders/common/cloudprovider" "github.com/aws/karpenter/pkg/utils/injection" "github.com/aws/karpenter/pkg/utils/pretty" @@ -155,7 +156,7 @@ func (p *LaunchTemplateProvider) ensureLaunchTemplate(ctx context.Context, optio LaunchTemplateNames: []*string{aws.String(name)}, }) // Create LT if one doesn't exist - if isNotFound(err) { + if awserrors.IsNotFound(err) { launchTemplate, err = p.createLaunchTemplate(ctx, options) if err != nil { return nil, fmt.Errorf("creating launch template, %w", err) diff --git a/pkg/cloudproviders/aws/cloudprovider/suite_test.go b/pkg/cloudproviders/aws/cloudprovider/suite_test.go index 94be424fc1a0..f9885f7a79d9 100644 --- a/pkg/cloudproviders/aws/cloudprovider/suite_test.go +++ b/pkg/cloudproviders/aws/cloudprovider/suite_test.go @@ -36,6 +36,7 @@ import ( . "github.com/onsi/gomega" . "knative.dev/pkg/logging/testing" + awscache "github.com/aws/karpenter/pkg/cloudproviders/aws/cache" "github.com/aws/karpenter/pkg/cloudproviders/aws/cloudprovider/amifamily" "github.com/aws/karpenter/pkg/cloudproviders/common/cloudprovider" . "github.com/aws/karpenter/pkg/test/expectations" @@ -61,7 +62,8 @@ var securityGroupCache *cache.Cache var subnetCache *cache.Cache var ssmCache *cache.Cache var ec2Cache *cache.Cache -var unavailableOfferingsCache *UnavailableOfferingsCache +var internalUnavailableOfferingsCache *cache.Cache +var unavailableOfferingsCache *awscache.UnavailableOfferings var instanceTypeCache *cache.Cache var instanceTypeProvider *InstanceTypeProvider var fakeEC2API *fake.EC2API @@ -99,7 +101,8 @@ var _ = BeforeSuite(func() { ctx = injection.WithOptions(ctx, opts) ctx, stop = context.WithCancel(ctx) launchTemplateCache = cache.New(CacheTTL, CacheCleanupInterval) - unavailableOfferingsCache = NewUnavailableOfferingsCache() + internalUnavailableOfferingsCache = cache.New(awscache.UnavailableOfferingsTTL, CacheCleanupInterval) + unavailableOfferingsCache = awscache.NewUnavailableOfferings(internalUnavailableOfferingsCache) securityGroupCache = cache.New(CacheTTL, CacheCleanupInterval) subnetCache = cache.New(CacheTTL, CacheCleanupInterval) ssmCache = cache.New(CacheTTL, CacheCleanupInterval) @@ -171,7 +174,7 @@ var _ = BeforeEach(func() { launchTemplateCache.Flush() securityGroupCache.Flush() subnetCache.Flush() - unavailableOfferingsCache.cache.Flush() + internalUnavailableOfferingsCache.Flush() ssmCache.Flush() ec2Cache.Flush() instanceTypeCache.Flush() diff --git a/pkg/cloudproviders/aws/cloudprovider/errors.go b/pkg/cloudproviders/aws/errors/errors.go similarity index 82% rename from pkg/cloudproviders/aws/cloudprovider/errors.go rename to pkg/cloudproviders/aws/errors/errors.go index d4c1778e3eb8..3d1d7df113ad 100644 --- a/pkg/cloudproviders/aws/cloudprovider/errors.go +++ b/pkg/cloudproviders/aws/errors/errors.go @@ -12,7 +12,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -package cloudprovider +package errors import ( "errors" @@ -43,10 +43,14 @@ var ( ) type InstanceTerminatedError struct { - error + Err error } -func isInstanceTerminated(err error) bool { +func (e InstanceTerminatedError) Error() string { + return e.Err.Error() +} + +func IsInstanceTerminated(err error) bool { if err == nil { return false } @@ -54,10 +58,10 @@ func isInstanceTerminated(err error) bool { return errors.As(err, &itErr) } -// isNotFound returns true if the err is an AWS error (even if it's +// IsNotFound returns true if the err is an AWS error (even if it's // wrapped) and is a known to mean "not found" (as opposed to a more // serious or unexpected error) -func isNotFound(err error) bool { +func IsNotFound(err error) bool { if err == nil { return false } @@ -68,14 +72,14 @@ func isNotFound(err error) bool { return false } -// isUnfulfillableCapacity returns true if the Fleet err means +// IsUnfulfillableCapacity returns true if the Fleet err means // capacity is temporarily unavailable for launching. // This could be due to account limits, insufficient ec2 capacity, etc. -func isUnfulfillableCapacity(err *ec2.CreateFleetError) bool { +func IsUnfulfillableCapacity(err *ec2.CreateFleetError) bool { return lo.Contains(unfulfillableCapacityErrorCodes, *err.ErrorCode) } -func isLaunchTemplateNotFound(err error) bool { +func IsLaunchTemplateNotFound(err error) bool { if err == nil { return false } From 2a2ceb2ff0771f370f5f6de5616743ce1dcb427c Mon Sep 17 00:00:00 2001 From: Jonathan Innis Date: Fri, 14 Oct 2022 13:52:32 -0700 Subject: [PATCH 47/55] Create cloudprovider and controller options --- cmd/controller/main.go | 24 ++++- cmd/webhook/main.go | 2 +- hack/docs/instancetypes_gen_docs.go | 2 +- .../aws/cloudprovider/cloudprovider.go | 59 ++---------- .../aws/cloudprovider/suite_test.go | 7 +- .../aws/controllers/controllers.go | 45 +++++++-- .../aws/{ => controllers}/events/recorder.go | 0 .../controllers/notification/controller.go | 5 +- .../controllers/notification/suite_test.go | 3 + pkg/cloudproviders/aws/options.go | 94 +++++++++++++++++++ pkg/controllers/controllers.go | 13 +-- pkg/operator/options.go | 12 ++- 12 files changed, 180 insertions(+), 86 deletions(-) rename pkg/cloudproviders/aws/{ => controllers}/events/recorder.go (100%) create mode 100644 pkg/cloudproviders/aws/options.go diff --git a/cmd/controller/main.go b/cmd/controller/main.go index 67777ca260f0..ca724fc59e83 100644 --- a/cmd/controller/main.go +++ b/cmd/controller/main.go @@ -19,27 +19,45 @@ import ( utilruntime "k8s.io/apimachinery/pkg/util/runtime" + "github.com/aws/karpenter/pkg/cloudproviders/aws" awscloudprovider "github.com/aws/karpenter/pkg/cloudproviders/aws/cloudprovider" + awscontrollers "github.com/aws/karpenter/pkg/cloudproviders/aws/controllers" "github.com/aws/karpenter/pkg/cloudproviders/common/cloudprovider" cloudprovidermetrics "github.com/aws/karpenter/pkg/cloudproviders/common/cloudprovider/metrics" "github.com/aws/karpenter/pkg/controllers" + "github.com/aws/karpenter/pkg/controllers/state" "github.com/aws/karpenter/pkg/operator" ) func main() { options, manager := operator.NewOptionsWithManagerOrDie() - cloudProvider := cloudprovider.CloudProvider(awscloudprovider.NewCloudProvider(options.Ctx, cloudprovider.Options{ + awsOptions := aws.NewOptionsOrDie(options.Ctx, cloudprovider.Options{ ClientSet: options.Clientset, KubeClient: options.KubeClient, StartAsync: options.StartAsync, - })) + }) + cloudProvider := cloudprovider.CloudProvider(awscloudprovider.New(options.Ctx, awsOptions)) if hp, ok := cloudProvider.(operator.HealthCheck); ok { utilruntime.Must(manager.AddHealthzCheck("cloud-provider", hp.LivenessProbe)) } cloudProvider = cloudprovidermetrics.Decorate(cloudProvider) + + cluster := state.NewCluster(options.Clock, options.Config, options.KubeClient, cloudProvider) + + var c []operator.Controller + c = append(c, controllers.GetControllers(options, cluster, cloudProvider)...) + c = append(c, awscontrollers.GetControllers(options.Ctx, awscontrollers.Options{ + Options: awsOptions, + Config: options.Config, + Clock: options.Clock, + Cluster: cluster, + Recorder: options.Recorder, + KubeClient: options.KubeClient, + })...) + if err := operator.RegisterControllers(options.Ctx, manager, - controllers.GetControllers(options, cloudProvider)..., + c..., ).Start(options.Ctx); err != nil { panic(fmt.Sprintf("Unable to start manager, %s", err)) } diff --git a/cmd/webhook/main.go b/cmd/webhook/main.go index f0f05c185614..abec04ba0fb7 100644 --- a/cmd/webhook/main.go +++ b/cmd/webhook/main.go @@ -24,6 +24,6 @@ import ( func main() { webhooks.Initialize(func(ctx context.Context, o cloudprovider.Options) cloudprovider.CloudProvider { - return awscloudprovider.NewCloudProvider(ctx, o) + return awscloudprovider.New(ctx, o) }) } diff --git a/hack/docs/instancetypes_gen_docs.go b/hack/docs/instancetypes_gen_docs.go index e50cad1846a8..c81e125689fb 100644 --- a/hack/docs/instancetypes_gen_docs.go +++ b/hack/docs/instancetypes_gen_docs.go @@ -56,7 +56,7 @@ func main() { opts = opts.MustParse() ctx := injection.WithOptions(context.Background(), *opts) - cp := awscloudprovider.NewCloudProvider(ctx, cloudprovider.Options{}) + cp := awscloudprovider.New(ctx, cloudprovider.Options{}) provider := v1alpha1.AWS{SubnetSelector: map[string]string{ "*": "*", }} diff --git a/pkg/cloudproviders/aws/cloudprovider/cloudprovider.go b/pkg/cloudproviders/aws/cloudprovider/cloudprovider.go index 4c832b7276e9..21cf6b7428f6 100644 --- a/pkg/cloudproviders/aws/cloudprovider/cloudprovider.go +++ b/pkg/cloudproviders/aws/cloudprovider/cloudprovider.go @@ -21,15 +21,9 @@ import ( "fmt" "net" "net/http" - "time" - "github.com/aws/aws-sdk-go/aws" + sdk "github.com/aws/aws-sdk-go/aws" "github.com/aws/aws-sdk-go/aws/awserr" - "github.com/aws/aws-sdk-go/aws/client" - "github.com/aws/aws-sdk-go/aws/ec2metadata" - "github.com/aws/aws-sdk-go/aws/endpoints" - "github.com/aws/aws-sdk-go/aws/request" - "github.com/aws/aws-sdk-go/aws/session" "github.com/aws/aws-sdk-go/service/ec2" "github.com/aws/aws-sdk-go/service/ssm" "github.com/patrickmn/go-cache" @@ -47,25 +41,15 @@ import ( "github.com/aws/karpenter-core/pkg/apis/provisioning/v1alpha5" awsv1alpha1 "github.com/aws/karpenter/pkg/apis/awsnodetemplate/v1alpha1" + "github.com/aws/karpenter/pkg/cloudproviders/aws" "github.com/aws/karpenter/pkg/cloudproviders/aws/apis/v1alpha1" - awscache "github.com/aws/karpenter/pkg/cloudproviders/aws/cache" "github.com/aws/karpenter/pkg/cloudproviders/aws/cloudprovider/amifamily" "github.com/aws/karpenter/pkg/cloudproviders/common/cloudprovider" "github.com/aws/karpenter/pkg/utils/functional" "github.com/aws/karpenter/pkg/utils/injection" - "github.com/aws/karpenter/pkg/utils/project" ) const ( - // CacheTTL restricts QPS to AWS APIs to this interval for verifying setup - // resources. This value represents the maximum eventual consistency between - // AWS actual state and the controller's ability to provision those - // resources. Cache hits enable faster provisioning and reduced API load on - // AWS APIs, which can have a serious impact on performance and scalability. - // DO NOT CHANGE THIS VALUE WITHOUT DUE CONSIDERATION - CacheTTL = 60 * time.Second - // CacheCleanupInterval triggers cache cleanup (lazy eviction) at this interval. - CacheCleanupInterval = 10 * time.Minute // MaxInstanceTypes defines the number of instance type options to pass to CreateFleet MaxInstanceTypes = 20 ) @@ -82,7 +66,8 @@ type CloudProvider struct { kubeClient k8sClient.Client } -func NewCloudProvider(ctx context.Context, options cloudprovider.Options) *CloudProvider { +func New(ctx context.Context, options aws.Options) *CloudProvider { + ctx = logging.WithLogger(ctx, logging.FromContext(ctx).Named("aws")) // if performing validation only, then only the Validate()/Default() methods will be called which // don't require any other setup if options.WebhookOnly { @@ -92,29 +77,17 @@ func NewCloudProvider(ctx context.Context, options cloudprovider.Options) *Cloud return cp } - ctx = logging.WithLogger(ctx, logging.FromContext(ctx).Named("aws")) - sess := withUserAgent(session.Must(session.NewSession( - request.WithRetryer( - &aws.Config{STSRegionalEndpoint: endpoints.RegionalSTSEndpoint}, - client.DefaultRetryer{NumMaxRetries: client.DefaultRetryerMaxNumRetries}, - ), - ))) - if *sess.Config.Region == "" { - logging.FromContext(ctx).Debug("AWS region not configured, asking EC2 Instance Metadata Service") - *sess.Config.Region = getRegionFromIMDS(sess) - } - logging.FromContext(ctx).Debugf("Using AWS region %s", *sess.Config.Region) kubeDNSIP, err := kubeDNSIP(ctx, options.ClientSet) if err != nil { logging.FromContext(ctx).Fatalf("Unable to detect the IP of the kube-dns service, %s", err) } logging.FromContext(ctx).Debugf("Discovered DNS IP %s", kubeDNSIP) - ec2api := ec2.New(sess) + ec2api := ec2.New(options.Session) if err := checkEC2Connectivity(ec2api); err != nil { logging.FromContext(ctx).Fatalf("Checking EC2 API connectivity, %s", err) } subnetProvider := NewSubnetProvider(ec2api) - instanceTypeProvider := NewInstanceTypeProvider(ctx, sess, options, ec2api, subnetProvider, awscache.NewUnavailableOfferings(cache.New(awscache.UnavailableOfferingsTTL, CacheCleanupInterval))) + instanceTypeProvider := NewInstanceTypeProvider(ctx, options.Session, options.Options, ec2api, subnetProvider, options.UnavailableOfferingsCache) cloudprovider := &CloudProvider{ instanceTypeProvider: instanceTypeProvider, instanceProvider: NewInstanceProvider(ctx, ec2api, instanceTypeProvider, subnetProvider, @@ -122,7 +95,7 @@ func NewCloudProvider(ctx context.Context, options cloudprovider.Options) *Cloud ctx, ec2api, options.ClientSet, - amifamily.New(ctx, ssm.New(sess), ec2api, cache.New(CacheTTL, CacheCleanupInterval), cache.New(CacheTTL, CacheCleanupInterval), options.KubeClient), + amifamily.New(ctx, ssm.New(options.Session), ec2api, cache.New(options.CacheTTL, options.CacheCleanupInterval), cache.New(options.CacheTTL, options.CacheCleanupInterval), options.KubeClient), NewSecurityGroupProvider(ec2api), getCABundle(ctx), options.StartAsync, @@ -140,7 +113,7 @@ func NewCloudProvider(ctx context.Context, options cloudprovider.Options) *Cloud // checkEC2Connectivity makes a dry-run call to DescribeInstanceTypes. If it fails, we provide an early indicator that we // are having issues connecting to the EC2 API. func checkEC2Connectivity(api *ec2.EC2) error { - _, err := api.DescribeInstanceTypes(&ec2.DescribeInstanceTypesInput{DryRun: aws.Bool(true)}) + _, err := api.DescribeInstanceTypes(&ec2.DescribeInstanceTypesInput{DryRun: sdk.Bool(true)}) var aerr awserr.Error if errors.As(err, &aerr) && aerr.Code() == "DryRunOperation" { return nil @@ -254,22 +227,6 @@ func defaultLabels(provisioner *v1alpha5.Provisioner) { } } -// get the current region from EC2 IMDS -func getRegionFromIMDS(sess *session.Session) string { - region, err := ec2metadata.New(sess).Region() - if err != nil { - panic(fmt.Sprintf("Failed to call the metadata server's region API, %s", err)) - } - return region -} - -// withUserAgent adds a karpenter specific user-agent string to AWS session -func withUserAgent(sess *session.Session) *session.Session { - userAgent := fmt.Sprintf("karpenter.sh-%s", project.Version) - sess.Handlers.Build.PushBack(request.MakeAddToUserAgentFreeFormHandler(userAgent)) - return sess -} - func getCABundle(ctx context.Context) *string { // Discover CA Bundle from the REST client. We could alternatively // have used the simpler client-go InClusterConfig() method. diff --git a/pkg/cloudproviders/aws/cloudprovider/suite_test.go b/pkg/cloudproviders/aws/cloudprovider/suite_test.go index ab6d6ac27157..1e3324efa5fe 100644 --- a/pkg/cloudproviders/aws/cloudprovider/suite_test.go +++ b/pkg/cloudproviders/aws/cloudprovider/suite_test.go @@ -38,7 +38,6 @@ import ( awscache "github.com/aws/karpenter/pkg/cloudproviders/aws/cache" "github.com/aws/karpenter/pkg/cloudproviders/aws/cloudprovider/amifamily" - "github.com/aws/karpenter/pkg/cloudproviders/common/cloudprovider" . "github.com/aws/karpenter/pkg/test/expectations" "github.com/aws/karpenter-core/pkg/apis/provisioning/v1alpha5" @@ -210,7 +209,7 @@ var _ = Describe("Allocation", func() { It("should default requirements hooks in webhook mode", func() { // clear our hook to ensure that creating the cloud provider in webhook mode sets it v1alpha5.DefaultHook = func(ctx context.Context, provisoner *v1alpha5.Provisioner) {} - NewCloudProvider(ctx, cloudprovider.Options{WebhookOnly: true}) + New(ctx, aws.Options{WebhookOnly: true}) v1alpha5.DefaultHook(ctx, provisioner) Expect(provisioner.Spec.Requirements).To(ContainElement(v1.NodeSelectorRequirement{ Key: v1alpha5.LabelCapacityType, @@ -528,13 +527,13 @@ var _ = Describe("Allocation", func() { Context("Webhook", func() { It("should validate when in webhook mode", func() { - cp := NewCloudProvider(ctx, cloudprovider.Options{WebhookOnly: true}) + cp := New(ctx, aws.Options{WebhookOnly: true}) // just ensures that validation doesn't depend on anything as when created for the webhook // we don't fully initialize the cloud provider Expect(cp.Validate(ctx, provisioner)).To(Succeed()) }) It("should default when in webhookmode", func() { - cp := NewCloudProvider(ctx, cloudprovider.Options{WebhookOnly: true}) + cp := New(ctx, aws.Options{WebhookOnly: true}) // just ensures that validation doesn't depend on anything as when created for the webhook // we don't fully initialize the cloud provider cp.Default(ctx, provisioner) diff --git a/pkg/cloudproviders/aws/controllers/controllers.go b/pkg/cloudproviders/aws/controllers/controllers.go index 8dd6f4fbc45f..cb1df6f888eb 100644 --- a/pkg/cloudproviders/aws/controllers/controllers.go +++ b/pkg/cloudproviders/aws/controllers/controllers.go @@ -17,20 +17,47 @@ package controllers import ( "context" + "github.com/aws/aws-sdk-go/service/eventbridge" + "github.com/aws/aws-sdk-go/service/sqs" + "k8s.io/utils/clock" + "knative.dev/pkg/logging" + "sigs.k8s.io/controller-runtime/pkg/client" + + "github.com/aws/karpenter/pkg/cloudproviders/aws" + "github.com/aws/karpenter/pkg/cloudproviders/aws/controllers/events" + "github.com/aws/karpenter/pkg/cloudproviders/aws/controllers/infrastructure" + "github.com/aws/karpenter/pkg/cloudproviders/aws/controllers/notification" + "github.com/aws/karpenter/pkg/cloudproviders/aws/controllers/providers" + "github.com/aws/karpenter/pkg/config" + "github.com/aws/karpenter/pkg/controllers/state" + coreevents "github.com/aws/karpenter/pkg/events" "github.com/aws/karpenter/pkg/operator" ) -func GetControllers(ctx context.Context, opts *operator.Options) []operator.Controller { +type Options struct { + aws.Options + + Config config.Config + Clock clock.Clock + Cluster *state.Cluster + Recorder coreevents.Recorder + KubeClient client.Client +} + +func GetControllers(ctx context.Context, options Options) []operator.Controller { var ret []operator.Controller - //rec := events.NewRecorder(opts.Recorder) + rec := events.NewRecorder(options.Recorder) + + sqsProvider := providers.NewSQSProvider(ctx, sqs.New(options.Session)) + eventBridgeProvider := providers.NewEventBridgeProvider(eventbridge.New(options.Session), sqsProvider) // Only enable spot interruption handling controllers when the feature flag is enabled - //if opts.Config.EnableInterruptionHandling() { - // logging.FromContext(ctx).Infof("Enabling interruption handling") - // - // infraController := infrastructure.NewController(opts.KubeClient, provider.SQSProvider(), provider.EventBridgeProvider()) - // notificationController := notification.NewController(opts.KubeClient, opts.Clock, rec, opts.Cluster, provider.SQSProvider(), provider.InstanceTypeProvider()) - // ret = append(ret, infraController, notificationController) - //} + if options.Config.EnableInterruptionHandling() { + logging.FromContext(ctx).Infof("Enabling interruption handling") + + infraController := infrastructure.NewController(options.KubeClient, sqsProvider, eventBridgeProvider) + notificationController := notification.NewController(options.KubeClient, options.Clock, rec, options.Cluster, sqsProvider, options.UnavailableOfferingsCache) + ret = append(ret, infraController, notificationController) + } return ret } diff --git a/pkg/cloudproviders/aws/events/recorder.go b/pkg/cloudproviders/aws/controllers/events/recorder.go similarity index 100% rename from pkg/cloudproviders/aws/events/recorder.go rename to pkg/cloudproviders/aws/controllers/events/recorder.go diff --git a/pkg/cloudproviders/aws/controllers/notification/controller.go b/pkg/cloudproviders/aws/controllers/notification/controller.go index 16f592fee559..44dcf28d8ce5 100644 --- a/pkg/cloudproviders/aws/controllers/notification/controller.go +++ b/pkg/cloudproviders/aws/controllers/notification/controller.go @@ -35,13 +35,12 @@ import ( "github.com/aws/karpenter-core/pkg/apis/provisioning/v1alpha5" "github.com/aws/karpenter/pkg/apis/awsnodetemplate/v1alpha1" - awscache "github.com/aws/karpenter/pkg/cloudproviders/aws/cache" - awsv1alpha1 "github.com/aws/karpenter/pkg/cloudproviders/aws/apis/v1alpha1" + awscache "github.com/aws/karpenter/pkg/cloudproviders/aws/cache" + "github.com/aws/karpenter/pkg/cloudproviders/aws/controllers/events" "github.com/aws/karpenter/pkg/cloudproviders/aws/controllers/notification/event" "github.com/aws/karpenter/pkg/cloudproviders/aws/controllers/notification/event/statechange" "github.com/aws/karpenter/pkg/cloudproviders/aws/controllers/providers" - "github.com/aws/karpenter/pkg/cloudproviders/aws/events" "github.com/aws/karpenter/pkg/cloudproviders/aws/utils" "github.com/aws/karpenter/pkg/controllers/state" "github.com/aws/karpenter/pkg/metrics" diff --git a/pkg/cloudproviders/aws/controllers/notification/suite_test.go b/pkg/cloudproviders/aws/controllers/notification/suite_test.go index 5da9d713d8df..a799ca746de6 100644 --- a/pkg/cloudproviders/aws/controllers/notification/suite_test.go +++ b/pkg/cloudproviders/aws/controllers/notification/suite_test.go @@ -27,6 +27,7 @@ import ( "github.com/aws/aws-sdk-go/service/sqs" . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" + "github.com/patrickmn/go-cache" "github.com/samber/lo" v1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -42,6 +43,7 @@ import ( "github.com/aws/karpenter-core/pkg/apis/provisioning/v1alpha5" "github.com/aws/karpenter/pkg/cloudproviders/aws/apis/v1alpha1" awscache "github.com/aws/karpenter/pkg/cloudproviders/aws/cache" + awscloudprovider "github.com/aws/karpenter/pkg/cloudproviders/aws/cloudprovider" "github.com/aws/karpenter/pkg/cloudproviders/aws/controllers/notification" "github.com/aws/karpenter/pkg/cloudproviders/aws/controllers/notification/event" "github.com/aws/karpenter/pkg/cloudproviders/aws/controllers/notification/event/scheduledchange" @@ -99,6 +101,7 @@ var _ = BeforeEach(func() { cluster = state.NewCluster(fakeClock, cfg, env.Client, cloudProvider) nodeStateController = state.NewNodeController(env.Client, cluster) recorder = awsfake.NewEventRecorder() + unavailableOfferingsCache = awscache.NewUnavailableOfferings(cache.New(awscache.UnavailableOfferingsTTL, awscloudprovider.CacheCleanupInterval)) sqsapi = &awsfake.SQSAPI{} sqsProvider = providers.NewSQSProvider(ctx, sqsapi) diff --git a/pkg/cloudproviders/aws/options.go b/pkg/cloudproviders/aws/options.go new file mode 100644 index 000000000000..f62ed3076190 --- /dev/null +++ b/pkg/cloudproviders/aws/options.go @@ -0,0 +1,94 @@ +/* +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package aws + +import ( + "context" + "fmt" + "time" + + "github.com/aws/aws-sdk-go/aws" + "github.com/aws/aws-sdk-go/aws/client" + "github.com/aws/aws-sdk-go/aws/ec2metadata" + "github.com/aws/aws-sdk-go/aws/endpoints" + "github.com/aws/aws-sdk-go/aws/request" + "github.com/aws/aws-sdk-go/aws/session" + "github.com/patrickmn/go-cache" + "knative.dev/pkg/logging" + + awscache "github.com/aws/karpenter/pkg/cloudproviders/aws/cache" + "github.com/aws/karpenter/pkg/cloudproviders/common/cloudprovider" + "github.com/aws/karpenter/pkg/utils/project" +) + +const ( + // cacheTTL restricts QPS to AWS APIs to this interval for verifying setup + // resources. This value represents the maximum eventual consistency between + // AWS actual state and the controller's ability to provision those + // resources. Cache hits enable faster provisioning and reduced API load on + // AWS APIs, which can have a serious impact on performance and scalability. + // DO NOT CHANGE THIS VALUE WITHOUT DUE CONSIDERATION + cacheTTL = 60 * time.Second + // cacheCleanupInterval triggers cache cleanup (lazy eviction) at this interval. + cacheCleanupInterval = 10 * time.Minute +) + +type Options struct { + cloudprovider.Options + + Session *session.Session + UnavailableOfferingsCache *awscache.UnavailableOfferings + + CacheCleanupInterval time.Duration + CacheTTL time.Duration +} + +func NewOptionsOrDie(ctx context.Context, options cloudprovider.Options) Options { + ctx = logging.WithLogger(ctx, logging.FromContext(ctx).Named("aws")) + sess := withUserAgent(session.Must(session.NewSession( + request.WithRetryer( + &aws.Config{STSRegionalEndpoint: endpoints.RegionalSTSEndpoint}, + client.DefaultRetryer{NumMaxRetries: client.DefaultRetryerMaxNumRetries}, + ), + ))) + if *sess.Config.Region == "" { + logging.FromContext(ctx).Debug("AWS region not configured, asking EC2 Instance Metadata Service") + *sess.Config.Region = getRegionFromIMDS(sess) + } + logging.FromContext(ctx).Debugf("Using AWS region %s", *sess.Config.Region) + return Options{ + Options: options, + Session: sess, + UnavailableOfferingsCache: awscache.NewUnavailableOfferings(cache.New(awscache.UnavailableOfferingsTTL, cacheCleanupInterval)), + CacheCleanupInterval: cacheCleanupInterval, + CacheTTL: cacheTTL, + } +} + +// get the current region from EC2 IMDS +func getRegionFromIMDS(sess *session.Session) string { + region, err := ec2metadata.New(sess).Region() + if err != nil { + panic(fmt.Sprintf("Failed to call the metadata server's region API, %s", err)) + } + return region +} + +// withUserAgent adds a karpenter specific user-agent string to AWS session +func withUserAgent(sess *session.Session) *session.Session { + userAgent := fmt.Sprintf("karpenter.sh-%s", project.Version) + sess.Handlers.Build.PushBack(request.MakeAddToUserAgentFreeFormHandler(userAgent)) + return sess +} diff --git a/pkg/controllers/controllers.go b/pkg/controllers/controllers.go index 17b16a2c71e0..3e04b91142b1 100644 --- a/pkg/controllers/controllers.go +++ b/pkg/controllers/controllers.go @@ -15,10 +15,7 @@ limitations under the License. package controllers import ( - "knative.dev/pkg/logging" - "github.com/aws/karpenter/pkg/cloudproviders/common/cloudprovider" - "github.com/aws/karpenter/pkg/config" "github.com/aws/karpenter/pkg/controllers/consolidation" "github.com/aws/karpenter/pkg/controllers/counter" metricspod "github.com/aws/karpenter/pkg/controllers/metrics/pod" @@ -36,14 +33,8 @@ func init() { metrics.MustRegister() // Registers cross-controller metrics } -func GetControllers(opts operator.Options, cloudProvider cloudprovider.CloudProvider) []operator.Controller { - cfg, err := config.New(opts.Ctx, opts.Clientset, opts.Cmw) - if err != nil { - // this does not happen if the config map is missing or invalid, only if some other error occurs - logging.FromContext(opts.Ctx).Fatalf("unable to load config, %s", err) - } - cluster := state.NewCluster(opts.Clock, cfg, opts.KubeClient, cloudProvider) - provisioner := provisioning.NewProvisioner(opts.Ctx, cfg, opts.KubeClient, opts.Clientset.CoreV1(), opts.Recorder, cloudProvider, cluster) +func GetControllers(opts operator.Options, cluster *state.Cluster, cloudProvider cloudprovider.CloudProvider) []operator.Controller { + provisioner := provisioning.NewProvisioner(opts.Ctx, opts.Config, opts.KubeClient, opts.Clientset.CoreV1(), opts.Recorder, cloudProvider, cluster) metricsstate.StartMetricScraper(opts.Ctx, cluster) diff --git a/pkg/operator/options.go b/pkg/operator/options.go index 14b186c9bb66..a480832d099e 100644 --- a/pkg/operator/options.go +++ b/pkg/operator/options.go @@ -22,7 +22,6 @@ import ( utilruntime "k8s.io/apimachinery/pkg/util/runtime" "k8s.io/client-go/kubernetes" clientgoscheme "k8s.io/client-go/kubernetes/scheme" - "k8s.io/client-go/rest" "k8s.io/client-go/util/flowcontrol" "k8s.io/utils/clock" "knative.dev/pkg/configmap/informer" @@ -33,6 +32,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/manager" "github.com/aws/karpenter/pkg/apis" + "github.com/aws/karpenter/pkg/config" "github.com/aws/karpenter/pkg/events" "github.com/aws/karpenter/pkg/utils/injection" "github.com/aws/karpenter/pkg/utils/options" @@ -57,7 +57,7 @@ func init() { type Options struct { Ctx context.Context Recorder events.Recorder - Config *rest.Config + Config config.Config KubeClient client.Client Clientset *kubernetes.Clientset Clock clock.Clock @@ -96,10 +96,16 @@ func NewOptionsWithManagerOrDie() (Options, manager.Manager) { recorder = events.NewLoadSheddingRecorder(recorder) recorder = events.NewDedupeRecorder(recorder) + cfg, err := config.New(ctx, clientSet, cmw) + if err != nil { + // this does not happen if the config map is missing or invalid, only if some other error occurs + logging.FromContext(ctx).Fatalf("unable to load config, %s", err) + } + return Options{ Ctx: ctx, Recorder: recorder, - Config: controllerRuntimeConfig, + Config: cfg, Clientset: clientSet, KubeClient: manager.GetClient(), Clock: clock.RealClock{}, From c6dd15a649c4562a2cb9f623d89109f89bc25fca Mon Sep 17 00:00:00 2001 From: Jonathan Innis Date: Tue, 25 Oct 2022 18:47:02 -0500 Subject: [PATCH 48/55] Add enableInterruptionHandling feature flag --- cmd/controller/main.go | 2 +- pkg/controllers/controllers.go | 4 +- pkg/controllers/interruption/action.go | 22 +++++++++++ pkg/controllers/interruption/controller.go | 36 +++++++----------- .../interruption_benchmark_test.go | 38 ++++++++++--------- .../interruption/messages/metadata.go | 15 -------- .../interruption/messages/types.go | 16 +++++++- pkg/controllers/interruption/metrics.go | 8 ++++ pkg/controllers/interruption/suite_test.go | 17 +++++---- pkg/controllers/nodetemplate/controller.go | 26 +++++++++---- pkg/controllers/nodetemplate/metrics.go | 2 +- 11 files changed, 111 insertions(+), 75 deletions(-) create mode 100644 pkg/controllers/interruption/action.go diff --git a/cmd/controller/main.go b/cmd/controller/main.go index e9ba2b44e0ca..63ec4b6ba272 100644 --- a/cmd/controller/main.go +++ b/cmd/controller/main.go @@ -43,7 +43,7 @@ func main() { lo.Must0(operator.AddHealthzCheck("cloud-provider", awsCloudProvider.LivenessProbe)) cloudProvider := metrics.Decorate(awsCloudProvider) - clusterState := state.NewCluster(operator.SettingsStore.InjectSettings(ctx), operator.Clock, operator.GetClient(), cloudProvider) + clusterState := state.NewCluster(ctx, operator.Clock, operator.GetClient(), cloudProvider) operator. WithControllers(ctx, controllers.NewControllers( ctx, diff --git a/pkg/controllers/controllers.go b/pkg/controllers/controllers.go index dca3aa2b7000..3418e66b7afd 100644 --- a/pkg/controllers/controllers.go +++ b/pkg/controllers/controllers.go @@ -28,12 +28,12 @@ import ( ) func NewControllers(ctx awscontext.Context, cluster *state.Cluster) []controller.Controller { - rec := events.NewRecorder(ctx.EventRecorder) + eventRecorder := events.NewRecorder(ctx.EventRecorder) sqsProvider := providers.NewSQS(ctx, sqs.New(ctx.Session)) eventBridgeProvider := providers.NewEventBridge(eventbridge.New(ctx.Session), sqsProvider) return []controller.Controller{ nodetemplate.NewController(ctx.KubeClient, sqsProvider, eventBridgeProvider), - interruption.NewController(ctx.KubeClient, ctx.Clock, rec, cluster, sqsProvider, ctx.UnavailableOfferingsCache), + interruption.NewController(ctx.KubeClient, ctx.Clock, eventRecorder, cluster, sqsProvider, ctx.UnavailableOfferingsCache), } } diff --git a/pkg/controllers/interruption/action.go b/pkg/controllers/interruption/action.go new file mode 100644 index 000000000000..a66190c7ddb1 --- /dev/null +++ b/pkg/controllers/interruption/action.go @@ -0,0 +1,22 @@ +package interruption + +import "fmt" + +type Action byte + +const ( + _ Action = iota + CordonAndDrain + NoAction +) + +func (a Action) String() string { + switch a { + case CordonAndDrain: + return "CordonAndDrain" + case NoAction: + return "NoAction" + default: + return fmt.Sprintf("Unsupported Action %d", a) + } +} diff --git a/pkg/controllers/interruption/controller.go b/pkg/controllers/interruption/controller.go index 5952684500bc..d18cc1233b85 100644 --- a/pkg/controllers/interruption/controller.go +++ b/pkg/controllers/interruption/controller.go @@ -34,10 +34,8 @@ import ( "sigs.k8s.io/controller-runtime/pkg/manager" "sigs.k8s.io/controller-runtime/pkg/reconcile" - "github.com/aws/karpenter-core/pkg/apis/provisioning/v1alpha5" - "github.com/aws/karpenter-core/pkg/controllers/state" - "github.com/aws/karpenter-core/pkg/metrics" - operatorcontroller "github.com/aws/karpenter-core/pkg/operator/controller" + "github.com/aws/karpenter-core/pkg/operator/scheme" + "github.com/aws/karpenter/pkg/apis" "github.com/aws/karpenter/pkg/apis/awsnodetemplate/v1alpha1" awscache "github.com/aws/karpenter/pkg/cache" interruptionevents "github.com/aws/karpenter/pkg/controllers/interruption/events" @@ -46,25 +44,16 @@ import ( "github.com/aws/karpenter/pkg/controllers/providers" "github.com/aws/karpenter/pkg/events" "github.com/aws/karpenter/pkg/utils" -) - -type Action byte -const ( - _ Action = iota - CordonAndDrain - NoAction + "github.com/aws/karpenter-core/pkg/apis/config/settings" + "github.com/aws/karpenter-core/pkg/apis/provisioning/v1alpha5" + "github.com/aws/karpenter-core/pkg/controllers/state" + "github.com/aws/karpenter-core/pkg/metrics" + operatorcontroller "github.com/aws/karpenter-core/pkg/operator/controller" ) -func (a Action) String() string { - switch a { - case CordonAndDrain: - return "CordonAndDrain" - case NoAction: - return "NoAction" - default: - return fmt.Sprintf("Unsupported Action %d", a) - } +func init() { + lo.Must0(apis.AddToScheme(scheme.Scheme)) } // Controller is an AWS interruption controller. @@ -103,7 +92,8 @@ func (c *Controller) Reconcile(ctx context.Context, _ reconcile.Request) (reconc return reconcile.Result{}, fmt.Errorf("listing node templates, %w", err) } - if len(list.Items) > 0 { + if settings.FromContext(ctx).EnableInterruptionHandling && len(list.Items) > 0 { + active.Set(1) sqsMessages, err := c.provider.GetSQSMessages(ctx) if err != nil { return reconcile.Result{}, fmt.Errorf("getting messages from queue, %w", err) @@ -117,8 +107,10 @@ func (c *Controller) Reconcile(ctx context.Context, _ reconcile.Request) (reconc errs[i] = c.handleMessage(ctx, instanceIDMap, sqsMessages[i]) }) return reconcile.Result{}, multierr.Combine(errs...) + } else { + active.Set(0) } - return reconcile.Result{RequeueAfter: time.Minute}, nil + return reconcile.Result{RequeueAfter: time.Second * 10}, nil } func (c *Controller) Builder(_ context.Context, m manager.Manager) operatorcontroller.Builder { diff --git a/pkg/controllers/interruption/interruption_benchmark_test.go b/pkg/controllers/interruption/interruption_benchmark_test.go index 3a8daf79d5e3..4742c7387326 100644 --- a/pkg/controllers/interruption/interruption_benchmark_test.go +++ b/pkg/controllers/interruption/interruption_benchmark_test.go @@ -44,20 +44,21 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/reconcile" - "github.com/aws/karpenter-core/pkg/apis/provisioning/v1alpha5" - "github.com/aws/karpenter-core/pkg/cloudprovider/fake" awscache "github.com/aws/karpenter/pkg/cache" awscontext "github.com/aws/karpenter/pkg/context" "github.com/aws/karpenter/pkg/controllers/interruption" "github.com/aws/karpenter/pkg/controllers/providers" "github.com/aws/karpenter/pkg/events" + "github.com/aws/karpenter-core/pkg/apis/provisioning/v1alpha5" + "github.com/aws/karpenter-core/pkg/cloudprovider/fake" + + awsfake "github.com/aws/karpenter/pkg/fake" + "github.com/aws/karpenter-core/pkg/controllers/state" - "github.com/aws/karpenter-core/pkg/operator" "github.com/aws/karpenter-core/pkg/operator/injection" "github.com/aws/karpenter-core/pkg/operator/options" "github.com/aws/karpenter-core/pkg/test" - awsfake "github.com/aws/karpenter/pkg/fake" ) var r = rand.New(rand.NewSource(time.Now().Unix())) @@ -78,6 +79,7 @@ func BenchmarkNotification100(b *testing.B) { benchmarkNotificationController(b, 100) } +//nolint:gocyclo func benchmarkNotificationController(b *testing.B, messageCount int) { opts := options.Options{ AWSIsolatedVPC: true, @@ -143,27 +145,29 @@ func benchmarkNotificationController(b *testing.B, messageCount int) { if err != nil { b.Fatalf("creating manager, %v", err) } - m = operator.RegisterControllers(env.Ctx, test.SettingsStore{}, m, interruptionController, nodeStateController) + // Registering controller with the manager + if err = interruptionController.Builder(ctx, m).Complete(interruptionController); err != nil { + b.Fatalf("registering interruption controller, %v", err) + } + if err = nodeStateController.Builder(ctx, m).Complete(nodeStateController); err != nil { + b.Fatalf("registering nodeState controller, %v", err) + } + + b.ResetTimer() + start := time.Now() managerErr := make(chan error) go func() { logging.FromContext(env.Ctx).Infof("Starting controller manager") - if err := m.Start(env.Ctx); err != nil { - managerErr <- err - } + managerErr <- m.Start(env.Ctx) }() - b.ResetTimer() - start := time.Now() - - interruptionController.Start(env.Ctx) - done := providers.monitorMessagesProcessed(env.Ctx, eventRecorder, messageCount) - select { - case err := <-managerErr: - b.Fatalf("starting manager, %v", err) - case <-done: + case <-providers.monitorMessagesProcessed(env.Ctx, eventRecorder, messageCount): + case err = <-managerErr: + b.Fatalf("running manager, %v", err) } + duration := time.Since(start) b.ReportMetric(float64(messageCount), "Messages") b.ReportMetric(duration.Seconds(), "TotalDurationInSeconds") diff --git a/pkg/controllers/interruption/messages/metadata.go b/pkg/controllers/interruption/messages/metadata.go index 726582f35a31..6785b48f8ecb 100644 --- a/pkg/controllers/interruption/messages/metadata.go +++ b/pkg/controllers/interruption/messages/metadata.go @@ -13,18 +13,3 @@ limitations under the License. */ package messages - -import ( - "time" -) - -type Metadata struct { - Account string `json:"account"` - DetailType string `json:"detail-type"` - ID string `json:"id"` - Region string `json:"region"` - Resources []string `json:"resources"` - Source string `json:"source"` - Time time.Time `json:"time"` - Version string `json:"version"` -} diff --git a/pkg/controllers/interruption/messages/types.go b/pkg/controllers/interruption/messages/types.go index 1a1f473a3786..40f4e8c31601 100644 --- a/pkg/controllers/interruption/messages/types.go +++ b/pkg/controllers/interruption/messages/types.go @@ -14,7 +14,10 @@ limitations under the License. package messages -import "fmt" +import ( + "fmt" + "time" +) type Parser interface { Parse(string) (Interface, error) @@ -57,3 +60,14 @@ func (k Kind) String() string { return fmt.Sprintf("Unsupported Kind %d", k) } } + +type Metadata struct { + Account string `json:"account"` + DetailType string `json:"detail-type"` + ID string `json:"id"` + Region string `json:"region"` + Resources []string `json:"resources"` + Source string `json:"source"` + Time time.Time `json:"time"` + Version string `json:"version"` +} diff --git a/pkg/controllers/interruption/metrics.go b/pkg/controllers/interruption/metrics.go index 2aa347d73f0e..5a39ce4db7a3 100644 --- a/pkg/controllers/interruption/metrics.go +++ b/pkg/controllers/interruption/metrics.go @@ -39,6 +39,14 @@ var ( Buckets: metrics.DurationBuckets(), }, ) + active = prometheus.NewGauge( + prometheus.GaugeOpts{ + Namespace: metrics.Namespace, + Subsystem: subsystem, + Name: "active", + Help: "Whether the message polling is currently active.", + }, + ) receivedMessages = prometheus.NewCounterVec( prometheus.CounterOpts{ Namespace: metrics.Namespace, diff --git a/pkg/controllers/interruption/suite_test.go b/pkg/controllers/interruption/suite_test.go index ade87e040971..18aa120a52a2 100644 --- a/pkg/controllers/interruption/suite_test.go +++ b/pkg/controllers/interruption/suite_test.go @@ -38,14 +38,6 @@ import ( _ "knative.dev/pkg/system/testing" "sigs.k8s.io/controller-runtime/pkg/client" - "github.com/aws/karpenter-core/pkg/apis/config/settings" - "github.com/aws/karpenter-core/pkg/apis/provisioning/v1alpha5" - "github.com/aws/karpenter-core/pkg/cloudprovider/fake" - "github.com/aws/karpenter-core/pkg/controllers/state" - "github.com/aws/karpenter-core/pkg/operator/injection" - "github.com/aws/karpenter-core/pkg/operator/options" - "github.com/aws/karpenter-core/pkg/test" - . "github.com/aws/karpenter-core/pkg/test/expectations" "github.com/aws/karpenter/pkg/apis/awsnodetemplate/v1alpha1" awscache "github.com/aws/karpenter/pkg/cache" awscontext "github.com/aws/karpenter/pkg/context" @@ -58,6 +50,15 @@ import ( "github.com/aws/karpenter/pkg/errors" "github.com/aws/karpenter/pkg/events" awsfake "github.com/aws/karpenter/pkg/fake" + + "github.com/aws/karpenter-core/pkg/apis/config/settings" + "github.com/aws/karpenter-core/pkg/apis/provisioning/v1alpha5" + "github.com/aws/karpenter-core/pkg/cloudprovider/fake" + "github.com/aws/karpenter-core/pkg/controllers/state" + "github.com/aws/karpenter-core/pkg/operator/injection" + "github.com/aws/karpenter-core/pkg/operator/options" + "github.com/aws/karpenter-core/pkg/test" + . "github.com/aws/karpenter-core/pkg/test/expectations" ) const ( diff --git a/pkg/controllers/nodetemplate/controller.go b/pkg/controllers/nodetemplate/controller.go index e3d0d5ad3507..2cfafd5cce41 100644 --- a/pkg/controllers/nodetemplate/controller.go +++ b/pkg/controllers/nodetemplate/controller.go @@ -17,9 +17,10 @@ package nodetemplate import ( "context" "net/http" + "time" + "github.com/samber/lo" "k8s.io/apimachinery/pkg/api/errors" - utilruntime "k8s.io/apimachinery/pkg/util/runtime" "knative.dev/pkg/logging" controllerruntime "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" @@ -27,11 +28,12 @@ import ( "sigs.k8s.io/controller-runtime/pkg/manager" "sigs.k8s.io/controller-runtime/pkg/reconcile" + "github.com/aws/karpenter-core/pkg/apis/config/settings" operatorcontroller "github.com/aws/karpenter-core/pkg/operator/controller" "github.com/aws/karpenter-core/pkg/operator/scheme" "github.com/aws/karpenter-core/pkg/apis/provisioning/v1alpha5" - awsapis "github.com/aws/karpenter/pkg/apis" + "github.com/aws/karpenter/pkg/apis" "github.com/aws/karpenter/pkg/apis/awsnodetemplate/v1alpha1" "github.com/aws/karpenter/pkg/controllers/providers" ) @@ -39,7 +41,7 @@ import ( const Name = "nodetemplate" func init() { - utilruntime.Must(awsapis.AddToScheme(scheme.Scheme)) + lo.Must0(apis.AddToScheme(scheme.Scheme)) } // Controller is the AWSNodeTemplate Controller @@ -48,6 +50,8 @@ func init() { type Controller struct { kubeClient client.Client provider *providers.Infrastructure + + lastInfrastructureReconcile time.Time } func NewController(kubeClient client.Client, sqsProvider *providers.SQS, eventBridgeProvider *providers.EventBridge) *Controller { @@ -94,13 +98,19 @@ func (c *Controller) Reconcile(ctx context.Context, req reconcile.Request) (reco return reconcile.Result{}, err } active.Set(1) - if err := c.provider.Create(ctx); err != nil { - healthy.Set(0) - return reconcile.Result{}, err + if settings.FromContext(ctx).EnableInterruptionHandling && + c.lastInfrastructureReconcile.Add(time.Hour).Before(time.Now()) { + + if err := c.provider.Create(ctx); err != nil { + healthy.Set(0) + return reconcile.Result{}, err + } + c.lastInfrastructureReconcile = time.Now() + healthy.Set(1) } - healthy.Set(1) } - return reconcile.Result{}, nil + // TODO: Implement an alerting mechanism for settings updates; until then, just poll + return reconcile.Result{RequeueAfter: time.Second * 10}, nil } func (c *Controller) Builder(_ context.Context, m manager.Manager) operatorcontroller.Builder { diff --git a/pkg/controllers/nodetemplate/metrics.go b/pkg/controllers/nodetemplate/metrics.go index 9852d30e3b01..546d97bd2aff 100644 --- a/pkg/controllers/nodetemplate/metrics.go +++ b/pkg/controllers/nodetemplate/metrics.go @@ -38,7 +38,7 @@ var ( Namespace: metrics.Namespace, Subsystem: subsystem, Name: "active", - Help: "Whether the infrastructure reconciliation is currently active. This is based on AWSNodeTemplate reconciliation and us ref-counting more than 1 AWSNodeTemplate.", + Help: "Whether the infrastructure reconciliation is currently active.", }, ) ) From 061df9a9b6e3e84ae2ad32e7c91ccaedefaf744f Mon Sep 17 00:00:00 2001 From: Jonathan Innis Date: Fri, 28 Oct 2022 14:58:25 -0500 Subject: [PATCH 49/55] Create sub-reconcilers for AWSNodeTemplate --- pkg/controllers/interruption/suite_test.go | 40 ++++++--- pkg/controllers/nodetemplate/controller.go | 87 ++++++++----------- pkg/controllers/nodetemplate/finalizer.go | 38 ++++++++ .../nodetemplate/infrastructure.go | 66 ++++++++++++++ pkg/controllers/nodetemplate/suite_test.go | 13 ++- 5 files changed, 177 insertions(+), 67 deletions(-) create mode 100644 pkg/controllers/nodetemplate/finalizer.go create mode 100644 pkg/controllers/nodetemplate/infrastructure.go diff --git a/pkg/controllers/interruption/suite_test.go b/pkg/controllers/interruption/suite_test.go index ace9c719e884..69e3bc4e1dd4 100644 --- a/pkg/controllers/interruption/suite_test.go +++ b/pkg/controllers/interruption/suite_test.go @@ -19,6 +19,8 @@ import ( "encoding/json" "fmt" "math/rand" + "path/filepath" + "runtime" "testing" "time" @@ -39,23 +41,23 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" "github.com/aws/karpenter/pkg/apis/awsnodetemplate/v1alpha1" + awssettings "github.com/aws/karpenter/pkg/apis/config/settings" awscache "github.com/aws/karpenter/pkg/cache" awscontext "github.com/aws/karpenter/pkg/context" "github.com/aws/karpenter/pkg/controllers/interruption" - event2 "github.com/aws/karpenter/pkg/controllers/interruption/messages" + "github.com/aws/karpenter/pkg/controllers/interruption/messages" "github.com/aws/karpenter/pkg/controllers/interruption/messages/scheduledchange" "github.com/aws/karpenter/pkg/controllers/interruption/messages/spotinterruption" "github.com/aws/karpenter/pkg/controllers/interruption/messages/statechange" "github.com/aws/karpenter/pkg/controllers/providers" "github.com/aws/karpenter/pkg/errors" awsfake "github.com/aws/karpenter/pkg/fake" + awstest "github.com/aws/karpenter/pkg/test" "github.com/aws/karpenter-core/pkg/apis/config/settings" "github.com/aws/karpenter-core/pkg/apis/provisioning/v1alpha5" "github.com/aws/karpenter-core/pkg/cloudprovider/fake" "github.com/aws/karpenter-core/pkg/controllers/state" - "github.com/aws/karpenter-core/pkg/operator/injection" - "github.com/aws/karpenter-core/pkg/operator/options" "github.com/aws/karpenter-core/pkg/test" . "github.com/aws/karpenter-core/pkg/test/expectations" ) @@ -70,6 +72,7 @@ const ( var ctx context.Context var env *test.Environment +var nodeTemplate *v1alpha1.AWSNodeTemplate var cluster *state.Cluster var sqsapi *awsfake.SQSAPI var eventbridgeapi *awsfake.EventBridgeAPI @@ -89,14 +92,20 @@ func TestAPIs(t *testing.T) { } var _ = BeforeEach(func() { - opts := options.Options{ - AWSIsolatedVPC: true, + settingsStore := test.SettingsStore{ + settings.ContextKey: test.Settings(), + awssettings.ContextKey: awssettings.Settings{ + EnableInterruptionHandling: true, + }, } - ctx = injection.WithOptions(ctx, opts) - ctx = settings.ToContext(ctx, test.Settings()) + ctx = settingsStore.InjectSettings(ctx) env = test.NewEnvironment(ctx, func(e *test.Environment) { fakeClock = &clock.FakeClock{} cloudProvider = &fake.CloudProvider{} + + nodeTemplate = awstest.AWSNodeTemplate() + ExpectApplied(ctx, e.Client, nodeTemplate) + cluster = state.NewCluster(ctx, fakeClock, env.Client, cloudProvider) recorder = test.NewEventRecorder() nodeStateController = state.NewNodeController(env.Client, cluster) @@ -109,11 +118,13 @@ var _ = BeforeEach(func() { controller = interruption.NewController(env.Client, fakeClock, recorder, cluster, sqsProvider, unavailableOfferingsCache) }) + env.CRDDirectoryPaths = append(env.CRDDirectoryPaths, relativeToRoot("charts/karpenter/crds")) Expect(env.Start()).To(Succeed(), "Failed to start environment") }) var _ = AfterEach(func() { ExpectCleanedUp(ctx, env.Client) + ExpectDeleted(ctx, env.Client, nodeTemplate) Expect(env.Stop()).To(Succeed(), "Failed to stop environment") }) @@ -310,7 +321,7 @@ func awsErrWithCode(code string) awserr.Error { func spotInterruptionMessage(involvedInstanceID string) spotinterruption.Event { return spotinterruption.Event{ - Metadata: event2.Metadata{ + Metadata: messages.Metadata{ Version: "0", Account: defaultAccountID, DetailType: "EC2 Spot Instance Interruption Warning", @@ -331,7 +342,7 @@ func spotInterruptionMessage(involvedInstanceID string) spotinterruption.Event { func stateChangeMessage(involvedInstanceID, state string) statechange.Event { return statechange.Event{ - Metadata: event2.Metadata{ + Metadata: messages.Metadata{ Version: "0", Account: defaultAccountID, DetailType: "EC2 Instance State-change Notification", @@ -350,10 +361,9 @@ func stateChangeMessage(involvedInstanceID, state string) statechange.Event { } } -// TODO: Update the scheduled change message to accurately reflect a real health event func scheduledChangeMessage(involvedInstanceID string) scheduledchange.Event { return scheduledchange.Event{ - Metadata: event2.Metadata{ + Metadata: messages.Metadata{ Version: "0", Account: defaultAccountID, DetailType: "AWS Health Event", @@ -377,7 +387,7 @@ func scheduledChangeMessage(involvedInstanceID string) scheduledchange.Event { } } -func NewWrappedMessage(evt event2.Interface) *sqs.Message { +func NewWrappedMessage(evt messages.Interface) *sqs.Message { return &sqs.Message{ Body: aws.String(string(lo.Must(json.Marshal(evt)))), MessageId: aws.String(string(uuid.NewUUID())), @@ -402,3 +412,9 @@ func randStringRunes(n int) string { func makeInstanceID() string { return fmt.Sprintf("i-%s", randStringRunes(17)) } + +func relativeToRoot(path string) string { + _, file, _, _ := runtime.Caller(0) + manifestsRoot := filepath.Join(filepath.Dir(file), "..", "..", "..") + return filepath.Join(manifestsRoot, path) +} diff --git a/pkg/controllers/nodetemplate/controller.go b/pkg/controllers/nodetemplate/controller.go index 1ddb77dd6aaf..21f0a524b302 100644 --- a/pkg/controllers/nodetemplate/controller.go +++ b/pkg/controllers/nodetemplate/controller.go @@ -16,23 +16,22 @@ package nodetemplate import ( "context" + "fmt" "net/http" - "time" "github.com/samber/lo" + "go.uber.org/multierr" + "k8s.io/apimachinery/pkg/api/equality" "k8s.io/apimachinery/pkg/api/errors" "knative.dev/pkg/logging" controllerruntime "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" - "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" "sigs.k8s.io/controller-runtime/pkg/manager" "sigs.k8s.io/controller-runtime/pkg/reconcile" operatorcontroller "github.com/aws/karpenter-core/pkg/operator/controller" "github.com/aws/karpenter-core/pkg/operator/scheme" - awssettings "github.com/aws/karpenter/pkg/apis/config/settings" - - "github.com/aws/karpenter-core/pkg/apis/provisioning/v1alpha5" + "github.com/aws/karpenter-core/pkg/utils/result" "github.com/aws/karpenter/pkg/apis" "github.com/aws/karpenter/pkg/apis/awsnodetemplate/v1alpha1" "github.com/aws/karpenter/pkg/controllers/providers" @@ -48,71 +47,53 @@ func init() { // It sub-reconciles by checking if there are any AWSNodeTemplates and provisions infrastructure // if there is. If there are no templates, then it de-provisions the infrastructure. type Controller struct { - kubeClient client.Client - provider *providers.Infrastructure - - lastInfrastructureReconcile time.Time + kubeClient client.Client + finalizer *Finalizer + infrastructure *Infrastructure } func NewController(kubeClient client.Client, sqsProvider *providers.SQS, eventBridgeProvider *providers.EventBridge) *Controller { return &Controller{ - kubeClient: kubeClient, - provider: providers.NewInfrastructure(sqsProvider, eventBridgeProvider), + kubeClient: kubeClient, + finalizer: &Finalizer{}, + infrastructure: &Infrastructure{kubeClient: kubeClient, provider: providers.NewInfrastructure(sqsProvider, eventBridgeProvider)}, } } -// Reconcile reconciles the SQS queue and the EventBridge rules with the expected -// configuration prescribed by Karpenter -// -//nolint:gocyclo +// Reconcile reconciles the AWSNodeTemplate with its sub-reconcilers func (c *Controller) Reconcile(ctx context.Context, req reconcile.Request) (reconcile.Result, error) { ctx = logging.WithLogger(ctx, logging.FromContext(ctx).Named(Name)) - nt := &v1alpha1.AWSNodeTemplate{} - if err := c.kubeClient.Get(ctx, req.NamespacedName, nt); err != nil { + stored := &v1alpha1.AWSNodeTemplate{} + if err := c.kubeClient.Get(ctx, req.NamespacedName, stored); err != nil { if errors.IsNotFound(err) { return reconcile.Result{}, nil } return reconcile.Result{}, err } - list := &v1alpha1.AWSNodeTemplateList{} - if err := c.kubeClient.List(ctx, list); err != nil { - return reconcile.Result{}, err - } - // Handle removing the finalizer and also cleaning up the infrastructure on the last AWSNodeTemplate deletion - if !nt.DeletionTimestamp.IsZero() { - if len(list.Items) == 1 { - if err := c.provider.Delete(ctx); err != nil { - return reconcile.Result{}, err - } - } - mergeFrom := client.MergeFrom(nt.DeepCopy()) - controllerutil.RemoveFinalizer(nt, v1alpha5.TerminationFinalizer) - if err := c.kubeClient.Patch(ctx, nt, mergeFrom); err != nil { - return reconcile.Result{}, err - } - infrastructureActive.Set(0) - return reconcile.Result{}, nil - } else if len(list.Items) >= 1 { - infrastructureActive.Set(1) - if awssettings.FromContext(ctx).EnableInterruptionHandling && - c.lastInfrastructureReconcile.Add(time.Hour).Before(time.Now()) { - - if err := c.provider.Create(ctx); err != nil { - infrastructureHealthy.Set(0) - return reconcile.Result{}, err - } - c.lastInfrastructureReconcile = time.Now() - infrastructureHealthy.Set(1) - } + nodeTemplate := stored.DeepCopy() + var results []reconcile.Result + var errs error + for _, r := range []interface { + Reconcile(context.Context, *v1alpha1.AWSNodeTemplate) (reconcile.Result, error) + }{ + c.infrastructure, + c.finalizer, + } { + res, err := r.Reconcile(ctx, nodeTemplate) + errs = multierr.Append(errs, err) + results = append(results, res) } - mergeFrom := client.MergeFrom(nt.DeepCopy()) - controllerutil.AddFinalizer(nt, v1alpha5.TerminationFinalizer) - if err := c.kubeClient.Patch(ctx, nt, mergeFrom); err != nil { - return reconcile.Result{}, err + // If there are any errors, we shouldn't apply the changes, we should requeue + if errs != nil { + return reconcile.Result{}, errs + } + if !equality.Semantic.DeepEqual(nodeTemplate, stored) { + if err := c.kubeClient.Patch(ctx, nodeTemplate, client.MergeFrom(stored)); err != nil { + return reconcile.Result{}, fmt.Errorf("patching AWSNodeTemplate, %w", err) + } } - // TODO: Implement an alerting mechanism for settings updates; until then, just poll - return reconcile.Result{RequeueAfter: time.Second * 10}, nil + return result.Min(results...), nil } func (c *Controller) Builder(_ context.Context, m manager.Manager) operatorcontroller.Builder { diff --git a/pkg/controllers/nodetemplate/finalizer.go b/pkg/controllers/nodetemplate/finalizer.go new file mode 100644 index 000000000000..2a65ec2bcf10 --- /dev/null +++ b/pkg/controllers/nodetemplate/finalizer.go @@ -0,0 +1,38 @@ +/* +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package nodetemplate + +import ( + "context" + + "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" + "sigs.k8s.io/controller-runtime/pkg/reconcile" + + "github.com/aws/karpenter-core/pkg/apis/provisioning/v1alpha5" + "github.com/aws/karpenter/pkg/apis/awsnodetemplate/v1alpha1" +) + +type Finalizer struct{} + +// Reconcile adds the finalizer if the nodeTemplate doesn't have it or removes the finalizer +// if the nodeTemplate is being deleted +func (r *Finalizer) Reconcile(_ context.Context, nodeTemplate *v1alpha1.AWSNodeTemplate) (reconcile.Result, error) { + if !nodeTemplate.DeletionTimestamp.IsZero() { + controllerutil.RemoveFinalizer(nodeTemplate, v1alpha5.TerminationFinalizer) + return reconcile.Result{}, nil + } + controllerutil.AddFinalizer(nodeTemplate, v1alpha5.TerminationFinalizer) + return reconcile.Result{}, nil +} diff --git a/pkg/controllers/nodetemplate/infrastructure.go b/pkg/controllers/nodetemplate/infrastructure.go new file mode 100644 index 000000000000..49d420dad94b --- /dev/null +++ b/pkg/controllers/nodetemplate/infrastructure.go @@ -0,0 +1,66 @@ +/* +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package nodetemplate + +import ( + "context" + "time" + + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/reconcile" + + "github.com/aws/karpenter/pkg/apis/awsnodetemplate/v1alpha1" + awssettings "github.com/aws/karpenter/pkg/apis/config/settings" + "github.com/aws/karpenter/pkg/controllers/providers" +) + +type Infrastructure struct { + kubeClient client.Client + provider *providers.Infrastructure + + lastInfrastructureReconcile time.Time +} + +// Reconcile reconciles the infrastructure based on whether interruption handling is enabled and deletes +// the infrastructure by ref-counting when the last AWSNodeTemplate is removed +func (i *Infrastructure) Reconcile(ctx context.Context, nodeTemplate *v1alpha1.AWSNodeTemplate) (reconcile.Result, error) { + if awssettings.FromContext(ctx).EnableInterruptionHandling { + list := &v1alpha1.AWSNodeTemplateList{} + if err := i.kubeClient.List(ctx, list); err != nil { + return reconcile.Result{}, err + } + if !nodeTemplate.DeletionTimestamp.IsZero() && len(list.Items) == 1 { + if err := i.provider.Delete(ctx); err != nil { + return reconcile.Result{}, err + } + return reconcile.Result{}, nil + } else if len(list.Items) >= 1 { + infrastructureActive.Set(1) + if i.lastInfrastructureReconcile.Add(time.Hour).Before(time.Now()) { + if err := i.provider.Create(ctx); err != nil { + infrastructureHealthy.Set(0) + return reconcile.Result{}, err + } + i.lastInfrastructureReconcile = time.Now() + infrastructureHealthy.Set(1) + } + // TODO: Implement an alerting mechanism for settings updates; until then, just poll + return reconcile.Result{RequeueAfter: time.Second * 10}, nil + } + } + infrastructureActive.Set(0) + infrastructureHealthy.Set(0) + return reconcile.Result{}, nil +} diff --git a/pkg/controllers/nodetemplate/suite_test.go b/pkg/controllers/nodetemplate/suite_test.go index 4efe81da8f0d..0e5caf8d4994 100644 --- a/pkg/controllers/nodetemplate/suite_test.go +++ b/pkg/controllers/nodetemplate/suite_test.go @@ -30,10 +30,12 @@ import ( _ "knative.dev/pkg/system/testing" "sigs.k8s.io/controller-runtime/pkg/client" + "github.com/aws/karpenter-core/pkg/apis/config/settings" "github.com/aws/karpenter-core/pkg/operator/injection" "github.com/aws/karpenter-core/pkg/operator/options" . "github.com/aws/karpenter-core/pkg/test/expectations" "github.com/aws/karpenter/pkg/apis/awsnodetemplate/v1alpha1" + awssettings "github.com/aws/karpenter/pkg/apis/config/settings" "github.com/aws/karpenter/pkg/controllers/providers" "github.com/aws/karpenter/pkg/errors" @@ -68,6 +70,13 @@ func TestAPIs(t *testing.T) { } var _ = BeforeEach(func() { + settingsStore := test.SettingsStore{ + settings.ContextKey: test.Settings(), + awssettings.ContextKey: awssettings.Settings{ + EnableInterruptionHandling: true, + }, + } + ctx = settingsStore.InjectSettings(ctx) env = test.NewEnvironment(ctx, func(e *test.Environment) { opts = defaultOpts Expect(opts.Validate()).To(Succeed(), "Failed to validate options") @@ -80,7 +89,7 @@ var _ = BeforeEach(func() { controller = nodetemplate.NewController(e.Client, sqsProvider, eventBridgeProvider) }) - env.CRDDirectoryPaths = append(env.CRDDirectoryPaths, RelativeToRoot("charts/karpenter/crds")) + env.CRDDirectoryPaths = append(env.CRDDirectoryPaths, relativeToRoot("charts/karpenter/crds")) Expect(env.Start()).To(Succeed(), "Failed to start environment") }) @@ -237,7 +246,7 @@ func awsErrWithCode(code string) awserr.Error { return awserr.New(code, "", fmt.Errorf("")) } -func RelativeToRoot(path string) string { +func relativeToRoot(path string) string { _, file, _, _ := runtime.Caller(0) manifestsRoot := filepath.Join(filepath.Dir(file), "..", "..", "..") return filepath.Join(manifestsRoot, path) From 060495d5d6d6acb78f3b6b206f4056eb15a03efd Mon Sep 17 00:00:00 2001 From: Jonathan Innis Date: Fri, 28 Oct 2022 17:13:17 -0500 Subject: [PATCH 50/55] Add global tags to infrastructure --- pkg/controllers/interruption/controller.go | 26 +++++----- .../interruption/messages/metadata.go | 15 ------ .../interruption/messages/noop/model.go | 6 +-- .../messages/rebalancerecommendation/model.go | 10 ++-- .../rebalancerecommendation/parser.go | 8 +-- .../messages/scheduledchange/model.go | 12 ++--- .../messages/scheduledchange/parser.go | 12 ++--- .../messages/spotinterruption/model.go | 20 ++------ .../messages/spotinterruption/parser.go | 8 +-- .../messages/statechange/model.go | 24 ++------- .../messages/statechange/parser.go | 10 ++-- .../interruption/messages/types.go | 4 +- pkg/controllers/interruption/metrics.go | 2 +- pkg/controllers/interruption/parser.go | 12 ++--- pkg/controllers/interruption/suite_test.go | 19 +++---- pkg/controllers/providers/eventbridge.go | 49 ++++++++++++------- pkg/controllers/providers/infrastructure.go | 19 +++---- pkg/controllers/providers/sqs.go | 24 ++++++--- pkg/errors/errors.go | 18 ------- test/suites/interruption/suite_test.go | 4 +- 20 files changed, 130 insertions(+), 172 deletions(-) delete mode 100644 pkg/controllers/interruption/messages/metadata.go diff --git a/pkg/controllers/interruption/controller.go b/pkg/controllers/interruption/controller.go index 5f7a5351c09d..13500d804bda 100644 --- a/pkg/controllers/interruption/controller.go +++ b/pkg/controllers/interruption/controller.go @@ -43,6 +43,7 @@ import ( "github.com/aws/karpenter/pkg/controllers/interruption/messages" "github.com/aws/karpenter/pkg/controllers/interruption/messages/statechange" "github.com/aws/karpenter/pkg/controllers/providers" + "github.com/aws/karpenter/pkg/errors" "github.com/aws/karpenter/pkg/utils" "github.com/aws/karpenter-core/pkg/apis/provisioning/v1alpha5" @@ -176,20 +177,20 @@ func (c *Controller) handleMessage(ctx context.Context, instanceIDMap map[string return nil } -func (c *Controller) handleNode(ctx context.Context, evt messages.Interface, node *v1.Node) error { +func (c *Controller) handleNode(ctx context.Context, msg messages.Message, node *v1.Node) error { ctx = logging.WithLogger(ctx, logging.FromContext(ctx).With("node", node.Name)) - action := actionForEvent(evt) + action := actionForMessage(msg) // Record metric and event for this action - c.notifyForEvent(evt, node) + c.notifyForMessage(msg, node) actionsPerformed.WithLabelValues(action.String()).Inc() // Mark the offering as unavailable in the ICE cache since we got a spot interruption warning - if evt.Kind() == messages.SpotInterruptionKind { + if msg.Kind() == messages.SpotInterruptionKind { zone := node.Labels[v1.LabelTopologyZone] instanceType := node.Labels[v1.LabelInstanceTypeStable] if zone != "" && instanceType != "" { - c.unavailableOfferingsCache.MarkUnavailable(ctx, evt.Kind().String(), instanceType, zone, v1alpha1.CapacityTypeSpot) + c.unavailableOfferingsCache.MarkUnavailable(ctx, msg.Kind().String(), instanceType, zone, v1alpha1.CapacityTypeSpot) } } if action != NoAction { @@ -200,6 +201,9 @@ func (c *Controller) handleNode(ctx context.Context, evt messages.Interface, nod func (c *Controller) deleteInstance(ctx context.Context, node *v1.Node) error { if err := c.kubeClient.Delete(ctx, node); err != nil { + if errors.IsNotFound(err) { + return nil + } return fmt.Errorf("deleting the node on notification, %w", err) } c.recorder.Publish(interruptionevents.NodeTerminatingOnInterruption(node)) @@ -207,8 +211,8 @@ func (c *Controller) deleteInstance(ctx context.Context, node *v1.Node) error { return nil } -func (c *Controller) notifyForEvent(evt messages.Interface, n *v1.Node) { - switch evt.Kind() { +func (c *Controller) notifyForMessage(msg messages.Message, n *v1.Node) { + switch msg.Kind() { case messages.RebalanceRecommendationKind: c.recorder.Publish(interruptionevents.InstanceRebalanceRecommendation(n)) @@ -219,8 +223,8 @@ func (c *Controller) notifyForEvent(evt messages.Interface, n *v1.Node) { c.recorder.Publish(interruptionevents.InstanceSpotInterrupted(n)) case messages.StateChangeKind: - typed := evt.(statechange.Event) - if lo.Contains([]string{"stopping", "stopped"}, typed.State()) { + typed := msg.(statechange.Message) + if lo.Contains([]string{"stopping", "stopped"}, typed.Detail.State) { c.recorder.Publish(interruptionevents.InstanceStopping(n)) } else { c.recorder.Publish(interruptionevents.InstanceTerminating(n)) @@ -249,8 +253,8 @@ func (c *Controller) makeInstanceIDMap() map[string]*v1.Node { return m } -func actionForEvent(evt messages.Interface) Action { - switch evt.Kind() { +func actionForMessage(msg messages.Message) Action { + switch msg.Kind() { case messages.ScheduledChangeKind, messages.SpotInterruptionKind, messages.StateChangeKind: return CordonAndDrain default: diff --git a/pkg/controllers/interruption/messages/metadata.go b/pkg/controllers/interruption/messages/metadata.go deleted file mode 100644 index 6785b48f8ecb..000000000000 --- a/pkg/controllers/interruption/messages/metadata.go +++ /dev/null @@ -1,15 +0,0 @@ -/* -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package messages diff --git a/pkg/controllers/interruption/messages/noop/model.go b/pkg/controllers/interruption/messages/noop/model.go index 9eca028cc2c9..2afb6d86a93c 100644 --- a/pkg/controllers/interruption/messages/noop/model.go +++ b/pkg/controllers/interruption/messages/noop/model.go @@ -18,14 +18,14 @@ import ( "github.com/aws/karpenter/pkg/controllers/interruption/messages" ) -type Event struct { +type Message struct { messages.Metadata } -func (Event) EC2InstanceIDs() []string { +func (Message) EC2InstanceIDs() []string { return []string{} } -func (Event) Kind() messages.Kind { +func (Message) Kind() messages.Kind { return messages.NoOpKind } diff --git a/pkg/controllers/interruption/messages/rebalancerecommendation/model.go b/pkg/controllers/interruption/messages/rebalancerecommendation/model.go index 48e43c25865f..b68e9d0cf4d0 100644 --- a/pkg/controllers/interruption/messages/rebalancerecommendation/model.go +++ b/pkg/controllers/interruption/messages/rebalancerecommendation/model.go @@ -18,9 +18,9 @@ import ( "github.com/aws/karpenter/pkg/controllers/interruption/messages" ) -// Event contains the properties defined by +// Message contains the properties defined by // https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/rebalance-recommendations.html#monitor-rebalance-recommendations -type Event struct { +type Message struct { messages.Metadata Detail Detail `json:"detail"` @@ -30,10 +30,10 @@ type Detail struct { InstanceID string `json:"instance-id"` } -func (e Event) EC2InstanceIDs() []string { - return []string{e.Detail.InstanceID} +func (m Message) EC2InstanceIDs() []string { + return []string{m.Detail.InstanceID} } -func (Event) Kind() messages.Kind { +func (Message) Kind() messages.Kind { return messages.RebalanceRecommendationKind } diff --git a/pkg/controllers/interruption/messages/rebalancerecommendation/parser.go b/pkg/controllers/interruption/messages/rebalancerecommendation/parser.go index 500aad7ffc13..75fd8f4561a1 100644 --- a/pkg/controllers/interruption/messages/rebalancerecommendation/parser.go +++ b/pkg/controllers/interruption/messages/rebalancerecommendation/parser.go @@ -23,12 +23,12 @@ import ( type Parser struct{} -func (p Parser) Parse(msg string) (messages.Interface, error) { - evt := Event{} - if err := json.Unmarshal([]byte(msg), &evt); err != nil { +func (p Parser) Parse(raw string) (messages.Message, error) { + msg := Message{} + if err := json.Unmarshal([]byte(raw), &msg); err != nil { return nil, fmt.Errorf("unmarhsalling the message as EC2InstanceRebalanceRecommendation, %w", err) } - return evt, nil + return msg, nil } func (p Parser) Version() string { diff --git a/pkg/controllers/interruption/messages/scheduledchange/model.go b/pkg/controllers/interruption/messages/scheduledchange/model.go index 0029c7faa276..2b726bd3bf4f 100644 --- a/pkg/controllers/interruption/messages/scheduledchange/model.go +++ b/pkg/controllers/interruption/messages/scheduledchange/model.go @@ -18,23 +18,23 @@ import ( "github.com/aws/karpenter/pkg/controllers/interruption/messages" ) -// Event contains the properties defined in AWS EventBridge schema +// Message contains the properties defined in AWS EventBridge schema // aws.health@AWSHealthEvent v0. -type Event struct { +type Message struct { messages.Metadata Detail Detail `json:"detail"` } -func (e Event) EC2InstanceIDs() []string { - ids := make([]string, len(e.Detail.AffectedEntities)) - for i, entity := range e.Detail.AffectedEntities { +func (m Message) EC2InstanceIDs() []string { + ids := make([]string, len(m.Detail.AffectedEntities)) + for i, entity := range m.Detail.AffectedEntities { ids[i] = entity.EntityValue } return ids } -func (Event) Kind() messages.Kind { +func (Message) Kind() messages.Kind { return messages.ScheduledChangeKind } diff --git a/pkg/controllers/interruption/messages/scheduledchange/parser.go b/pkg/controllers/interruption/messages/scheduledchange/parser.go index 5393d6e86c72..9d64451a4465 100644 --- a/pkg/controllers/interruption/messages/scheduledchange/parser.go +++ b/pkg/controllers/interruption/messages/scheduledchange/parser.go @@ -28,18 +28,18 @@ const ( type Parser struct{} -func (p Parser) Parse(msg string) (messages.Interface, error) { - evt := Event{} - if err := json.Unmarshal([]byte(msg), &evt); err != nil { +func (p Parser) Parse(raw string) (messages.Message, error) { + msg := Message{} + if err := json.Unmarshal([]byte(raw), &msg); err != nil { return nil, fmt.Errorf("unmarhsalling the message as AWSHealthEvent, %w", err) } // We ignore services and event categories that we don't watch - if evt.Detail.Service != acceptedService || - evt.Detail.EventTypeCategory != acceptedEventTypeCategory { + if msg.Detail.Service != acceptedService || + msg.Detail.EventTypeCategory != acceptedEventTypeCategory { return nil, nil } - return evt, nil + return msg, nil } func (p Parser) Version() string { diff --git a/pkg/controllers/interruption/messages/spotinterruption/model.go b/pkg/controllers/interruption/messages/spotinterruption/model.go index dc798e41c8a0..d0af8572c0b4 100644 --- a/pkg/controllers/interruption/messages/spotinterruption/model.go +++ b/pkg/controllers/interruption/messages/spotinterruption/model.go @@ -15,14 +15,12 @@ limitations under the License. package spotinterruption import ( - "time" - "github.com/aws/karpenter/pkg/controllers/interruption/messages" ) -// Event contains the properties defined in AWS EventBridge schema +// Message contains the properties defined in AWS EventBridge schema // aws.ec2@EC2SpotInstanceInterruptionWarning v0. -type Event struct { +type Message struct { messages.Metadata Detail Detail `json:"detail"` @@ -33,18 +31,10 @@ type Detail struct { InstanceAction string `json:"instance-action"` } -func (e Event) EventID() string { - return e.ID -} - -func (e Event) EC2InstanceIDs() []string { - return []string{e.Detail.InstanceID} +func (m Message) EC2InstanceIDs() []string { + return []string{m.Detail.InstanceID} } -func (Event) Kind() messages.Kind { +func (Message) Kind() messages.Kind { return messages.SpotInterruptionKind } - -func (e Event) StartTime() time.Time { - return e.Time -} diff --git a/pkg/controllers/interruption/messages/spotinterruption/parser.go b/pkg/controllers/interruption/messages/spotinterruption/parser.go index 18883c29b7ae..cc5a8172ee7a 100644 --- a/pkg/controllers/interruption/messages/spotinterruption/parser.go +++ b/pkg/controllers/interruption/messages/spotinterruption/parser.go @@ -23,12 +23,12 @@ import ( type Parser struct{} -func (p Parser) Parse(msg string) (messages.Interface, error) { - evt := Event{} - if err := json.Unmarshal([]byte(msg), &evt); err != nil { +func (p Parser) Parse(raw string) (messages.Message, error) { + msg := Message{} + if err := json.Unmarshal([]byte(raw), &msg); err != nil { return nil, fmt.Errorf("unmarhsalling the message as EC2SpotInstanceInterruptionWarning, %w", err) } - return evt, nil + return msg, nil } func (p Parser) Version() string { diff --git a/pkg/controllers/interruption/messages/statechange/model.go b/pkg/controllers/interruption/messages/statechange/model.go index fc54a83067de..99bac01bdb64 100644 --- a/pkg/controllers/interruption/messages/statechange/model.go +++ b/pkg/controllers/interruption/messages/statechange/model.go @@ -15,14 +15,12 @@ limitations under the License. package statechange import ( - "time" - "github.com/aws/karpenter/pkg/controllers/interruption/messages" ) -// Event contains the properties defined in AWS EventBridge schema +// Message contains the properties defined in AWS EventBridge schema // aws.ec2@EC2InstanceStateChangeNotification v1. -type Event struct { +type Message struct { messages.Metadata Detail Detail `json:"detail"` @@ -33,22 +31,10 @@ type Detail struct { State string `json:"state"` } -func (e Event) EventID() string { - return e.ID -} - -func (e Event) EC2InstanceIDs() []string { - return []string{e.Detail.InstanceID} +func (m Message) EC2InstanceIDs() []string { + return []string{m.Detail.InstanceID} } -func (e Event) State() string { - return e.Detail.State -} - -func (Event) Kind() messages.Kind { +func (Message) Kind() messages.Kind { return messages.StateChangeKind } - -func (e Event) StartTime() time.Time { - return e.Time -} diff --git a/pkg/controllers/interruption/messages/statechange/parser.go b/pkg/controllers/interruption/messages/statechange/parser.go index 17391b2f0e4f..b5b5ebc0b932 100644 --- a/pkg/controllers/interruption/messages/statechange/parser.go +++ b/pkg/controllers/interruption/messages/statechange/parser.go @@ -28,17 +28,17 @@ var acceptedStates = sets.NewString("stopping", "stopped", "shutting-down", "ter type Parser struct{} -func (p Parser) Parse(msg string) (messages.Interface, error) { - evt := Event{} - if err := json.Unmarshal([]byte(msg), &evt); err != nil { +func (p Parser) Parse(raw string) (messages.Message, error) { + msg := Message{} + if err := json.Unmarshal([]byte(raw), &msg); err != nil { return nil, fmt.Errorf("unmarhsalling the message as EC2InstanceStateChangeNotification, %w", err) } // We ignore states that are not in the set of states we can react to - if !acceptedStates.Has(strings.ToLower(evt.Detail.State)) { + if !acceptedStates.Has(strings.ToLower(msg.Detail.State)) { return nil, nil } - return evt, nil + return msg, nil } func (p Parser) Version() string { diff --git a/pkg/controllers/interruption/messages/types.go b/pkg/controllers/interruption/messages/types.go index 40f4e8c31601..ab871c70e716 100644 --- a/pkg/controllers/interruption/messages/types.go +++ b/pkg/controllers/interruption/messages/types.go @@ -20,14 +20,14 @@ import ( ) type Parser interface { - Parse(string) (Interface, error) + Parse(string) (Message, error) Version() string Source() string DetailType() string } -type Interface interface { +type Message interface { EC2InstanceIDs() []string Kind() Kind } diff --git a/pkg/controllers/interruption/metrics.go b/pkg/controllers/interruption/metrics.go index 5a39ce4db7a3..070ca18689b1 100644 --- a/pkg/controllers/interruption/metrics.go +++ b/pkg/controllers/interruption/metrics.go @@ -22,7 +22,7 @@ import ( ) const ( - subsystem = "aws_notification_controller" + subsystem = "aws_interruption_controller" messageTypeLabel = "message_type" actionableTypeLabel = "actionable" actionTypeLabel = "action_type" diff --git a/pkg/controllers/interruption/parser.go b/pkg/controllers/interruption/parser.go index 93a8babc21e8..e1775c5d1370 100644 --- a/pkg/controllers/interruption/parser.go +++ b/pkg/controllers/interruption/parser.go @@ -71,23 +71,23 @@ func NewEventParser(parsers ...messages.Parser) *EventParser { } } -func (p EventParser) Parse(msg string) (messages.Interface, error) { +func (p EventParser) Parse(msg string) (messages.Message, error) { if msg == "" { - return noop.Event{}, nil + return noop.Message{}, nil } md := messages.Metadata{} if err := json.Unmarshal([]byte(msg), &md); err != nil { - return noop.Event{}, fmt.Errorf("unmarshalling the message as Metadata, %w", err) + return noop.Message{}, fmt.Errorf("unmarshalling the message as Metadata, %w", err) } if parser, ok := p.parserMap[newParserKey(md)]; ok { evt, err := parser.Parse(msg) if err != nil { - return noop.Event{}, fmt.Errorf("parsing event message, %w", err) + return noop.Message{}, fmt.Errorf("parsing event message, %w", err) } if evt == nil { - return noop.Event{}, nil + return noop.Message{}, nil } return evt, nil } - return noop.Event{Metadata: md}, nil + return noop.Message{Metadata: md}, nil } diff --git a/pkg/controllers/interruption/suite_test.go b/pkg/controllers/interruption/suite_test.go index 69e3bc4e1dd4..abbf5b0474dc 100644 --- a/pkg/controllers/interruption/suite_test.go +++ b/pkg/controllers/interruption/suite_test.go @@ -319,8 +319,8 @@ func awsErrWithCode(code string) awserr.Error { return awserr.New(code, "", fmt.Errorf("")) } -func spotInterruptionMessage(involvedInstanceID string) spotinterruption.Event { - return spotinterruption.Event{ +func spotInterruptionMessage(involvedInstanceID string) spotinterruption.Message { + return spotinterruption.Message{ Metadata: messages.Metadata{ Version: "0", Account: defaultAccountID, @@ -340,8 +340,8 @@ func spotInterruptionMessage(involvedInstanceID string) spotinterruption.Event { } } -func stateChangeMessage(involvedInstanceID, state string) statechange.Event { - return statechange.Event{ +func stateChangeMessage(involvedInstanceID, state string) statechange.Message { + return statechange.Message{ Metadata: messages.Metadata{ Version: "0", Account: defaultAccountID, @@ -361,8 +361,8 @@ func stateChangeMessage(involvedInstanceID, state string) statechange.Event { } } -func scheduledChangeMessage(involvedInstanceID string) scheduledchange.Event { - return scheduledchange.Event{ +func scheduledChangeMessage(involvedInstanceID string) scheduledchange.Message { + return scheduledchange.Message{ Metadata: messages.Metadata{ Version: "0", Account: defaultAccountID, @@ -387,13 +387,6 @@ func scheduledChangeMessage(involvedInstanceID string) scheduledchange.Event { } } -func NewWrappedMessage(evt messages.Interface) *sqs.Message { - return &sqs.Message{ - Body: aws.String(string(lo.Must(json.Marshal(evt)))), - MessageId: aws.String(string(uuid.NewUUID())), - } -} - func makeProviderID(instanceID string) string { return fmt.Sprintf("aws:///%s/%s", defaultRegion, instanceID) } diff --git a/pkg/controllers/providers/eventbridge.go b/pkg/controllers/providers/eventbridge.go index f64a41667706..f8bd829efb25 100644 --- a/pkg/controllers/providers/eventbridge.go +++ b/pkg/controllers/providers/eventbridge.go @@ -28,6 +28,7 @@ import ( "github.com/aws/karpenter-core/pkg/apis/provisioning/v1alpha5" "github.com/aws/karpenter-core/pkg/operator/injection" + awssettings "github.com/aws/karpenter/pkg/apis/config/settings" awserrors "github.com/aws/karpenter/pkg/errors" "github.com/aws/karpenter/pkg/utils" ) @@ -40,6 +41,14 @@ type rule struct { Target *target } +func (er rule) addQueueTarget(queueARN string) rule { + er.Target = &target{ + ID: QueueTargetID, + ARN: queueARN, + } + return er +} + type target struct { ID string ARN string @@ -66,27 +75,18 @@ func NewEventBridge(eb eventbridgeiface.EventBridgeAPI, sqsProvider *SQS) *Event } } -func (eb *EventBridge) CreateEC2NotificationRules(ctx context.Context) error { +func (eb *EventBridge) CreateEC2EventRules(ctx context.Context) error { queueARN, err := eb.sqsProvider.queueARN.TryGet(ctx) if err != nil { return fmt.Errorf("resolving queue arn, %w", err) } - rules := lo.Map(eb.getEC2NotificationEventRules(ctx), func(r rule, _ int) rule { return r.AddQueueTarget(queueARN) }) + rules := lo.Map(eb.getEC2NotificationEventRules(ctx), func(r rule, _ int) rule { return r.addQueueTarget(queueARN) }) errs := make([]error, len(rules)) workqueue.ParallelizeUntil(ctx, len(rules), len(rules), func(i int) { _, err := eb.client.PutRuleWithContext(ctx, &eventbridge.PutRuleInput{ Name: aws.String(rules[i].Name), EventPattern: aws.String(string(rules[i].Pattern.Serialize())), - Tags: []*eventbridge.Tag{ - { - Key: aws.String(v1alpha5.DiscoveryTagKey), - Value: aws.String(injection.GetOptions(ctx).ClusterName), - }, - { - Key: aws.String(v1alpha5.ManagedByTagKey), - Value: aws.String(injection.GetOptions(ctx).ClusterName), - }, - }, + Tags: eb.getTags(ctx), }) if err != nil { errs[i] = multierr.Append(errs[i], err) @@ -164,10 +164,23 @@ func (eb *EventBridge) getEC2NotificationEventRules(ctx context.Context) []rule } } -func (er rule) AddQueueTarget(queueARN string) rule { - er.Target = &target{ - ID: QueueTargetID, - ARN: queueARN, - } - return er +func (eb *EventBridge) getTags(ctx context.Context) []*eventbridge.Tag { + return append( + []*eventbridge.Tag{ + { + Key: aws.String(v1alpha5.DiscoveryTagKey), + Value: aws.String(injection.GetOptions(ctx).ClusterName), + }, + { + Key: aws.String(v1alpha5.ManagedByTagKey), + Value: aws.String(injection.GetOptions(ctx).ClusterName), + }, + }, + lo.MapToSlice(awssettings.FromContext(ctx).Tags, func(k, v string) *eventbridge.Tag { + return &eventbridge.Tag{ + Key: aws.String(k), + Value: aws.String(v), + } + })..., + ) } diff --git a/pkg/controllers/providers/infrastructure.go b/pkg/controllers/providers/infrastructure.go index 271a98182451..e63a49ba027d 100644 --- a/pkg/controllers/providers/infrastructure.go +++ b/pkg/controllers/providers/infrastructure.go @@ -45,7 +45,7 @@ func (p *Infrastructure) Create(ctx context.Context) error { if err := p.ensureEventBridge(ctx); err != nil { return fmt.Errorf("ensuring eventBridge rules and targets, %w", err) } - logging.FromContext(ctx).Infof("Successfully completed reconciliation of infrastructure") + logging.FromContext(ctx).Infof("Completed reconciliation of infrastructure") return nil } @@ -92,29 +92,24 @@ func (p *Infrastructure) ensureQueue(ctx context.Context) error { return fmt.Errorf("creating sqs queue with policy, %w", err) } logging.FromContext(ctx).Debugf("Successfully created the SQS notification queue") - case errors.IsAccessDenied(err): - return fmt.Errorf("failed obtaining permission to discover sqs queue url, %w", err) default: - return fmt.Errorf("failed discovering sqs queue url, %w", err) + return fmt.Errorf("discovering sqs queue url, %w", err) } } // Always attempt to set the queue attributes, even after creation to help set the queue policy if err := p.sqsProvider.SetQueueAttributes(ctx, nil); err != nil { return fmt.Errorf("setting queue attributes for queue, %w", err) } + logging.FromContext(ctx).Debugf("Successfully reconciled SQS queue") return nil } // ensureEventBridge reconciles the Eventbridge rules with the configuration prescribed by Karpenter func (p *Infrastructure) ensureEventBridge(ctx context.Context) error { - logging.FromContext(ctx).Debugf("Reconciling the EventBridge notification rules...") - if err := p.eventBridgeProvider.CreateEC2NotificationRules(ctx); err != nil { - switch { - case errors.IsAccessDenied(err): - return fmt.Errorf("obtaining permission to eventbridge, %w", err) - default: - return fmt.Errorf("creating event bridge notification rules, %w", err) - } + logging.FromContext(ctx).Debugf("Reconciling the EventBridge event rules...") + if err := p.eventBridgeProvider.CreateEC2EventRules(ctx); err != nil { + return fmt.Errorf("creating EventBridge event rules, %w", err) } + logging.FromContext(ctx).Debugf("Successfully reconciled EventBridge event rules") return nil } diff --git a/pkg/controllers/providers/sqs.go b/pkg/controllers/providers/sqs.go index fb95ca1cf197..e1858d4fa13b 100644 --- a/pkg/controllers/providers/sqs.go +++ b/pkg/controllers/providers/sqs.go @@ -27,6 +27,7 @@ import ( "github.com/aws/karpenter-core/pkg/apis/provisioning/v1alpha5" "github.com/aws/karpenter-core/pkg/operator/injection" "github.com/aws/karpenter-core/pkg/utils/atomic" + awssettings "github.com/aws/karpenter/pkg/apis/config/settings" awserrors "github.com/aws/karpenter/pkg/errors" "github.com/aws/karpenter/pkg/utils" ) @@ -58,9 +59,9 @@ type SQS struct { func NewSQS(ctx context.Context, client sqsiface.SQSAPI) *SQS { provider := &SQS{ - client: client, - queueName: getQueueName(ctx), + client: client, } + provider.queueName = provider.getQueueName(ctx) provider.queueURL.Resolve = func(ctx context.Context) (string, error) { input := &sqs.GetQueueUrlInput{ QueueName: aws.String(provider.queueName), @@ -95,10 +96,7 @@ func (s *SQS) QueueName() string { func (s *SQS) CreateQueue(ctx context.Context) error { input := &sqs.CreateQueueInput{ QueueName: aws.String(s.queueName), - Tags: map[string]*string{ - v1alpha5.DiscoveryTagKey: aws.String(injection.GetOptions(ctx).ClusterName), - v1alpha5.ManagedByTagKey: aws.String(injection.GetOptions(ctx).ClusterName), - }, + Tags: s.getTags(ctx), } result, err := s.client.CreateQueueWithContext(ctx, input) if err != nil { @@ -260,9 +258,21 @@ func (s *SQS) getQueuePolicy(ctx context.Context) (*queuePolicy, error) { }, nil } +func (s *SQS) getTags(ctx context.Context) map[string]*string { + return lo.Assign( + lo.MapEntries(awssettings.FromContext(ctx).Tags, func(k, v string) (string, *string) { + return k, lo.ToPtr(v) + }), + map[string]*string{ + v1alpha5.DiscoveryTagKey: aws.String(injection.GetOptions(ctx).ClusterName), + v1alpha5.ManagedByTagKey: aws.String(injection.GetOptions(ctx).ClusterName), + }, + ) +} + // getQueueName generates a sufficiently random name for the queue name from the cluster name // This is used because the max-len for a queue name is 80 characters but the maximum cluster name // length is 100 -func getQueueName(ctx context.Context) string { +func (s *SQS) getQueueName(ctx context.Context) string { return fmt.Sprintf("Karpenter-EventQueue-%s", utils.GetClusterNameHash(ctx, 20)) } diff --git a/pkg/errors/errors.go b/pkg/errors/errors.go index a1d1d27fac43..221c5a44934c 100644 --- a/pkg/errors/errors.go +++ b/pkg/errors/errors.go @@ -46,10 +46,6 @@ var ( "UnfulfillableCapacity", "Unsupported", ) - accessDeniedErrorCodes = sets.NewString( - AccessDeniedCode, - AccessDeniedExceptionCode, - ) ) type InstanceTerminatedError struct { @@ -82,20 +78,6 @@ func IsNotFound(err error) bool { return false } -// IsAccessDenied returns true if the err is an AWS error (even if it's -// wrapped) and is a known to mean "access denied" (as opposed to a more -// serious or unexpected error) -func IsAccessDenied(err error) bool { - if err == nil { - return false - } - var awsError awserr.Error - if errors.As(err, &awsError) { - return accessDeniedErrorCodes.Has(awsError.Code()) - } - return false -} - // IsUnfulfillableCapacity returns true if the Fleet err means // capacity is temporarily unavailable for launching. // This could be due to account limits, insufficient ec2 capacity, etc. diff --git a/test/suites/interruption/suite_test.go b/test/suites/interruption/suite_test.go index f7b823640c3c..d3e29dda1782 100644 --- a/test/suites/interruption/suite_test.go +++ b/test/suites/interruption/suite_test.go @@ -247,8 +247,8 @@ var _ = Describe("Interruption", Label("AWS"), func() { }) }) -func scheduledChangeMessage(region, accountID, involvedInstanceID string) scheduledchange.Event { - return scheduledchange.Event{ +func scheduledChangeMessage(region, accountID, involvedInstanceID string) scheduledchange.Message { + return scheduledchange.Message{ Metadata: messages.Metadata{ Version: "0", Account: accountID, From 6d04394b91eed1ee89fdb440bff352f2d2e1c308 Mon Sep 17 00:00:00 2001 From: Jonathan Innis Date: Mon, 31 Oct 2022 09:59:58 -0700 Subject: [PATCH 51/55] Fix scale testing --- pkg/controllers/interruption/controller.go | 5 +-- .../interruption_benchmark_test.go | 29 ++++++++++++----- pkg/fake/recorder.go | 31 ++++++------------- 3 files changed, 35 insertions(+), 30 deletions(-) diff --git a/pkg/controllers/interruption/controller.go b/pkg/controllers/interruption/controller.go index 13500d804bda..4a886549148a 100644 --- a/pkg/controllers/interruption/controller.go +++ b/pkg/controllers/interruption/controller.go @@ -33,8 +33,6 @@ import ( "sigs.k8s.io/controller-runtime/pkg/manager" "sigs.k8s.io/controller-runtime/pkg/reconcile" - "github.com/aws/karpenter-core/pkg/events" - "github.com/aws/karpenter-core/pkg/operator/scheme" "github.com/aws/karpenter/pkg/apis" "github.com/aws/karpenter/pkg/apis/awsnodetemplate/v1alpha1" awssettings "github.com/aws/karpenter/pkg/apis/config/settings" @@ -46,6 +44,9 @@ import ( "github.com/aws/karpenter/pkg/errors" "github.com/aws/karpenter/pkg/utils" + "github.com/aws/karpenter-core/pkg/events" + "github.com/aws/karpenter-core/pkg/operator/scheme" + "github.com/aws/karpenter-core/pkg/apis/provisioning/v1alpha5" "github.com/aws/karpenter-core/pkg/controllers/state" "github.com/aws/karpenter-core/pkg/metrics" diff --git a/pkg/controllers/interruption/interruption_benchmark_test.go b/pkg/controllers/interruption/interruption_benchmark_test.go index 1a4a5359af30..77366c4e9304 100644 --- a/pkg/controllers/interruption/interruption_benchmark_test.go +++ b/pkg/controllers/interruption/interruption_benchmark_test.go @@ -44,15 +44,17 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/reconcile" - "github.com/aws/karpenter-core/pkg/apis/provisioning/v1alpha5" - "github.com/aws/karpenter-core/pkg/cloudprovider/fake" + awssettings "github.com/aws/karpenter/pkg/apis/config/settings" awscache "github.com/aws/karpenter/pkg/cache" awscontext "github.com/aws/karpenter/pkg/context" "github.com/aws/karpenter/pkg/controllers/interruption" "github.com/aws/karpenter/pkg/controllers/providers" - awsfake "github.com/aws/karpenter/pkg/fake" + awstest "github.com/aws/karpenter/pkg/test" + "github.com/aws/karpenter-core/pkg/apis/config/settings" + "github.com/aws/karpenter-core/pkg/apis/provisioning/v1alpha5" + "github.com/aws/karpenter-core/pkg/cloudprovider/fake" "github.com/aws/karpenter-core/pkg/controllers/state" "github.com/aws/karpenter-core/pkg/operator/injection" "github.com/aws/karpenter-core/pkg/operator/options" @@ -84,11 +86,20 @@ func benchmarkNotificationController(b *testing.B, messageCount int) { ClusterName: "karpenter-notification-benchmarking", } fakeClock = &clock.FakeClock{} - ctx = injection.WithOptions(context.Background(), opts) + settingsStore := test.SettingsStore{ + settings.ContextKey: test.Settings(), + awssettings.ContextKey: awssettings.Settings{ + EnableInterruptionHandling: true, + }, + } + ctx = settingsStore.InjectSettings(context.Background()) + ctx = injection.WithOptions(ctx, opts) env = test.NewEnvironment(ctx, func(e *test.Environment) {}) + env.CRDDirectoryPaths = append(env.CRDDirectoryPaths, relativeToRoot("charts/karpenter/crds")) if err := env.Start(); err != nil { b.Fatalf("Starting envirionment, %v", err) } + // Stop the test environment after the test completes defer func() { if err := retry.Do(func() error { @@ -112,13 +123,17 @@ func benchmarkNotificationController(b *testing.B, messageCount int) { }() // Load all the fundamental components before setting up the controllers - eventRecorder := awsfake.NewEventRecorder() - recorder = test.NewEventRecorder() + recorder := awsfake.NewEventRecorder() cluster = state.NewCluster(ctx, fakeClock, env.Client, cloudProvider) cloudProvider = &fake.CloudProvider{} unavailableOfferingsCache = awscache.NewUnavailableOfferings(cache.New(awscache.UnavailableOfferingsTTL, awscontext.CacheCleanupInterval)) + // Provision a single AWS Node Template to allow interruption reconciliation + if err := env.Client.Create(ctx, awstest.AWSNodeTemplate()); err != nil { + b.Fatalf("creating AWS node template, %v", err) + } + // Set-up the controllers nodeStateController = state.NewNodeController(env.Client, cluster) interruptionController := interruption.NewController(env.Client, fakeClock, recorder, cluster, providers.sqsProvider, unavailableOfferingsCache) @@ -161,7 +176,7 @@ func benchmarkNotificationController(b *testing.B, messageCount int) { }() select { - case <-providers.monitorMessagesProcessed(env.Ctx, eventRecorder, messageCount): + case <-providers.monitorMessagesProcessed(env.Ctx, recorder, messageCount): case err = <-managerErr: b.Fatalf("running manager, %v", err) } diff --git a/pkg/fake/recorder.go b/pkg/fake/recorder.go index 68980eaa852f..8abe05719bfe 100644 --- a/pkg/fake/recorder.go +++ b/pkg/fake/recorder.go @@ -17,10 +17,8 @@ package fake import ( "sync/atomic" - v1 "k8s.io/api/core/v1" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/runtime" - + "github.com/aws/karpenter-core/pkg/events" + "github.com/aws/karpenter-core/pkg/test" interruptionevents "github.com/aws/karpenter/pkg/controllers/interruption/events" ) @@ -38,28 +36,19 @@ func NewEventRecorder() *EventRecorder { return &EventRecorder{} } -func (e *EventRecorder) Event(_ runtime.Object, _, reason, _ string) { - fakeNode := &v1.Node{ObjectMeta: metav1.ObjectMeta{Name: "fake"}} - switch reason { - case interruptionevents.InstanceSpotInterrupted(fakeNode).Reason: +func (e *EventRecorder) Publish(evt events.Event) { + switch evt.Reason { + case interruptionevents.InstanceSpotInterrupted(test.Node()).Reason: e.InstanceSpotInterruptedCalled.Add(1) - case interruptionevents.InstanceRebalanceRecommendation(fakeNode).Reason: + case interruptionevents.InstanceRebalanceRecommendation(test.Node()).Reason: e.InstanceRebalanceRecommendationCalled.Add(1) - case interruptionevents.InstanceUnhealthy(fakeNode).Reason: + case interruptionevents.InstanceUnhealthy(test.Node()).Reason: e.InstanceUnhealthyCalled.Add(1) - case interruptionevents.InstanceTerminating(fakeNode).Reason: + case interruptionevents.InstanceTerminating(test.Node()).Reason: e.InstanceTerminatingCalled.Add(1) - case interruptionevents.InstanceStopping(fakeNode).Reason: + case interruptionevents.InstanceStopping(test.Node()).Reason: e.InstanceStoppingCalled.Add(1) - case interruptionevents.NodeTerminatingOnInterruption(fakeNode).Reason: + case interruptionevents.NodeTerminatingOnInterruption(test.Node()).Reason: e.NodeTerminatingOnInterruptionCalled.Add(1) } } - -func (e *EventRecorder) Eventf(object runtime.Object, eventtype, reason, messageFmt string, _ ...interface{}) { - e.Event(object, eventtype, reason, messageFmt) -} - -func (e *EventRecorder) AnnotatedEventf(object runtime.Object, _ map[string]string, eventtype, reason, messageFmt string, _ ...interface{}) { - e.Event(object, eventtype, reason, messageFmt) -} From 542fe07ff8f0b75d2dd08996107e9f08168a580d Mon Sep 17 00:00:00 2001 From: Jonathan Innis Date: Mon, 31 Oct 2022 12:46:46 -0700 Subject: [PATCH 52/55] PR comments --- charts/karpenter/templates/clusterrole.yaml | 2 +- cmd/controller/main.go | 4 +- go.sum | 5 +- pkg/controllers/controllers.go | 5 +- pkg/controllers/interruption/controller.go | 168 +++++++++--------- .../interruption_benchmark_test.go | 35 ++-- pkg/controllers/interruption/metrics.go | 2 +- pkg/controllers/interruption/suite_test.go | 22 +-- pkg/controllers/nodetemplate/controller.go | 4 +- pkg/controllers/providers/infrastructure.go | 21 +-- pkg/controllers/providers/sqs.go | 26 ++- pkg/fake/recorder.go | 54 ------ test/go.mod | 4 +- test/go.sum | 8 +- test/pkg/environment/aws/expectations.go | 3 +- 15 files changed, 137 insertions(+), 226 deletions(-) delete mode 100644 pkg/fake/recorder.go diff --git a/charts/karpenter/templates/clusterrole.yaml b/charts/karpenter/templates/clusterrole.yaml index e4dec34ac962..520a00d50360 100644 --- a/charts/karpenter/templates/clusterrole.yaml +++ b/charts/karpenter/templates/clusterrole.yaml @@ -31,7 +31,7 @@ rules: # Read - apiGroups: ["karpenter.k8s.aws"] resources: ["awsnodetemplates"] - verbs: ["get", "list", "watch"] + verbs: ["get", "list", "watch", "patch"] - apiGroups: ["admissionregistration.k8s.io"] resources: ["validatingwebhookconfigurations"] verbs: ["update"] diff --git a/cmd/controller/main.go b/cmd/controller/main.go index eeefdd58cc3d..a5c4b79e0df6 100644 --- a/cmd/controller/main.go +++ b/cmd/controller/main.go @@ -46,14 +46,13 @@ func main() { lo.Must0(operator.AddHealthzCheck("cloud-provider", awsCloudProvider.LivenessProbe)) cloudProvider := metrics.Decorate(awsCloudProvider) - clusterState := state.NewCluster(operator.SettingsStore.InjectSettings(ctx), operator.Clock, operator.GetClient(), cloudProvider) operator. WithControllers(ctx, corecontrollers.NewControllers( ctx, clock.RealClock{}, operator.GetClient(), operator.KubernetesInterface, - clusterState, + state.NewCluster(operator.SettingsStore.InjectSettings(ctx), operator.Clock, operator.GetClient(), cloudProvider), operator.EventRecorder, operator.SettingsStore, cloudProvider, @@ -61,7 +60,6 @@ func main() { WithWebhooks(corewebhooks.NewWebhooks()...). WithControllers(ctx, controllers.NewControllers( awsCtx, - clusterState, )...). WithWebhooks(webhooks.NewWebhooks()...). Start(ctx) diff --git a/go.sum b/go.sum index 526a959b9efd..bd9ab68e3f57 100644 --- a/go.sum +++ b/go.sum @@ -66,8 +66,6 @@ github.com/avast/retry-go v3.0.0+incompatible h1:4SOWQ7Qs+oroOTQOYnAHqelpCO0biHS github.com/avast/retry-go v3.0.0+incompatible/go.mod h1:XtSnn+n/sHqQIpZ10K1qAevBhOOCWBLXXy3hyiqqBrY= github.com/aws/aws-sdk-go v1.44.114 h1:plIkWc/RsHr3DXBj4MEw9sEW4CcL/e2ryokc+CKyq1I= github.com/aws/aws-sdk-go v1.44.114/go.mod h1:y4AeaBuwd2Lk+GepC1E9v0qOiTws0MIWAX4oIKwKHZo= -github.com/aws/karpenter-core v0.0.2-0.20221031163135-0c39b59ed935 h1:HEd6DTBd8QcVEv9Ow7jPQWQ7LLp2QcipgaDYmJfazOI= -github.com/aws/karpenter-core v0.0.2-0.20221031163135-0c39b59ed935/go.mod h1:b9EJPH/E/rPxwSshkn4wCY2Tw5AaDBPKhT8r3AIVlHo= github.com/aws/karpenter-core v0.0.2-0.20221031185530-c9e0b5b6b603 h1:ULI8LXSgJQARW9EDO1EpTqp3ZYsabXq7V9NzURo9XxE= github.com/aws/karpenter-core v0.0.2-0.20221031185530-c9e0b5b6b603/go.mod h1:AC7JRJN2p/Lcq7gPWeZOSd/XsgSUQHomkHW4mk3Xg24= github.com/benbjohnson/clock v1.1.0 h1:Q92kusRqC1XV2MjkWETPvjJVqKetz1OzxZB7mHJLju8= @@ -876,8 +874,7 @@ k8s.io/kube-openapi v0.0.0-20220803162953-67bda5d908f1 h1:MQ8BAZPZlWk3S9K4a9NCkI k8s.io/kube-openapi v0.0.0-20220803162953-67bda5d908f1/go.mod h1:C/N6wCaBHeBHkHUesQOQy2/MZqGgMAFPqGsGQLdbZBU= k8s.io/utils v0.0.0-20220728103510-ee6ede2d64ed h1:jAne/RjBTyawwAy0utX5eqigAwz/lQhTmy+Hr/Cpue4= k8s.io/utils v0.0.0-20220728103510-ee6ede2d64ed/go.mod h1:jPW/WVKK9YHAvNhRxK0md/EJ228hCsBRufyofKtW8HA= -knative.dev/pkg v0.0.0-20221011175852-714b7630a836 h1:0N7Zo/O+xeUUebJPm9keBaGclrUoEbljr3J1MsqtaIM= -knative.dev/pkg v0.0.0-20221011175852-714b7630a836/go.mod h1:DMTRDJ5WRxf/DrlOPzohzfhSuJggscLZ8EavOq9O/x8= +knative.dev/pkg v0.0.0-20221031132215-6eb8f1845a9d h1:BRSonjQw4u63gaWeoE5i2724FF3K7teFf8DcmwjGdAQ= knative.dev/pkg v0.0.0-20221031132215-6eb8f1845a9d/go.mod h1:j5kO7gKmWGj2DJpefCEiPbItToiYf+2bCtI+A6REkQo= rsc.io/binaryregexp v0.2.0/go.mod h1:qTv7/COck+e2FymRvadv62gMdZztPaShugOCi3I+8D8= rsc.io/quote/v3 v3.1.0/go.mod h1:yEA65RcK8LyAZtP9Kv3t0HmxON59tX3rD+tICJqUlj0= diff --git a/pkg/controllers/controllers.go b/pkg/controllers/controllers.go index d7b687c013fc..e05ef04f2d8b 100644 --- a/pkg/controllers/controllers.go +++ b/pkg/controllers/controllers.go @@ -18,7 +18,6 @@ import ( "github.com/aws/aws-sdk-go/service/eventbridge" "github.com/aws/aws-sdk-go/service/sqs" - "github.com/aws/karpenter-core/pkg/controllers/state" "github.com/aws/karpenter-core/pkg/operator/controller" awscontext "github.com/aws/karpenter/pkg/context" "github.com/aws/karpenter/pkg/controllers/interruption" @@ -26,12 +25,12 @@ import ( "github.com/aws/karpenter/pkg/controllers/providers" ) -func NewControllers(ctx awscontext.Context, cluster *state.Cluster) []controller.Controller { +func NewControllers(ctx awscontext.Context) []controller.Controller { sqsProvider := providers.NewSQS(ctx, sqs.New(ctx.Session)) eventBridgeProvider := providers.NewEventBridge(eventbridge.New(ctx.Session), sqsProvider) return []controller.Controller{ nodetemplate.NewController(ctx.KubeClient, sqsProvider, eventBridgeProvider), - interruption.NewController(ctx.KubeClient, ctx.Clock, ctx.EventRecorder, cluster, sqsProvider, ctx.UnavailableOfferingsCache), + interruption.NewController(ctx.KubeClient, ctx.Clock, ctx.EventRecorder, sqsProvider, ctx.UnavailableOfferingsCache), } } diff --git a/pkg/controllers/interruption/controller.go b/pkg/controllers/interruption/controller.go index 949f0d6904e7..6953588fd76e 100644 --- a/pkg/controllers/interruption/controller.go +++ b/pkg/controllers/interruption/controller.go @@ -34,9 +34,9 @@ import ( "sigs.k8s.io/controller-runtime/pkg/reconcile" "github.com/aws/karpenter/pkg/apis" - awssettings "github.com/aws/karpenter/pkg/apis/config/settings" + "github.com/aws/karpenter/pkg/apis/config/settings" "github.com/aws/karpenter/pkg/apis/v1alpha1" - awscache "github.com/aws/karpenter/pkg/cache" + "github.com/aws/karpenter/pkg/cache" interruptionevents "github.com/aws/karpenter/pkg/controllers/interruption/events" "github.com/aws/karpenter/pkg/controllers/interruption/messages" "github.com/aws/karpenter/pkg/controllers/interruption/messages/statechange" @@ -48,9 +48,8 @@ import ( "github.com/aws/karpenter-core/pkg/operator/scheme" "github.com/aws/karpenter-core/pkg/apis/provisioning/v1alpha5" - "github.com/aws/karpenter-core/pkg/controllers/state" "github.com/aws/karpenter-core/pkg/metrics" - operatorcontroller "github.com/aws/karpenter-core/pkg/operator/controller" + corecontroller "github.com/aws/karpenter-core/pkg/operator/controller" ) func init() { @@ -63,46 +62,57 @@ func init() { type Controller struct { kubeClient client.Client clk clock.Clock - cluster *state.Cluster recorder events.Recorder - provider *providers.SQS - unavailableOfferingsCache *awscache.UnavailableOfferings + sqsProvider *providers.SQS + unavailableOfferingsCache *cache.UnavailableOfferings parser *EventParser } -func NewController(kubeClient client.Client, clk clock.Clock, recorder events.Recorder, cluster *state.Cluster, - sqsProvider *providers.SQS, unavailableOfferingsCache *awscache.UnavailableOfferings) *Controller { +func NewController(kubeClient client.Client, clk clock.Clock, recorder events.Recorder, + sqsProvider *providers.SQS, unavailableOfferingsCache *cache.UnavailableOfferings) *Controller { return &Controller{ kubeClient: kubeClient, clk: clk, - cluster: cluster, recorder: recorder, - provider: sqsProvider, + sqsProvider: sqsProvider, unavailableOfferingsCache: unavailableOfferingsCache, parser: NewEventParser(DefaultParsers...), } } func (c *Controller) Reconcile(ctx context.Context, _ reconcile.Request) (reconcile.Result, error) { - list := &v1alpha1.AWSNodeTemplateList{} - if err := c.kubeClient.List(ctx, list); err != nil { - return reconcile.Result{}, fmt.Errorf("listing node templates, %w", err) + queueExists, err := c.sqsProvider.QueueExists(ctx) + if err != nil { + return reconcile.Result{}, fmt.Errorf("checking queue existence, %w", err) } - - if awssettings.FromContext(ctx).EnableInterruptionHandling && len(list.Items) > 0 { + if settings.FromContext(ctx).EnableInterruptionHandling && queueExists { active.Set(1) - sqsMessages, err := c.provider.GetSQSMessages(ctx) + sqsMessages, err := c.sqsProvider.GetSQSMessages(ctx) if err != nil { return reconcile.Result{}, fmt.Errorf("getting messages from queue, %w", err) } if len(sqsMessages) == 0 { return reconcile.Result{}, nil } - instanceIDMap := c.makeInstanceIDMap() + instanceIDMap, err := c.makeInstanceIDMap(ctx) + if err != nil { + return reconcile.Result{}, fmt.Errorf("making instance id map, %w", err) + } errs := make([]error, len(sqsMessages)) workqueue.ParallelizeUntil(ctx, 10, len(sqsMessages), func(i int) { - errs[i] = c.handleMessage(ctx, instanceIDMap, sqsMessages[i]) + msg, e := c.parseMessage(sqsMessages[i]) + if e != nil { + // If we fail to parse, then we should delete the message but still log the error + logging.FromContext(ctx).Errorf("parsing message, %v", e) + errs[i] = c.deleteMessage(ctx, sqsMessages[i]) + return + } + if e = c.handleMessage(ctx, instanceIDMap, msg); e != nil { + errs[i] = fmt.Errorf("handling message, %w", e) + return + } + errs[i] = c.deleteMessage(ctx, sqsMessages[i]) }) return reconcile.Result{}, multierr.Combine(errs...) } @@ -110,8 +120,8 @@ func (c *Controller) Reconcile(ctx context.Context, _ reconcile.Request) (reconc return reconcile.Result{RequeueAfter: time.Second * 10}, nil } -func (c *Controller) Builder(_ context.Context, m manager.Manager) operatorcontroller.Builder { - return operatorcontroller.NewSingletonManagedBy(m). +func (c *Controller) Builder(_ context.Context, m manager.Manager) corecontroller.Builder { + return corecontroller.NewSingletonManagedBy(m). Named("interruption") } @@ -119,67 +129,58 @@ func (c *Controller) LivenessProbe(_ *http.Request) error { return nil } -// handleMessage gets the node names of the instances involved in the queue message and takes the -// assigned action on the instances based on the message event -func (c *Controller) handleMessage(ctx context.Context, instanceIDMap map[string]*v1.Node, raw *sqsapi.Message) error { +// parseMessage parses the passed SQS message into an internal Message interface +func (c *Controller) parseMessage(raw *sqsapi.Message) (messages.Message, error) { // No message to parse in this case if raw == nil || raw.Body == nil { - return nil + return nil, fmt.Errorf("") } msg, err := c.parser.Parse(*raw.Body) if err != nil { - // In the scenario where we can't parse the message, we log that we have an error and then are - // forced to just delete the message from the queue - logging.FromContext(ctx).Errorf("parsing sqs message, %v", err) - err = c.provider.DeleteSQSMessage(ctx, raw) - if err != nil { - return fmt.Errorf("failed to delete message from queue, %w", err) - } deletedMessages.Inc() - return nil + return nil, fmt.Errorf("parsing sqs message, %w", err) } - ctx = logging.WithLogger(ctx, logging.FromContext(ctx).With("event", msg.Kind())) + return msg, nil +} - nodes := getInvolvedNodes(msg.EC2InstanceIDs(), instanceIDMap) - // There's no action to take here since the event doesn't pertain to any of our instances - if len(nodes) == 0 { - receivedMessages.WithLabelValues(msg.Kind().String(), "false").Inc() +// handleMessage takes an action against every node involved in the message that is owned by a Provisioner +func (c *Controller) handleMessage(ctx context.Context, instanceIDMap map[string]*v1.Node, msg messages.Message) (err error) { + ctx = logging.WithLogger(ctx, logging.FromContext(ctx).With("messageKind", msg.Kind())) + receivedMessages.WithLabelValues(msg.Kind().String()).Inc() - // Since there's no action, just delete the message - err = c.provider.DeleteSQSMessage(ctx, raw) - if err != nil { - return fmt.Errorf("failed to delete message from queue, %w", err) + var failedNodeNames []string + for _, instanceID := range msg.EC2InstanceIDs() { + node, ok := instanceIDMap[instanceID] + if !ok { + continue + } + if e := c.handleNode(ctx, msg, node); e != nil { + failedNodeNames = append(failedNodeNames, node.Name) + err = multierr.Append(err, e) } - deletedMessages.Inc() - return nil - } - receivedMessages.WithLabelValues(msg.Kind().String(), "true").Inc() - - nodeNames := lo.Map(nodes, func(n *v1.Node, _ int) string { return n.Name }) - logging.FromContext(ctx).Infof("Received actionable event from SQS queue for node(s) [%s%s]", - strings.Join(lo.Slice(nodeNames, 0, 3), ","), - lo.Ternary(len(nodeNames) > 3, "...", "")) - - for i := range nodes { - node := nodes[i] - err = multierr.Append(err, c.handleNode(ctx, msg, node)) } if err != nil { return fmt.Errorf("failed to act on nodes [%s%s], %w", - strings.Join(lo.Slice(nodeNames, 0, 3), ","), - lo.Ternary(len(nodeNames) > 3, "...", ""), err) + strings.Join(lo.Slice(failedNodeNames, 0, 3), ","), + lo.Ternary(len(failedNodeNames) > 3, "...", ""), err) } - err = c.provider.DeleteSQSMessage(ctx, raw) - if err != nil { - return fmt.Errorf("failed to delete message from queue, %w", err) + return nil +} + +// deleteMessage removes the passed SQS message from the queue and fires a metric for the deletion +func (c *Controller) deleteMessage(ctx context.Context, msg *sqsapi.Message) error { + if err := c.sqsProvider.DeleteSQSMessage(ctx, msg); err != nil { + return fmt.Errorf("deleting sqs message, %w", err) } deletedMessages.Inc() return nil } +// handleNode retrieves the action for the message and then performs the appropriate action against the node func (c *Controller) handleNode(ctx context.Context, msg messages.Message, node *v1.Node) error { - ctx = logging.WithLogger(ctx, logging.FromContext(ctx).With("node", node.Name)) action := actionForMessage(msg) + ctx = logging.WithLogger(ctx, logging.FromContext(ctx).With("node", node.Name)) + ctx = logging.WithLogger(ctx, logging.FromContext(ctx).With("action", action.String())) // Record metric and event for this action c.notifyForMessage(msg, node) @@ -194,23 +195,26 @@ func (c *Controller) handleNode(ctx context.Context, msg messages.Message, node } } if action != NoAction { - return c.deleteInstance(ctx, node) + return c.deleteNode(ctx, node) } return nil } -func (c *Controller) deleteInstance(ctx context.Context, node *v1.Node) error { +// deleteNode removes the node from the api-server +func (c *Controller) deleteNode(ctx context.Context, node *v1.Node) error { if err := c.kubeClient.Delete(ctx, node); err != nil { if errors.IsNotFound(err) { return nil } - return fmt.Errorf("deleting the node on notification, %w", err) + return fmt.Errorf("deleting the node on interruption message, %w", err) } + logging.FromContext(ctx).Infof("Deleted node from interruption message") c.recorder.Publish(interruptionevents.NodeTerminatingOnInterruption(node)) metrics.NodesTerminatedCounter.WithLabelValues(terminationReasonLabel).Inc() return nil } +// notifyForMessage publishes the relevant alert based on the message kind func (c *Controller) notifyForMessage(msg messages.Message, n *v1.Node) { switch msg.Kind() { case messages.RebalanceRecommendationKind: @@ -236,21 +240,25 @@ func (c *Controller) notifyForMessage(msg messages.Message, n *v1.Node) { // makeInstanceIDMap builds a map between the instance id that is stored in the // node .spec.providerID and the node name stored on the host -func (c *Controller) makeInstanceIDMap() map[string]*v1.Node { +func (c *Controller) makeInstanceIDMap(ctx context.Context) (map[string]*v1.Node, error) { m := map[string]*v1.Node{} - c.cluster.ForEachNode(func(n *state.Node) bool { + nodeList := &v1.NodeList{} + if err := c.kubeClient.List(ctx, nodeList); err != nil { + return nil, fmt.Errorf("listing nodes, %w", err) + } + for i := range nodeList.Items { + node := nodeList.Items[i] // If this node isn't owned by a provisioner, we shouldn't handle it - if _, ok := n.Node.Labels[v1alpha5.ProvisionerNameLabelKey]; !ok { - return true + if _, ok := node.Labels[v1alpha5.ProvisionerNameLabelKey]; !ok { + continue } - id, err := utils.ParseInstanceID(n.Node) + id, err := utils.ParseInstanceID(&node) if err != nil || id == nil { - return true + continue } - m[ptr.StringValue(id)] = n.Node - return true - }) - return m + m[ptr.StringValue(id)] = &node + } + return m, nil } func actionForMessage(msg messages.Message) Action { @@ -261,15 +269,3 @@ func actionForMessage(msg messages.Message) Action { return NoAction } } - -// getInvolvedNodes gets all the nodes that are involved in an event based -// on the instanceIDs passed in from the event -func getInvolvedNodes(instanceIDs []string, instanceIDMap map[string]*v1.Node) []*v1.Node { - var nodes []*v1.Node - for _, id := range instanceIDs { - if node, ok := instanceIDMap[id]; ok { - nodes = append(nodes, node) - } - } - return nodes -} diff --git a/pkg/controllers/interruption/interruption_benchmark_test.go b/pkg/controllers/interruption/interruption_benchmark_test.go index 77366c4e9304..cf62a43614f8 100644 --- a/pkg/controllers/interruption/interruption_benchmark_test.go +++ b/pkg/controllers/interruption/interruption_benchmark_test.go @@ -42,20 +42,18 @@ import ( "knative.dev/pkg/logging" controllerruntime "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" - "sigs.k8s.io/controller-runtime/pkg/reconcile" awssettings "github.com/aws/karpenter/pkg/apis/config/settings" awscache "github.com/aws/karpenter/pkg/cache" awscontext "github.com/aws/karpenter/pkg/context" "github.com/aws/karpenter/pkg/controllers/interruption" + "github.com/aws/karpenter/pkg/controllers/interruption/events" "github.com/aws/karpenter/pkg/controllers/providers" - awsfake "github.com/aws/karpenter/pkg/fake" awstest "github.com/aws/karpenter/pkg/test" "github.com/aws/karpenter-core/pkg/apis/config/settings" "github.com/aws/karpenter-core/pkg/apis/provisioning/v1alpha5" "github.com/aws/karpenter-core/pkg/cloudprovider/fake" - "github.com/aws/karpenter-core/pkg/controllers/state" "github.com/aws/karpenter-core/pkg/operator/injection" "github.com/aws/karpenter-core/pkg/operator/options" "github.com/aws/karpenter-core/pkg/test" @@ -123,8 +121,7 @@ func benchmarkNotificationController(b *testing.B, messageCount int) { }() // Load all the fundamental components before setting up the controllers - recorder := awsfake.NewEventRecorder() - cluster = state.NewCluster(ctx, fakeClock, env.Client, cloudProvider) + recorder = test.NewEventRecorder() cloudProvider = &fake.CloudProvider{} unavailableOfferingsCache = awscache.NewUnavailableOfferings(cache.New(awscache.UnavailableOfferingsTTL, awscontext.CacheCleanupInterval)) @@ -135,13 +132,12 @@ func benchmarkNotificationController(b *testing.B, messageCount int) { } // Set-up the controllers - nodeStateController = state.NewNodeController(env.Client, cluster) - interruptionController := interruption.NewController(env.Client, fakeClock, recorder, cluster, providers.sqsProvider, unavailableOfferingsCache) + interruptionController := interruption.NewController(env.Client, fakeClock, recorder, providers.sqsProvider, unavailableOfferingsCache) messages, nodes := makeDiverseMessagesAndNodes(messageCount) logging.FromContext(env.Ctx).Infof("Provisioning %d nodes", messageCount) - if err := provisionNodes(env.Ctx, env.Client, nodes, nodeStateController); err != nil { + if err := provisionNodes(env.Ctx, env.Client, nodes); err != nil { b.Fatalf("provisioning nodes, %v", err) } logging.FromContext(env.Ctx).Infof("Completed provisioning %d nodes", messageCount) @@ -163,9 +159,6 @@ func benchmarkNotificationController(b *testing.B, messageCount int) { if err = interruptionController.Builder(ctx, m).Complete(interruptionController); err != nil { b.Fatalf("registering interruption controller, %v", err) } - if err = nodeStateController.Builder(ctx, m).Complete(nodeStateController); err != nil { - b.Fatalf("registering nodeState controller, %v", err) - } b.ResetTimer() start := time.Now() @@ -234,16 +227,16 @@ func (p *providerSet) provisionMessages(ctx context.Context, messages ...interfa return multierr.Combine(errs...) } -func (p *providerSet) monitorMessagesProcessed(ctx context.Context, eventRecorder *awsfake.EventRecorder, expectedProcessed int) <-chan struct{} { +func (p *providerSet) monitorMessagesProcessed(ctx context.Context, eventRecorder *test.EventRecorder, expectedProcessed int) <-chan struct{} { done := make(chan struct{}) totalProcessed := 0 go func() { for totalProcessed < expectedProcessed { - totalProcessed = int(eventRecorder.InstanceStoppingCalled.Load()) + - int(eventRecorder.InstanceTerminatingCalled.Load()) + - int(eventRecorder.InstanceUnhealthyCalled.Load()) + - int(eventRecorder.InstanceRebalanceRecommendationCalled.Load()) + - int(eventRecorder.InstanceSpotInterruptedCalled.Load()) + totalProcessed = eventRecorder.Calls(events.InstanceStopping(test.Node()).Reason) + + eventRecorder.Calls(events.InstanceTerminating(test.Node()).Reason) + + eventRecorder.Calls(events.InstanceUnhealthy(test.Node()).Reason) + + eventRecorder.Calls(events.InstanceRebalanceRecommendation(test.Node()).Reason) + + eventRecorder.Calls(events.InstanceSpotInterrupted(test.Node()).Reason) logging.FromContext(ctx).Infof("Processed %d messages from the queue", totalProcessed) time.Sleep(time.Second) } @@ -252,7 +245,7 @@ func (p *providerSet) monitorMessagesProcessed(ctx context.Context, eventRecorde return done } -func provisionNodes(ctx context.Context, kubeClient client.Client, nodes []*v1.Node, nodeController *state.NodeController) error { +func provisionNodes(ctx context.Context, kubeClient client.Client, nodes []*v1.Node) error { errs := make([]error, len(nodes)) workqueue.ParallelizeUntil(ctx, 20, len(nodes), func(i int) { if err := retry.Do(func() error { @@ -260,12 +253,6 @@ func provisionNodes(ctx context.Context, kubeClient client.Client, nodes []*v1.N }); err != nil { errs[i] = fmt.Errorf("provisioning node, %w", err) } - if err := retry.Do(func() error { - _, err := nodeController.Reconcile(ctx, reconcile.Request{NamespacedName: client.ObjectKeyFromObject(nodes[i])}) - return err - }); err != nil { - errs[i] = fmt.Errorf("reconciling node, %w", err) - } }) return multierr.Combine(errs...) } diff --git a/pkg/controllers/interruption/metrics.go b/pkg/controllers/interruption/metrics.go index 91c0f9d035c9..140e8a028128 100644 --- a/pkg/controllers/interruption/metrics.go +++ b/pkg/controllers/interruption/metrics.go @@ -45,7 +45,7 @@ var ( Name: "received_messages", Help: "Count of messages received from the SQS queue. Broken down by message type and whether the message was actionable.", }, - []string{messageTypeLabel, actionableTypeLabel}, + []string{messageTypeLabel}, ) deletedMessages = prometheus.NewCounter( prometheus.CounterOpts{ diff --git a/pkg/controllers/interruption/suite_test.go b/pkg/controllers/interruption/suite_test.go index 4a3d951bc019..a4cf75a625d0 100644 --- a/pkg/controllers/interruption/suite_test.go +++ b/pkg/controllers/interruption/suite_test.go @@ -57,7 +57,6 @@ import ( "github.com/aws/karpenter-core/pkg/apis/config/settings" "github.com/aws/karpenter-core/pkg/apis/provisioning/v1alpha5" "github.com/aws/karpenter-core/pkg/cloudprovider/fake" - "github.com/aws/karpenter-core/pkg/controllers/state" "github.com/aws/karpenter-core/pkg/test" . "github.com/aws/karpenter-core/pkg/test/expectations" ) @@ -73,7 +72,6 @@ const ( var ctx context.Context var env *test.Environment var nodeTemplate *v1alpha1.AWSNodeTemplate -var cluster *state.Cluster var sqsapi *awsfake.SQSAPI var eventbridgeapi *awsfake.EventBridgeAPI var cloudProvider *fake.CloudProvider @@ -83,7 +81,6 @@ var unavailableOfferingsCache *awscache.UnavailableOfferings var recorder *test.EventRecorder var fakeClock *clock.FakeClock var controller *interruption.Controller -var nodeStateController *state.NodeController func TestAPIs(t *testing.T) { ctx = TestContextWithLogger(t) @@ -106,9 +103,7 @@ var _ = BeforeEach(func() { nodeTemplate = awstest.AWSNodeTemplate() ExpectApplied(ctx, e.Client, nodeTemplate) - cluster = state.NewCluster(ctx, fakeClock, env.Client, cloudProvider) recorder = test.NewEventRecorder() - nodeStateController = state.NewNodeController(env.Client, cluster) unavailableOfferingsCache = awscache.NewUnavailableOfferings(cache.New(awscache.UnavailableOfferingsTTL, awscontext.CacheCleanupInterval)) sqsapi = &awsfake.SQSAPI{} @@ -116,7 +111,7 @@ var _ = BeforeEach(func() { eventbridgeapi = &awsfake.EventBridgeAPI{} eventBridgeProvider = providers.NewEventBridge(eventbridgeapi, sqsProvider) - controller = interruption.NewController(env.Client, fakeClock, recorder, cluster, sqsProvider, unavailableOfferingsCache) + controller = interruption.NewController(env.Client, fakeClock, recorder, sqsProvider, unavailableOfferingsCache) }) env.CRDDirectoryPaths = append(env.CRDDirectoryPaths, relativeToRoot("charts/karpenter/crds")) Expect(env.Start()).To(Succeed(), "Failed to start environment") @@ -140,7 +135,6 @@ var _ = Describe("Processing Messages", func() { }) ExpectMessagesCreated(spotInterruptionMessage(defaultInstanceID)) ExpectApplied(env.Ctx, env.Client, node) - ExpectReconcileSucceeded(env.Ctx, nodeStateController, client.ObjectKeyFromObject(node)) ExpectReconcileSucceeded(env.Ctx, controller, types.NamespacedName{}) ExpectNotFound(env.Ctx, env.Client, node) @@ -157,7 +151,6 @@ var _ = Describe("Processing Messages", func() { }) ExpectMessagesCreated(scheduledChangeMessage(defaultInstanceID)) ExpectApplied(env.Ctx, env.Client, node) - ExpectReconcileSucceeded(env.Ctx, nodeStateController, client.ObjectKeyFromObject(node)) ExpectReconcileSucceeded(env.Ctx, controller, types.NamespacedName{}) ExpectNotFound(env.Ctx, env.Client, node) @@ -181,11 +174,6 @@ var _ = Describe("Processing Messages", func() { ExpectMessagesCreated(messages...) ExpectApplied(env.Ctx, env.Client, lo.Map(nodes, func(n *v1.Node, _ int) client.Object { return n })...) - // Wait for the nodes to reconcile with the cluster state - for _, node := range nodes { - ExpectReconcileSucceeded(env.Ctx, nodeStateController, client.ObjectKeyFromObject(node)) - } - ExpectReconcileSucceeded(env.Ctx, controller, types.NamespacedName{}) ExpectNotFound(env.Ctx, env.Client, lo.Map(nodes, func(n *v1.Node, _ int) client.Object { return n })...) Expect(sqsapi.DeleteMessageBehavior.SuccessfulCalls()).To(Equal(4)) @@ -213,11 +201,6 @@ var _ = Describe("Processing Messages", func() { ExpectMessagesCreated(messages...) ExpectApplied(env.Ctx, env.Client, lo.Map(nodes, func(n *v1.Node, _ int) client.Object { return n })...) - // Wait for the nodes to reconcile with the cluster state - for _, node := range nodes { - ExpectReconcileSucceeded(env.Ctx, nodeStateController, client.ObjectKeyFromObject(node)) - } - ExpectReconcileSucceeded(env.Ctx, controller, types.NamespacedName{}) ExpectNotFound(env.Ctx, env.Client, lo.Map(nodes, func(n *v1.Node, _ int) client.Object { return n })...) Expect(sqsapi.DeleteMessageBehavior.SuccessfulCalls()).To(Equal(100)) @@ -228,7 +211,6 @@ var _ = Describe("Processing Messages", func() { }) ExpectMessagesCreated(spotInterruptionMessage(node.Spec.ProviderID)) ExpectApplied(env.Ctx, env.Client, node) - ExpectReconcileSucceeded(env.Ctx, nodeStateController, client.ObjectKeyFromObject(node)) ExpectReconcileSucceeded(env.Ctx, controller, types.NamespacedName{}) ExpectNodeExists(env.Ctx, env.Client, node.Name) @@ -259,7 +241,6 @@ var _ = Describe("Processing Messages", func() { }) ExpectMessagesCreated(stateChangeMessage(defaultInstanceID, "creating")) ExpectApplied(env.Ctx, env.Client, node) - ExpectReconcileSucceeded(env.Ctx, nodeStateController, client.ObjectKeyFromObject(node)) ExpectReconcileSucceeded(env.Ctx, controller, types.NamespacedName{}) ExpectNodeExists(env.Ctx, env.Client, node.Name) @@ -279,7 +260,6 @@ var _ = Describe("Processing Messages", func() { }) ExpectMessagesCreated(spotInterruptionMessage(defaultInstanceID)) ExpectApplied(env.Ctx, env.Client, node) - ExpectReconcileSucceeded(env.Ctx, nodeStateController, client.ObjectKeyFromObject(node)) ExpectReconcileSucceeded(env.Ctx, controller, types.NamespacedName{}) ExpectNotFound(env.Ctx, env.Client, node) diff --git a/pkg/controllers/nodetemplate/controller.go b/pkg/controllers/nodetemplate/controller.go index 20823d53a326..3427b30369c0 100644 --- a/pkg/controllers/nodetemplate/controller.go +++ b/pkg/controllers/nodetemplate/controller.go @@ -29,7 +29,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/manager" "sigs.k8s.io/controller-runtime/pkg/reconcile" - operatorcontroller "github.com/aws/karpenter-core/pkg/operator/controller" + corecontroller "github.com/aws/karpenter-core/pkg/operator/controller" "github.com/aws/karpenter-core/pkg/operator/scheme" "github.com/aws/karpenter-core/pkg/utils/result" "github.com/aws/karpenter/pkg/apis" @@ -96,7 +96,7 @@ func (c *Controller) Reconcile(ctx context.Context, req reconcile.Request) (reco return result.Min(results...), nil } -func (c *Controller) Builder(_ context.Context, m manager.Manager) operatorcontroller.Builder { +func (c *Controller) Builder(_ context.Context, m manager.Manager) corecontroller.Builder { return controllerruntime. NewControllerManagedBy(m). Named(Name). diff --git a/pkg/controllers/providers/infrastructure.go b/pkg/controllers/providers/infrastructure.go index e63a49ba027d..a9d76cee0beb 100644 --- a/pkg/controllers/providers/infrastructure.go +++ b/pkg/controllers/providers/infrastructure.go @@ -21,8 +21,6 @@ import ( "go.uber.org/multierr" "k8s.io/client-go/util/workqueue" "knative.dev/pkg/logging" - - "github.com/aws/karpenter/pkg/errors" ) type Infrastructure struct { @@ -84,17 +82,16 @@ func (p *Infrastructure) ensureQueue(ctx context.Context) error { // Attempt to find the queue. If we can't find it, assume it isn't created and try to create it // If we did find it, then just set the queue attributes on the existing queue logging.FromContext(ctx).Debugf("Reconciling the SQS notification queue...") - if _, err := p.sqsProvider.DiscoverQueueURL(ctx, true); err != nil { - switch { - case errors.IsNotFound(err): - logging.FromContext(ctx).Debugf("Queue not found, creating the SQS notification queue...") - if err := p.sqsProvider.CreateQueue(ctx); err != nil { - return fmt.Errorf("creating sqs queue with policy, %w", err) - } - logging.FromContext(ctx).Debugf("Successfully created the SQS notification queue") - default: - return fmt.Errorf("discovering sqs queue url, %w", err) + queueExists, err := p.sqsProvider.QueueExists(ctx) + if err != nil { + return fmt.Errorf("checking queue existence, %w", err) + } + if !queueExists { + logging.FromContext(ctx).Debugf("Queue not found, creating the SQS notification queue...") + if err := p.sqsProvider.CreateQueue(ctx); err != nil { + return fmt.Errorf("creating sqs queue with policy, %w", err) } + logging.FromContext(ctx).Debugf("Successfully created the SQS notification queue") } // Always attempt to set the queue attributes, even after creation to help set the queue policy if err := p.sqsProvider.SetQueueAttributes(ctx, nil); err != nil { diff --git a/pkg/controllers/providers/sqs.go b/pkg/controllers/providers/sqs.go index e1858d4fa13b..de5b244aa992 100644 --- a/pkg/controllers/providers/sqs.go +++ b/pkg/controllers/providers/sqs.go @@ -107,7 +107,7 @@ func (s *SQS) CreateQueue(ctx context.Context) error { } func (s *SQS) SetQueueAttributes(ctx context.Context, attributeOverrides map[string]*string) error { - queueURL, err := s.DiscoverQueueURL(ctx, false) + queueURL, err := s.DiscoverQueueURL(ctx) if err != nil { return fmt.Errorf("fetching queue url, %w", err) } @@ -129,9 +129,19 @@ func (s *SQS) SetQueueAttributes(ctx context.Context, attributeOverrides map[str return nil } -func (s *SQS) DiscoverQueueURL(ctx context.Context, ignoreCache bool) (string, error) { - opts := lo.Ternary(ignoreCache, atomic.IgnoreCacheOption, nil) - return s.queueURL.TryGet(ctx, opts) +func (s *SQS) QueueExists(ctx context.Context) (bool, error) { + _, err := s.queueURL.TryGet(ctx, atomic.IgnoreCacheOption) + if err != nil { + if awserrors.IsNotFound(err) { + return false, nil + } + return false, err + } + return true, nil +} + +func (s *SQS) DiscoverQueueURL(ctx context.Context) (string, error) { + return s.queueURL.TryGet(ctx) } func (s *SQS) DiscoverQueueARN(ctx context.Context) (string, error) { @@ -139,7 +149,7 @@ func (s *SQS) DiscoverQueueARN(ctx context.Context) (string, error) { } func (s *SQS) GetSQSMessages(ctx context.Context) ([]*sqs.Message, error) { - queueURL, err := s.DiscoverQueueURL(ctx, false) + queueURL, err := s.DiscoverQueueURL(ctx) if err != nil { return nil, fmt.Errorf("fetching queue url, %w", err) } @@ -170,7 +180,7 @@ func (s *SQS) SendMessage(ctx context.Context, body interface{}) (string, error) if err != nil { return "", fmt.Errorf("marshaling the passed body as json, %w", err) } - queueURL, err := s.DiscoverQueueURL(ctx, false) + queueURL, err := s.DiscoverQueueURL(ctx) if err != nil { return "", fmt.Errorf("fetching queue url, %w", err) } @@ -186,7 +196,7 @@ func (s *SQS) SendMessage(ctx context.Context, body interface{}) (string, error) } func (s *SQS) DeleteSQSMessage(ctx context.Context, msg *sqs.Message) error { - queueURL, err := s.DiscoverQueueURL(ctx, false) + queueURL, err := s.DiscoverQueueURL(ctx) if err != nil { return fmt.Errorf("failed fetching queue url, %w", err) } @@ -204,7 +214,7 @@ func (s *SQS) DeleteSQSMessage(ctx context.Context, msg *sqs.Message) error { } func (s *SQS) DeleteQueue(ctx context.Context) error { - queueURL, err := s.DiscoverQueueURL(ctx, false) + queueURL, err := s.DiscoverQueueURL(ctx) if err != nil { if awserrors.IsNotFound(err) { return nil diff --git a/pkg/fake/recorder.go b/pkg/fake/recorder.go deleted file mode 100644 index 8abe05719bfe..000000000000 --- a/pkg/fake/recorder.go +++ /dev/null @@ -1,54 +0,0 @@ -/* -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package fake - -import ( - "sync/atomic" - - "github.com/aws/karpenter-core/pkg/events" - "github.com/aws/karpenter-core/pkg/test" - interruptionevents "github.com/aws/karpenter/pkg/controllers/interruption/events" -) - -// EventRecorder is a mock event recorder that is used to facilitate testing. -type EventRecorder struct { - InstanceSpotInterruptedCalled atomic.Int64 - InstanceRebalanceRecommendationCalled atomic.Int64 - InstanceUnhealthyCalled atomic.Int64 - InstanceTerminatingCalled atomic.Int64 - InstanceStoppingCalled atomic.Int64 - NodeTerminatingOnInterruptionCalled atomic.Int64 -} - -func NewEventRecorder() *EventRecorder { - return &EventRecorder{} -} - -func (e *EventRecorder) Publish(evt events.Event) { - switch evt.Reason { - case interruptionevents.InstanceSpotInterrupted(test.Node()).Reason: - e.InstanceSpotInterruptedCalled.Add(1) - case interruptionevents.InstanceRebalanceRecommendation(test.Node()).Reason: - e.InstanceRebalanceRecommendationCalled.Add(1) - case interruptionevents.InstanceUnhealthy(test.Node()).Reason: - e.InstanceUnhealthyCalled.Add(1) - case interruptionevents.InstanceTerminating(test.Node()).Reason: - e.InstanceTerminatingCalled.Add(1) - case interruptionevents.InstanceStopping(test.Node()).Reason: - e.InstanceStoppingCalled.Add(1) - case interruptionevents.NodeTerminatingOnInterruption(test.Node()).Reason: - e.NodeTerminatingOnInterruptionCalled.Add(1) - } -} diff --git a/test/go.mod b/test/go.mod index 660429500b5c..5c0e752e8ce7 100644 --- a/test/go.mod +++ b/test/go.mod @@ -7,7 +7,7 @@ require ( github.com/aws/aws-sdk-go v1.44.114 github.com/aws/aws-sdk-go-v2/config v1.17.8 github.com/aws/karpenter v0.18.0 - github.com/aws/karpenter-core v0.0.2-0.20221031163135-0c39b59ed935 + github.com/aws/karpenter-core v0.0.2-0.20221031185530-c9e0b5b6b603 github.com/onsi/ginkgo/v2 v2.2.0 github.com/onsi/gomega v1.21.1 github.com/samber/lo v1.32.0 @@ -15,7 +15,7 @@ require ( k8s.io/api v0.25.2 k8s.io/apimachinery v0.25.2 k8s.io/client-go v0.25.2 - knative.dev/pkg v0.0.0-20221014164553-b812affa3893 + knative.dev/pkg v0.0.0-20221031132215-6eb8f1845a9d sigs.k8s.io/controller-runtime v0.13.0 ) diff --git a/test/go.sum b/test/go.sum index ccb213c47304..a9b2c30db2d0 100644 --- a/test/go.sum +++ b/test/go.sum @@ -83,8 +83,8 @@ github.com/aws/aws-sdk-go-v2/service/ssooidc v1.13.6 h1:OwhhKc1P9ElfWbMKPIbMMZBV github.com/aws/aws-sdk-go-v2/service/ssooidc v1.13.6/go.mod h1:csZuQY65DAdFBt1oIjO5hhBR49kQqop4+lcuCjf2arA= github.com/aws/aws-sdk-go-v2/service/sts v1.16.19 h1:9pPi0PsFNAGILFfPCk8Y0iyEBGc6lu6OQ97U7hmdesg= github.com/aws/aws-sdk-go-v2/service/sts v1.16.19/go.mod h1:h4J3oPZQbxLhzGnk+j9dfYHi5qIOVJ5kczZd658/ydM= -github.com/aws/karpenter-core v0.0.2-0.20221031163135-0c39b59ed935 h1:HEd6DTBd8QcVEv9Ow7jPQWQ7LLp2QcipgaDYmJfazOI= -github.com/aws/karpenter-core v0.0.2-0.20221031163135-0c39b59ed935/go.mod h1:b9EJPH/E/rPxwSshkn4wCY2Tw5AaDBPKhT8r3AIVlHo= +github.com/aws/karpenter-core v0.0.2-0.20221031185530-c9e0b5b6b603 h1:ULI8LXSgJQARW9EDO1EpTqp3ZYsabXq7V9NzURo9XxE= +github.com/aws/karpenter-core v0.0.2-0.20221031185530-c9e0b5b6b603/go.mod h1:AC7JRJN2p/Lcq7gPWeZOSd/XsgSUQHomkHW4mk3Xg24= github.com/aws/smithy-go v1.11.2/go.mod h1:3xHYmszWVx2c0kIwQeEVf9uSm4fYZt67FBJnwub1bgM= github.com/aws/smithy-go v1.13.3 h1:l7LYxGuzK6/K+NzJ2mC+VvLUbae0sL3bXU//04MkmnA= github.com/aws/smithy-go v1.13.3/go.mod h1:Tg+OJXh4MB2R/uN61Ko2f6hTZwB/ZYGOtib8J3gBHzA= @@ -697,8 +697,8 @@ k8s.io/kube-openapi v0.0.0-20220803162953-67bda5d908f1 h1:MQ8BAZPZlWk3S9K4a9NCkI k8s.io/kube-openapi v0.0.0-20220803162953-67bda5d908f1/go.mod h1:C/N6wCaBHeBHkHUesQOQy2/MZqGgMAFPqGsGQLdbZBU= k8s.io/utils v0.0.0-20220728103510-ee6ede2d64ed h1:jAne/RjBTyawwAy0utX5eqigAwz/lQhTmy+Hr/Cpue4= k8s.io/utils v0.0.0-20220728103510-ee6ede2d64ed/go.mod h1:jPW/WVKK9YHAvNhRxK0md/EJ228hCsBRufyofKtW8HA= -knative.dev/pkg v0.0.0-20221014164553-b812affa3893 h1:RU6XnkYE017AWfKgN+eQLyDcT99eJKgz+ZWOvb+3fhc= -knative.dev/pkg v0.0.0-20221014164553-b812affa3893/go.mod h1:DMTRDJ5WRxf/DrlOPzohzfhSuJggscLZ8EavOq9O/x8= +knative.dev/pkg v0.0.0-20221031132215-6eb8f1845a9d h1:BRSonjQw4u63gaWeoE5i2724FF3K7teFf8DcmwjGdAQ= +knative.dev/pkg v0.0.0-20221031132215-6eb8f1845a9d/go.mod h1:j5kO7gKmWGj2DJpefCEiPbItToiYf+2bCtI+A6REkQo= rsc.io/binaryregexp v0.2.0/go.mod h1:qTv7/COck+e2FymRvadv62gMdZztPaShugOCi3I+8D8= rsc.io/quote/v3 v3.1.0/go.mod h1:yEA65RcK8LyAZtP9Kv3t0HmxON59tX3rD+tICJqUlj0= rsc.io/sampler v1.3.0/go.mod h1:T1hPZKmBbMNahiBKFy5HrXp6adAjACjK9JXDnKaTXpA= diff --git a/test/pkg/environment/aws/expectations.go b/test/pkg/environment/aws/expectations.go index 1264952341a2..6cd8a568c939 100644 --- a/test/pkg/environment/aws/expectations.go +++ b/test/pkg/environment/aws/expectations.go @@ -85,8 +85,9 @@ func (env *Environment) GetVolume(volumeID *string) ec2.Volume { func (env *Environment) EventuallyExpectQueueCreated() { EventuallyWithOffset(1, func(g Gomega) { - _, err := env.SQSProvider.DiscoverQueueURL(env.Context, true) + exists, err := env.SQSProvider.QueueExists(env.Context) g.Expect(err).ToNot(HaveOccurred()) + g.Expect(exists).To(BeTrue()) }).Should(Succeed()) } From 325f48a298e64796378adedafbac6e10ef39a8d5 Mon Sep 17 00:00:00 2001 From: Jonathan Innis Date: Tue, 1 Nov 2022 15:01:15 -0700 Subject: [PATCH 53/55] Tag-based discovery for eventbridge rules --- pkg/controllers/controllers.go | 2 +- pkg/controllers/interruption/suite_test.go | 2 +- .../nodetemplate/infrastructure.go | 1 + pkg/controllers/nodetemplate/suite_test.go | 142 +++++++++++++-- pkg/controllers/providers/eventbridge.go | 172 +++++++++++++----- pkg/controllers/providers/infrastructure.go | 4 +- pkg/controllers/providers/sqs.go | 30 ++- pkg/errors/errors.go | 18 ++ pkg/fake/eventbridgeapi.go | 19 +- pkg/utils/utils.go | 13 -- test/pkg/environment/aws/environment.go | 2 +- .../cloudformation.yaml | 14 +- .../scripts/step01-config.sh | 1 - .../scripts/step03-iam-cloud-formation.sh | 3 +- 14 files changed, 315 insertions(+), 108 deletions(-) diff --git a/pkg/controllers/controllers.go b/pkg/controllers/controllers.go index e05ef04f2d8b..e8f55e5b7ce9 100644 --- a/pkg/controllers/controllers.go +++ b/pkg/controllers/controllers.go @@ -26,7 +26,7 @@ import ( ) func NewControllers(ctx awscontext.Context) []controller.Controller { - sqsProvider := providers.NewSQS(ctx, sqs.New(ctx.Session)) + sqsProvider := providers.NewSQS(sqs.New(ctx.Session)) eventBridgeProvider := providers.NewEventBridge(eventbridge.New(ctx.Session), sqsProvider) return []controller.Controller{ diff --git a/pkg/controllers/interruption/suite_test.go b/pkg/controllers/interruption/suite_test.go index a4cf75a625d0..86b0950dcf79 100644 --- a/pkg/controllers/interruption/suite_test.go +++ b/pkg/controllers/interruption/suite_test.go @@ -107,7 +107,7 @@ var _ = BeforeEach(func() { unavailableOfferingsCache = awscache.NewUnavailableOfferings(cache.New(awscache.UnavailableOfferingsTTL, awscontext.CacheCleanupInterval)) sqsapi = &awsfake.SQSAPI{} - sqsProvider = providers.NewSQS(ctx, sqsapi) + sqsProvider = providers.NewSQS(sqsapi) eventbridgeapi = &awsfake.EventBridgeAPI{} eventBridgeProvider = providers.NewEventBridge(eventbridgeapi, sqsProvider) diff --git a/pkg/controllers/nodetemplate/infrastructure.go b/pkg/controllers/nodetemplate/infrastructure.go index 6df750bf8f32..48516c6421d5 100644 --- a/pkg/controllers/nodetemplate/infrastructure.go +++ b/pkg/controllers/nodetemplate/infrastructure.go @@ -45,6 +45,7 @@ func (i *Infrastructure) Reconcile(ctx context.Context, nodeTemplate *v1alpha1.A if err := i.provider.Delete(ctx); err != nil { return reconcile.Result{}, err } + i.lastInfrastructureReconcile = time.Time{} return reconcile.Result{}, nil } else if len(list.Items) >= 1 { infrastructureActive.Set(1) diff --git a/pkg/controllers/nodetemplate/suite_test.go b/pkg/controllers/nodetemplate/suite_test.go index 614448bc12dc..c0818713057d 100644 --- a/pkg/controllers/nodetemplate/suite_test.go +++ b/pkg/controllers/nodetemplate/suite_test.go @@ -21,6 +21,7 @@ import ( "runtime" "testing" + "github.com/aws/aws-sdk-go/aws" "github.com/aws/aws-sdk-go/aws/awserr" "github.com/aws/aws-sdk-go/service/eventbridge" "github.com/aws/aws-sdk-go/service/sqs" @@ -31,6 +32,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" "github.com/aws/karpenter-core/pkg/apis/config/settings" + "github.com/aws/karpenter-core/pkg/apis/provisioning/v1alpha5" "github.com/aws/karpenter-core/pkg/operator/injection" "github.com/aws/karpenter-core/pkg/operator/options" . "github.com/aws/karpenter-core/pkg/test/expectations" @@ -77,6 +79,7 @@ var _ = BeforeEach(func() { }, } ctx = settingsStore.InjectSettings(ctx) + ctx = injection.WithOptions(ctx, defaultOpts) env = test.NewEnvironment(ctx, func(e *test.Environment) { opts = defaultOpts Expect(opts.Validate()).To(Succeed(), "Failed to validate options") @@ -84,7 +87,7 @@ var _ = BeforeEach(func() { sqsapi = &awsfake.SQSAPI{} eventbridgeapi = &awsfake.EventBridgeAPI{} - sqsProvider = providers.NewSQS(e.Ctx, sqsapi) + sqsProvider = providers.NewSQS(sqsapi) eventBridgeProvider = providers.NewEventBridge(eventbridgeapi, sqsProvider) controller = nodetemplate.NewController(e.Client, sqsProvider, eventBridgeProvider) @@ -151,12 +154,42 @@ var _ = Describe("Infrastructure", func() { sqsapi.GetQueueURLBehavior.Error.Set(awsErrWithCode(sqs.ErrCodeQueueDoesNotExist), awsfake.MaxCalls(1)) // This mocks the queue not existing ExpectApplied(ctx, env.Client, provider) - ExpectReconcileSucceeded(ctx, controller, client.ObjectKeyFromObject(provider)) + Expect(sqsapi.CreateQueueBehavior.SuccessfulCalls()).To(Equal(1)) Expect(eventbridgeapi.PutRuleBehavior.SuccessfulCalls()).To(Equal(4)) Expect(eventbridgeapi.PutTargetsBehavior.SuccessfulCalls()).To(Equal(4)) + // Set the output of ListRules to mock rule creation + eventbridgeapi.ListRulesBehavior.Output.Set(&eventbridge.ListRulesOutput{ + Rules: []*eventbridge.Rule{ + { + Name: aws.String(providers.DefaultRules[providers.ScheduledChangedRule].Name), + Arn: aws.String("test-arn1"), + }, + { + Name: aws.String(providers.DefaultRules[providers.SpotTerminationRule].Name), + Arn: aws.String("test-arn2"), + }, + { + Name: aws.String(providers.DefaultRules[providers.RebalanceRule].Name), + Arn: aws.String("test-arn3"), + }, + { + Name: aws.String(providers.DefaultRules[providers.StateChangeRule].Name), + Arn: aws.String("test-arn4"), + }, + }, + }) + eventbridgeapi.ListTagsForResourceBehavior.Output.Set(&eventbridge.ListTagsForResourceOutput{ + Tags: []*eventbridge.Tag{ + { + Key: aws.String(v1alpha5.DiscoveryTagKey), + Value: aws.String(defaultOpts.ClusterName), + }, + }, + }) + // Delete the AWSNodeTemplate and then re-reconcile it to delete the infrastructure Expect(env.Client.Delete(ctx, provider)).To(Succeed()) ExpectReconcileSucceeded(ctx, controller, client.ObjectKeyFromObject(provider)) @@ -172,6 +205,36 @@ var _ = Describe("Infrastructure", func() { sqsapi.DeleteQueueBehavior.Error.Set(awsErrWithCode(sqs.ErrCodeQueueDoesNotExist), awsfake.MaxCalls(0)) + // Set the output of ListRules to mock rule creation + eventbridgeapi.ListRulesBehavior.Output.Set(&eventbridge.ListRulesOutput{ + Rules: []*eventbridge.Rule{ + { + Name: aws.String(providers.DefaultRules[providers.ScheduledChangedRule].Name), + Arn: aws.String("test-arn1"), + }, + { + Name: aws.String(providers.DefaultRules[providers.SpotTerminationRule].Name), + Arn: aws.String("test-arn2"), + }, + { + Name: aws.String(providers.DefaultRules[providers.RebalanceRule].Name), + Arn: aws.String("test-arn3"), + }, + { + Name: aws.String(providers.DefaultRules[providers.StateChangeRule].Name), + Arn: aws.String("test-arn4"), + }, + }, + }) + eventbridgeapi.ListTagsForResourceBehavior.Output.Set(&eventbridge.ListTagsForResourceOutput{ + Tags: []*eventbridge.Tag{ + { + Key: aws.String(v1alpha5.DiscoveryTagKey), + Value: aws.String(defaultOpts.ClusterName), + }, + }, + }) + // Delete the AWSNodeTemplate and then re-reconcile it to delete the infrastructure Expect(env.Client.Delete(ctx, provider)).To(Succeed()) ExpectReconcileSucceeded(ctx, controller, client.ObjectKeyFromObject(provider)) @@ -180,23 +243,42 @@ var _ = Describe("Infrastructure", func() { Expect(eventbridgeapi.DeleteRuleBehavior.SuccessfulCalls()).To(Equal(4)) Expect(eventbridgeapi.RemoveTargetsBehavior.SuccessfulCalls()).To(Equal(4)) }) - It("should cleanup when a single rule is already deleted", func() { + It("should cleanup with a success when a few rules aren't in list call", func() { provider := awstest.AWSNodeTemplate() ExpectApplied(ctx, env.Client, provider) ExpectReconcileSucceeded(ctx, controller, client.ObjectKeyFromObject(provider)) - eventbridgeapi.RemoveTargetsBehavior.Error.Set(awsErrWithCode((&eventbridge.ResourceNotFoundException{}).Code())) - eventbridgeapi.DeleteRuleBehavior.Error.Set(awsErrWithCode((&eventbridge.ResourceNotFoundException{}).Code())) + // Set the output of ListRules to mock rule creation + eventbridgeapi.ListRulesBehavior.Output.Set(&eventbridge.ListRulesOutput{ + Rules: []*eventbridge.Rule{ + { + Name: aws.String(providers.DefaultRules[providers.ScheduledChangedRule].Name), + Arn: aws.String("test-arn1"), + }, + { + Name: aws.String(providers.DefaultRules[providers.SpotTerminationRule].Name), + Arn: aws.String("test-arn2"), + }, + }, + }) + eventbridgeapi.ListTagsForResourceBehavior.Output.Set(&eventbridge.ListTagsForResourceOutput{ + Tags: []*eventbridge.Tag{ + { + Key: aws.String(v1alpha5.DiscoveryTagKey), + Value: aws.String(defaultOpts.ClusterName), + }, + }, + }) // Delete the AWSNodeTemplate and then re-reconcile it to delete the infrastructure Expect(env.Client.Delete(ctx, provider)).To(Succeed()) ExpectReconcileSucceeded(ctx, controller, client.ObjectKeyFromObject(provider)) Expect(sqsapi.DeleteQueueBehavior.SuccessfulCalls()).To(Equal(1)) - Expect(eventbridgeapi.RemoveTargetsBehavior.SuccessfulCalls()).To(Equal(3)) - Expect(eventbridgeapi.DeleteRuleBehavior.SuccessfulCalls()).To(Equal(3)) + Expect(eventbridgeapi.RemoveTargetsBehavior.SuccessfulCalls()).To(Equal(2)) + Expect(eventbridgeapi.DeleteRuleBehavior.SuccessfulCalls()).To(Equal(2)) }) - It("should cleanup when a single rule is already deleted", func() { + It("should cleanup with a success when getting not found errors", func() { provider := awstest.AWSNodeTemplate() ExpectApplied(ctx, env.Client, provider) ExpectReconcileSucceeded(ctx, controller, client.ObjectKeyFromObject(provider)) @@ -213,17 +295,17 @@ var _ = Describe("Infrastructure", func() { Expect(eventbridgeapi.DeleteRuleBehavior.SuccessfulCalls()).To(Equal(0)) }) It("should only attempt to delete the infrastructure when the last node template is removed", func() { - var providers []*v1alpha1.AWSNodeTemplate + var nodeTemplates []*v1alpha1.AWSNodeTemplate for i := 0; i < 10; i++ { p := awstest.AWSNodeTemplate() - providers = append(providers, p) + nodeTemplates = append(nodeTemplates, p) ExpectApplied(ctx, env.Client, p) ExpectReconcileSucceeded(ctx, controller, client.ObjectKeyFromObject(p)) } - for i := 0; i < len(providers)-1; i++ { - Expect(env.Client.Delete(ctx, providers[i])).To(Succeed()) - ExpectReconcileSucceeded(ctx, controller, client.ObjectKeyFromObject(providers[i])) + for i := 0; i < len(nodeTemplates)-1; i++ { + Expect(env.Client.Delete(ctx, nodeTemplates[i])).To(Succeed()) + ExpectReconcileSucceeded(ctx, controller, client.ObjectKeyFromObject(nodeTemplates[i])) } // It shouldn't attempt to delete at this point @@ -231,9 +313,39 @@ var _ = Describe("Infrastructure", func() { Expect(eventbridgeapi.RemoveTargetsBehavior.Calls()).To(Equal(0)) Expect(eventbridgeapi.DeleteRuleBehavior.Calls()).To(Equal(0)) + // Set the output of ListRules to mock rule creation + eventbridgeapi.ListRulesBehavior.Output.Set(&eventbridge.ListRulesOutput{ + Rules: []*eventbridge.Rule{ + { + Name: aws.String(providers.DefaultRules[providers.ScheduledChangedRule].Name), + Arn: aws.String("test-arn1"), + }, + { + Name: aws.String(providers.DefaultRules[providers.SpotTerminationRule].Name), + Arn: aws.String("test-arn2"), + }, + { + Name: aws.String(providers.DefaultRules[providers.RebalanceRule].Name), + Arn: aws.String("test-arn3"), + }, + { + Name: aws.String(providers.DefaultRules[providers.StateChangeRule].Name), + Arn: aws.String("test-arn4"), + }, + }, + }) + eventbridgeapi.ListTagsForResourceBehavior.Output.Set(&eventbridge.ListTagsForResourceOutput{ + Tags: []*eventbridge.Tag{ + { + Key: aws.String(v1alpha5.DiscoveryTagKey), + Value: aws.String(defaultOpts.ClusterName), + }, + }, + }) + // Last AWSNodeTemplate, so now it should delete it - Expect(env.Client.Delete(ctx, providers[len(providers)-1])).To(Succeed()) - ExpectReconcileSucceeded(ctx, controller, client.ObjectKeyFromObject(providers[len(providers)-1])) + Expect(env.Client.Delete(ctx, nodeTemplates[len(nodeTemplates)-1])).To(Succeed()) + ExpectReconcileSucceeded(ctx, controller, client.ObjectKeyFromObject(nodeTemplates[len(nodeTemplates)-1])) Expect(sqsapi.DeleteQueueBehavior.SuccessfulCalls()).To(Equal(1)) Expect(eventbridgeapi.RemoveTargetsBehavior.SuccessfulCalls()).To(Equal(4)) diff --git a/pkg/controllers/providers/eventbridge.go b/pkg/controllers/providers/eventbridge.go index f8bd829efb25..6e3d0a312c1f 100644 --- a/pkg/controllers/providers/eventbridge.go +++ b/pkg/controllers/providers/eventbridge.go @@ -18,48 +18,92 @@ import ( "context" "encoding/json" "fmt" + "regexp" + "time" "github.com/aws/aws-sdk-go/aws" "github.com/aws/aws-sdk-go/service/eventbridge" "github.com/aws/aws-sdk-go/service/eventbridge/eventbridgeiface" "github.com/samber/lo" "go.uber.org/multierr" + "k8s.io/apimachinery/pkg/util/rand" "k8s.io/client-go/util/workqueue" "github.com/aws/karpenter-core/pkg/apis/provisioning/v1alpha5" "github.com/aws/karpenter-core/pkg/operator/injection" awssettings "github.com/aws/karpenter/pkg/apis/config/settings" awserrors "github.com/aws/karpenter/pkg/errors" - "github.com/aws/karpenter/pkg/utils" ) -const QueueTargetID = "KarpenterEventQueue" +func init() { + rand.Seed(time.Now().Unix()) +} + +const ( + ScheduledChangedRule = "ScheduledChangeRule" + SpotTerminationRule = "SpotTerminationRule" + RebalanceRule = "RebalanceRule" + StateChangeRule = "StateChangeRule" +) -type rule struct { +var DefaultRules = map[string]Rule{ + ScheduledChangedRule: { + Name: fmt.Sprintf("Karpenter-%s-%s", ScheduledChangedRule, rand.String(64-len(ScheduledChangedRule))), + Pattern: Pattern{ + Source: []string{"aws.health"}, + DetailType: []string{"AWS Health Event"}, + }, + }, + SpotTerminationRule: { + Name: fmt.Sprintf("Karpenter-%s-%s", SpotTerminationRule, rand.String(64-len(SpotTerminationRule))), + Pattern: Pattern{ + Source: []string{"aws.ec2"}, + DetailType: []string{"EC2 Spot Instance Interruption Warning"}, + }, + }, + RebalanceRule: { + Name: fmt.Sprintf("Karpenter-%s-%s", RebalanceRule, rand.String(64-len(RebalanceRule))), + Pattern: Pattern{ + Source: []string{"aws.ec2"}, + DetailType: []string{"EC2 Instance Rebalance Recommendation"}, + }, + }, + StateChangeRule: { + Name: fmt.Sprintf("Karpenter-%s-%s", StateChangeRule, rand.String(64-len(StateChangeRule))), + Pattern: Pattern{ + Source: []string{"aws.ec2"}, + DetailType: []string{"EC2 Instance State-change Notification"}, + }, + }, +} + +type Rule struct { Name string - Pattern *pattern - Target *target + Pattern Pattern + Target Target } -func (er rule) addQueueTarget(queueARN string) rule { - er.Target = &target{ +const QueueTargetID = "KarpenterEventQueue" + +func (er Rule) addQueueTarget(queueARN string) Rule { + er.Target = Target{ ID: QueueTargetID, ARN: queueARN, } return er } -type target struct { +type Target struct { ID string ARN string } -type pattern struct { +type Pattern struct { Source []string `json:"source,omitempty"` DetailType []string `json:"detail-type,omitempty"` } -func (ep *pattern) Serialize() []byte { +func (ep Pattern) Serialize() []byte { return lo.Must(json.Marshal(ep)) } @@ -75,12 +119,18 @@ func NewEventBridge(eb eventbridgeiface.EventBridgeAPI, sqsProvider *SQS) *Event } } -func (eb *EventBridge) CreateEC2EventRules(ctx context.Context) error { +func (eb *EventBridge) CreateRules(ctx context.Context) error { queueARN, err := eb.sqsProvider.queueARN.TryGet(ctx) if err != nil { return fmt.Errorf("resolving queue arn, %w", err) } - rules := lo.Map(eb.getEC2NotificationEventRules(ctx), func(r rule, _ int) rule { return r.addQueueTarget(queueARN) }) + existingRules, err := eb.DiscoverRules(ctx) + if err != nil { + return fmt.Errorf("discovering existing rules, %w", err) + } + rules := lo.MapToSlice(eb.mergeRules(existingRules), func(_ string, r Rule) Rule { + return r.addQueueTarget(queueARN) + }) errs := make([]error, len(rules)) workqueue.ParallelizeUntil(ctx, len(rules), len(rules), func(i int) { _, err := eb.client.PutRuleWithContext(ctx, &eventbridge.PutRuleInput{ @@ -107,8 +157,46 @@ func (eb *EventBridge) CreateEC2EventRules(ctx context.Context) error { return multierr.Combine(errs...) } -func (eb *EventBridge) DeleteEC2NotificationRules(ctx context.Context) error { - rules := eb.getEC2NotificationEventRules(ctx) +func (eb *EventBridge) DiscoverRules(ctx context.Context) (map[string]Rule, error) { + m := map[string]Rule{} + output, err := eb.client.ListRulesWithContext(ctx, &eventbridge.ListRulesInput{ + NamePrefix: aws.String("Karpenter-"), + }) + if err != nil { + return nil, fmt.Errorf("listing rules, %w", err) + } + for _, rule := range output.Rules { + out, err := eb.client.ListTagsForResourceWithContext(ctx, &eventbridge.ListTagsForResourceInput{ + ResourceARN: rule.Arn, + }) + // If we get access denied, that means the tag-based policy didn't allow us to get the tags from the rule + // which means it isn't a rule that we created for this cluster anyways + if err != nil && !awserrors.IsAccessDenied(err) { + return nil, fmt.Errorf("describing rules, %w", err) + } + for _, tag := range out.Tags { + if aws.StringValue(tag.Key) == v1alpha5.DiscoveryTagKey && + aws.StringValue(tag.Value) == injection.GetOptions(ctx).ClusterName { + + // If we succeed to parse the rule name, we should store it by its rule type + t, err := parseRuleName(aws.StringValue(rule.Name)) + if err == nil { + m[t] = Rule{ + Name: aws.StringValue(rule.Name), + } + } + } + } + } + return m, nil +} + +func (eb *EventBridge) DeleteRules(ctx context.Context) error { + out, err := eb.DiscoverRules(ctx) + if err != nil { + return fmt.Errorf("discovering existing rules, %w", err) + } + rules := lo.Values(out) errs := make([]error, len(rules)) workqueue.ParallelizeUntil(ctx, len(rules), len(rules), func(i int) { targetInput := &eventbridge.RemoveTargetsInput{ @@ -131,37 +219,17 @@ func (eb *EventBridge) DeleteEC2NotificationRules(ctx context.Context) error { return multierr.Combine(errs...) } -func (eb *EventBridge) getEC2NotificationEventRules(ctx context.Context) []rule { - return []rule{ - { - Name: fmt.Sprintf("Karpenter-%s-ScheduledChangeRule", utils.GetClusterNameHash(ctx, 20)), - Pattern: &pattern{ - Source: []string{"aws.health"}, - DetailType: []string{"AWS Health Event"}, - }, - }, - { - Name: fmt.Sprintf("Karpenter-%s-SpotTerminationRule", utils.GetClusterNameHash(ctx, 20)), - Pattern: &pattern{ - Source: []string{"aws.ec2"}, - DetailType: []string{"EC2 Spot Instance Interruption Warning"}, - }, - }, - { - Name: fmt.Sprintf("Karpenter-%s-RebalanceRule", utils.GetClusterNameHash(ctx, 20)), - Pattern: &pattern{ - Source: []string{"aws.ec2"}, - DetailType: []string{"EC2 Instance Rebalance Recommendation"}, - }, - }, - { - Name: fmt.Sprintf("Karpenter-%s-InstanceStateChangeRule", utils.GetClusterNameHash(ctx, 20)), - Pattern: &pattern{ - Source: []string{"aws.ec2"}, - DetailType: []string{"EC2 Instance State-change Notification"}, - }, - }, +func (eb *EventBridge) mergeRules(existing map[string]Rule) map[string]Rule { + m := map[string]Rule{} + for k, rule := range DefaultRules { + if existingRule, ok := existing[k]; ok { + existingRule.Pattern = rule.Pattern + m[k] = existingRule + } else { + m[k] = rule + } } + return m } func (eb *EventBridge) getTags(ctx context.Context) []*eventbridge.Tag { @@ -184,3 +252,19 @@ func (eb *EventBridge) getTags(ctx context.Context) []*eventbridge.Tag { })..., ) } + +// parseRuleName parses out the rule type based on the expected naming convention for rules +// provisioned by Karpenter +func parseRuleName(raw string) (string, error) { + r := regexp.MustCompile(`Karpenter-(?P.*)-.*`) + matches := r.FindStringSubmatch(raw) + if matches == nil { + return "", fmt.Errorf("parsing rule name, %s", raw) + } + for i, name := range r.SubexpNames() { + if name == "RuleType" { + return matches[i], nil + } + } + return "", fmt.Errorf("parsing rule name, %s", raw) +} diff --git a/pkg/controllers/providers/infrastructure.go b/pkg/controllers/providers/infrastructure.go index a9d76cee0beb..2adcfe989121 100644 --- a/pkg/controllers/providers/infrastructure.go +++ b/pkg/controllers/providers/infrastructure.go @@ -58,7 +58,7 @@ func (p *Infrastructure) Delete(ctx context.Context) error { } deleteEventBridgeRulesFunc := func() error { logging.FromContext(ctx).Debugf("Deleting the EventBridge notification rules...") - return p.eventBridgeProvider.DeleteEC2NotificationRules(ctx) + return p.eventBridgeProvider.DeleteRules(ctx) } funcs := []func() error{ deleteQueueFunc, @@ -104,7 +104,7 @@ func (p *Infrastructure) ensureQueue(ctx context.Context) error { // ensureEventBridge reconciles the Eventbridge rules with the configuration prescribed by Karpenter func (p *Infrastructure) ensureEventBridge(ctx context.Context) error { logging.FromContext(ctx).Debugf("Reconciling the EventBridge event rules...") - if err := p.eventBridgeProvider.CreateEC2EventRules(ctx); err != nil { + if err := p.eventBridgeProvider.CreateRules(ctx); err != nil { return fmt.Errorf("creating EventBridge event rules, %w", err) } logging.FromContext(ctx).Debugf("Successfully reconciled EventBridge event rules") diff --git a/pkg/controllers/providers/sqs.go b/pkg/controllers/providers/sqs.go index de5b244aa992..007b879dc13d 100644 --- a/pkg/controllers/providers/sqs.go +++ b/pkg/controllers/providers/sqs.go @@ -29,7 +29,6 @@ import ( "github.com/aws/karpenter-core/pkg/utils/atomic" awssettings "github.com/aws/karpenter/pkg/apis/config/settings" awserrors "github.com/aws/karpenter/pkg/errors" - "github.com/aws/karpenter/pkg/utils" ) type queuePolicy struct { @@ -52,19 +51,17 @@ type principal struct { type SQS struct { client sqsiface.SQSAPI - queueURL atomic.Lazy[string] - queueARN atomic.Lazy[string] - queueName string + queueURL atomic.Lazy[string] + queueARN atomic.Lazy[string] } -func NewSQS(ctx context.Context, client sqsiface.SQSAPI) *SQS { +func NewSQS(client sqsiface.SQSAPI) *SQS { provider := &SQS{ client: client, } - provider.queueName = provider.getQueueName(ctx) provider.queueURL.Resolve = func(ctx context.Context) (string, error) { input := &sqs.GetQueueUrlInput{ - QueueName: aws.String(provider.queueName), + QueueName: aws.String(provider.QueueName(ctx)), } ret, err := provider.client.GetQueueUrlWithContext(ctx, input) if err != nil { @@ -73,9 +70,13 @@ func NewSQS(ctx context.Context, client sqsiface.SQSAPI) *SQS { return aws.StringValue(ret.QueueUrl), nil } provider.queueARN.Resolve = func(ctx context.Context) (string, error) { + queueURL, err := provider.queueURL.TryGet(ctx) + if err != nil { + return "", fmt.Errorf("discovering queue url, %w", err) + } input := &sqs.GetQueueAttributesInput{ AttributeNames: aws.StringSlice([]string{sqs.QueueAttributeNameQueueArn}), - QueueUrl: aws.String(provider.queueName), + QueueUrl: aws.String(queueURL), } ret, err := provider.client.GetQueueAttributesWithContext(ctx, input) if err != nil { @@ -89,13 +90,13 @@ func NewSQS(ctx context.Context, client sqsiface.SQSAPI) *SQS { return provider } -func (s *SQS) QueueName() string { - return s.queueName +func (s *SQS) QueueName(ctx context.Context) string { + return lo.Substring(injection.GetOptions(ctx).ClusterName, 0, 80) } func (s *SQS) CreateQueue(ctx context.Context) error { input := &sqs.CreateQueueInput{ - QueueName: aws.String(s.queueName), + QueueName: aws.String(s.QueueName(ctx)), Tags: s.getTags(ctx), } result, err := s.client.CreateQueueWithContext(ctx, input) @@ -279,10 +280,3 @@ func (s *SQS) getTags(ctx context.Context) map[string]*string { }, ) } - -// getQueueName generates a sufficiently random name for the queue name from the cluster name -// This is used because the max-len for a queue name is 80 characters but the maximum cluster name -// length is 100 -func (s *SQS) getQueueName(ctx context.Context) string { - return fmt.Sprintf("Karpenter-EventQueue-%s", utils.GetClusterNameHash(ctx, 20)) -} diff --git a/pkg/errors/errors.go b/pkg/errors/errors.go index 221c5a44934c..79c891c4defc 100644 --- a/pkg/errors/errors.go +++ b/pkg/errors/errors.go @@ -46,6 +46,10 @@ var ( "UnfulfillableCapacity", "Unsupported", ) + accessDeniedErrorCodes = sets.NewString( + AccessDeniedCode, + AccessDeniedExceptionCode, + ) ) type InstanceTerminatedError struct { @@ -78,6 +82,20 @@ func IsNotFound(err error) bool { return false } +// IsAccessDenied returns true if the error is an AWS error (even if it's +// wrapped) and is a known to mean "access denied" (as opposed to a more +// serious or unexpected error) +func IsAccessDenied(err error) bool { + if err == nil { + return false + } + var awsError awserr.Error + if errors.As(err, &awsError) { + return accessDeniedErrorCodes.Has(awsError.Code()) + } + return false +} + // IsUnfulfillableCapacity returns true if the Fleet err means // capacity is temporarily unavailable for launching. // This could be due to account limits, insufficient ec2 capacity, etc. diff --git a/pkg/fake/eventbridgeapi.go b/pkg/fake/eventbridgeapi.go index e0a4c981771b..c225b91d1b66 100644 --- a/pkg/fake/eventbridgeapi.go +++ b/pkg/fake/eventbridgeapi.go @@ -25,10 +25,12 @@ import ( // EventBridgeBehavior must be reset between tests otherwise tests will // pollute each other. type EventBridgeBehavior struct { - PutRuleBehavior MockedFunction[eventbridge.PutRuleInput, eventbridge.PutRuleOutput] - PutTargetsBehavior MockedFunction[eventbridge.PutTargetsInput, eventbridge.PutTargetsOutput] - DeleteRuleBehavior MockedFunction[eventbridge.DeleteRuleInput, eventbridge.DeleteRuleOutput] - RemoveTargetsBehavior MockedFunction[eventbridge.RemoveTargetsInput, eventbridge.RemoveTargetsOutput] + PutRuleBehavior MockedFunction[eventbridge.PutRuleInput, eventbridge.PutRuleOutput] + PutTargetsBehavior MockedFunction[eventbridge.PutTargetsInput, eventbridge.PutTargetsOutput] + ListRulesBehavior MockedFunction[eventbridge.ListRulesInput, eventbridge.ListRulesOutput] + ListTagsForResourceBehavior MockedFunction[eventbridge.ListTagsForResourceInput, eventbridge.ListTagsForResourceOutput] + DeleteRuleBehavior MockedFunction[eventbridge.DeleteRuleInput, eventbridge.DeleteRuleOutput] + RemoveTargetsBehavior MockedFunction[eventbridge.RemoveTargetsInput, eventbridge.RemoveTargetsOutput] } type EventBridgeAPI struct { @@ -41,6 +43,7 @@ type EventBridgeAPI struct { func (eb *EventBridgeAPI) Reset() { eb.PutRuleBehavior.Reset() eb.PutTargetsBehavior.Reset() + eb.ListRulesBehavior.Reset() eb.DeleteRuleBehavior.Reset() eb.RemoveTargetsBehavior.Reset() } @@ -55,6 +58,14 @@ func (eb *EventBridgeAPI) PutTargetsWithContext(_ context.Context, input *eventb return eb.PutTargetsBehavior.Invoke(input) } +func (eb *EventBridgeAPI) ListRulesWithContext(_ context.Context, input *eventbridge.ListRulesInput, _ ...request.Option) (*eventbridge.ListRulesOutput, error) { + return eb.ListRulesBehavior.Invoke(input) +} + +func (eb *EventBridgeAPI) ListTagsForResourceWithContext(_ context.Context, input *eventbridge.ListTagsForResourceInput, _ ...request.Option) (*eventbridge.ListTagsForResourceOutput, error) { + return eb.ListTagsForResourceBehavior.Invoke(input) +} + func (eb *EventBridgeAPI) DeleteRuleWithContext(_ context.Context, input *eventbridge.DeleteRuleInput, _ ...request.Option) (*eventbridge.DeleteRuleOutput, error) { return eb.DeleteRuleBehavior.Invoke(input) } diff --git a/pkg/utils/utils.go b/pkg/utils/utils.go index 1e1bacadd8e4..34c7d1bf59a0 100644 --- a/pkg/utils/utils.go +++ b/pkg/utils/utils.go @@ -15,16 +15,11 @@ limitations under the License. package utils import ( - "context" - "crypto/sha256" - "encoding/hex" "fmt" "regexp" v1 "k8s.io/api/core/v1" "knative.dev/pkg/ptr" - - "github.com/aws/karpenter-core/pkg/operator/injection" ) // ParseInstanceID parses the provider ID stored on the node to get the instance ID @@ -42,11 +37,3 @@ func ParseInstanceID(node *v1.Node) (*string, error) { } return nil, fmt.Errorf("parsing instance id %s", node.Spec.ProviderID) } - -// GetClusterNameHash gets the SHA256 hex-encoded checksum of the cluster name, truncated at the passed truncatedAt -func GetClusterNameHash(ctx context.Context, truncateAt int) string { - h := sha256.New() - h.Write([]byte(injection.GetOptions(ctx).ClusterName)) - checkSum := h.Sum([]byte{}) - return hex.EncodeToString(checkSum)[:truncateAt] -} diff --git a/test/pkg/environment/aws/environment.go b/test/pkg/environment/aws/environment.go index 41bf00248387..c231946e1d2e 100644 --- a/test/pkg/environment/aws/environment.go +++ b/test/pkg/environment/aws/environment.go @@ -58,6 +58,6 @@ func NewEnvironment(t *testing.T) (*Environment, error) { SSMAPI: *ssm.New(session), IAMAPI: *iam.New(session), InterruptionAPI: itn.New(lo.Must(config.LoadDefaultConfig(env.Context))), - SQSProvider: providers.NewSQS(env.Context, sqs.New(session)), + SQSProvider: providers.NewSQS(sqs.New(session)), }, nil } diff --git a/website/content/en/preview/getting-started/getting-started-with-eksctl/cloudformation.yaml b/website/content/en/preview/getting-started/getting-started-with-eksctl/cloudformation.yaml index 7374b3487b6a..dab54b975386 100644 --- a/website/content/en/preview/getting-started/getting-started-with-eksctl/cloudformation.yaml +++ b/website/content/en/preview/getting-started/getting-started-with-eksctl/cloudformation.yaml @@ -4,9 +4,6 @@ Parameters: ClusterName: Type: String Description: "EKS cluster name" - ClusterNameSHA: - Type: String - Description: "Truncated checksum of EKS cluster name" Resources: KarpenterNodeInstanceProfile: Type: "AWS::IAM::InstanceProfile" @@ -75,7 +72,7 @@ Resources: Version: "2012-10-17" Statement: - Effect: Allow - Resource: !Sub "arn:${AWS::Partition}:sqs:${AWS::Region}:${AWS::AccountId}:Karpenter-EventQueue-${ClusterNameSHA}" + Resource: !Sub "arn:${AWS::Partition}:sqs:${AWS::Region}:${AWS::AccountId}:${ClusterName}" Action: # Write Operations - sqs:CreateQueue @@ -88,11 +85,16 @@ Resources: - sqs:GetQueueAttributes - sqs:ReceiveMessage - Effect: Allow - Resource: !Sub "arn:${AWS::Partition}:events:${AWS::Region}:${AWS::AccountId}:rule/Karpenter-${ClusterNameSHA}-*" + Resource: !Sub "arn:${AWS::Partition}:events:${AWS::Region}:${AWS::AccountId}:rule/Karpenter-*" Action: # Write Operations - events:PutRule - events:TagResource - events:PutTargets - events:DeleteRule - - events:RemoveTargets \ No newline at end of file + - events:RemoveTargets + - events:ListTagsForResource + - Effect: Allow + Resource: "*" + Action: + - events:ListRules \ No newline at end of file diff --git a/website/content/en/preview/getting-started/getting-started-with-eksctl/scripts/step01-config.sh b/website/content/en/preview/getting-started/getting-started-with-eksctl/scripts/step01-config.sh index 709dcf7ff491..5792c574690e 100755 --- a/website/content/en/preview/getting-started/getting-started-with-eksctl/scripts/step01-config.sh +++ b/website/content/en/preview/getting-started/getting-started-with-eksctl/scripts/step01-config.sh @@ -1,4 +1,3 @@ export CLUSTER_NAME="${USER}-karpenter-demo" -export CLUSTER_NAME_SHA=$(echo -n "${CLUSTER_NAME}" | tr -d '"' | sha256sum | cut -c -20) export AWS_DEFAULT_REGION="us-west-2" export AWS_ACCOUNT_ID="$(aws sts get-caller-identity --query Account --output text)" diff --git a/website/content/en/preview/getting-started/getting-started-with-eksctl/scripts/step03-iam-cloud-formation.sh b/website/content/en/preview/getting-started/getting-started-with-eksctl/scripts/step03-iam-cloud-formation.sh index 1b11b83783d9..e07c4220ae38 100755 --- a/website/content/en/preview/getting-started/getting-started-with-eksctl/scripts/step03-iam-cloud-formation.sh +++ b/website/content/en/preview/getting-started/getting-started-with-eksctl/scripts/step03-iam-cloud-formation.sh @@ -5,5 +5,4 @@ curl -fsSL https://karpenter.sh/"${KARPENTER_VERSION}"/getting-started/getting-s --stack-name "Karpenter-${CLUSTER_NAME}" \ --template-file "${TEMPOUT}" \ --capabilities CAPABILITY_NAMED_IAM \ - --parameter-overrides "ClusterName=${CLUSTER_NAME}" - --parameter-overrides "ClusterNameSHA=${CLUSTER_NAME_SHA}" + --parameter-overrides "ClusterName=${CLUSTER_NAME}" \ No newline at end of file From 95ae341974a5df825c7fd998ce659d69aaf5c8aa Mon Sep 17 00:00:00 2001 From: Jonathan Innis Date: Wed, 2 Nov 2022 11:46:46 -0700 Subject: [PATCH 54/55] PR comments --- pkg/controllers/interruption/controller.go | 21 ++-- .../interruption/messages/types.go | 5 + pkg/controllers/interruption/metrics.go | 18 ++- pkg/controllers/nodetemplate/controller.go | 2 +- .../nodetemplate/infrastructure.go | 110 +++++++++++++++-- pkg/controllers/nodetemplate/metrics.go | 37 ++++-- pkg/controllers/providers/infrastructure.go | 112 ------------------ pkg/controllers/providers/sqs.go | 2 +- 8 files changed, 161 insertions(+), 146 deletions(-) delete mode 100644 pkg/controllers/providers/infrastructure.go diff --git a/pkg/controllers/interruption/controller.go b/pkg/controllers/interruption/controller.go index 6953588fd76e..020f385e75fc 100644 --- a/pkg/controllers/interruption/controller.go +++ b/pkg/controllers/interruption/controller.go @@ -57,7 +57,7 @@ func init() { } // Controller is an AWS interruption controller. -// It continually polls an provisioned SQS queue for events from aws.ec2 and aws.health that +// It continually polls an SQS queue for events from aws.ec2 and aws.health that // trigger node health events or node spot interruption/rebalance events. type Controller struct { kubeClient client.Client @@ -82,12 +82,16 @@ func NewController(kubeClient client.Client, clk clock.Clock, recorder events.Re } func (c *Controller) Reconcile(ctx context.Context, _ reconcile.Request) (reconcile.Result, error) { - queueExists, err := c.sqsProvider.QueueExists(ctx) - if err != nil { - return reconcile.Result{}, fmt.Errorf("checking queue existence, %w", err) - } - if settings.FromContext(ctx).EnableInterruptionHandling && queueExists { - active.Set(1) + if settings.FromContext(ctx).EnableInterruptionHandling { + queueExists, err := c.sqsProvider.QueueExists(ctx) + if err != nil { + return reconcile.Result{}, fmt.Errorf("checking queue existence, %w", err) + } + if !queueExists { + enabled.Set(0) + return reconcile.Result{RequeueAfter: time.Second * 10}, nil + } + enabled.Set(1) sqsMessages, err := c.sqsProvider.GetSQSMessages(ctx) if err != nil { return reconcile.Result{}, fmt.Errorf("getting messages from queue, %w", err) @@ -116,7 +120,7 @@ func (c *Controller) Reconcile(ctx context.Context, _ reconcile.Request) (reconc }) return reconcile.Result{}, multierr.Combine(errs...) } - active.Set(0) + enabled.Set(0) return reconcile.Result{RequeueAfter: time.Second * 10}, nil } @@ -159,6 +163,7 @@ func (c *Controller) handleMessage(ctx context.Context, instanceIDMap map[string err = multierr.Append(err, e) } } + messageLatency.Observe(time.Since(msg.StartTime()).Seconds()) if err != nil { return fmt.Errorf("failed to act on nodes [%s%s], %w", strings.Join(lo.Slice(failedNodeNames, 0, 3), ","), diff --git a/pkg/controllers/interruption/messages/types.go b/pkg/controllers/interruption/messages/types.go index ab871c70e716..e424d7e1bcec 100644 --- a/pkg/controllers/interruption/messages/types.go +++ b/pkg/controllers/interruption/messages/types.go @@ -30,6 +30,7 @@ type Parser interface { type Message interface { EC2InstanceIDs() []string Kind() Kind + StartTime() time.Time } type Kind byte @@ -71,3 +72,7 @@ type Metadata struct { Time time.Time `json:"time"` Version string `json:"version"` } + +func (m Metadata) StartTime() time.Time { + return m.Time +} diff --git a/pkg/controllers/interruption/metrics.go b/pkg/controllers/interruption/metrics.go index 140e8a028128..8132536d3a52 100644 --- a/pkg/controllers/interruption/metrics.go +++ b/pkg/controllers/interruption/metrics.go @@ -24,18 +24,17 @@ import ( const ( subsystem = "aws_interruption_controller" messageTypeLabel = "message_type" - actionableTypeLabel = "actionable" actionTypeLabel = "action_type" terminationReasonLabel = "interruption" ) var ( - active = prometheus.NewGauge( + enabled = prometheus.NewGauge( prometheus.GaugeOpts{ Namespace: metrics.Namespace, Subsystem: subsystem, - Name: "active", - Help: "Whether the message polling is currently active.", + Name: "enabled", + Help: "Whether the message polling is currently enabled.", }, ) receivedMessages = prometheus.NewCounterVec( @@ -55,6 +54,15 @@ var ( Help: "Count of messages deleted from the SQS queue.", }, ) + messageLatency = prometheus.NewHistogram( + prometheus.HistogramOpts{ + Namespace: metrics.Namespace, + Subsystem: subsystem, + Name: "message_latency_time_seconds", + Help: "Length of time between message creation in queue and an action taken on the message by the controller.", + Buckets: metrics.DurationBuckets(), + }, + ) actionsPerformed = prometheus.NewCounterVec( prometheus.CounterOpts{ Namespace: metrics.Namespace, @@ -67,5 +75,5 @@ var ( ) func init() { - crmetrics.Registry.MustRegister(receivedMessages, deletedMessages, actionsPerformed) + crmetrics.Registry.MustRegister(enabled, receivedMessages, deletedMessages, messageLatency, actionsPerformed) } diff --git a/pkg/controllers/nodetemplate/controller.go b/pkg/controllers/nodetemplate/controller.go index 3427b30369c0..7b8babc95d89 100644 --- a/pkg/controllers/nodetemplate/controller.go +++ b/pkg/controllers/nodetemplate/controller.go @@ -56,7 +56,7 @@ func NewController(kubeClient client.Client, sqsProvider *providers.SQS, eventBr return &Controller{ kubeClient: kubeClient, finalizer: &Finalizer{}, - infrastructure: &Infrastructure{kubeClient: kubeClient, provider: providers.NewInfrastructure(sqsProvider, eventBridgeProvider)}, + infrastructure: &Infrastructure{kubeClient: kubeClient, sqsProvider: sqsProvider, eventBridgeProvider: eventBridgeProvider}, } } diff --git a/pkg/controllers/nodetemplate/infrastructure.go b/pkg/controllers/nodetemplate/infrastructure.go index 48516c6421d5..149fd109ef6e 100644 --- a/pkg/controllers/nodetemplate/infrastructure.go +++ b/pkg/controllers/nodetemplate/infrastructure.go @@ -16,21 +16,27 @@ package nodetemplate import ( "context" + "fmt" "time" + "go.uber.org/multierr" + "k8s.io/client-go/util/workqueue" + "knative.dev/pkg/logging" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/reconcile" + "github.com/aws/karpenter-core/pkg/metrics" awssettings "github.com/aws/karpenter/pkg/apis/config/settings" "github.com/aws/karpenter/pkg/apis/v1alpha1" "github.com/aws/karpenter/pkg/controllers/providers" ) type Infrastructure struct { - kubeClient client.Client - provider *providers.Infrastructure + kubeClient client.Client + sqsProvider *providers.SQS + eventBridgeProvider *providers.EventBridge - lastInfrastructureReconcile time.Time + lastInfrastructureReconcile time.Time // Keeps track of the last reconcile time for infra, so we don't keep calling APIs } // Reconcile reconciles the infrastructure based on whether interruption handling is enabled and deletes @@ -42,26 +48,108 @@ func (i *Infrastructure) Reconcile(ctx context.Context, nodeTemplate *v1alpha1.A return reconcile.Result{}, err } if !nodeTemplate.DeletionTimestamp.IsZero() && len(list.Items) == 1 { - if err := i.provider.Delete(ctx); err != nil { + if err := i.DeleteInfrastructure(ctx); err != nil { return reconcile.Result{}, err } i.lastInfrastructureReconcile = time.Time{} return reconcile.Result{}, nil } else if len(list.Items) >= 1 { - infrastructureActive.Set(1) + infrastructureEnabled.Set(1) if i.lastInfrastructureReconcile.Add(time.Hour).Before(time.Now()) { - if err := i.provider.Create(ctx); err != nil { + if err := i.CreateInfrastructure(ctx); err != nil { infrastructureHealthy.Set(0) return reconcile.Result{}, err } i.lastInfrastructureReconcile = time.Now() infrastructureHealthy.Set(1) } - // TODO: Implement an alerting mechanism for settings updates; until then, just poll - return reconcile.Result{RequeueAfter: time.Second * 10}, nil } + } else { + infrastructureEnabled.Set(0) + infrastructureHealthy.Set(0) } - infrastructureActive.Set(0) - infrastructureHealthy.Set(0) - return reconcile.Result{}, nil + + // TODO: Implement an alerting mechanism for settings updates; until then, just poll + return reconcile.Result{RequeueAfter: time.Second * 10}, nil +} + +// CreateInfrastructure provisions an SQS queue and EventBridge rules to enable interruption handling +func (i *Infrastructure) CreateInfrastructure(ctx context.Context) error { + defer metrics.Measure(infrastructureCreateDuration)() + if err := i.ensureQueue(ctx); err != nil { + return fmt.Errorf("ensuring queue, %w", err) + } + if err := i.ensureEventBridge(ctx); err != nil { + return fmt.Errorf("ensuring eventBridge rules and targets, %w", err) + } + logging.FromContext(ctx).Infof("Completed reconciliation of infrastructure") + return nil +} + +// DeleteInfrastructure removes the infrastructure that was stood up and reconciled +// by the infrastructure controller for SQS message polling +func (i *Infrastructure) DeleteInfrastructure(ctx context.Context) error { + defer metrics.Measure(infrastructureDeleteDuration)() + logging.FromContext(ctx).Infof("Deprovisioning the infrastructure...") + funcs := []func(context.Context) error{ + i.deleteQueue, + i.deleteEventBridge, + } + errs := make([]error, len(funcs)) + workqueue.ParallelizeUntil(ctx, len(funcs), len(funcs), func(i int) { + errs[i] = funcs[i](ctx) + }) + + err := multierr.Combine(errs...) + if err != nil { + return err + } + logging.FromContext(ctx).Infof("Completed deprovisioning the infrastructure") + return nil +} + +// ensureQueue reconciles the SQS queue with the configuration prescribed by Karpenter +func (i *Infrastructure) ensureQueue(ctx context.Context) error { + // Attempt to find the queue. If we can't find it, assume it isn't created and try to create it + // If we did find it, then just set the queue attributes on the existing queue + ctx = logging.WithLogger(ctx, logging.FromContext(ctx).With("queueName", i.sqsProvider.QueueName(ctx))) + logging.FromContext(ctx).Debugf("Reconciling the SQS interruption queue...") + queueExists, err := i.sqsProvider.QueueExists(ctx) + if err != nil { + return fmt.Errorf("checking queue existence, %w", err) + } + if !queueExists { + logging.FromContext(ctx).Debugf("Queue not found, creating the SQS interruption queue...") + if err := i.sqsProvider.CreateQueue(ctx); err != nil { + return fmt.Errorf("creating sqs queue with policy, %w", err) + } + logging.FromContext(ctx).Debugf("Successfully created the SQS interruption queue") + } + // Always attempt to set the queue attributes, even after creation to help set the queue policy + if err := i.sqsProvider.SetQueueAttributes(ctx, nil); err != nil { + return fmt.Errorf("setting queue attributes for queue, %w", err) + } + logging.FromContext(ctx).Debugf("Successfully reconciled SQS queue") + return nil +} + +func (i *Infrastructure) deleteQueue(ctx context.Context) error { + ctx = logging.WithLogger(ctx, logging.FromContext(ctx).With("queueName", i.sqsProvider.QueueName(ctx))) + logging.FromContext(ctx).Debugf("Deleting the SQS interruption queue...") + return i.sqsProvider.DeleteQueue(ctx) +} + +// ensureEventBridge reconciles the Eventbridge rules with the configuration prescribed by Karpenter +func (i *Infrastructure) ensureEventBridge(ctx context.Context) error { + logging.FromContext(ctx).Debugf("Reconciling the EventBridge event rules...") + if err := i.eventBridgeProvider.CreateRules(ctx); err != nil { + return fmt.Errorf("creating EventBridge event rules, %w", err) + } + logging.FromContext(ctx).Debugf("Successfully reconciled EventBridge event rules") + return nil +} + +func (i *Infrastructure) deleteEventBridge(ctx context.Context) error { + logging.FromContext(ctx).Debugf("Deleting the EventBridge interruption rules...") + return i.eventBridgeProvider.DeleteRules(ctx) } diff --git a/pkg/controllers/nodetemplate/metrics.go b/pkg/controllers/nodetemplate/metrics.go index 4715dcfa9f8f..3956266e6d41 100644 --- a/pkg/controllers/nodetemplate/metrics.go +++ b/pkg/controllers/nodetemplate/metrics.go @@ -16,29 +16,50 @@ package nodetemplate import ( "github.com/prometheus/client_golang/prometheus" + crmetrics "sigs.k8s.io/controller-runtime/pkg/metrics" "github.com/aws/karpenter-core/pkg/metrics" ) -const ( - subsystem = "aws_notification_controller" -) +const subSystem = "nodetemplate_infrastructure" var ( infrastructureHealthy = prometheus.NewGauge( prometheus.GaugeOpts{ Namespace: metrics.Namespace, - Subsystem: subsystem, - Name: "infrastructure_healthy", + Subsystem: subSystem, + Name: "healthy", Help: "Whether the infrastructure provisioned by the controller is healthy.", }, ) - infrastructureActive = prometheus.NewGauge( + infrastructureEnabled = prometheus.NewGauge( prometheus.GaugeOpts{ Namespace: metrics.Namespace, - Subsystem: subsystem, - Name: "infrastructure_active", + Subsystem: subSystem, + Name: "enabled", Help: "Whether the infrastructure reconciliation is currently active.", }, ) + infrastructureCreateDuration = prometheus.NewHistogram( + prometheus.HistogramOpts{ + Namespace: metrics.Namespace, + Subsystem: subSystem, + Name: "create_time_seconds", + Help: "Length of time to create infrastructure.", + Buckets: metrics.DurationBuckets(), + }, + ) + infrastructureDeleteDuration = prometheus.NewHistogram( + prometheus.HistogramOpts{ + Namespace: metrics.Namespace, + Subsystem: subSystem, + Name: "delete_time_seconds", + Help: "Length of time to delete infrastructure.", + Buckets: metrics.DurationBuckets(), + }, + ) ) + +func init() { + crmetrics.Registry.MustRegister(infrastructureHealthy, infrastructureEnabled, infrastructureCreateDuration, infrastructureDeleteDuration) +} diff --git a/pkg/controllers/providers/infrastructure.go b/pkg/controllers/providers/infrastructure.go deleted file mode 100644 index 2adcfe989121..000000000000 --- a/pkg/controllers/providers/infrastructure.go +++ /dev/null @@ -1,112 +0,0 @@ -/* -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package providers - -import ( - "context" - "fmt" - - "go.uber.org/multierr" - "k8s.io/client-go/util/workqueue" - "knative.dev/pkg/logging" -) - -type Infrastructure struct { - sqsProvider *SQS - eventBridgeProvider *EventBridge -} - -func NewInfrastructure(sqsProvider *SQS, eventBridgeProvider *EventBridge) *Infrastructure { - return &Infrastructure{ - sqsProvider: sqsProvider, - eventBridgeProvider: eventBridgeProvider, - } -} - -// Create provisions an SQS queue and EventBridge rules to enable interruption handling -func (p *Infrastructure) Create(ctx context.Context) error { - if err := p.ensureQueue(ctx); err != nil { - return fmt.Errorf("ensuring queue, %w", err) - } - if err := p.ensureEventBridge(ctx); err != nil { - return fmt.Errorf("ensuring eventBridge rules and targets, %w", err) - } - logging.FromContext(ctx).Infof("Completed reconciliation of infrastructure") - return nil -} - -// Delete removes the infrastructure that was stood up and reconciled -// by the infrastructure controller for SQS message polling -func (p *Infrastructure) Delete(ctx context.Context) error { - logging.FromContext(ctx).Infof("Deprovisioning the infrastructure...") - - deleteQueueFunc := func() error { - logging.FromContext(ctx).Debugf("Deleting the SQS notification queue...") - return p.sqsProvider.DeleteQueue(ctx) - } - deleteEventBridgeRulesFunc := func() error { - logging.FromContext(ctx).Debugf("Deleting the EventBridge notification rules...") - return p.eventBridgeProvider.DeleteRules(ctx) - } - funcs := []func() error{ - deleteQueueFunc, - deleteEventBridgeRulesFunc, - } - errs := make([]error, len(funcs)) - workqueue.ParallelizeUntil(ctx, len(funcs), len(funcs), func(i int) { - errs[i] = funcs[i]() - }) - - err := multierr.Combine(errs...) - if err != nil { - return err - } - logging.FromContext(ctx).Infof("Completed deprovisioning the infrastructure") - return nil -} - -// ensureQueue reconciles the SQS queue with the configuration prescribed by Karpenter -func (p *Infrastructure) ensureQueue(ctx context.Context) error { - // Attempt to find the queue. If we can't find it, assume it isn't created and try to create it - // If we did find it, then just set the queue attributes on the existing queue - logging.FromContext(ctx).Debugf("Reconciling the SQS notification queue...") - queueExists, err := p.sqsProvider.QueueExists(ctx) - if err != nil { - return fmt.Errorf("checking queue existence, %w", err) - } - if !queueExists { - logging.FromContext(ctx).Debugf("Queue not found, creating the SQS notification queue...") - if err := p.sqsProvider.CreateQueue(ctx); err != nil { - return fmt.Errorf("creating sqs queue with policy, %w", err) - } - logging.FromContext(ctx).Debugf("Successfully created the SQS notification queue") - } - // Always attempt to set the queue attributes, even after creation to help set the queue policy - if err := p.sqsProvider.SetQueueAttributes(ctx, nil); err != nil { - return fmt.Errorf("setting queue attributes for queue, %w", err) - } - logging.FromContext(ctx).Debugf("Successfully reconciled SQS queue") - return nil -} - -// ensureEventBridge reconciles the Eventbridge rules with the configuration prescribed by Karpenter -func (p *Infrastructure) ensureEventBridge(ctx context.Context) error { - logging.FromContext(ctx).Debugf("Reconciling the EventBridge event rules...") - if err := p.eventBridgeProvider.CreateRules(ctx); err != nil { - return fmt.Errorf("creating EventBridge event rules, %w", err) - } - logging.FromContext(ctx).Debugf("Successfully reconciled EventBridge event rules") - return nil -} diff --git a/pkg/controllers/providers/sqs.go b/pkg/controllers/providers/sqs.go index 007b879dc13d..c22ae98afa65 100644 --- a/pkg/controllers/providers/sqs.go +++ b/pkg/controllers/providers/sqs.go @@ -217,7 +217,7 @@ func (s *SQS) DeleteSQSMessage(ctx context.Context, msg *sqs.Message) error { func (s *SQS) DeleteQueue(ctx context.Context) error { queueURL, err := s.DiscoverQueueURL(ctx) if err != nil { - if awserrors.IsNotFound(err) { + if awserrors.IsNotFound(err) || awserrors.IsAccessDenied(err) { return nil } return fmt.Errorf("fetching queue url, %w", err) From 563aa44f9d54dfe5e7d063ba2ea04cf38426f296 Mon Sep 17 00:00:00 2001 From: Jonathan Innis Date: Thu, 3 Nov 2022 11:08:28 -0700 Subject: [PATCH 55/55] PR comments --- cmd/controller/main.go | 3 +- pkg/apis/v1alpha1/register.go | 2 + .../interruption_benchmark_test.go | 16 +- pkg/controllers/interruption/suite_test.go | 326 ++++++------ pkg/controllers/nodetemplate/controller.go | 8 +- pkg/controllers/nodetemplate/finalizer.go | 13 +- .../nodetemplate/infrastructure.go | 97 ++-- pkg/controllers/nodetemplate/metrics.go | 10 +- pkg/controllers/nodetemplate/suite_test.go | 493 +++++++++--------- pkg/controllers/providers/eventbridge.go | 5 - pkg/fake/atomic.go | 2 + .../pipeline-trigger-cron.yaml | 2 +- test/pkg/environment/aws/setup.go | 2 +- test/suites/interruption/suite_test.go | 13 +- 14 files changed, 505 insertions(+), 487 deletions(-) diff --git a/cmd/controller/main.go b/cmd/controller/main.go index a5c4b79e0df6..76f498af8c45 100644 --- a/cmd/controller/main.go +++ b/cmd/controller/main.go @@ -16,7 +16,6 @@ package main import ( "github.com/samber/lo" - "k8s.io/utils/clock" awscloudprovider "github.com/aws/karpenter/pkg/cloudprovider" "github.com/aws/karpenter/pkg/context" @@ -49,7 +48,7 @@ func main() { operator. WithControllers(ctx, corecontrollers.NewControllers( ctx, - clock.RealClock{}, + operator.Clock, operator.GetClient(), operator.KubernetesInterface, state.NewCluster(operator.SettingsStore.InjectSettings(ctx), operator.Clock, operator.GetClient(), cloudProvider), diff --git a/pkg/apis/v1alpha1/register.go b/pkg/apis/v1alpha1/register.go index b2e5a6496ab0..f499b6175035 100644 --- a/pkg/apis/v1alpha1/register.go +++ b/pkg/apis/v1alpha1/register.go @@ -72,6 +72,8 @@ var ( LabelInstanceGPUCount = LabelDomain + "/instance-gpu-count" LabelInstanceGPUMemory = LabelDomain + "/instance-gpu-memory" LabelInstanceAMIID = LabelDomain + "/instance-ami-id" + + InterruptionInfrastructureFinalizer = Group + "/interruption-infrastructure" ) var ( diff --git a/pkg/controllers/interruption/interruption_benchmark_test.go b/pkg/controllers/interruption/interruption_benchmark_test.go index cf62a43614f8..1781adfa1f20 100644 --- a/pkg/controllers/interruption/interruption_benchmark_test.go +++ b/pkg/controllers/interruption/interruption_benchmark_test.go @@ -48,6 +48,7 @@ import ( awscontext "github.com/aws/karpenter/pkg/context" "github.com/aws/karpenter/pkg/controllers/interruption" "github.com/aws/karpenter/pkg/controllers/interruption/events" + "github.com/aws/karpenter/pkg/controllers/nodetemplate" "github.com/aws/karpenter/pkg/controllers/providers" awstest "github.com/aws/karpenter/pkg/test" @@ -181,6 +182,7 @@ func benchmarkNotificationController(b *testing.B, messageCount int) { } type providerSet struct { + kubeClient client.Client sqsProvider *providers.SQS eventBridgeProvider *providers.EventBridge } @@ -192,7 +194,7 @@ func newProviders(ctx context.Context) providerSet { awsclient.DefaultRetryer{NumMaxRetries: awsclient.DefaultRetryerMaxNumRetries}, ), )) - sqsProvider = providers.NewSQS(ctx, sqs.New(sess)) + sqsProvider = providers.NewSQS(sqs.New(sess)) eventBridgeProvider = providers.NewEventBridge(eventbridge.New(sess), sqsProvider) return providerSet{ sqsProvider: sqsProvider, @@ -201,10 +203,11 @@ func newProviders(ctx context.Context) providerSet { } func (p *providerSet) makeInfrastructure(ctx context.Context) error { - infraProvider := providers.NewInfrastructure(p.sqsProvider, p.eventBridgeProvider) - if err := infraProvider.Create(ctx); err != nil { + infraReconciler := nodetemplate.NewInfrastructureReconciler(p.kubeClient, p.sqsProvider, p.eventBridgeProvider) + if err := infraReconciler.CreateInfrastructure(ctx); err != nil { return fmt.Errorf("creating infrastructure, %w", err) } + if err := p.sqsProvider.SetQueueAttributes(ctx, map[string]*string{ sqs.QueueAttributeNameMessageRetentionPeriod: aws.String("1200"), // 20 minutes for this test }); err != nil { @@ -214,8 +217,11 @@ func (p *providerSet) makeInfrastructure(ctx context.Context) error { } func (p *providerSet) cleanupInfrastructure(ctx context.Context) error { - infraProvider := providers.NewInfrastructure(p.sqsProvider, p.eventBridgeProvider) - return infraProvider.Delete(ctx) + infraReconciler := nodetemplate.NewInfrastructureReconciler(p.kubeClient, p.sqsProvider, p.eventBridgeProvider) + if err := infraReconciler.DeleteInfrastructure(ctx); err != nil { + return fmt.Errorf("deleting infrastructure, %w", err) + } + return nil } func (p *providerSet) provisionMessages(ctx context.Context, messages ...interface{}) error { diff --git a/pkg/controllers/interruption/suite_test.go b/pkg/controllers/interruption/suite_test.go index 86b0950dcf79..b7a0be3fe732 100644 --- a/pkg/controllers/interruption/suite_test.go +++ b/pkg/controllers/interruption/suite_test.go @@ -18,12 +18,12 @@ import ( "context" "encoding/json" "fmt" - "math/rand" "path/filepath" "runtime" "testing" "time" + "github.com/Pallinder/go-randomdata" "github.com/aws/aws-sdk-go/aws" "github.com/aws/aws-sdk-go/aws/awserr" "github.com/aws/aws-sdk-go/service/sqs" @@ -40,6 +40,11 @@ import ( _ "knative.dev/pkg/system/testing" "sigs.k8s.io/controller-runtime/pkg/client" + "github.com/aws/karpenter-core/pkg/apis/config/settings" + "github.com/aws/karpenter-core/pkg/apis/provisioning/v1alpha5" + "github.com/aws/karpenter-core/pkg/cloudprovider/fake" + "github.com/aws/karpenter-core/pkg/test" + . "github.com/aws/karpenter-core/pkg/test/expectations" awssettings "github.com/aws/karpenter/pkg/apis/config/settings" "github.com/aws/karpenter/pkg/apis/v1alpha1" awscache "github.com/aws/karpenter/pkg/cache" @@ -53,12 +58,6 @@ import ( "github.com/aws/karpenter/pkg/errors" awsfake "github.com/aws/karpenter/pkg/fake" awstest "github.com/aws/karpenter/pkg/test" - - "github.com/aws/karpenter-core/pkg/apis/config/settings" - "github.com/aws/karpenter-core/pkg/apis/provisioning/v1alpha5" - "github.com/aws/karpenter-core/pkg/cloudprovider/fake" - "github.com/aws/karpenter-core/pkg/test" - . "github.com/aws/karpenter-core/pkg/test/expectations" ) const ( @@ -85,17 +84,10 @@ var controller *interruption.Controller func TestAPIs(t *testing.T) { ctx = TestContextWithLogger(t) RegisterFailHandler(Fail) - RunSpecs(t, "AWS Notification") + RunSpecs(t, "AWSInterruption") } -var _ = BeforeEach(func() { - settingsStore := test.SettingsStore{ - settings.ContextKey: test.Settings(), - awssettings.ContextKey: awssettings.Settings{ - EnableInterruptionHandling: true, - }, - } - ctx = settingsStore.InjectSettings(ctx) +var _ = BeforeSuite(func() { env = test.NewEnvironment(ctx, func(e *test.Environment) { fakeClock = &clock.FakeClock{} cloudProvider = &fake.CloudProvider{} @@ -110,174 +102,189 @@ var _ = BeforeEach(func() { sqsProvider = providers.NewSQS(sqsapi) eventbridgeapi = &awsfake.EventBridgeAPI{} eventBridgeProvider = providers.NewEventBridge(eventbridgeapi, sqsProvider) - - controller = interruption.NewController(env.Client, fakeClock, recorder, sqsProvider, unavailableOfferingsCache) }) env.CRDDirectoryPaths = append(env.CRDDirectoryPaths, relativeToRoot("charts/karpenter/crds")) Expect(env.Start()).To(Succeed(), "Failed to start environment") }) +var _ = AfterSuite(func() { + Expect(env.Stop()).To(Succeed(), "Failed to stop environment") +}) + +var _ = BeforeEach(func() { + controller = interruption.NewController(env.Client, fakeClock, recorder, sqsProvider, unavailableOfferingsCache) + settingsStore := test.SettingsStore{ + settings.ContextKey: test.Settings(), + awssettings.ContextKey: awssettings.Settings{ + EnableInterruptionHandling: true, + }, + } + ctx = settingsStore.InjectSettings(ctx) +}) + var _ = AfterEach(func() { + sqsapi.Reset() + eventbridgeapi.Reset() ExpectCleanedUp(ctx, env.Client) ExpectDeleted(ctx, env.Client, nodeTemplate) - Expect(env.Stop()).To(Succeed(), "Failed to stop environment") }) -var _ = Describe("Processing Messages", func() { - It("should delete the node when receiving a spot interruption warning", func() { - node := test.Node(test.NodeOptions{ - ObjectMeta: metav1.ObjectMeta{ - Labels: map[string]string{ - v1alpha5.ProvisionerNameLabelKey: "default", +var _ = Describe("AWSInterruption", func() { + Context("Processing Messages", func() { + It("should delete the node when receiving a spot interruption warning", func() { + node := test.Node(test.NodeOptions{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + v1alpha5.ProvisionerNameLabelKey: "default", + }, }, - }, - ProviderID: makeProviderID(defaultInstanceID), + ProviderID: makeProviderID(defaultInstanceID), + }) + ExpectMessagesCreated(spotInterruptionMessage(defaultInstanceID)) + ExpectApplied(ctx, env.Client, node) + + ExpectReconcileSucceeded(ctx, controller, types.NamespacedName{}) + ExpectNotFound(ctx, env.Client, node) + Expect(sqsapi.DeleteMessageBehavior.SuccessfulCalls()).To(Equal(1)) }) - ExpectMessagesCreated(spotInterruptionMessage(defaultInstanceID)) - ExpectApplied(env.Ctx, env.Client, node) - - ExpectReconcileSucceeded(env.Ctx, controller, types.NamespacedName{}) - ExpectNotFound(env.Ctx, env.Client, node) - Expect(sqsapi.DeleteMessageBehavior.SuccessfulCalls()).To(Equal(1)) - }) - It("should delete the node when receiving a scheduled change message", func() { - node := test.Node(test.NodeOptions{ - ObjectMeta: metav1.ObjectMeta{ - Labels: map[string]string{ - v1alpha5.ProvisionerNameLabelKey: "default", + It("should delete the node when receiving a scheduled change message", func() { + node := test.Node(test.NodeOptions{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + v1alpha5.ProvisionerNameLabelKey: "default", + }, }, - }, - ProviderID: makeProviderID(defaultInstanceID), + ProviderID: makeProviderID(defaultInstanceID), + }) + ExpectMessagesCreated(scheduledChangeMessage(defaultInstanceID)) + ExpectApplied(ctx, env.Client, node) + + ExpectReconcileSucceeded(ctx, controller, types.NamespacedName{}) + ExpectNotFound(ctx, env.Client, node) + Expect(sqsapi.DeleteMessageBehavior.SuccessfulCalls()).To(Equal(1)) }) - ExpectMessagesCreated(scheduledChangeMessage(defaultInstanceID)) - ExpectApplied(env.Ctx, env.Client, node) + It("should delete the node when receiving a state change message", func() { + var nodes []*v1.Node + var messages []interface{} + for _, state := range []string{"terminated", "stopped", "stopping", "shutting-down"} { + instanceID := makeInstanceID() + nodes = append(nodes, test.Node(test.NodeOptions{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + v1alpha5.ProvisionerNameLabelKey: "default", + }, + }, + ProviderID: makeProviderID(instanceID), + })) + messages = append(messages, stateChangeMessage(instanceID, state)) + } + ExpectMessagesCreated(messages...) + ExpectApplied(ctx, env.Client, lo.Map(nodes, func(n *v1.Node, _ int) client.Object { return n })...) + + ExpectReconcileSucceeded(ctx, controller, types.NamespacedName{}) + ExpectNotFound(ctx, env.Client, lo.Map(nodes, func(n *v1.Node, _ int) client.Object { return n })...) + Expect(sqsapi.DeleteMessageBehavior.SuccessfulCalls()).To(Equal(4)) + }) + It("should handle multiple messages that cause node deletion", func() { + var nodes []*v1.Node + var instanceIDs []string + for i := 0; i < 100; i++ { + instanceIDs = append(instanceIDs, makeInstanceID()) + nodes = append(nodes, test.Node(test.NodeOptions{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + v1alpha5.ProvisionerNameLabelKey: "default", + }, + }, + ProviderID: makeProviderID(instanceIDs[len(instanceIDs)-1]), + })) - ExpectReconcileSucceeded(env.Ctx, controller, types.NamespacedName{}) - ExpectNotFound(env.Ctx, env.Client, node) - Expect(sqsapi.DeleteMessageBehavior.SuccessfulCalls()).To(Equal(1)) - }) - It("should delete the node when receiving a state change message", func() { - var nodes []*v1.Node - var messages []interface{} - for _, state := range []string{"terminated", "stopped", "stopping", "shutting-down"} { - instanceID := makeInstanceID() - nodes = append(nodes, test.Node(test.NodeOptions{ + } + + var messages []interface{} + for _, id := range instanceIDs { + messages = append(messages, spotInterruptionMessage(id)) + } + ExpectMessagesCreated(messages...) + ExpectApplied(ctx, env.Client, lo.Map(nodes, func(n *v1.Node, _ int) client.Object { return n })...) + + ExpectReconcileSucceeded(ctx, controller, types.NamespacedName{}) + ExpectNotFound(ctx, env.Client, lo.Map(nodes, func(n *v1.Node, _ int) client.Object { return n })...) + Expect(sqsapi.DeleteMessageBehavior.SuccessfulCalls()).To(Equal(100)) + }) + It("should not delete a node when not owned by provisioner", func() { + node := test.Node(test.NodeOptions{ + ProviderID: makeProviderID(string(uuid.NewUUID())), + }) + ExpectMessagesCreated(spotInterruptionMessage(node.Spec.ProviderID)) + ExpectApplied(ctx, env.Client, node) + + ExpectReconcileSucceeded(ctx, controller, types.NamespacedName{}) + ExpectNodeExists(ctx, env.Client, node.Name) + Expect(sqsapi.DeleteMessageBehavior.SuccessfulCalls()).To(Equal(1)) + }) + It("should delete a message when the message can't be parsed", func() { + badMessage := &sqs.Message{ + Body: aws.String(string(lo.Must(json.Marshal(map[string]string{ + "field1": "value1", + "field2": "value2", + })))), + MessageId: aws.String(string(uuid.NewUUID())), + } + + ExpectMessagesCreated(badMessage) + + ExpectReconcileSucceeded(ctx, controller, types.NamespacedName{}) + Expect(sqsapi.DeleteMessageBehavior.SuccessfulCalls()).To(Equal(1)) + }) + It("should delete a state change message when the state isn't in accepted states", func() { + node := test.Node(test.NodeOptions{ ObjectMeta: metav1.ObjectMeta{ Labels: map[string]string{ v1alpha5.ProvisionerNameLabelKey: "default", }, }, - ProviderID: makeProviderID(instanceID), - })) - messages = append(messages, stateChangeMessage(instanceID, state)) - } - ExpectMessagesCreated(messages...) - ExpectApplied(env.Ctx, env.Client, lo.Map(nodes, func(n *v1.Node, _ int) client.Object { return n })...) - - ExpectReconcileSucceeded(env.Ctx, controller, types.NamespacedName{}) - ExpectNotFound(env.Ctx, env.Client, lo.Map(nodes, func(n *v1.Node, _ int) client.Object { return n })...) - Expect(sqsapi.DeleteMessageBehavior.SuccessfulCalls()).To(Equal(4)) - }) - It("should handle multiple messages that cause node deletion", func() { - var nodes []*v1.Node - var instanceIDs []string - for i := 0; i < 100; i++ { - instanceIDs = append(instanceIDs, makeInstanceID()) - nodes = append(nodes, test.Node(test.NodeOptions{ + ProviderID: makeProviderID(defaultInstanceID), + }) + ExpectMessagesCreated(stateChangeMessage(defaultInstanceID, "creating")) + ExpectApplied(ctx, env.Client, node) + + ExpectReconcileSucceeded(ctx, controller, types.NamespacedName{}) + ExpectNodeExists(ctx, env.Client, node.Name) + Expect(sqsapi.DeleteMessageBehavior.SuccessfulCalls()).To(Equal(1)) + }) + It("should mark the ICE cache for the offering when getting a spot interruption warning", func() { + node := test.Node(test.NodeOptions{ ObjectMeta: metav1.ObjectMeta{ Labels: map[string]string{ v1alpha5.ProvisionerNameLabelKey: "default", + v1.LabelTopologyZone: "test-zone-1a", + v1.LabelInstanceTypeStable: "t3.large", + v1alpha5.LabelCapacityType: v1alpha1.CapacityTypeSpot, }, }, - ProviderID: makeProviderID(instanceIDs[len(instanceIDs)-1]), - })) + ProviderID: makeProviderID(defaultInstanceID), + }) + ExpectMessagesCreated(spotInterruptionMessage(defaultInstanceID)) + ExpectApplied(ctx, env.Client, node) - } - - var messages []interface{} - for _, id := range instanceIDs { - messages = append(messages, spotInterruptionMessage(id)) - } - ExpectMessagesCreated(messages...) - ExpectApplied(env.Ctx, env.Client, lo.Map(nodes, func(n *v1.Node, _ int) client.Object { return n })...) + ExpectReconcileSucceeded(ctx, controller, types.NamespacedName{}) + ExpectNotFound(ctx, env.Client, node) + Expect(sqsapi.DeleteMessageBehavior.SuccessfulCalls()).To(Equal(1)) - ExpectReconcileSucceeded(env.Ctx, controller, types.NamespacedName{}) - ExpectNotFound(env.Ctx, env.Client, lo.Map(nodes, func(n *v1.Node, _ int) client.Object { return n })...) - Expect(sqsapi.DeleteMessageBehavior.SuccessfulCalls()).To(Equal(100)) - }) - It("should not delete a node when not owned by provisioner", func() { - node := test.Node(test.NodeOptions{ - ProviderID: makeProviderID(string(uuid.NewUUID())), + // Expect a t3.large in test-zone-1a to be added to the ICE cache + Expect(unavailableOfferingsCache.IsUnavailable("t3.large", "test-zone-1a", v1alpha1.CapacityTypeSpot)).To(BeTrue()) }) - ExpectMessagesCreated(spotInterruptionMessage(node.Spec.ProviderID)) - ExpectApplied(env.Ctx, env.Client, node) - - ExpectReconcileSucceeded(env.Ctx, controller, types.NamespacedName{}) - ExpectNodeExists(env.Ctx, env.Client, node.Name) - Expect(sqsapi.DeleteMessageBehavior.SuccessfulCalls()).To(Equal(1)) }) - It("should delete a message when the message can't be parsed", func() { - badMessage := &sqs.Message{ - Body: aws.String(string(lo.Must(json.Marshal(map[string]string{ - "field1": "value1", - "field2": "value2", - })))), - MessageId: aws.String(string(uuid.NewUUID())), - } - - ExpectMessagesCreated(badMessage) - - ExpectReconcileSucceeded(env.Ctx, controller, types.NamespacedName{}) - Expect(sqsapi.DeleteMessageBehavior.SuccessfulCalls()).To(Equal(1)) - }) - It("should delete a state change message when the state isn't in accepted states", func() { - node := test.Node(test.NodeOptions{ - ObjectMeta: metav1.ObjectMeta{ - Labels: map[string]string{ - v1alpha5.ProvisionerNameLabelKey: "default", - }, - }, - ProviderID: makeProviderID(defaultInstanceID), + Context("Error Handling", func() { + It("should send an error on polling when AccessDenied", func() { + sqsapi.ReceiveMessageBehavior.Error.Set(awsErrWithCode(errors.AccessDeniedCode), awsfake.MaxCalls(0)) + ExpectReconcileFailed(ctx, controller, types.NamespacedName{}) }) - ExpectMessagesCreated(stateChangeMessage(defaultInstanceID, "creating")) - ExpectApplied(env.Ctx, env.Client, node) - - ExpectReconcileSucceeded(env.Ctx, controller, types.NamespacedName{}) - ExpectNodeExists(env.Ctx, env.Client, node.Name) - Expect(sqsapi.DeleteMessageBehavior.SuccessfulCalls()).To(Equal(1)) - }) - It("should mark the ICE cache for the offering when getting a spot interruption warning", func() { - node := test.Node(test.NodeOptions{ - ObjectMeta: metav1.ObjectMeta{ - Labels: map[string]string{ - v1alpha5.ProvisionerNameLabelKey: "default", - v1.LabelTopologyZone: "test-zone-1a", - v1.LabelInstanceTypeStable: "t3.large", - v1alpha5.LabelCapacityType: v1alpha1.CapacityTypeSpot, - }, - }, - ProviderID: makeProviderID(defaultInstanceID), + It("should send an error on polling when QueueDeletedRecently", func() { + sqsapi.GetQueueURLBehavior.Error.Set(awsErrWithCode(sqs.ErrCodeQueueDeletedRecently), awsfake.MaxCalls(0)) + ExpectReconcileFailed(ctx, controller, types.NamespacedName{}) }) - ExpectMessagesCreated(spotInterruptionMessage(defaultInstanceID)) - ExpectApplied(env.Ctx, env.Client, node) - - ExpectReconcileSucceeded(env.Ctx, controller, types.NamespacedName{}) - ExpectNotFound(env.Ctx, env.Client, node) - Expect(sqsapi.DeleteMessageBehavior.SuccessfulCalls()).To(Equal(1)) - - // Expect a t3.large in test-zone-1a to be added to the ICE cache - Expect(unavailableOfferingsCache.IsUnavailable("t3.large", "test-zone-1a", v1alpha1.CapacityTypeSpot)).To(BeTrue()) - }) -}) - -var _ = Describe("Error Handling", func() { - It("should send an error on polling when AccessDenied", func() { - sqsapi.ReceiveMessageBehavior.Error.Set(awsErrWithCode(errors.AccessDeniedCode), awsfake.MaxCalls(0)) - ExpectReconcileFailed(ctx, controller, types.NamespacedName{}) - }) - It("should send an error on polling when QueueDeletedRecently", func() { - sqsapi.GetQueueURLBehavior.Error.Set(awsErrWithCode(sqs.ErrCodeQueueDeletedRecently), awsfake.MaxCalls(0)) - ExpectReconcileFailed(ctx, controller, types.NamespacedName{}) }) }) @@ -371,19 +378,8 @@ func makeProviderID(instanceID string) string { return fmt.Sprintf("aws:///%s/%s", defaultRegion, instanceID) } -var runes = []rune("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789") - -// nolint:gosec -func randStringRunes(n int) string { - b := make([]rune, n) - for i := range b { - b[i] = runes[rand.Intn(len(runes))] - } - return string(b) -} - func makeInstanceID() string { - return fmt.Sprintf("i-%s", randStringRunes(17)) + return fmt.Sprintf("i-%s", randomdata.Alphanumeric(17)) } func relativeToRoot(path string) string { diff --git a/pkg/controllers/nodetemplate/controller.go b/pkg/controllers/nodetemplate/controller.go index 7b8babc95d89..533ba093f4e7 100644 --- a/pkg/controllers/nodetemplate/controller.go +++ b/pkg/controllers/nodetemplate/controller.go @@ -48,15 +48,15 @@ func init() { // if there is. If there are no templates, then it de-provisions the infrastructure. type Controller struct { kubeClient client.Client - finalizer *Finalizer - infrastructure *Infrastructure + finalizer *FinalizerReconciler + infrastructure *InfrastructureReconciler } func NewController(kubeClient client.Client, sqsProvider *providers.SQS, eventBridgeProvider *providers.EventBridge) *Controller { return &Controller{ kubeClient: kubeClient, - finalizer: &Finalizer{}, - infrastructure: &Infrastructure{kubeClient: kubeClient, sqsProvider: sqsProvider, eventBridgeProvider: eventBridgeProvider}, + finalizer: NewFinalizerReconciler(), + infrastructure: NewInfrastructureReconciler(kubeClient, sqsProvider, eventBridgeProvider), } } diff --git a/pkg/controllers/nodetemplate/finalizer.go b/pkg/controllers/nodetemplate/finalizer.go index 5f39de31205b..29fec3cbcae0 100644 --- a/pkg/controllers/nodetemplate/finalizer.go +++ b/pkg/controllers/nodetemplate/finalizer.go @@ -20,19 +20,22 @@ import ( "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" "sigs.k8s.io/controller-runtime/pkg/reconcile" - "github.com/aws/karpenter-core/pkg/apis/provisioning/v1alpha5" "github.com/aws/karpenter/pkg/apis/v1alpha1" ) -type Finalizer struct{} +type FinalizerReconciler struct{} + +func NewFinalizerReconciler() *FinalizerReconciler { + return &FinalizerReconciler{} +} // Reconcile adds the finalizer if the nodeTemplate doesn't have it or removes the finalizer // if the nodeTemplate is being deleted -func (r *Finalizer) Reconcile(_ context.Context, nodeTemplate *v1alpha1.AWSNodeTemplate) (reconcile.Result, error) { +func (r *FinalizerReconciler) Reconcile(_ context.Context, nodeTemplate *v1alpha1.AWSNodeTemplate) (reconcile.Result, error) { if !nodeTemplate.DeletionTimestamp.IsZero() { - controllerutil.RemoveFinalizer(nodeTemplate, v1alpha5.TerminationFinalizer) + controllerutil.RemoveFinalizer(nodeTemplate, v1alpha1.InterruptionInfrastructureFinalizer) return reconcile.Result{}, nil } - controllerutil.AddFinalizer(nodeTemplate, v1alpha5.TerminationFinalizer) + controllerutil.AddFinalizer(nodeTemplate, v1alpha1.InterruptionInfrastructureFinalizer) return reconcile.Result{}, nil } diff --git a/pkg/controllers/nodetemplate/infrastructure.go b/pkg/controllers/nodetemplate/infrastructure.go index 9839e6eb7149..a5db9b5cb9de 100644 --- a/pkg/controllers/nodetemplate/infrastructure.go +++ b/pkg/controllers/nodetemplate/infrastructure.go @@ -31,7 +31,7 @@ import ( "github.com/aws/karpenter/pkg/controllers/providers" ) -type Infrastructure struct { +type InfrastructureReconciler struct { kubeClient client.Client sqsProvider *providers.SQS eventBridgeProvider *providers.EventBridge @@ -39,40 +39,45 @@ type Infrastructure struct { lastInfrastructureReconcile time.Time // Keeps track of the last reconcile time for infra, so we don't keep calling APIs } +func NewInfrastructureReconciler(kubeClient client.Client, sqsProvider *providers.SQS, eventBridgeProvider *providers.EventBridge) *InfrastructureReconciler { + return &InfrastructureReconciler{ + kubeClient: kubeClient, + sqsProvider: sqsProvider, + eventBridgeProvider: eventBridgeProvider, + } +} + // Reconcile reconciles the infrastructure based on whether interruption handling is enabled and deletes // the infrastructure by ref-counting when the last AWSNodeTemplate is removed -func (i *Infrastructure) Reconcile(ctx context.Context, nodeTemplate *v1alpha1.AWSNodeTemplate) (reconcile.Result, error) { - if awssettings.FromContext(ctx).EnableInterruptionHandling { - list := &v1alpha1.AWSNodeTemplateList{} - if err := i.kubeClient.List(ctx, list); err != nil { +func (i *InfrastructureReconciler) Reconcile(ctx context.Context, nodeTemplate *v1alpha1.AWSNodeTemplate) (reconcile.Result, error) { + if !awssettings.FromContext(ctx).EnableInterruptionHandling { + // TODO: Implement an alerting mechanism for settings updates; until then, just poll + return reconcile.Result{RequeueAfter: time.Second * 10}, nil + } + list := &v1alpha1.AWSNodeTemplateList{} + if err := i.kubeClient.List(ctx, list); err != nil { + return reconcile.Result{}, err + } + if !nodeTemplate.DeletionTimestamp.IsZero() && len(list.Items) == 1 { + if err := i.DeleteInfrastructure(ctx); err != nil { return reconcile.Result{}, err } - if !nodeTemplate.DeletionTimestamp.IsZero() && len(list.Items) == 1 { - if err := i.DeleteInfrastructure(ctx); err != nil { + i.lastInfrastructureReconcile = time.Time{} + return reconcile.Result{}, nil + } else if len(list.Items) >= 1 { + if i.lastInfrastructureReconcile.Add(time.Minute * 5).Before(time.Now()) { + if err := i.CreateInfrastructure(ctx); err != nil { return reconcile.Result{}, err } - i.lastInfrastructureReconcile = time.Time{} - return reconcile.Result{}, nil - } else if len(list.Items) >= 1 { - if i.lastInfrastructureReconcile.Add(time.Hour).Before(time.Now()) { - if err := i.CreateInfrastructure(ctx); err != nil { - infrastructureHealthy.Set(0) - return reconcile.Result{}, err - } - i.lastInfrastructureReconcile = time.Now() - infrastructureHealthy.Set(1) - } + i.lastInfrastructureReconcile = time.Now() } - } else { - infrastructureHealthy.Set(0) } - // TODO: Implement an alerting mechanism for settings updates; until then, just poll return reconcile.Result{RequeueAfter: time.Second * 10}, nil } // CreateInfrastructure provisions an SQS queue and EventBridge rules to enable interruption handling -func (i *Infrastructure) CreateInfrastructure(ctx context.Context) error { +func (i *InfrastructureReconciler) CreateInfrastructure(ctx context.Context) error { defer metrics.Measure(infrastructureCreateDuration)() if err := i.ensureQueue(ctx); err != nil { return fmt.Errorf("ensuring queue, %w", err) @@ -80,15 +85,14 @@ func (i *Infrastructure) CreateInfrastructure(ctx context.Context) error { if err := i.ensureEventBridge(ctx); err != nil { return fmt.Errorf("ensuring eventBridge rules and targets, %w", err) } - logging.FromContext(ctx).Infof("Completed reconciliation of infrastructure") + logging.FromContext(ctx).Infof("Ensured existence of interruption-handling infrastructure") return nil } // DeleteInfrastructure removes the infrastructure that was stood up and reconciled // by the infrastructure controller for SQS message polling -func (i *Infrastructure) DeleteInfrastructure(ctx context.Context) error { +func (i *InfrastructureReconciler) DeleteInfrastructure(ctx context.Context) error { defer metrics.Measure(infrastructureDeleteDuration)() - logging.FromContext(ctx).Infof("Deprovisioning the infrastructure...") funcs := []func(context.Context) error{ i.deleteQueue, i.deleteEventBridge, @@ -102,52 +106,55 @@ func (i *Infrastructure) DeleteInfrastructure(ctx context.Context) error { if err != nil { return err } - logging.FromContext(ctx).Infof("Completed deprovisioning the infrastructure") + logging.FromContext(ctx).Infof("Deprovisioned the interruption-handling infrastructure") return nil } // ensureQueue reconciles the SQS queue with the configuration prescribed by Karpenter -func (i *Infrastructure) ensureQueue(ctx context.Context) error { +func (i *InfrastructureReconciler) ensureQueue(ctx context.Context) error { // Attempt to find the queue. If we can't find it, assume it isn't created and try to create it // If we did find it, then just set the queue attributes on the existing queue ctx = logging.WithLogger(ctx, logging.FromContext(ctx).With("queueName", i.sqsProvider.QueueName(ctx))) - logging.FromContext(ctx).Debugf("Reconciling the SQS interruption queue...") queueExists, err := i.sqsProvider.QueueExists(ctx) if err != nil { - return fmt.Errorf("checking queue existence, %w", err) + return fmt.Errorf("checking the SQS interruption queue existence, %w", err) } if !queueExists { - logging.FromContext(ctx).Debugf("Queue not found, creating the SQS interruption queue...") + logging.FromContext(ctx).Debugf("Queue not found, creating the SQS interruption queue") if err := i.sqsProvider.CreateQueue(ctx); err != nil { - return fmt.Errorf("creating sqs queue with policy, %w", err) + return fmt.Errorf("creating the SQSS interruption queue with policy, %w", err) } - logging.FromContext(ctx).Debugf("Successfully created the SQS interruption queue") } // Always attempt to set the queue attributes, even after creation to help set the queue policy if err := i.sqsProvider.SetQueueAttributes(ctx, nil); err != nil { - return fmt.Errorf("setting queue attributes for queue, %w", err) + return fmt.Errorf("setting queue attributes for interruption queue, %w", err) } - logging.FromContext(ctx).Debugf("Successfully reconciled SQS queue") + logging.FromContext(ctx).Debugf("Reconciled the SQS interruption queue") return nil } -func (i *Infrastructure) deleteQueue(ctx context.Context) error { +func (i *InfrastructureReconciler) deleteQueue(ctx context.Context) error { ctx = logging.WithLogger(ctx, logging.FromContext(ctx).With("queueName", i.sqsProvider.QueueName(ctx))) - logging.FromContext(ctx).Debugf("Deleting the SQS interruption queue...") - return i.sqsProvider.DeleteQueue(ctx) + if err := i.sqsProvider.DeleteQueue(ctx); err != nil { + return fmt.Errorf("deleting the the SQS interruption queue, %w", err) + } + logging.FromContext(ctx).Debugf("Deleted the SQS interruption queue") + return nil } -// ensureEventBridge reconciles the Eventbridge rules with the configuration prescribed by Karpenter -func (i *Infrastructure) ensureEventBridge(ctx context.Context) error { - logging.FromContext(ctx).Debugf("Reconciling the EventBridge event rules...") +// ensureEventBridge reconciles the EventBridge rules with the configuration prescribed by Karpenter +func (i *InfrastructureReconciler) ensureEventBridge(ctx context.Context) error { if err := i.eventBridgeProvider.CreateRules(ctx); err != nil { - return fmt.Errorf("creating EventBridge event rules, %w", err) + return fmt.Errorf("creating EventBridge interruption rules, %w", err) } - logging.FromContext(ctx).Debugf("Successfully reconciled EventBridge event rules") + logging.FromContext(ctx).Debugf("Reconciled the EventBridge interruption rules") return nil } -func (i *Infrastructure) deleteEventBridge(ctx context.Context) error { - logging.FromContext(ctx).Debugf("Deleting the EventBridge interruption rules...") - return i.eventBridgeProvider.DeleteRules(ctx) +func (i *InfrastructureReconciler) deleteEventBridge(ctx context.Context) error { + if err := i.eventBridgeProvider.DeleteRules(ctx); err != nil { + return fmt.Errorf("deleting the EventBridge interruption rules, %w", err) + } + logging.FromContext(ctx).Debugf("Deleted the EventBridge interruption rules") + return nil } diff --git a/pkg/controllers/nodetemplate/metrics.go b/pkg/controllers/nodetemplate/metrics.go index 6d41a1c4d73a..d59bb1589d5d 100644 --- a/pkg/controllers/nodetemplate/metrics.go +++ b/pkg/controllers/nodetemplate/metrics.go @@ -24,14 +24,6 @@ import ( const subSystem = "nodetemplate_infrastructure" var ( - infrastructureHealthy = prometheus.NewGauge( - prometheus.GaugeOpts{ - Namespace: metrics.Namespace, - Subsystem: subSystem, - Name: "healthy", - Help: "Whether the infrastructure provisioned by the controller is healthy.", - }, - ) infrastructureCreateDuration = prometheus.NewHistogram( prometheus.HistogramOpts{ Namespace: metrics.Namespace, @@ -53,5 +45,5 @@ var ( ) func init() { - crmetrics.Registry.MustRegister(infrastructureHealthy, infrastructureCreateDuration, infrastructureDeleteDuration) + crmetrics.Registry.MustRegister(infrastructureCreateDuration, infrastructureDeleteDuration) } diff --git a/pkg/controllers/nodetemplate/suite_test.go b/pkg/controllers/nodetemplate/suite_test.go index c0818713057d..82c53d8f8fd4 100644 --- a/pkg/controllers/nodetemplate/suite_test.go +++ b/pkg/controllers/nodetemplate/suite_test.go @@ -29,20 +29,20 @@ import ( . "github.com/onsi/gomega" . "knative.dev/pkg/logging/testing" _ "knative.dev/pkg/system/testing" + "sigs.k8s.io/controller-runtime/pkg/client" "github.com/aws/karpenter-core/pkg/apis/config/settings" "github.com/aws/karpenter-core/pkg/apis/provisioning/v1alpha5" "github.com/aws/karpenter-core/pkg/operator/injection" "github.com/aws/karpenter-core/pkg/operator/options" + "github.com/aws/karpenter-core/pkg/test" . "github.com/aws/karpenter-core/pkg/test/expectations" awssettings "github.com/aws/karpenter/pkg/apis/config/settings" "github.com/aws/karpenter/pkg/apis/v1alpha1" + "github.com/aws/karpenter/pkg/controllers/nodetemplate" "github.com/aws/karpenter/pkg/controllers/providers" "github.com/aws/karpenter/pkg/errors" - - "github.com/aws/karpenter-core/pkg/test" - "github.com/aws/karpenter/pkg/controllers/nodetemplate" awsfake "github.com/aws/karpenter/pkg/fake" awstest "github.com/aws/karpenter/pkg/test" ) @@ -68,18 +68,10 @@ var defaultOpts = options.Options{ func TestAPIs(t *testing.T) { ctx = TestContextWithLogger(t) RegisterFailHandler(Fail) - RunSpecs(t, "AWS Node Template") + RunSpecs(t, "AWSNodeTemplate") } -var _ = BeforeEach(func() { - settingsStore := test.SettingsStore{ - settings.ContextKey: test.Settings(), - awssettings.ContextKey: awssettings.Settings{ - EnableInterruptionHandling: true, - }, - } - ctx = settingsStore.InjectSettings(ctx) - ctx = injection.WithOptions(ctx, defaultOpts) +var _ = BeforeSuite(func() { env = test.NewEnvironment(ctx, func(e *test.Environment) { opts = defaultOpts Expect(opts.Validate()).To(Succeed(), "Failed to validate options") @@ -89,267 +81,284 @@ var _ = BeforeEach(func() { eventbridgeapi = &awsfake.EventBridgeAPI{} sqsProvider = providers.NewSQS(sqsapi) eventBridgeProvider = providers.NewEventBridge(eventbridgeapi, sqsProvider) - - controller = nodetemplate.NewController(e.Client, sqsProvider, eventBridgeProvider) }) env.CRDDirectoryPaths = append(env.CRDDirectoryPaths, relativeToRoot("charts/karpenter/crds")) Expect(env.Start()).To(Succeed(), "Failed to start environment") }) -var _ = AfterEach(func() { - ExpectCleanedUp(ctx, env.Client) +var _ = AfterSuite(func() { Expect(env.Stop()).To(Succeed(), "Failed to stop environment") }) -var _ = Describe("Infrastructure", func() { - Context("Creation", func() { - var provider *v1alpha1.AWSNodeTemplate - BeforeEach(func() { - provider = awstest.AWSNodeTemplate() - ExpectApplied(env.Ctx, env.Client, provider) - }) - AfterEach(func() { - ExpectFinalizersRemoved(env.Ctx, env.Client, provider) - ExpectDeleted(env.Ctx, env.Client, provider) - }) - It("should reconcile the queue and the eventbridge rules on start", func() { - sqsapi.GetQueueURLBehavior.Error.Set(awsErrWithCode(sqs.ErrCodeQueueDoesNotExist), awsfake.MaxCalls(1)) // This mocks the queue not existing - - ExpectReconcileSucceeded(ctx, controller, client.ObjectKeyFromObject(provider)) +var _ = BeforeEach(func() { + controller = nodetemplate.NewController(env.Client, sqsProvider, eventBridgeProvider) + settingsStore := test.SettingsStore{ + settings.ContextKey: test.Settings(), + awssettings.ContextKey: awssettings.Settings{ + EnableInterruptionHandling: true, + }, + } + ctx = settingsStore.InjectSettings(ctx) + ctx = injection.WithOptions(ctx, defaultOpts) +}) - Expect(sqsapi.CreateQueueBehavior.SuccessfulCalls()).To(Equal(1)) - Expect(eventbridgeapi.PutRuleBehavior.SuccessfulCalls()).To(Equal(4)) - Expect(eventbridgeapi.PutTargetsBehavior.SuccessfulCalls()).To(Equal(4)) - }) - It("should throw an error but wait with backoff if we get AccessDenied", func() { - sqsapi.GetQueueURLBehavior.Error.Set(awsErrWithCode(sqs.ErrCodeQueueDoesNotExist), awsfake.MaxCalls(0)) // This mocks the queue not existing - sqsapi.CreateQueueBehavior.Error.Set(awsErrWithCode(errors.AccessDeniedCode), awsfake.MaxCalls(0)) - eventbridgeapi.PutRuleBehavior.Error.Set(awsErrWithCode(errors.AccessDeniedExceptionCode), awsfake.MaxCalls(0)) - eventbridgeapi.PutTargetsBehavior.Error.Set(awsErrWithCode(errors.AccessDeniedExceptionCode), awsfake.MaxCalls(0)) - - ExpectReconcileFailed(ctx, controller, client.ObjectKeyFromObject(provider)) - Expect(sqsapi.CreateQueueBehavior.FailedCalls()).To(Equal(1)) - - // Simulating AccessDenied being resolved - sqsapi.CreateQueueBehavior.Reset() - eventbridgeapi.PutRuleBehavior.Reset() - eventbridgeapi.PutTargetsBehavior.Reset() - - ExpectReconcileSucceeded(ctx, controller, client.ObjectKeyFromObject(provider)) - Expect(sqsapi.CreateQueueBehavior.SuccessfulCalls()).To(Equal(1)) - Expect(eventbridgeapi.PutRuleBehavior.SuccessfulCalls()).To(Equal(4)) - Expect(eventbridgeapi.PutTargetsBehavior.SuccessfulCalls()).To(Equal(4)) - }) - It("should throw an error and wait with backoff if we get QueueDeletedRecently", func() { - sqsapi.GetQueueURLBehavior.Error.Set(awsErrWithCode(sqs.ErrCodeQueueDoesNotExist), awsfake.MaxCalls(0)) // This mocks the queue not existing - sqsapi.CreateQueueBehavior.Error.Set(awsErrWithCode(sqs.ErrCodeQueueDeletedRecently), awsfake.MaxCalls(0)) +var _ = AfterEach(func() { + sqsapi.Reset() + eventbridgeapi.Reset() + ExpectCleanedUp(ctx, env.Client) +}) - ExpectReconcileFailed(ctx, controller, client.ObjectKeyFromObject(provider)) - Expect(sqsapi.CreateQueueBehavior.FailedCalls()).To(Equal(1)) - }) - }) - Context("Deletion", func() { - It("should cleanup the infrastructure when the last AWSNodeTemplate is removed", func() { - provider := awstest.AWSNodeTemplate() - sqsapi.GetQueueURLBehavior.Error.Set(awsErrWithCode(sqs.ErrCodeQueueDoesNotExist), awsfake.MaxCalls(1)) // This mocks the queue not existing - - ExpectApplied(ctx, env.Client, provider) - ExpectReconcileSucceeded(ctx, controller, client.ObjectKeyFromObject(provider)) - - Expect(sqsapi.CreateQueueBehavior.SuccessfulCalls()).To(Equal(1)) - Expect(eventbridgeapi.PutRuleBehavior.SuccessfulCalls()).To(Equal(4)) - Expect(eventbridgeapi.PutTargetsBehavior.SuccessfulCalls()).To(Equal(4)) - - // Set the output of ListRules to mock rule creation - eventbridgeapi.ListRulesBehavior.Output.Set(&eventbridge.ListRulesOutput{ - Rules: []*eventbridge.Rule{ - { - Name: aws.String(providers.DefaultRules[providers.ScheduledChangedRule].Name), - Arn: aws.String("test-arn1"), - }, - { - Name: aws.String(providers.DefaultRules[providers.SpotTerminationRule].Name), - Arn: aws.String("test-arn2"), - }, - { - Name: aws.String(providers.DefaultRules[providers.RebalanceRule].Name), - Arn: aws.String("test-arn3"), - }, - { - Name: aws.String(providers.DefaultRules[providers.StateChangeRule].Name), - Arn: aws.String("test-arn4"), - }, - }, +var _ = Describe("AWSNodeTemplate", func() { + Context("Infrastructure", func() { + Context("Creation", func() { + var provider *v1alpha1.AWSNodeTemplate + BeforeEach(func() { + provider = awstest.AWSNodeTemplate() + ExpectApplied(ctx, env.Client, provider) }) - eventbridgeapi.ListTagsForResourceBehavior.Output.Set(&eventbridge.ListTagsForResourceOutput{ - Tags: []*eventbridge.Tag{ - { - Key: aws.String(v1alpha5.DiscoveryTagKey), - Value: aws.String(defaultOpts.ClusterName), - }, - }, + AfterEach(func() { + ExpectFinalizersRemoved(ctx, env.Client, provider) + ExpectDeleted(ctx, env.Client, provider) }) + It("should reconcile the queue and the eventbridge rules on start", func() { + sqsapi.GetQueueURLBehavior.Error.Set(awsErrWithCode(sqs.ErrCodeQueueDoesNotExist), awsfake.MaxCalls(1)) // This mocks the queue not existing - // Delete the AWSNodeTemplate and then re-reconcile it to delete the infrastructure - Expect(env.Client.Delete(ctx, provider)).To(Succeed()) - ExpectReconcileSucceeded(ctx, controller, client.ObjectKeyFromObject(provider)) + ExpectReconcileSucceeded(ctx, controller, client.ObjectKeyFromObject(provider)) - Expect(sqsapi.DeleteQueueBehavior.SuccessfulCalls()).To(Equal(1)) - Expect(eventbridgeapi.DeleteRuleBehavior.SuccessfulCalls()).To(Equal(4)) - Expect(eventbridgeapi.RemoveTargetsBehavior.SuccessfulCalls()).To(Equal(4)) - }) - It("should cleanup when queue is already deleted", func() { - provider := awstest.AWSNodeTemplate() - ExpectApplied(ctx, env.Client, provider) - ExpectReconcileSucceeded(ctx, controller, client.ObjectKeyFromObject(provider)) - - sqsapi.DeleteQueueBehavior.Error.Set(awsErrWithCode(sqs.ErrCodeQueueDoesNotExist), awsfake.MaxCalls(0)) - - // Set the output of ListRules to mock rule creation - eventbridgeapi.ListRulesBehavior.Output.Set(&eventbridge.ListRulesOutput{ - Rules: []*eventbridge.Rule{ - { - Name: aws.String(providers.DefaultRules[providers.ScheduledChangedRule].Name), - Arn: aws.String("test-arn1"), - }, - { - Name: aws.String(providers.DefaultRules[providers.SpotTerminationRule].Name), - Arn: aws.String("test-arn2"), - }, - { - Name: aws.String(providers.DefaultRules[providers.RebalanceRule].Name), - Arn: aws.String("test-arn3"), - }, - { - Name: aws.String(providers.DefaultRules[providers.StateChangeRule].Name), - Arn: aws.String("test-arn4"), - }, - }, + Expect(sqsapi.CreateQueueBehavior.SuccessfulCalls()).To(Equal(1)) + Expect(eventbridgeapi.PutRuleBehavior.SuccessfulCalls()).To(Equal(4)) + Expect(eventbridgeapi.PutTargetsBehavior.SuccessfulCalls()).To(Equal(4)) }) - eventbridgeapi.ListTagsForResourceBehavior.Output.Set(&eventbridge.ListTagsForResourceOutput{ - Tags: []*eventbridge.Tag{ - { - Key: aws.String(v1alpha5.DiscoveryTagKey), - Value: aws.String(defaultOpts.ClusterName), - }, - }, + It("should throw an error but wait with backoff if we get AccessDenied", func() { + sqsapi.GetQueueURLBehavior.Error.Set(awsErrWithCode(sqs.ErrCodeQueueDoesNotExist), awsfake.MaxCalls(0)) // This mocks the queue not existing + sqsapi.CreateQueueBehavior.Error.Set(awsErrWithCode(errors.AccessDeniedCode), awsfake.MaxCalls(0)) + eventbridgeapi.PutRuleBehavior.Error.Set(awsErrWithCode(errors.AccessDeniedExceptionCode), awsfake.MaxCalls(0)) + eventbridgeapi.PutTargetsBehavior.Error.Set(awsErrWithCode(errors.AccessDeniedExceptionCode), awsfake.MaxCalls(0)) + + ExpectReconcileFailed(ctx, controller, client.ObjectKeyFromObject(provider)) + Expect(sqsapi.CreateQueueBehavior.FailedCalls()).To(Equal(1)) + + // Simulating AccessDenied being resolved + sqsapi.CreateQueueBehavior.Reset() + eventbridgeapi.PutRuleBehavior.Reset() + eventbridgeapi.PutTargetsBehavior.Reset() + + ExpectReconcileSucceeded(ctx, controller, client.ObjectKeyFromObject(provider)) + Expect(sqsapi.CreateQueueBehavior.SuccessfulCalls()).To(Equal(1)) + Expect(eventbridgeapi.PutRuleBehavior.SuccessfulCalls()).To(Equal(4)) + Expect(eventbridgeapi.PutTargetsBehavior.SuccessfulCalls()).To(Equal(4)) }) + It("should throw an error and wait with backoff if we get QueueDeletedRecently", func() { + sqsapi.GetQueueURLBehavior.Error.Set(awsErrWithCode(sqs.ErrCodeQueueDoesNotExist), awsfake.MaxCalls(0)) // This mocks the queue not existing + sqsapi.CreateQueueBehavior.Error.Set(awsErrWithCode(sqs.ErrCodeQueueDeletedRecently), awsfake.MaxCalls(0)) - // Delete the AWSNodeTemplate and then re-reconcile it to delete the infrastructure - Expect(env.Client.Delete(ctx, provider)).To(Succeed()) - ExpectReconcileSucceeded(ctx, controller, client.ObjectKeyFromObject(provider)) - - Expect(sqsapi.DeleteQueueBehavior.SuccessfulCalls()).To(Equal(0)) - Expect(eventbridgeapi.DeleteRuleBehavior.SuccessfulCalls()).To(Equal(4)) - Expect(eventbridgeapi.RemoveTargetsBehavior.SuccessfulCalls()).To(Equal(4)) + ExpectReconcileFailed(ctx, controller, client.ObjectKeyFromObject(provider)) + Expect(sqsapi.CreateQueueBehavior.FailedCalls()).To(Equal(1)) + }) }) - It("should cleanup with a success when a few rules aren't in list call", func() { - provider := awstest.AWSNodeTemplate() - ExpectApplied(ctx, env.Client, provider) - ExpectReconcileSucceeded(ctx, controller, client.ObjectKeyFromObject(provider)) - - // Set the output of ListRules to mock rule creation - eventbridgeapi.ListRulesBehavior.Output.Set(&eventbridge.ListRulesOutput{ - Rules: []*eventbridge.Rule{ - { - Name: aws.String(providers.DefaultRules[providers.ScheduledChangedRule].Name), - Arn: aws.String("test-arn1"), + Context("Deletion", func() { + It("should cleanup the infrastructure when the last AWSNodeTemplate is removed", func() { + provider := awstest.AWSNodeTemplate() + sqsapi.GetQueueURLBehavior.Error.Set(awsErrWithCode(sqs.ErrCodeQueueDoesNotExist), awsfake.MaxCalls(1)) // This mocks the queue not existing + + ExpectApplied(ctx, env.Client, provider) + ExpectReconcileSucceeded(ctx, controller, client.ObjectKeyFromObject(provider)) + + Expect(sqsapi.CreateQueueBehavior.SuccessfulCalls()).To(Equal(1)) + Expect(eventbridgeapi.PutRuleBehavior.SuccessfulCalls()).To(Equal(4)) + Expect(eventbridgeapi.PutTargetsBehavior.SuccessfulCalls()).To(Equal(4)) + + // Set the output of ListRules to mock rule creation + eventbridgeapi.ListRulesBehavior.Output.Set(&eventbridge.ListRulesOutput{ + Rules: []*eventbridge.Rule{ + { + Name: aws.String(providers.DefaultRules[providers.ScheduledChangedRule].Name), + Arn: aws.String("test-arn1"), + }, + { + Name: aws.String(providers.DefaultRules[providers.SpotTerminationRule].Name), + Arn: aws.String("test-arn2"), + }, + { + Name: aws.String(providers.DefaultRules[providers.RebalanceRule].Name), + Arn: aws.String("test-arn3"), + }, + { + Name: aws.String(providers.DefaultRules[providers.StateChangeRule].Name), + Arn: aws.String("test-arn4"), + }, }, - { - Name: aws.String(providers.DefaultRules[providers.SpotTerminationRule].Name), - Arn: aws.String("test-arn2"), + }) + eventbridgeapi.ListTagsForResourceBehavior.Output.Set(&eventbridge.ListTagsForResourceOutput{ + Tags: []*eventbridge.Tag{ + { + Key: aws.String(v1alpha5.DiscoveryTagKey), + Value: aws.String(defaultOpts.ClusterName), + }, }, - }, + }) + + // Delete the AWSNodeTemplate and then re-reconcile it to delete the infrastructure + Expect(env.Client.Delete(ctx, provider)).To(Succeed()) + ExpectReconcileSucceeded(ctx, controller, client.ObjectKeyFromObject(provider)) + + Expect(sqsapi.DeleteQueueBehavior.SuccessfulCalls()).To(Equal(1)) + Expect(eventbridgeapi.DeleteRuleBehavior.SuccessfulCalls()).To(Equal(4)) + Expect(eventbridgeapi.RemoveTargetsBehavior.SuccessfulCalls()).To(Equal(4)) }) - eventbridgeapi.ListTagsForResourceBehavior.Output.Set(&eventbridge.ListTagsForResourceOutput{ - Tags: []*eventbridge.Tag{ - { - Key: aws.String(v1alpha5.DiscoveryTagKey), - Value: aws.String(defaultOpts.ClusterName), + It("should cleanup when queue is already deleted", func() { + provider := awstest.AWSNodeTemplate() + ExpectApplied(ctx, env.Client, provider) + ExpectReconcileSucceeded(ctx, controller, client.ObjectKeyFromObject(provider)) + + sqsapi.DeleteQueueBehavior.Error.Set(awsErrWithCode(sqs.ErrCodeQueueDoesNotExist), awsfake.MaxCalls(0)) + + // Set the output of ListRules to mock rule creation + eventbridgeapi.ListRulesBehavior.Output.Set(&eventbridge.ListRulesOutput{ + Rules: []*eventbridge.Rule{ + { + Name: aws.String(providers.DefaultRules[providers.ScheduledChangedRule].Name), + Arn: aws.String("test-arn1"), + }, + { + Name: aws.String(providers.DefaultRules[providers.SpotTerminationRule].Name), + Arn: aws.String("test-arn2"), + }, + { + Name: aws.String(providers.DefaultRules[providers.RebalanceRule].Name), + Arn: aws.String("test-arn3"), + }, + { + Name: aws.String(providers.DefaultRules[providers.StateChangeRule].Name), + Arn: aws.String("test-arn4"), + }, + }, + }) + eventbridgeapi.ListTagsForResourceBehavior.Output.Set(&eventbridge.ListTagsForResourceOutput{ + Tags: []*eventbridge.Tag{ + { + Key: aws.String(v1alpha5.DiscoveryTagKey), + Value: aws.String(defaultOpts.ClusterName), + }, }, - }, + }) + + // Delete the AWSNodeTemplate and then re-reconcile it to delete the infrastructure + Expect(env.Client.Delete(ctx, provider)).To(Succeed()) + ExpectReconcileSucceeded(ctx, controller, client.ObjectKeyFromObject(provider)) + + Expect(sqsapi.DeleteQueueBehavior.SuccessfulCalls()).To(Equal(0)) + Expect(eventbridgeapi.DeleteRuleBehavior.SuccessfulCalls()).To(Equal(4)) + Expect(eventbridgeapi.RemoveTargetsBehavior.SuccessfulCalls()).To(Equal(4)) }) + It("should cleanup with a success when a few rules aren't in list call", func() { + provider := awstest.AWSNodeTemplate() + ExpectApplied(ctx, env.Client, provider) + ExpectReconcileSucceeded(ctx, controller, client.ObjectKeyFromObject(provider)) + + // Set the output of ListRules to mock rule creation + eventbridgeapi.ListRulesBehavior.Output.Set(&eventbridge.ListRulesOutput{ + Rules: []*eventbridge.Rule{ + { + Name: aws.String(providers.DefaultRules[providers.ScheduledChangedRule].Name), + Arn: aws.String("test-arn1"), + }, + { + Name: aws.String(providers.DefaultRules[providers.SpotTerminationRule].Name), + Arn: aws.String("test-arn2"), + }, + }, + }) + eventbridgeapi.ListTagsForResourceBehavior.Output.Set(&eventbridge.ListTagsForResourceOutput{ + Tags: []*eventbridge.Tag{ + { + Key: aws.String(v1alpha5.DiscoveryTagKey), + Value: aws.String(defaultOpts.ClusterName), + }, + }, + }) - // Delete the AWSNodeTemplate and then re-reconcile it to delete the infrastructure - Expect(env.Client.Delete(ctx, provider)).To(Succeed()) - ExpectReconcileSucceeded(ctx, controller, client.ObjectKeyFromObject(provider)) + // Delete the AWSNodeTemplate and then re-reconcile it to delete the infrastructure + Expect(env.Client.Delete(ctx, provider)).To(Succeed()) + ExpectReconcileSucceeded(ctx, controller, client.ObjectKeyFromObject(provider)) - Expect(sqsapi.DeleteQueueBehavior.SuccessfulCalls()).To(Equal(1)) - Expect(eventbridgeapi.RemoveTargetsBehavior.SuccessfulCalls()).To(Equal(2)) - Expect(eventbridgeapi.DeleteRuleBehavior.SuccessfulCalls()).To(Equal(2)) - }) - It("should cleanup with a success when getting not found errors", func() { - provider := awstest.AWSNodeTemplate() - ExpectApplied(ctx, env.Client, provider) - ExpectReconcileSucceeded(ctx, controller, client.ObjectKeyFromObject(provider)) + Expect(sqsapi.DeleteQueueBehavior.SuccessfulCalls()).To(Equal(1)) + Expect(eventbridgeapi.RemoveTargetsBehavior.SuccessfulCalls()).To(Equal(2)) + Expect(eventbridgeapi.DeleteRuleBehavior.SuccessfulCalls()).To(Equal(2)) + }) + It("should cleanup with a success when getting not found errors", func() { + provider := awstest.AWSNodeTemplate() + ExpectApplied(ctx, env.Client, provider) + ExpectReconcileSucceeded(ctx, controller, client.ObjectKeyFromObject(provider)) - eventbridgeapi.RemoveTargetsBehavior.Error.Set(awsErrWithCode((&eventbridge.ResourceNotFoundException{}).Code()), awsfake.MaxCalls(0)) - eventbridgeapi.DeleteRuleBehavior.Error.Set(awsErrWithCode((&eventbridge.ResourceNotFoundException{}).Code()), awsfake.MaxCalls(0)) + eventbridgeapi.RemoveTargetsBehavior.Error.Set(awsErrWithCode((&eventbridge.ResourceNotFoundException{}).Code()), awsfake.MaxCalls(0)) + eventbridgeapi.DeleteRuleBehavior.Error.Set(awsErrWithCode((&eventbridge.ResourceNotFoundException{}).Code()), awsfake.MaxCalls(0)) - // Delete the AWSNodeTemplate and then re-reconcile it to delete the infrastructure - Expect(env.Client.Delete(ctx, provider)).To(Succeed()) - ExpectReconcileSucceeded(ctx, controller, client.ObjectKeyFromObject(provider)) + // Delete the AWSNodeTemplate and then re-reconcile it to delete the infrastructure + Expect(env.Client.Delete(ctx, provider)).To(Succeed()) + ExpectReconcileSucceeded(ctx, controller, client.ObjectKeyFromObject(provider)) - Expect(sqsapi.DeleteQueueBehavior.SuccessfulCalls()).To(Equal(1)) - Expect(eventbridgeapi.RemoveTargetsBehavior.SuccessfulCalls()).To(Equal(0)) - Expect(eventbridgeapi.DeleteRuleBehavior.SuccessfulCalls()).To(Equal(0)) - }) - It("should only attempt to delete the infrastructure when the last node template is removed", func() { - var nodeTemplates []*v1alpha1.AWSNodeTemplate - for i := 0; i < 10; i++ { - p := awstest.AWSNodeTemplate() - nodeTemplates = append(nodeTemplates, p) - ExpectApplied(ctx, env.Client, p) - ExpectReconcileSucceeded(ctx, controller, client.ObjectKeyFromObject(p)) - } - - for i := 0; i < len(nodeTemplates)-1; i++ { - Expect(env.Client.Delete(ctx, nodeTemplates[i])).To(Succeed()) - ExpectReconcileSucceeded(ctx, controller, client.ObjectKeyFromObject(nodeTemplates[i])) - } - - // It shouldn't attempt to delete at this point - Expect(sqsapi.DeleteQueueBehavior.Calls()).To(Equal(0)) - Expect(eventbridgeapi.RemoveTargetsBehavior.Calls()).To(Equal(0)) - Expect(eventbridgeapi.DeleteRuleBehavior.Calls()).To(Equal(0)) - - // Set the output of ListRules to mock rule creation - eventbridgeapi.ListRulesBehavior.Output.Set(&eventbridge.ListRulesOutput{ - Rules: []*eventbridge.Rule{ - { - Name: aws.String(providers.DefaultRules[providers.ScheduledChangedRule].Name), - Arn: aws.String("test-arn1"), - }, - { - Name: aws.String(providers.DefaultRules[providers.SpotTerminationRule].Name), - Arn: aws.String("test-arn2"), - }, - { - Name: aws.String(providers.DefaultRules[providers.RebalanceRule].Name), - Arn: aws.String("test-arn3"), - }, - { - Name: aws.String(providers.DefaultRules[providers.StateChangeRule].Name), - Arn: aws.String("test-arn4"), - }, - }, + Expect(sqsapi.DeleteQueueBehavior.SuccessfulCalls()).To(Equal(1)) + Expect(eventbridgeapi.RemoveTargetsBehavior.SuccessfulCalls()).To(Equal(0)) + Expect(eventbridgeapi.DeleteRuleBehavior.SuccessfulCalls()).To(Equal(0)) }) - eventbridgeapi.ListTagsForResourceBehavior.Output.Set(&eventbridge.ListTagsForResourceOutput{ - Tags: []*eventbridge.Tag{ - { - Key: aws.String(v1alpha5.DiscoveryTagKey), - Value: aws.String(defaultOpts.ClusterName), + It("should only attempt to delete the infrastructure when the last node template is removed", func() { + var nodeTemplates []*v1alpha1.AWSNodeTemplate + for i := 0; i < 10; i++ { + p := awstest.AWSNodeTemplate() + nodeTemplates = append(nodeTemplates, p) + ExpectApplied(ctx, env.Client, p) + ExpectReconcileSucceeded(ctx, controller, client.ObjectKeyFromObject(p)) + } + + for i := 0; i < len(nodeTemplates)-1; i++ { + Expect(env.Client.Delete(ctx, nodeTemplates[i])).To(Succeed()) + ExpectReconcileSucceeded(ctx, controller, client.ObjectKeyFromObject(nodeTemplates[i])) + } + + // It shouldn't attempt to delete at this point + Expect(sqsapi.DeleteQueueBehavior.Calls()).To(Equal(0)) + Expect(eventbridgeapi.RemoveTargetsBehavior.Calls()).To(Equal(0)) + Expect(eventbridgeapi.DeleteRuleBehavior.Calls()).To(Equal(0)) + + // Set the output of ListRules to mock rule creation + eventbridgeapi.ListRulesBehavior.Output.Set(&eventbridge.ListRulesOutput{ + Rules: []*eventbridge.Rule{ + { + Name: aws.String(providers.DefaultRules[providers.ScheduledChangedRule].Name), + Arn: aws.String("test-arn1"), + }, + { + Name: aws.String(providers.DefaultRules[providers.SpotTerminationRule].Name), + Arn: aws.String("test-arn2"), + }, + { + Name: aws.String(providers.DefaultRules[providers.RebalanceRule].Name), + Arn: aws.String("test-arn3"), + }, + { + Name: aws.String(providers.DefaultRules[providers.StateChangeRule].Name), + Arn: aws.String("test-arn4"), + }, }, - }, - }) + }) + eventbridgeapi.ListTagsForResourceBehavior.Output.Set(&eventbridge.ListTagsForResourceOutput{ + Tags: []*eventbridge.Tag{ + { + Key: aws.String(v1alpha5.DiscoveryTagKey), + Value: aws.String(defaultOpts.ClusterName), + }, + }, + }) - // Last AWSNodeTemplate, so now it should delete it - Expect(env.Client.Delete(ctx, nodeTemplates[len(nodeTemplates)-1])).To(Succeed()) - ExpectReconcileSucceeded(ctx, controller, client.ObjectKeyFromObject(nodeTemplates[len(nodeTemplates)-1])) + // Last AWSNodeTemplate, so now it should delete it + Expect(env.Client.Delete(ctx, nodeTemplates[len(nodeTemplates)-1])).To(Succeed()) + ExpectReconcileSucceeded(ctx, controller, client.ObjectKeyFromObject(nodeTemplates[len(nodeTemplates)-1])) - Expect(sqsapi.DeleteQueueBehavior.SuccessfulCalls()).To(Equal(1)) - Expect(eventbridgeapi.RemoveTargetsBehavior.SuccessfulCalls()).To(Equal(4)) - Expect(eventbridgeapi.DeleteRuleBehavior.SuccessfulCalls()).To(Equal(4)) + Expect(sqsapi.DeleteQueueBehavior.SuccessfulCalls()).To(Equal(1)) + Expect(eventbridgeapi.RemoveTargetsBehavior.SuccessfulCalls()).To(Equal(4)) + Expect(eventbridgeapi.DeleteRuleBehavior.SuccessfulCalls()).To(Equal(4)) + }) }) }) }) diff --git a/pkg/controllers/providers/eventbridge.go b/pkg/controllers/providers/eventbridge.go index 4cb6a2e259ef..5747b4f3cc0d 100644 --- a/pkg/controllers/providers/eventbridge.go +++ b/pkg/controllers/providers/eventbridge.go @@ -19,7 +19,6 @@ import ( "encoding/json" "fmt" "regexp" - "time" "github.com/aws/aws-sdk-go/aws" "github.com/aws/aws-sdk-go/service/eventbridge" @@ -35,10 +34,6 @@ import ( awserrors "github.com/aws/karpenter/pkg/errors" ) -func init() { - rand.Seed(time.Now().Unix()) -} - const ( ScheduledChangedRule = "ScheduledChangeRule" SpotTerminationRule = "SpotTerminationRule" diff --git a/pkg/fake/atomic.go b/pkg/fake/atomic.go index 9cd9170d69e1..685496759b91 100644 --- a/pkg/fake/atomic.go +++ b/pkg/fake/atomic.go @@ -81,6 +81,8 @@ func (e *AtomicError) Reset() { e.mu.Lock() defer e.mu.Unlock() e.err = nil + e.calls = 0 + e.maxCalls = 0 } func (e *AtomicError) IsNil() bool { diff --git a/test/infrastructure/clusters/test-infra/karpenter-tests/pipeline-trigger-cron.yaml b/test/infrastructure/clusters/test-infra/karpenter-tests/pipeline-trigger-cron.yaml index 14e46970832b..1161efd6ca1c 100644 --- a/test/infrastructure/clusters/test-infra/karpenter-tests/pipeline-trigger-cron.yaml +++ b/test/infrastructure/clusters/test-infra/karpenter-tests/pipeline-trigger-cron.yaml @@ -37,7 +37,7 @@ data: pipelines-trigger.sh: |+ #!/usr/bin/env bash set -euo pipefail - for suite in "Integration" "Consolidation" "Utilization" "Notification"; do + for suite in "Integration" "Consolidation" "Utilization" "Interruption"; do cat <