diff --git a/kwok/apis/crds/karpenter.kwok.sh_kwoknodeclasses.yaml b/kwok/apis/crds/karpenter.kwok.sh_kwoknodeclasses.yaml index c1657ab40a..3ce3353999 100644 --- a/kwok/apis/crds/karpenter.kwok.sh_kwoknodeclasses.yaml +++ b/kwok/apis/crds/karpenter.kwok.sh_kwoknodeclasses.yaml @@ -3,7 +3,7 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.16.3 + controller-gen.kubebuilder.io/version: v0.16.4 name: kwoknodeclasses.karpenter.kwok.sh spec: group: karpenter.kwok.sh diff --git a/kwok/charts/crds/karpenter.sh_nodeclaims.yaml b/kwok/charts/crds/karpenter.sh_nodeclaims.yaml index 9494280de5..73702f61cc 100644 --- a/kwok/charts/crds/karpenter.sh_nodeclaims.yaml +++ b/kwok/charts/crds/karpenter.sh_nodeclaims.yaml @@ -3,7 +3,7 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.16.3 + controller-gen.kubebuilder.io/version: v0.16.4 name: nodeclaims.karpenter.sh spec: group: karpenter.sh @@ -275,6 +275,13 @@ spec: If left undefined, the controller will wait indefinitely for pods to be drained. pattern: ^([0-9]+(s|m|h))+$ type: string + unreachableTimeout: + default: Never + description: |- + unreachableTimeout is the duration the controller will wait + before terminating a node, measured from when the node is tainted unreachable + pattern: ^(([0-9]+(s|m|h))+)|(Never)$ + type: string required: - nodeClassRef - requirements diff --git a/kwok/charts/crds/karpenter.sh_nodepools.yaml b/kwok/charts/crds/karpenter.sh_nodepools.yaml index 8cd3b43747..14ca95eab5 100644 --- a/kwok/charts/crds/karpenter.sh_nodepools.yaml +++ b/kwok/charts/crds/karpenter.sh_nodepools.yaml @@ -3,7 +3,7 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.16.3 + controller-gen.kubebuilder.io/version: v0.16.4 name: nodepools.karpenter.sh spec: group: karpenter.sh diff --git a/kwok/charts/values.yaml b/kwok/charts/values.yaml index 237f12db41..1aa06ae834 100644 --- a/kwok/charts/values.yaml +++ b/kwok/charts/values.yaml @@ -45,7 +45,7 @@ podDisruptionBudget: name: karpenter maxUnavailable: 1 # -- SecurityContext for the pod. -podSecurityContext: +podSecurityContext: fsGroup: 65536 # -- PriorityClass name for the pod. priorityClassName: system-cluster-critical @@ -91,7 +91,7 @@ controller: # -- Repository path to the controller image. repository: "" # -- Tag of the controller image. - tag: "" + tag: "" # -- SHA256 digest of the controller image. digest: "" # -- Additional environment variables for the controller pod. diff --git a/pkg/apis/crds/karpenter.sh_nodeclaims.yaml b/pkg/apis/crds/karpenter.sh_nodeclaims.yaml index e70b6e2af7..7e546225e3 100644 --- a/pkg/apis/crds/karpenter.sh_nodeclaims.yaml +++ b/pkg/apis/crds/karpenter.sh_nodeclaims.yaml @@ -3,7 +3,7 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.16.3 + controller-gen.kubebuilder.io/version: v0.16.4 name: nodeclaims.karpenter.sh spec: group: karpenter.sh @@ -273,6 +273,13 @@ spec: If left undefined, the controller will wait indefinitely for pods to be drained. pattern: ^([0-9]+(s|m|h))+$ type: string + unreachableTimeout: + default: Never + description: |- + unreachableTimeout is the duration the controller will wait + before terminating a node, measured from when the node is tainted unreachable + pattern: ^(([0-9]+(s|m|h))+)|(Never)$ + type: string required: - nodeClassRef - requirements diff --git a/pkg/apis/crds/karpenter.sh_nodepools.yaml b/pkg/apis/crds/karpenter.sh_nodepools.yaml index a22d8befeb..a188513736 100644 --- a/pkg/apis/crds/karpenter.sh_nodepools.yaml +++ b/pkg/apis/crds/karpenter.sh_nodepools.yaml @@ -3,7 +3,7 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.16.3 + controller-gen.kubebuilder.io/version: v0.16.4 name: nodepools.karpenter.sh spec: group: karpenter.sh diff --git a/pkg/apis/v1/nodeclaim.go b/pkg/apis/v1/nodeclaim.go index 1c163858ba..0db8da9982 100644 --- a/pkg/apis/v1/nodeclaim.go +++ b/pkg/apis/v1/nodeclaim.go @@ -73,6 +73,14 @@ type NodeClaimSpec struct { // +kubebuilder:validation:Schemaless // +optional ExpireAfter NillableDuration `json:"expireAfter,omitempty"` + // unreachableTimeout is the duration the controller will wait + // before terminating a node, measured from when the node is tainted unreachable + // +kubebuilder:default:="Never" + // +kubebuilder:validation:Pattern=`^(([0-9]+(s|m|h))+)|(Never)$` + // +kubebuilder:validation:Type="string" + // +kubebuilder:validation:Schemaless + // +optional + UnreachableTimeout NillableDuration `json:"unreachableTimeout,omitempty"` } // A node selector requirement with min values is a selector that contains values, a key, an operator that relates the key and values diff --git a/pkg/apis/v1/zz_generated.deepcopy.go b/pkg/apis/v1/zz_generated.deepcopy.go index 4ccbab50db..e3b4c5237b 100644 --- a/pkg/apis/v1/zz_generated.deepcopy.go +++ b/pkg/apis/v1/zz_generated.deepcopy.go @@ -222,6 +222,7 @@ func (in *NodeClaimSpec) DeepCopyInto(out *NodeClaimSpec) { **out = **in } in.ExpireAfter.DeepCopyInto(&out.ExpireAfter) + in.UnreachableTimeout.DeepCopyInto(&out.UnreachableTimeout) } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new NodeClaimSpec. diff --git a/pkg/controllers/controllers.go b/pkg/controllers/controllers.go index 2b69b26380..2b6a9b1a4c 100644 --- a/pkg/controllers/controllers.go +++ b/pkg/controllers/controllers.go @@ -37,6 +37,7 @@ import ( "sigs.k8s.io/karpenter/pkg/controllers/nodeclaim/expiration" nodeclaimgarbagecollection "sigs.k8s.io/karpenter/pkg/controllers/nodeclaim/garbagecollection" nodeclaimlifecycle "sigs.k8s.io/karpenter/pkg/controllers/nodeclaim/lifecycle" + "sigs.k8s.io/karpenter/pkg/controllers/nodeclaim/notready" podevents "sigs.k8s.io/karpenter/pkg/controllers/nodeclaim/podevents" nodepoolcounter "sigs.k8s.io/karpenter/pkg/controllers/nodepool/counter" nodepoolhash "sigs.k8s.io/karpenter/pkg/controllers/nodepool/hash" @@ -68,6 +69,7 @@ func NewControllers( provisioning.NewNodeController(kubeClient, p), nodepoolhash.NewController(kubeClient), expiration.NewController(clock, kubeClient), + notready.NewController(kubeClient), informer.NewDaemonSetController(kubeClient, cluster), informer.NewNodeController(kubeClient, cluster), informer.NewPodController(kubeClient, cluster), diff --git a/pkg/controllers/nodeclaim/notready/controller.go b/pkg/controllers/nodeclaim/notready/controller.go new file mode 100644 index 0000000000..20105e44c8 --- /dev/null +++ b/pkg/controllers/nodeclaim/notready/controller.go @@ -0,0 +1,93 @@ +/* +Copyright The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package notready + +import ( + "context" + "time" + + controllerruntime "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/controller-runtime/pkg/manager" + "sigs.k8s.io/controller-runtime/pkg/reconcile" + + nodeclaimutil "sigs.k8s.io/karpenter/pkg/utils/nodeclaim" + + corev1 "k8s.io/api/core/v1" + + v1 "sigs.k8s.io/karpenter/pkg/apis/v1" +) + +// Controller NotReady is a nodeclaim controller that deletes nodeclaims when they have been unreachable for too long +type Controller struct { + kubeClient client.Client +} + +// NewController constructs a nodeclaim disruption controller +func NewController(kubeClient client.Client) *Controller { + return &Controller{ + kubeClient: kubeClient, + } +} + +func (c *Controller) Reconcile(ctx context.Context, nodeClaim *v1.NodeClaim) (reconcile.Result, error) { + if nodeClaim.Spec.UnreachableTimeout.Duration == nil { + return reconcile.Result{}, nil + } + + node, err := nodeclaimutil.NodeForNodeClaim(ctx, c.kubeClient, nodeClaim) + if err != nil { + return reconcile.Result{}, nodeclaimutil.IgnoreDuplicateNodeError(nodeclaimutil.IgnoreNodeNotFoundError(err)) + } + + for _, taint := range node.Spec.Taints { + if taint.Key == corev1.TaintNodeUnreachable { + if taint.TimeAdded != nil { + durationSinceTaint := time.Since(taint.TimeAdded.Time) + if durationSinceTaint > *nodeClaim.Spec.UnreachableTimeout.Duration { + // if node is unreachable for too long, delete the nodeclaim + if err := c.kubeClient.Delete(ctx, nodeClaim); err != nil { + log.FromContext(ctx).V(0).Error(err, "Failed to delete nodeclaim", "node", node.Name) + return reconcile.Result{}, err + } + log.FromContext(ctx).V(0).Info("Deleted nodeclaim because the node has been unreachable for more than unreachableTimeout", "node", node.Name) + return reconcile.Result{}, nil + } else { + // If the node is unreachable and the time since it became unreachable is less than the configured timeout, + // we requeue to prevent the node from remaining in an unreachable state indefinitely + log.FromContext(ctx).V(1).Info("Node has been unreachable for less than unreachableTimeout, requeueing", "node", node.Name) + return reconcile.Result{RequeueAfter: *nodeClaim.Spec.UnreachableTimeout.Duration}, nil + } + } + } + } + + return reconcile.Result{}, nil +} + +func (c *Controller) Register(_ context.Context, m manager.Manager) error { + builder := controllerruntime.NewControllerManagedBy(m) + return builder. + Named("nodeclaim.notready"). + For(&v1.NodeClaim{}). + Watches( + &corev1.Node{}, + nodeclaimutil.NodeEventHandler(c.kubeClient), + ). + Complete(reconcile.AsReconciler(m.GetClient(), c)) +} diff --git a/pkg/controllers/nodeclaim/notready/suite_test.go b/pkg/controllers/nodeclaim/notready/suite_test.go new file mode 100644 index 0000000000..9326634369 --- /dev/null +++ b/pkg/controllers/nodeclaim/notready/suite_test.go @@ -0,0 +1,118 @@ +/* +Copyright The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package notready_test + +import ( + "context" + "testing" + "time" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + clock "k8s.io/utils/clock/testing" + "sigs.k8s.io/controller-runtime/pkg/cache" + "sigs.k8s.io/controller-runtime/pkg/client" + + "sigs.k8s.io/karpenter/pkg/apis" + v1 "sigs.k8s.io/karpenter/pkg/apis/v1" + "sigs.k8s.io/karpenter/pkg/controllers/nodeclaim/notready" + "sigs.k8s.io/karpenter/pkg/metrics" + "sigs.k8s.io/karpenter/pkg/operator/options" + "sigs.k8s.io/karpenter/pkg/test" + . "sigs.k8s.io/karpenter/pkg/test/expectations" + "sigs.k8s.io/karpenter/pkg/test/v1alpha1" + . "sigs.k8s.io/karpenter/pkg/utils/testing" +) + +var ctx context.Context +var notReadyController *notready.Controller +var env *test.Environment +var fakeClock *clock.FakeClock + +func TestAPIs(t *testing.T) { + ctx = TestContextWithLogger(t) + RegisterFailHandler(Fail) + RunSpecs(t, "NotReady Controller Suite") +} + +var _ = BeforeSuite(func() { + fakeClock = clock.NewFakeClock(time.Now()) + env = test.NewEnvironment(test.WithCRDs(apis.CRDs...), test.WithCRDs(v1alpha1.CRDs...), test.WithFieldIndexers(func(c cache.Cache) error { + return c.IndexField(ctx, &corev1.Node{}, "spec.providerID", func(obj client.Object) []string { + return []string{obj.(*corev1.Node).Spec.ProviderID} + }) + })) + ctx = options.ToContext(ctx, test.Options()) + notReadyController = notready.NewController(env.Client) +}) + +var _ = AfterSuite(func() { + Expect(env.Stop()).To(Succeed(), "Failed to stop environment") +}) + +var _ = BeforeEach(func() { + ctx = options.ToContext(ctx, test.Options()) + fakeClock.SetTime(time.Now()) +}) + +var _ = AfterEach(func() { + ExpectCleanedUp(ctx, env.Client) +}) + +var _ = Describe("NotReady", func() { + var nodePool *v1.NodePool + var nodeClaim *v1.NodeClaim + var node *corev1.Node + BeforeEach(func() { + nodePool = test.NodePool() + nodeClaim, node = test.NodeClaimAndNode(v1.NodeClaim{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{v1.NodePoolLabelKey: nodePool.Name}, + }, + Spec: v1.NodeClaimSpec{ + UnreachableTimeout: v1.MustParseNillableDuration("10m"), + }, + }) + metrics.NodeClaimsDisruptedTotal.Reset() + }) + It("should remove NodeClaim when the node has an unreachable taint for over the UnreachableTimeout duration", func() { + node.Spec.Taints = []corev1.Taint{ + { + Key: corev1.TaintNodeUnreachable, + Effect: corev1.TaintEffectNoSchedule, + TimeAdded: &metav1.Time{Time: fakeClock.Now().Add(-12 * time.Minute)}, + }, + } + ExpectApplied(ctx, env.Client, nodeClaim, node) + ExpectObjectReconciled(ctx, env.Client, notReadyController, nodeClaim) + ExpectNotFound(ctx, env.Client, nodeClaim) + }) + It("should not remove NodeClaim if unreachable taint is less than the UnreachableTimeout duration", func() { + node.Spec.Taints = []corev1.Taint{ + { + Key: corev1.TaintNodeUnreachable, + Effect: corev1.TaintEffectNoSchedule, + TimeAdded: &metav1.Time{Time: fakeClock.Now().Add(-7 * time.Minute)}, + }, + } + ExpectApplied(ctx, env.Client, nodeClaim, node) + ExpectObjectReconciled(ctx, env.Client, notReadyController, nodeClaim) + nodeClaim = ExpectExists(ctx, env.Client, nodeClaim) + }) +}) diff --git a/pkg/test/v1alpha1/crds/karpenter.test.sh_testnodeclasses.yaml b/pkg/test/v1alpha1/crds/karpenter.test.sh_testnodeclasses.yaml index b8684d22e0..8858691d41 100644 --- a/pkg/test/v1alpha1/crds/karpenter.test.sh_testnodeclasses.yaml +++ b/pkg/test/v1alpha1/crds/karpenter.test.sh_testnodeclasses.yaml @@ -3,7 +3,7 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.16.3 + controller-gen.kubebuilder.io/version: v0.16.4 name: testnodeclasses.karpenter.test.sh spec: group: karpenter.test.sh