From ed23a2bb48d4c02321fe0e56890aef90e8299746 Mon Sep 17 00:00:00 2001 From: Brad Davidson Date: Mon, 6 May 2024 19:43:37 +0000 Subject: [PATCH] Fix netpol crash when node remains tained unintialized It is concievable that users might take more than 60 seconds to deploy their own cloud-provider. Instead of exiting, we should wait forever, but with more logging to indicate what's being waited on. Signed-off-by: Brad Davidson --- pkg/agent/netpol/netpol.go | 15 +++++++-------- pkg/etcd/metadata_controller.go | 4 ++-- 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/pkg/agent/netpol/netpol.go b/pkg/agent/netpol/netpol.go index 60d2c1f07f45..f09d47d11e5b 100644 --- a/pkg/agent/netpol/netpol.go +++ b/pkg/agent/netpol/netpol.go @@ -67,27 +67,26 @@ func Run(ctx context.Context, nodeConfig *config.Node) error { return err } - // As kube-router netpol requires addresses to be available in the node object - // Wait until the node has ready addresses to avoid race conditions (max 1 minute). + // kube-router netpol requires addresses to be available in the node object. + // Wait until the uninitialized taint has been removed, at which point the addresses should be set. // TODO: Replace with non-deprecated PollUntilContextTimeout when our and Kubernetes code migrate to it - if err := wait.PollImmediateWithContext(ctx, 2*time.Second, 60*time.Second, func(ctx context.Context) (bool, error) { + if err := wait.PollImmediateInfiniteWithContext(ctx, 2*time.Second, func(ctx context.Context) (bool, error) { // Get the node object node, err := client.CoreV1().Nodes().Get(ctx, nodeConfig.AgentConfig.NodeName, metav1.GetOptions{}) if err != nil { - logrus.Debugf("Network policy controller waiting to get Node %s: %v", nodeConfig.AgentConfig.NodeName, err) + logrus.Infof("Network policy controller waiting to get Node %s: %v", nodeConfig.AgentConfig.NodeName, err) return false, nil } - // Check for the uninitialized taint that should be removed by cloud-provider - // If there is no cloud-provider, the taint will not be there + // Check for the taint that should be removed by cloud-provider when the node has been initialized. for _, taint := range node.Spec.Taints { if taint.Key == cloudproviderapi.TaintExternalCloudProvider { - logrus.Debugf("Network policy controller waiting for removal of %s taint", cloudproviderapi.TaintExternalCloudProvider) + logrus.Infof("Network policy controller waiting for removal of %s taint", cloudproviderapi.TaintExternalCloudProvider) return false, nil } } return true, nil }); err != nil { - return errors.Wrapf(err, "network policy controller timed out waiting for %s taint to be removed from Node %s", cloudproviderapi.TaintExternalCloudProvider, nodeConfig.AgentConfig.NodeName) + return errors.Wrapf(err, "network policy controller failed to wait for %s taint to be removed from Node %s", cloudproviderapi.TaintExternalCloudProvider, nodeConfig.AgentConfig.NodeName) } krConfig := options.NewKubeRouterConfig() diff --git a/pkg/etcd/metadata_controller.go b/pkg/etcd/metadata_controller.go index 0f4599a78f62..71ec165feb5a 100644 --- a/pkg/etcd/metadata_controller.go +++ b/pkg/etcd/metadata_controller.go @@ -13,7 +13,7 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/labels" "k8s.io/client-go/util/retry" - nodeUtil "k8s.io/kubernetes/pkg/controller/util/node" + nodeutil "k8s.io/kubernetes/pkg/controller/util/node" ) func registerMetadataHandlers(ctx context.Context, etcd *ETCD) { @@ -109,7 +109,7 @@ func (m *metadataHandler) handleSelf(node *v1.Node) (*v1.Node, error) { node.Labels = map[string]string{} } - if find, _ := nodeUtil.GetNodeCondition(&node.Status, etcdStatusType); find >= 0 { + if find, _ := nodeutil.GetNodeCondition(&node.Status, etcdStatusType); find >= 0 { node.Status.Conditions = append(node.Status.Conditions[:find], node.Status.Conditions[find+1:]...) }