Skip to content

Commit

Permalink
Fix netpol crash when node remains tained unintialized
Browse files Browse the repository at this point in the history
It is concievable that users might take more than 60 seconds to deploy their own cloud-provider. Instead of exiting, we should wait forever, but with more logging to indicate what's being waited on.

Signed-off-by: Brad Davidson <brad.davidson@rancher.com>
  • Loading branch information
brandond committed May 29, 2024
1 parent f2e7c01 commit ed23a2b
Show file tree
Hide file tree
Showing 2 changed files with 9 additions and 10 deletions.
15 changes: 7 additions & 8 deletions pkg/agent/netpol/netpol.go
Original file line number Diff line number Diff line change
Expand Up @@ -67,27 +67,26 @@ func Run(ctx context.Context, nodeConfig *config.Node) error {
return err
}

// As kube-router netpol requires addresses to be available in the node object
// Wait until the node has ready addresses to avoid race conditions (max 1 minute).
// kube-router netpol requires addresses to be available in the node object.
// Wait until the uninitialized taint has been removed, at which point the addresses should be set.
// TODO: Replace with non-deprecated PollUntilContextTimeout when our and Kubernetes code migrate to it
if err := wait.PollImmediateWithContext(ctx, 2*time.Second, 60*time.Second, func(ctx context.Context) (bool, error) {
if err := wait.PollImmediateInfiniteWithContext(ctx, 2*time.Second, func(ctx context.Context) (bool, error) {
// Get the node object
node, err := client.CoreV1().Nodes().Get(ctx, nodeConfig.AgentConfig.NodeName, metav1.GetOptions{})
if err != nil {
logrus.Debugf("Network policy controller waiting to get Node %s: %v", nodeConfig.AgentConfig.NodeName, err)
logrus.Infof("Network policy controller waiting to get Node %s: %v", nodeConfig.AgentConfig.NodeName, err)
return false, nil
}
// Check for the uninitialized taint that should be removed by cloud-provider
// If there is no cloud-provider, the taint will not be there
// Check for the taint that should be removed by cloud-provider when the node has been initialized.
for _, taint := range node.Spec.Taints {
if taint.Key == cloudproviderapi.TaintExternalCloudProvider {
logrus.Debugf("Network policy controller waiting for removal of %s taint", cloudproviderapi.TaintExternalCloudProvider)
logrus.Infof("Network policy controller waiting for removal of %s taint", cloudproviderapi.TaintExternalCloudProvider)
return false, nil
}
}
return true, nil
}); err != nil {
return errors.Wrapf(err, "network policy controller timed out waiting for %s taint to be removed from Node %s", cloudproviderapi.TaintExternalCloudProvider, nodeConfig.AgentConfig.NodeName)
return errors.Wrapf(err, "network policy controller failed to wait for %s taint to be removed from Node %s", cloudproviderapi.TaintExternalCloudProvider, nodeConfig.AgentConfig.NodeName)
}

krConfig := options.NewKubeRouterConfig()
Expand Down
4 changes: 2 additions & 2 deletions pkg/etcd/metadata_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ import (
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/labels"
"k8s.io/client-go/util/retry"
nodeUtil "k8s.io/kubernetes/pkg/controller/util/node"
nodeutil "k8s.io/kubernetes/pkg/controller/util/node"
)

func registerMetadataHandlers(ctx context.Context, etcd *ETCD) {
Expand Down Expand Up @@ -109,7 +109,7 @@ func (m *metadataHandler) handleSelf(node *v1.Node) (*v1.Node, error) {
node.Labels = map[string]string{}
}

if find, _ := nodeUtil.GetNodeCondition(&node.Status, etcdStatusType); find >= 0 {
if find, _ := nodeutil.GetNodeCondition(&node.Status, etcdStatusType); find >= 0 {
node.Status.Conditions = append(node.Status.Conditions[:find], node.Status.Conditions[find+1:]...)
}

Expand Down

0 comments on commit ed23a2b

Please sign in to comment.