diff --git a/controllers/node/node_controller.go b/controllers/node/node_controller.go index b31c04d4..9b402239 100644 --- a/controllers/node/node_controller.go +++ b/controllers/node/node_controller.go @@ -226,6 +226,7 @@ func (cnc *CloudNodeController) processNextWorkItem() bool { if err := cnc.syncHandler(key); err != nil { // Put the item back on the workqueue to handle any transient errors. cnc.workqueue.AddRateLimited(key) + klog.Infof("error syncing '%s': %v, requeuing", key, err) return fmt.Errorf("error syncing '%s': %s, requeuing", key, err.Error()) } @@ -424,12 +425,8 @@ func (cnc *CloudNodeController) syncNode(ctx context.Context, nodeName string) e klog.Infof("Initializing node %s with cloud provider", nodeName) copyNode := curNode.DeepCopy() - providerID, err := cnc.getProviderID(ctx, copyNode) - if err != nil { - return fmt.Errorf("failed to get provider ID for node %s at cloudprovider: %v", nodeName, err) - } - instanceMetadata, err := cnc.getInstanceMetadata(ctx, providerID, copyNode) + instanceMetadata, err := cnc.getInstanceMetadata(ctx, copyNode) if err != nil { return fmt.Errorf("failed to get instance metadata for node %s: %v", nodeName, err) } @@ -439,7 +436,7 @@ func (cnc *CloudNodeController) syncNode(ctx context.Context, nodeName string) e return nil } - nodeModifiers, err := cnc.getNodeModifiersFromCloudProvider(ctx, providerID, copyNode, instanceMetadata) + nodeModifiers, err := cnc.getNodeModifiersFromCloudProvider(ctx, copyNode, instanceMetadata) if err != nil { return fmt.Errorf("failed to get node modifiers from cloud provider: %v", err) } @@ -510,16 +507,13 @@ func (cnc *CloudNodeController) syncNode(ctx context.Context, nodeName string) e // loop, meaning they could get called multiple times. func (cnc *CloudNodeController) getNodeModifiersFromCloudProvider( ctx context.Context, - providerID string, node *v1.Node, instanceMeta *cloudprovider.InstanceMetadata, ) ([]nodeModifier, error) { var nodeModifiers []nodeModifier if node.Spec.ProviderID == "" { - if providerID != "" { - nodeModifiers = append(nodeModifiers, func(n *v1.Node) { n.Spec.ProviderID = providerID }) - } else if instanceMeta.ProviderID != "" { + if instanceMeta.ProviderID != "" { nodeModifiers = append(nodeModifiers, func(n *v1.Node) { n.Spec.ProviderID = instanceMeta.ProviderID }) } } @@ -594,35 +588,33 @@ func (cnc *CloudNodeController) getNodeModifiersFromCloudProvider( return nodeModifiers, nil } -func (cnc *CloudNodeController) getProviderID(ctx context.Context, node *v1.Node) (string, error) { - if node.Spec.ProviderID != "" { - return node.Spec.ProviderID, nil - } - - if _, ok := cnc.cloud.InstancesV2(); ok { - // We don't need providerID when we call InstanceMetadata for InstancesV2 - return "", nil - } - - providerID, err := cloudprovider.GetInstanceProviderID(ctx, cnc.cloud, types.NodeName(node.Name)) - if err == cloudprovider.NotImplemented { - // if the cloud provider being used does not support provider IDs, - // we can safely continue since we will attempt to set node - // addresses given the node name in getNodeAddressesByProviderIDOrName - klog.Warningf("cloud provider does not set node provider ID, using node name to discover node %s", node.Name) - return "", nil - } - - // if the cloud provider being used supports provider IDs, we want - // to propagate the error so that we re-try in the future; if we - // do not, the taint will be removed, and this will not be retried - return providerID, err -} - -// getInstanceMetadata get instance type and nodeAddresses, use Instances if InstancesV2 is off. -func (cnc *CloudNodeController) getInstanceMetadata(ctx context.Context, providerID string, node *v1.Node) (*cloudprovider.InstanceMetadata, error) { +// getInstanceMetadata get providerdID, instance type and nodeAddresses, use Instances if InstancesV2 is off. +// ProviderID is expected to be available, but to keep backward compatibility, +// we should handle some scenarios where it can be missing. It returns an error +// if providerID is missing, except when is not implemented by GetInstanceProviderID. +func (cnc *CloudNodeController) getInstanceMetadata(ctx context.Context, node *v1.Node) (*cloudprovider.InstanceMetadata, error) { + // kubelet can set the provider ID using the flag and is inmutable + providerID := node.Spec.ProviderID + // InstancesV2 require ProviderID to be present if instancesV2, ok := cnc.cloud.InstancesV2(); instancesV2 != nil && ok { - return instancesV2.InstanceMetadata(ctx, node) + metadata, err := instancesV2.InstanceMetadata(ctx, node) + if err != nil { + return nil, err + } + // spec.ProviderID is required for multiple controllers, like loadbalancers, so we should not + // untaint the node until is set. Once it is set, the field is immutable, so no need to reconcile. + // We only set this value during initialization and is never reconciled, so if for some reason + // we are not able to set it, the instance will never be able to acquire it. + // Before external cloud providers were enabled by default, the field was set by the kubelet, and the + // node was created with the value. + // xref: https://issues.k8s.io/123024 + if metadata != nil && metadata.ProviderID == "" { + if providerID == "" { + return metadata, fmt.Errorf("cloud provider does not set node provider ID for node %s", node.Name) + } + metadata.ProviderID = providerID + } + return metadata, nil } // If InstancesV2 not implement, use Instances. @@ -630,6 +622,26 @@ func (cnc *CloudNodeController) getInstanceMetadata(ctx context.Context, provide if !ok { return nil, fmt.Errorf("failed to get instances from cloud provider") } + + var err error + if providerID == "" { + providerID, err = cloudprovider.GetInstanceProviderID(ctx, cnc.cloud, types.NodeName(node.Name)) + if err != nil { + // This is the only case where ProviderID can be skipped + if errors.Is(err, cloudprovider.NotImplemented) { + // if the cloud provider being used does not support provider IDs, + // we can safely continue since we will attempt to set node + // addresses given the node name in getNodeAddressesByProviderIDOrName + klog.Warningf("cloud provider does not set node provider ID, using node name to discover node %s", node.Name) + } else { + // if the cloud provider being used supports provider IDs, we want + // to propagate the error so that we re-try in the future; if we + // do not, the taint will be removed, and this will not be retried + return nil, err + } + } + } + nodeAddresses, err := getNodeAddressesByProviderIDOrName(ctx, instances, providerID, node.Name) if err != nil { return nil, err @@ -640,6 +652,7 @@ func (cnc *CloudNodeController) getInstanceMetadata(ctx context.Context, provide } instanceMetadata := &cloudprovider.InstanceMetadata{ + ProviderID: providerID, InstanceType: instanceType, NodeAddresses: nodeAddresses, } diff --git a/controllers/node/node_controller_test.go b/controllers/node/node_controller_test.go index 7f5e95e0..2f0800b6 100644 --- a/controllers/node/node_controller_test.go +++ b/controllers/node/node_controller_test.go @@ -50,6 +50,7 @@ func Test_syncNode(t *testing.T) { fakeCloud *fakecloud.Cloud existingNode *v1.Node updatedNode *v1.Node + expectedErr bool }{ { name: "node initialized with provider ID", @@ -545,7 +546,8 @@ func Test_syncNode(t *testing.T) { }, }, { - name: "provided node IP address is not valid", + name: "provided node IP address is not valid", + expectedErr: true, fakeCloud: &fakecloud.Cloud{ EnableInstancesV2: false, Addresses: []v1.NodeAddress{ @@ -643,7 +645,8 @@ func Test_syncNode(t *testing.T) { }, }, { - name: "provided node IP address is not present", + name: "provided node IP address is not present", + expectedErr: true, fakeCloud: &fakecloud.Cloud{ EnableInstancesV2: false, Addresses: []v1.NodeAddress{ @@ -835,8 +838,10 @@ func Test_syncNode(t *testing.T) { }, }, }, - { - name: "provider ID not implemented", + { // for backward compatibility the cloud providers that does not implement + // providerID does not block the node initialization + name: "provider ID not implemented", + expectedErr: false, fakeCloud: &fakecloud.Cloud{ EnableInstancesV2: false, InstanceTypes: map[types.NodeName]string{}, @@ -1641,7 +1646,8 @@ func Test_syncNode(t *testing.T) { }, }, { - name: "[instanceV2] provider ID not implemented", + name: "[instanceV2] provider ID not implemented", + expectedErr: true, fakeCloud: &fakecloud.Cloud{ EnableInstancesV2: true, InstanceTypes: map[types.NodeName]string{}, @@ -1693,12 +1699,19 @@ func Test_syncNode(t *testing.T) { }, }, Spec: v1.NodeSpec{ - Taints: []v1.Taint{}, + Taints: []v1.Taint{ + { + Key: cloudproviderapi.TaintExternalCloudProvider, + Value: "true", + Effect: v1.TaintEffectNoSchedule, + }, + }, }, }, }, { - name: "[instanceV2] error getting InstanceMetadata", + name: "[instanceV2] error getting InstanceMetadata", + expectedErr: true, fakeCloud: &fakecloud.Cloud{ EnableInstancesV2: true, InstanceTypes: map[types.NodeName]string{}, @@ -1786,7 +1799,10 @@ func Test_syncNode(t *testing.T) { w := eventBroadcaster.StartLogging(klog.Infof) defer w.Stop() - cloudNodeController.syncNode(context.TODO(), test.existingNode.Name) + err := cloudNodeController.syncNode(context.TODO(), test.existingNode.Name) + if (err != nil) != test.expectedErr { + t.Fatalf("error got: %v expected: %v", err, test.expectedErr) + } updatedNode, err := clientset.CoreV1().Nodes().Get(context.TODO(), test.existingNode.Name, metav1.GetOptions{}) if err != nil { @@ -1833,6 +1849,9 @@ func TestGCEConditionV2(t *testing.T) { InstanceTypes: map[types.NodeName]string{ types.NodeName("node0"): "t1.micro", }, + ProviderID: map[types.NodeName]string{ + types.NodeName("node0"): "fake://12334", + }, Addresses: []v1.NodeAddress{ { Type: v1.NodeHostName, @@ -2376,15 +2395,16 @@ func TestNodeAddressesNotUpdate(t *testing.T) { } } -func TestGetProviderID(t *testing.T) { +func TestGetInstanceMetadata(t *testing.T) { tests := []struct { - name string - fakeCloud *fakecloud.Cloud - existingNode *v1.Node - expectedProviderID string + name string + fakeCloud *fakecloud.Cloud + existingNode *v1.Node + expectedMetadata *cloudprovider.InstanceMetadata + expectErr bool }{ { - name: "node initialized with provider ID", + name: "cloud implemented with Instances and provider ID", fakeCloud: &fakecloud.Cloud{ EnableInstancesV2: false, InstanceTypes: map[types.NodeName]string{ @@ -2393,6 +2413,9 @@ func TestGetProviderID(t *testing.T) { ExtID: map[types.NodeName]string{ types.NodeName("node0"): "12345", }, + ProviderID: map[types.NodeName]string{ + types.NodeName("node0"): "fake://12345", + }, Addresses: []v1.NodeAddress{ { Type: v1.NodeHostName, @@ -2423,21 +2446,27 @@ func TestGetProviderID(t *testing.T) { Effect: v1.TaintEffectNoSchedule, }, }, - ProviderID: "fake://12345", }, }, - expectedProviderID: "fake://12345", + expectedMetadata: &cloudprovider.InstanceMetadata{ + ProviderID: "fake://12345", + NodeAddresses: []v1.NodeAddress{ + {Type: "Hostname", Address: "node0.cloud.internal"}, + {Type: "InternalIP", Address: "10.0.0.1"}, + {Type: "ExternalIP", Address: "132.143.154.163"}, + }, + }, }, { - name: "cloud implemented with Instances (without providerID)", + name: "cloud implemented with Instances (providerID not implemented)", fakeCloud: &fakecloud.Cloud{ EnableInstancesV2: false, InstanceTypes: map[types.NodeName]string{ types.NodeName("node0"): "t1.micro", types.NodeName("fake://12345"): "t1.micro", }, - ExtID: map[types.NodeName]string{ - types.NodeName("node0"): "12345", + ExtIDErr: map[types.NodeName]error{ + types.NodeName("node0"): cloudprovider.NotImplemented, }, Addresses: []v1.NodeAddress{ { @@ -2461,7 +2490,58 @@ func TestGetProviderID(t *testing.T) { CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), }, }, - expectedProviderID: "fake://12345", + expectedMetadata: &cloudprovider.InstanceMetadata{ + NodeAddresses: []v1.NodeAddress{ + {Type: "Hostname", Address: "node0.cloud.internal"}, + {Type: "InternalIP", Address: "10.0.0.1"}, + {Type: "ExternalIP", Address: "132.143.154.163"}, + }, + }, + }, + { + name: "cloud implemented with Instances (providerID not implemented) and node with providerID", + fakeCloud: &fakecloud.Cloud{ + EnableInstancesV2: false, + InstanceTypes: map[types.NodeName]string{ + types.NodeName("node0"): "t1.micro", + types.NodeName("fake://12345"): "t1.micro", + }, + ExtIDErr: map[types.NodeName]error{ + types.NodeName("node0"): cloudprovider.NotImplemented, + }, + Addresses: []v1.NodeAddress{ + { + Type: v1.NodeHostName, + Address: "node0.cloud.internal", + }, + { + Type: v1.NodeInternalIP, + Address: "10.0.0.1", + }, + { + Type: v1.NodeExternalIP, + Address: "132.143.154.163", + }, + }, + Err: nil, + }, + existingNode: &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node0", + CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), + }, + Spec: v1.NodeSpec{ + ProviderID: "fake://asdasd", + }, + }, + expectedMetadata: &cloudprovider.InstanceMetadata{ + ProviderID: "fake://asdasd", + NodeAddresses: []v1.NodeAddress{ + {Type: "Hostname", Address: "node0.cloud.internal"}, + {Type: "InternalIP", Address: "10.0.0.1"}, + {Type: "ExternalIP", Address: "132.143.154.163"}, + }, + }, }, { name: "cloud implemented with InstancesV2 (with providerID)", @@ -2474,6 +2554,9 @@ func TestGetProviderID(t *testing.T) { ExtID: map[types.NodeName]string{ types.NodeName("node0"): "12345", }, + ProviderID: map[types.NodeName]string{ + types.NodeName("node0"): "fake://12345", + }, Addresses: []v1.NodeAddress{ { Type: v1.NodeHostName, @@ -2503,13 +2586,20 @@ func TestGetProviderID(t *testing.T) { Effect: v1.TaintEffectNoSchedule, }, }, - ProviderID: "fake://12345", }, }, - expectedProviderID: "fake://12345", + expectedMetadata: &cloudprovider.InstanceMetadata{ + ProviderID: "fake://12345", + NodeAddresses: []v1.NodeAddress{ + {Type: "Hostname", Address: "node0.cloud.internal"}, + {Type: "InternalIP", Address: "10.0.0.1"}, + {Type: "ExternalIP", Address: "132.143.154.163"}, + }, + }, }, - { - name: "cloud implemented with InstancesV2 (without providerID)", + { // it will be requeueud later + name: "cloud implemented with InstancesV2 (without providerID)", + expectErr: true, fakeCloud: &fakecloud.Cloud{ EnableInstancesV2: true, InstanceTypes: map[types.NodeName]string{ @@ -2550,7 +2640,59 @@ func TestGetProviderID(t *testing.T) { }, }, }, - expectedProviderID: "", + expectedMetadata: &cloudprovider.InstanceMetadata{ + NodeAddresses: []v1.NodeAddress{ + {Type: "Hostname", Address: "node0.cloud.internal"}, + {Type: "InternalIP", Address: "10.0.0.1"}, + {Type: "ExternalIP", Address: "132.143.154.163"}, + }, + }, + }, + { + name: "cloud implemented with InstancesV2 (without providerID) and node with providerID", + fakeCloud: &fakecloud.Cloud{ + EnableInstancesV2: true, + InstanceTypes: map[types.NodeName]string{ + types.NodeName("node0"): "t1.micro", + types.NodeName("fake://12345"): "t1.micro", + }, + ExtID: map[types.NodeName]string{ + types.NodeName("node0"): "12345", + }, + Addresses: []v1.NodeAddress{ + { + Type: v1.NodeHostName, + Address: "node0.cloud.internal", + }, + { + Type: v1.NodeInternalIP, + Address: "10.0.0.1", + }, + { + Type: v1.NodeExternalIP, + Address: "132.143.154.163", + }, + }, + Err: nil, + }, + existingNode: &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node0", + CreationTimestamp: metav1.Date(2012, 1, 1, 0, 0, 0, 0, time.UTC), + }, + Spec: v1.NodeSpec{ + ProviderID: "fake://12345", + }, + }, + expectedMetadata: &cloudprovider.InstanceMetadata{ + ProviderID: "fake://12345", + InstanceType: "t1.micro", + NodeAddresses: []v1.NodeAddress{ + {Type: "Hostname", Address: "node0.cloud.internal"}, + {Type: "InternalIP", Address: "10.0.0.1"}, + {Type: "ExternalIP", Address: "132.143.154.163"}, + }, + }, }, } @@ -2560,13 +2702,13 @@ func TestGetProviderID(t *testing.T) { cloud: test.fakeCloud, } - providerID, err := cloudNodeController.getProviderID(context.TODO(), test.existingNode) - if err != nil { - t.Fatalf("error getting provider ID: %v", err) + metadata, err := cloudNodeController.getInstanceMetadata(context.TODO(), test.existingNode) + if (err != nil) != test.expectErr { + t.Fatalf("error expected %v got: %v", test.expectErr, err) } - if !cmp.Equal(providerID, test.expectedProviderID) { - t.Errorf("unexpected providerID %s", cmp.Diff(providerID, test.expectedProviderID)) + if !cmp.Equal(metadata, test.expectedMetadata) { + t.Errorf("unexpected metadata %s", cmp.Diff(metadata, test.expectedMetadata)) } }) }