diff --git a/cmd/gcp-controller-manager/BUILD b/cmd/gcp-controller-manager/BUILD index b5e42797ce..11f0de9287 100644 --- a/cmd/gcp-controller-manager/BUILD +++ b/cmd/gcp-controller-manager/BUILD @@ -66,6 +66,7 @@ go_library( "//vendor/k8s.io/apimachinery/pkg/runtime/schema", "//vendor/k8s.io/apimachinery/pkg/runtime/serializer", "//vendor/k8s.io/apimachinery/pkg/types", + "//vendor/k8s.io/apimachinery/pkg/util/errors", "//vendor/k8s.io/apimachinery/pkg/util/runtime", "//vendor/k8s.io/apimachinery/pkg/util/validation/field", "//vendor/k8s.io/apimachinery/pkg/util/wait", diff --git a/cmd/gcp-controller-manager/node_csr_approver.go b/cmd/gcp-controller-manager/node_csr_approver.go index 0d40afcbeb..79aa873d3c 100644 --- a/cmd/gcp-controller-manager/node_csr_approver.go +++ b/cmd/gcp-controller-manager/node_csr_approver.go @@ -44,6 +44,7 @@ import ( v1 "k8s.io/api/core/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + utilerrors "k8s.io/apimachinery/pkg/util/errors" "k8s.io/cloud-provider-gcp/pkg/csrmetrics" "k8s.io/cloud-provider-gcp/pkg/nodeidentity" "k8s.io/cloud-provider-gcp/pkg/tpmattest" @@ -157,6 +158,7 @@ func (a *nodeApprover) handle(ctx context.Context, csr *capi.CertificateSigningR return a.updateCSR(csr, false, r.denyMsg) } } + klog.Infof("CSR %q validation passed", csr.Name) approved, err := a.authorizeSAR(csr, r.permission) if err != nil { @@ -777,25 +779,37 @@ func isNotFound(err error) bool { func ensureNodeMatchesMetadataOrDelete(ctx *controllerContext, csr *capi.CertificateSigningRequest, x509cr *x509.CertificateRequest) error { // TODO: short-circuit if !strings.HasPrefix(x509cr.Subject.CommonName, "system:node:") { - klog.Infof("exit 1 ensureNodeMatchesMetadataOrDelete %v", x509cr.Subject.CommonName) return nil } nodeName := strings.TrimPrefix(x509cr.Subject.CommonName, "system:node:") if len(nodeName) == 0 { - klog.Infof("exit 2 ensureNodeMatchesMetadataOrDelete %v", x509cr.Subject.CommonName) return nil } - if err := deleteAllPodsBoundToNode(ctx, nodeName); err != nil { - klog.Infof("failed to delete all pods bound to node %v", nodeName) - } - recordMetric := csrmetrics.OutboundRPCStartRecorder("k8s.Nodes.get") node, err := ctx.client.CoreV1().Nodes().Get(context.TODO(), nodeName, metav1.GetOptions{}) if apierrors.IsNotFound(err) { recordMetric(csrmetrics.OutboundRPCStatusNotFound) - // if there is no existing Node object, return success - klog.Infof("node object is nil, returning from ensureNodeMatchesMetadataOrDelete nodeName: %v", nodeName) + // GCE MIGs currently reuse the instance name on new VMs. For example, + // after GCE Preemptible/Spot VM is preempted, the instance started by + // MIG will have the same instance name. This can result in old "stale" + // pods still bound to the node to exist after a node is preempted. In + // some cases, during preemption, the GCE instance is deleted and cloud + // controller will sync with k8s and the underlying k8s node object will + // be deleted + // (https://github.com/kubernetes/kubernetes/blob/44e403f5bbc71eb5f577da6ac8a2e29875ac1d28/staging/src/k8s.io/cloud-provider/controllers/nodelifecycle/node_lifecycle_controller.go#L147-L153). + // However it is possible that the pods bound to this node will not be + // garbage collected as the orphaned pods + // (https://github.com/kubernetes/kubernetes/blob/1ab40212a4e6cb10b3ae88c2e6c912a9fc1b1605/pkg/controller/podgc/gc_controller.go#L220-L255) + // will only be cleared periodically every 20 seconds. The 20 seconds + // check can race with the time the new node is started. If the new node + // is started before the pod GC will run, the pods from the previous + // node will not be cleared. To avoid this situation, explicitly delete + // all pods bound to the node name, even if the node object does not + // exist. + if err := deleteAllPodsBoundToNode(ctx, nodeName); err != nil { + klog.Warningf("Failed to delete all pods bound to node %q: %v", nodeName, err) + } return nil } if err != nil { @@ -807,7 +821,6 @@ func ensureNodeMatchesMetadataOrDelete(ctx *controllerContext, csr *capi.Certifi recordMetric(csrmetrics.OutboundRPCStatusOK) delete, err := shouldDeleteNode(ctx, node, getInstanceByName) - if err != nil { // returning an error triggers a retry of this CSR. // if the errors are persistent, the CSR will not be approved and the node bootstrap will hang. @@ -820,6 +833,18 @@ func ensureNodeMatchesMetadataOrDelete(ctx *controllerContext, csr *capi.Certifi recordMetric = csrmetrics.OutboundRPCStartRecorder("k8s.Nodes.delete") err = ctx.client.CoreV1().Nodes().Delete(context.TODO(), nodeName, metav1.DeleteOptions{Preconditions: metav1.NewUIDPreconditions(string(node.UID))}) + // Pod Deletion is best effort, do not block CSR approval during node + // registration if there was an issue deleting pods on the node object GCE + // MIGs currently reuse the instance name on new VMs. For example, after GCE + // Preemptible/Spot VM is preempted, the instance started by MIG will have + // the same instance name. This can result in old "stale" pods still bound + // to the node to exist after a node is preempted. In the case that a GCE + // node is preempted and new GCE instance is created with the same name, + // explicitly delete all bounds to the old node. + if err := deleteAllPodsBoundToNode(ctx, nodeName); err != nil { + klog.Warningf("Failed to delete all pods bound to node %q: %v", nodeName, err) + } + if apierrors.IsNotFound(err) { recordMetric(csrmetrics.OutboundRPCStatusNotFound) // If we wanted to delete and the node is gone, this counts as success @@ -838,8 +863,6 @@ func ensureNodeMatchesMetadataOrDelete(ctx *controllerContext, csr *capi.Certifi var errInstanceNotFound = errors.New("instance not found") func shouldDeleteNode(ctx *controllerContext, node *v1.Node, getInstance func(*controllerContext, string) (*compute.Instance, error)) (bool, error) { - klog.Infof("enter should delete node", node.Name) - inst, err := getInstance(ctx, node.Name) if err != nil { if err == errInstanceNotFound { @@ -849,16 +872,13 @@ func shouldDeleteNode(ctx *controllerContext, node *v1.Node, getInstance func(*c klog.Errorf("Error retrieving instance %q: %v", node.Name, err) return false, err } - - oldInstanceId := node.ObjectMeta.Annotations[InstanceIDAnnotationKey] - newInstanceId := strconv.FormatUint(inst.Id, 10) - - klog.Infof("Instance %q has old instance id %q new instance id %q", inst.Name, oldInstanceId, newInstanceId) - if oldInstanceId != "" && newInstanceId != "" && oldInstanceId != newInstanceId { - klog.Infof("Instance %q has old instance id %q new instance id %q returning true!", inst.Name, oldInstanceId, newInstanceId) + oldInstanceID := node.ObjectMeta.Annotations[InstanceIDAnnotationKey] + newInstanceID := strconv.FormatUint(inst.Id, 10) + // Even if a GCE Instance reuses the instance name, the underlying GCE instance will change (for example on Preemptible / Spot VMs during preemption). + if oldInstanceID != "" && newInstanceID != "" && oldInstanceID != newInstanceID { + klog.Infof("Detected change in instance ID on node %q - Old Instance ID: %q ; New Instance ID: %q", inst.Name, oldInstanceID, newInstanceID) return true, nil } - // Newly created node might not have pod CIDR allocated yet. if node.Spec.PodCIDR == "" { klog.V(2).Infof("Node %q has empty podCIDR.", node.Name) @@ -880,29 +900,30 @@ func shouldDeleteNode(ctx *controllerContext, node *v1.Node, getInstance func(*c } // Instance with no alias range is route based, for which node object deletion is unnecessary. klog.V(2).Infof("Instance %q has no alias range.", inst.Name) - return false, nil } func deleteAllPodsBoundToNode(ctx *controllerContext, nodeName string) error { - klog.V(1).Infof("start delete all pods pod on node %v", nodeName) + errs := []error{} + deletedPods := []string{} podList, err := ctx.client.CoreV1().Pods(metav1.NamespaceAll).List(context.TODO(), metav1.ListOptions{FieldSelector: "spec.nodeName=" + nodeName}) if err != nil { - klog.Infof("delete pods on node %v failed %v", nodeName, err) + errs = append(errs, err) } for _, pod := range podList.Items { - klog.Infof("start delete pod %v %v on node %v", pod.Namespace, pod.Name, nodeName) err := ctx.client.CoreV1().Pods(pod.Namespace).Delete(context.TODO(), pod.Name, *metav1.NewDeleteOptions(0)) if err != nil { - klog.Infof("delete pod %v %v on node %v failed :%v", pod.Namespace, pod.Name, nodeName, err) + errs = append(errs, err) + continue } - klog.Infof("end delete pod %v %v on node %v", pod.Namespace, pod.Name, nodeName) + deletedPods = append(deletedPods, pod.Name) } - klog.Infof("end delete all pods pod on node %v", nodeName) - - return nil + if len(deletedPods) > 0 { + klog.Infof("Pods %s bound to node %s were deleted.", strings.Join(deletedPods[:], ", "), nodeName) + } + return utilerrors.NewAggregate(errs) } func getInstanceByName(ctx *controllerContext, instanceName string) (*compute.Instance, error) { diff --git a/cmd/gcp-controller-manager/node_csr_approver_test.go b/cmd/gcp-controller-manager/node_csr_approver_test.go index 48ef618f38..6fe947acae 100644 --- a/cmd/gcp-controller-manager/node_csr_approver_test.go +++ b/cmd/gcp-controller-manager/node_csr_approver_test.go @@ -1189,6 +1189,7 @@ func TestShouldDeleteNode(t *testing.T) { Name: "node-test", }, }, + instance: &compute.Instance{}, }, { desc: "instance not found", @@ -1216,6 +1217,24 @@ func TestShouldDeleteNode(t *testing.T) { getInstanceErr: testErr, expectedErr: testErr, }, + { + desc: "node with different instance id", + node: &v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node-test", + Annotations: map[string]string{ + InstanceIDAnnotationKey: "1234567890123456789", + }, + }, + Spec: v1.NodeSpec{ + PodCIDR: "10.0.0.1/24", + }, + }, + instance: &compute.Instance{ + Id: 0, + }, + shouldDelete: true, + }, } for _, c := range cases { fakeGetInstance := func(_ *controllerContext, _ string) (*compute.Instance, error) {