Skip to content

Commit

Permalink
Changed order of pod and node deleteion to avoid race conditions, added
Browse files Browse the repository at this point in the history
extra pod deletion to avoid old pods when pod gc controller not active.
  • Loading branch information
CoderSherlock committed Sep 7, 2022
1 parent a4c9303 commit 32b5aab
Show file tree
Hide file tree
Showing 2 changed files with 35 additions and 23 deletions.
1 change: 1 addition & 0 deletions cmd/gcp-controller-manager/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ go_library(
"//vendor/k8s.io/apimachinery/pkg/runtime/schema",
"//vendor/k8s.io/apimachinery/pkg/runtime/serializer",
"//vendor/k8s.io/apimachinery/pkg/types",
"//vendor/k8s.io/apimachinery/pkg/util/errors",
"//vendor/k8s.io/apimachinery/pkg/util/runtime",
"//vendor/k8s.io/apimachinery/pkg/util/validation/field",
"//vendor/k8s.io/apimachinery/pkg/util/wait",
Expand Down
57 changes: 34 additions & 23 deletions cmd/gcp-controller-manager/node_csr_approver.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ import (
v1 "k8s.io/api/core/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
utilerrors "k8s.io/apimachinery/pkg/util/errors"
"k8s.io/cloud-provider-gcp/pkg/csrmetrics"
"k8s.io/cloud-provider-gcp/pkg/nodeidentity"
"k8s.io/cloud-provider-gcp/pkg/tpmattest"
Expand Down Expand Up @@ -777,25 +778,28 @@ func isNotFound(err error) bool {
func ensureNodeMatchesMetadataOrDelete(ctx *controllerContext, csr *capi.CertificateSigningRequest, x509cr *x509.CertificateRequest) error {
// TODO: short-circuit
if !strings.HasPrefix(x509cr.Subject.CommonName, "system:node:") {
klog.Infof("exit 1 ensureNodeMatchesMetadataOrDelete %v", x509cr.Subject.CommonName)
return nil
}
nodeName := strings.TrimPrefix(x509cr.Subject.CommonName, "system:node:")
if len(nodeName) == 0 {
klog.Infof("exit 2 ensureNodeMatchesMetadataOrDelete %v", x509cr.Subject.CommonName)
return nil
}

if err := deleteAllPodsBoundToNode(ctx, nodeName); err != nil {
klog.Infof("failed to delete all pods bound to node %v", nodeName)
}

recordMetric := csrmetrics.OutboundRPCStartRecorder("k8s.Nodes.get")
node, err := ctx.client.CoreV1().Nodes().Get(context.TODO(), nodeName, metav1.GetOptions{})
if apierrors.IsNotFound(err) {
recordMetric(csrmetrics.OutboundRPCStatusNotFound)
// if there is no existing Node object, return success
klog.Infof("node object is nil, returning from ensureNodeMatchesMetadataOrDelete nodeName: %v", nodeName)
/*
GCE MIGs currently reuse the instance name on new VMs. For example, after GCE Preemptible/Spot VM is preempted, the instance started by MIG will have the same instance name.
This can result in old "stale" pods still bound to the node to exist after a node is preempted.
In some cases, during preemption, the GCE instance is deleted and cloud controller will sync with k8s and the underlying k8s node object will be deleted (https://github.com/kubernetes/kubernetes/blob/44e403f5bbc71eb5f577da6ac8a2e29875ac1d28/staging/src/k8s.io/cloud-provider/controllers/nodelifecycle/node_lifecycle_controller.go#L147-L153).
However it is possible that the pods bound to this node will not be garbage collected as the orphaned pods (https://github.com/kubernetes/kubernetes/blob/1ab40212a4e6cb10b3ae88c2e6c912a9fc1b1605/pkg/controller/podgc/gc_controller.go#L220-L255) will only be cleared periodically every 20 seconds.
The 20 seconds check can race with the time the new node is started. If the new node is started before the pod GC will run, the pods from the previous node will not be cleared.
To avoid this situation, explicitly delete all pods bound to the node name, even if the node object does not exist.
*/
if err := deleteAllPodsBoundToNode(ctx, nodeName); err != nil {
klog.Infof("failed to delete all pods bound to node %v", nodeName)
}
return nil
}
if err != nil {
Expand All @@ -820,6 +824,16 @@ func ensureNodeMatchesMetadataOrDelete(ctx *controllerContext, csr *capi.Certifi

recordMetric = csrmetrics.OutboundRPCStartRecorder("k8s.Nodes.delete")
err = ctx.client.CoreV1().Nodes().Delete(context.TODO(), nodeName, metav1.DeleteOptions{Preconditions: metav1.NewUIDPreconditions(string(node.UID))})
/*
Pod Deletion is best effort, do not block CSR approval during node registration if there was an issue deleting pods on the node object
GCE MIGs currently reuse the instance name on new VMs. For example, after GCE Preemptible/Spot VM is preempted, the instance started by MIG will have the same instance name.
This can result in old "stale" pods still bound to the node to exist after a node is preempted.
In the case that a GCE node is preempted and new GCE instance is created with the same name, explicitly delete all bounds to the old node.
*/
if err := deleteAllPodsBoundToNode(ctx, nodeName); err != nil {
klog.Infof("failed to delete all pods bound to node %v", nodeName)
}

if apierrors.IsNotFound(err) {
recordMetric(csrmetrics.OutboundRPCStatusNotFound)
// If we wanted to delete and the node is gone, this counts as success
Expand All @@ -838,8 +852,6 @@ func ensureNodeMatchesMetadataOrDelete(ctx *controllerContext, csr *capi.Certifi
var errInstanceNotFound = errors.New("instance not found")

func shouldDeleteNode(ctx *controllerContext, node *v1.Node, getInstance func(*controllerContext, string) (*compute.Instance, error)) (bool, error) {
klog.Infof("enter should delete node", node.Name)

inst, err := getInstance(ctx, node.Name)
if err != nil {
if err == errInstanceNotFound {
Expand All @@ -850,13 +862,14 @@ func shouldDeleteNode(ctx *controllerContext, node *v1.Node, getInstance func(*c
return false, err
}

oldInstanceId := node.ObjectMeta.Annotations[InstanceIDAnnotationKey]
newInstanceId := strconv.FormatUint(inst.Id, 10)

klog.Infof("Instance %q has old instance id %q new instance id %q", inst.Name, oldInstanceId, newInstanceId)
if oldInstanceId != "" && newInstanceId != "" && oldInstanceId != newInstanceId {
klog.Infof("Instance %q has old instance id %q new instance id %q returning true!", inst.Name, oldInstanceId, newInstanceId)
return true, nil
if inst != nil {
oldInstanceID := node.ObjectMeta.Annotations[InstanceIDAnnotationKey]
newInstanceID := strconv.FormatUint(inst.Id, 10)
// Even if a GCE Instance reuses the instance name, the underlying GCE instance will change (for example on Preemptible / Spot VMs during preemption).
if oldInstanceID != "" && newInstanceID != "" && oldInstanceID != newInstanceID {
klog.Infof("Detected change in instance ID on node %q - Old Instance ID: %q ; New Instance ID: %q", inst.Name, oldInstanceID, newInstanceID)
return true, nil
}
}

// Newly created node might not have pod CIDR allocated yet.
Expand Down Expand Up @@ -886,23 +899,21 @@ func shouldDeleteNode(ctx *controllerContext, node *v1.Node, getInstance func(*c

func deleteAllPodsBoundToNode(ctx *controllerContext, nodeName string) error {
klog.V(1).Infof("start delete all pods pod on node %v", nodeName)
errs := []error{}

podList, err := ctx.client.CoreV1().Pods(metav1.NamespaceAll).List(context.TODO(), metav1.ListOptions{FieldSelector: "spec.nodeName=" + nodeName})
if err != nil {
klog.Infof("delete pods on node %v failed %v", nodeName, err)
errs = append(errs, err)
}

for _, pod := range podList.Items {
klog.Infof("start delete pod %v %v on node %v", pod.Namespace, pod.Name, nodeName)
err := ctx.client.CoreV1().Pods(pod.Namespace).Delete(context.TODO(), pod.Name, *metav1.NewDeleteOptions(0))
if err != nil {
klog.Infof("delete pod %v %v on node %v failed :%v", pod.Namespace, pod.Name, nodeName, err)
errs = append(errs, err)
}
klog.Infof("end delete pod %v %v on node %v", pod.Namespace, pod.Name, nodeName)
}
klog.Infof("end delete all pods pod on node %v", nodeName)

return nil
return utilerrors.NewAggregate(errs)
}

func getInstanceByName(ctx *controllerContext, instanceName string) (*compute.Instance, error) {
Expand Down

0 comments on commit 32b5aab

Please sign in to comment.