Skip to content

Commit

Permalink
Changed order of pod and node deleteion to avoid race conditions
Browse files Browse the repository at this point in the history
Added extra pod deletion to avoid old pods when pod gc controller not active

Added testcase to test if a node should be deleted if the GCE instance ID changes
  • Loading branch information
CoderSherlock committed Sep 22, 2022
1 parent d5dbc5b commit dea9e67
Show file tree
Hide file tree
Showing 3 changed files with 69 additions and 28 deletions.
1 change: 1 addition & 0 deletions cmd/gcp-controller-manager/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ go_library(
"//vendor/k8s.io/apimachinery/pkg/runtime/schema",
"//vendor/k8s.io/apimachinery/pkg/runtime/serializer",
"//vendor/k8s.io/apimachinery/pkg/types",
"//vendor/k8s.io/apimachinery/pkg/util/errors",
"//vendor/k8s.io/apimachinery/pkg/util/runtime",
"//vendor/k8s.io/apimachinery/pkg/util/validation/field",
"//vendor/k8s.io/apimachinery/pkg/util/wait",
Expand Down
77 changes: 49 additions & 28 deletions cmd/gcp-controller-manager/node_csr_approver.go
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ import (
v1 "k8s.io/api/core/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
utilerrors "k8s.io/apimachinery/pkg/util/errors"
"k8s.io/cloud-provider-gcp/pkg/csrmetrics"
"k8s.io/cloud-provider-gcp/pkg/nodeidentity"
"k8s.io/cloud-provider-gcp/pkg/tpmattest"
Expand Down Expand Up @@ -157,6 +158,7 @@ func (a *nodeApprover) handle(ctx context.Context, csr *capi.CertificateSigningR
return a.updateCSR(csr, false, r.denyMsg)
}
}
klog.Infof("CSR %q validation passed", csr.Name)

approved, err := a.authorizeSAR(csr, r.permission)
if err != nil {
Expand Down Expand Up @@ -777,25 +779,37 @@ func isNotFound(err error) bool {
func ensureNodeMatchesMetadataOrDelete(ctx *controllerContext, csr *capi.CertificateSigningRequest, x509cr *x509.CertificateRequest) error {
// TODO: short-circuit
if !strings.HasPrefix(x509cr.Subject.CommonName, "system:node:") {
klog.Infof("exit 1 ensureNodeMatchesMetadataOrDelete %v", x509cr.Subject.CommonName)
return nil
}
nodeName := strings.TrimPrefix(x509cr.Subject.CommonName, "system:node:")
if len(nodeName) == 0 {
klog.Infof("exit 2 ensureNodeMatchesMetadataOrDelete %v", x509cr.Subject.CommonName)
return nil
}

if err := deleteAllPodsBoundToNode(ctx, nodeName); err != nil {
klog.Infof("failed to delete all pods bound to node %v", nodeName)
}

recordMetric := csrmetrics.OutboundRPCStartRecorder("k8s.Nodes.get")
node, err := ctx.client.CoreV1().Nodes().Get(context.TODO(), nodeName, metav1.GetOptions{})
if apierrors.IsNotFound(err) {
recordMetric(csrmetrics.OutboundRPCStatusNotFound)
// if there is no existing Node object, return success
klog.Infof("node object is nil, returning from ensureNodeMatchesMetadataOrDelete nodeName: %v", nodeName)
// GCE MIGs currently reuse the instance name on new VMs. For example,
// after GCE Preemptible/Spot VM is preempted, the instance started by
// MIG will have the same instance name. This can result in old "stale"
// pods still bound to the node to exist after a node is preempted. In
// some cases, during preemption, the GCE instance is deleted and cloud
// controller will sync with k8s and the underlying k8s node object will
// be deleted
// (https://github.com/kubernetes/kubernetes/blob/44e403f5bbc71eb5f577da6ac8a2e29875ac1d28/staging/src/k8s.io/cloud-provider/controllers/nodelifecycle/node_lifecycle_controller.go#L147-L153).
// However it is possible that the pods bound to this node will not be
// garbage collected as the orphaned pods
// (https://github.com/kubernetes/kubernetes/blob/1ab40212a4e6cb10b3ae88c2e6c912a9fc1b1605/pkg/controller/podgc/gc_controller.go#L220-L255)
// will only be cleared periodically every 20 seconds. The 20 seconds
// check can race with the time the new node is started. If the new node
// is started before the pod GC will run, the pods from the previous
// node will not be cleared. To avoid this situation, explicitly delete
// all pods bound to the node name, even if the node object does not
// exist.
if err := deleteAllPodsBoundToNode(ctx, nodeName); err != nil {
klog.Warningf("Failed to delete all pods bound to node %q: %v", nodeName, err)
}
return nil
}
if err != nil {
Expand All @@ -807,7 +821,6 @@ func ensureNodeMatchesMetadataOrDelete(ctx *controllerContext, csr *capi.Certifi
recordMetric(csrmetrics.OutboundRPCStatusOK)

delete, err := shouldDeleteNode(ctx, node, getInstanceByName)

if err != nil {
// returning an error triggers a retry of this CSR.
// if the errors are persistent, the CSR will not be approved and the node bootstrap will hang.
Expand All @@ -820,6 +833,18 @@ func ensureNodeMatchesMetadataOrDelete(ctx *controllerContext, csr *capi.Certifi

recordMetric = csrmetrics.OutboundRPCStartRecorder("k8s.Nodes.delete")
err = ctx.client.CoreV1().Nodes().Delete(context.TODO(), nodeName, metav1.DeleteOptions{Preconditions: metav1.NewUIDPreconditions(string(node.UID))})
// Pod Deletion is best effort, do not block CSR approval during node
// registration if there was an issue deleting pods on the node object GCE
// MIGs currently reuse the instance name on new VMs. For example, after GCE
// Preemptible/Spot VM is preempted, the instance started by MIG will have
// the same instance name. This can result in old "stale" pods still bound
// to the node to exist after a node is preempted. In the case that a GCE
// node is preempted and new GCE instance is created with the same name,
// explicitly delete all bounds to the old node.
if err := deleteAllPodsBoundToNode(ctx, nodeName); err != nil {
klog.Warningf("Failed to delete all pods bound to node %q: %v", nodeName, err)
}

if apierrors.IsNotFound(err) {
recordMetric(csrmetrics.OutboundRPCStatusNotFound)
// If we wanted to delete and the node is gone, this counts as success
Expand All @@ -838,8 +863,6 @@ func ensureNodeMatchesMetadataOrDelete(ctx *controllerContext, csr *capi.Certifi
var errInstanceNotFound = errors.New("instance not found")

func shouldDeleteNode(ctx *controllerContext, node *v1.Node, getInstance func(*controllerContext, string) (*compute.Instance, error)) (bool, error) {
klog.Infof("enter should delete node", node.Name)

inst, err := getInstance(ctx, node.Name)
if err != nil {
if err == errInstanceNotFound {
Expand All @@ -849,16 +872,13 @@ func shouldDeleteNode(ctx *controllerContext, node *v1.Node, getInstance func(*c
klog.Errorf("Error retrieving instance %q: %v", node.Name, err)
return false, err
}

oldInstanceId := node.ObjectMeta.Annotations[InstanceIDAnnotationKey]
newInstanceId := strconv.FormatUint(inst.Id, 10)

klog.Infof("Instance %q has old instance id %q new instance id %q", inst.Name, oldInstanceId, newInstanceId)
if oldInstanceId != "" && newInstanceId != "" && oldInstanceId != newInstanceId {
klog.Infof("Instance %q has old instance id %q new instance id %q returning true!", inst.Name, oldInstanceId, newInstanceId)
oldInstanceID := node.ObjectMeta.Annotations[InstanceIDAnnotationKey]
newInstanceID := strconv.FormatUint(inst.Id, 10)
// Even if a GCE Instance reuses the instance name, the underlying GCE instance will change (for example on Preemptible / Spot VMs during preemption).
if oldInstanceID != "" && newInstanceID != "" && oldInstanceID != newInstanceID {
klog.Infof("Detected change in instance ID on node %q - Old Instance ID: %q ; New Instance ID: %q", inst.Name, oldInstanceID, newInstanceID)
return true, nil
}

// Newly created node might not have pod CIDR allocated yet.
if node.Spec.PodCIDR == "" {
klog.V(2).Infof("Node %q has empty podCIDR.", node.Name)
Expand All @@ -880,29 +900,30 @@ func shouldDeleteNode(ctx *controllerContext, node *v1.Node, getInstance func(*c
}
// Instance with no alias range is route based, for which node object deletion is unnecessary.
klog.V(2).Infof("Instance %q has no alias range.", inst.Name)

return false, nil
}

func deleteAllPodsBoundToNode(ctx *controllerContext, nodeName string) error {
klog.V(1).Infof("start delete all pods pod on node %v", nodeName)
errs := []error{}
deletedPods := []string{}

podList, err := ctx.client.CoreV1().Pods(metav1.NamespaceAll).List(context.TODO(), metav1.ListOptions{FieldSelector: "spec.nodeName=" + nodeName})
if err != nil {
klog.Infof("delete pods on node %v failed %v", nodeName, err)
errs = append(errs, err)
}

for _, pod := range podList.Items {
klog.Infof("start delete pod %v %v on node %v", pod.Namespace, pod.Name, nodeName)
err := ctx.client.CoreV1().Pods(pod.Namespace).Delete(context.TODO(), pod.Name, *metav1.NewDeleteOptions(0))
if err != nil {
klog.Infof("delete pod %v %v on node %v failed :%v", pod.Namespace, pod.Name, nodeName, err)
errs = append(errs, err)
continue
}
klog.Infof("end delete pod %v %v on node %v", pod.Namespace, pod.Name, nodeName)
deletedPods = append(deletedPods, pod.Name)
}
klog.Infof("end delete all pods pod on node %v", nodeName)

return nil
if len(deletedPods) > 0 {
klog.Infof("Pods %s bound to node %s were deleted.", strings.Join(deletedPods[:], ", "), nodeName)
}
return utilerrors.NewAggregate(errs)
}

func getInstanceByName(ctx *controllerContext, instanceName string) (*compute.Instance, error) {
Expand Down
19 changes: 19 additions & 0 deletions cmd/gcp-controller-manager/node_csr_approver_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1189,6 +1189,7 @@ func TestShouldDeleteNode(t *testing.T) {
Name: "node-test",
},
},
instance: &compute.Instance{},
},
{
desc: "instance not found",
Expand Down Expand Up @@ -1216,6 +1217,24 @@ func TestShouldDeleteNode(t *testing.T) {
getInstanceErr: testErr,
expectedErr: testErr,
},
{
desc: "node with different instance id",
node: &v1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: "node-test",
Annotations: map[string]string{
InstanceIDAnnotationKey: "1234567890123456789",
},
},
Spec: v1.NodeSpec{
PodCIDR: "10.0.0.1/24",
},
},
instance: &compute.Instance{
Id: 0,
},
shouldDelete: true,
},
}
for _, c := range cases {
fakeGetInstance := func(_ *controllerContext, _ string) (*compute.Instance, error) {
Expand Down

0 comments on commit dea9e67

Please sign in to comment.