Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Consider critical-components-not-ready taint when machine joins first time #778

Merged
merged 7 commits into from
Feb 17, 2023
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions pkg/util/provider/machinecontroller/controller_suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -324,6 +324,17 @@ func newMachine(
return machine
}

func newHealthyMachine(generateName, nodeLabelKey string, phase v1alpha1.MachinePhase) *v1alpha1.Machine {
return newMachine(
&v1alpha1.MachineTemplateSpec{ObjectMeta: *newObjectMeta(&metav1.ObjectMeta{GenerateName: generateName}, 0)},
&v1alpha1.MachineStatus{
Conditions: nodeConditions(true, false, false, false, false),
CurrentStatus: v1alpha1.CurrentStatus{Phase: phase, LastUpdateTime: metav1.Now()},
},
nil, nil, map[string]string{v1alpha1.NodeLabelKey: nodeLabelKey}, true, metav1.Now(),
)
}

func newMachines(
machineCount int,
specTemplate *v1alpha1.MachineTemplateSpec,
Expand Down
87 changes: 56 additions & 31 deletions pkg/util/provider/machinecontroller/machine_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,8 @@ var _ = Describe("machine", func() {
var (
fakeMachineClient *fakemachineapi.FakeMachineV1alpha1
c *controller
testMachine v1alpha1.Machine
testNode v1.Node
)

Describe("#isHealthy", func() {
Expand All @@ -58,41 +60,36 @@ var _ = Describe("machine", func() {
controlMachineClient: fakeMachineClient,
nodeConditions: "ReadonlyFilesystem,KernelDeadlock,DiskPressure,NetworkUnavailable",
}
testMachine = v1alpha1.Machine{
ObjectMeta: metav1.ObjectMeta{
Name: "testmachine",
Namespace: testNamespace,
},
Status: v1alpha1.MachineStatus{
Conditions: []corev1.NodeCondition{
{
Type: corev1.NodeDiskPressure,
Status: corev1.ConditionFalse,
},
{
Type: corev1.NodeMemoryPressure,
Status: corev1.ConditionFalse,
},
{
Type: corev1.NodeNetworkUnavailable,
Status: corev1.ConditionFalse,
},
{
Type: corev1.NodeReady,
Status: corev1.ConditionTrue,
},
},
},
}
})

testMachine := v1alpha1.Machine{
ObjectMeta: metav1.ObjectMeta{
Name: "testmachine",
Namespace: testNamespace,
},
Status: v1alpha1.MachineStatus{
Conditions: []corev1.NodeCondition{},
},
}
DescribeTable("Checking health of the machine",
func(conditionType corev1.NodeConditionType, conditionStatus corev1.ConditionStatus, expected bool) {
testMachine.Status.Conditions = []corev1.NodeCondition{
{
Type: corev1.NodeReady,
Status: corev1.ConditionTrue,
},
{
Type: corev1.NodeDiskPressure,
Status: corev1.ConditionFalse,
},
{
Type: corev1.NodeMemoryPressure,
Status: corev1.ConditionFalse,
},
{
Type: corev1.NodeNetworkUnavailable,
Status: corev1.ConditionFalse,
},
{
Type: corev1.NodeReady,
Status: corev1.ConditionTrue,
},
}
for i, condition := range testMachine.Status.Conditions {
if condition.Type == conditionType {
testMachine.Status.Conditions[i].Status = conditionStatus
Expand Down Expand Up @@ -123,6 +120,34 @@ var _ = Describe("machine", func() {
)
})

Describe("#criticalComponentsNotReadyTaintPresent", func() {
BeforeEach(func() {
c = &controller{
controlMachineClient: fakeMachineClient,
nodeConditions: "ReadonlyFilesystem,KernelDeadlock,DiskPressure,NetworkUnavailable",
}
testNode = v1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: "testnode",
},
Spec: v1.NodeSpec{
Taints: []v1.Taint{},
},
}
})

DescribeTable("Checking readiness of the node",
func(nodeTaints []v1.Taint, expected bool) {
testNode.Spec.Taints = nodeTaints
Expect(c.criticalComponentsNotReadyTaintPresent(&testNode)).Should(BeIdenticalTo(expected))
},
Entry("with no taints is False", nil, false),
Entry("with empty taints is False", []v1.Taint{}, false),
Entry("with unrelated taints is False", []v1.Taint{{Key: "unrelated", Effect: corev1.TaintEffectNoSchedule}}, false),
Entry("with critical-components-not-ready taint is True", []v1.Taint{{Key: "node.gardener.cloud/critical-components-not-ready", Effect: corev1.TaintEffectNoSchedule}}, true),
)
})

/*
Describe("##updateMachineConditions", func() {
Describe("Update conditions of a non-existing machine", func() {
Expand Down
30 changes: 29 additions & 1 deletion pkg/util/provider/machinecontroller/machine_util.go
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ const (
// TODO: use client library instead when it starts to support update retries
//
// see https://github.com/kubernetes/kubernetes/issues/21479
//type updateMachineFunc func(machine *v1alpha1.Machine) error
// type updateMachineFunc func(machine *v1alpha1.Machine) error

/*
// UpdateMachineWithRetries updates a machine with given applyUpdate function. Note that machine not found error is ignored.
Expand Down Expand Up @@ -645,6 +645,23 @@ func (c *controller) reconcileMachineHealth(ctx context.Context, machine *v1alph
}
cloneDirty = true

} else if c.isHealthy(clone) && clone.Status.CurrentStatus.Phase == v1alpha1.MachinePending {
// when checking if a healthy machine in `Pending` is ready to be `Running`,
// we need to take into account the CriticalNodeComponentsNotReadyTaint.
if !c.criticalComponentsNotReadyTaintPresent(node) {
// Machine is ready and has joined the cluster
clone.Status.LastOperation = v1alpha1.LastOperation{
Description: description,
State: v1alpha1.MachineStateSuccessful,
Type: lastOperationType,
LastUpdateTime: metav1.Now(),
}
clone.Status.CurrentStatus = v1alpha1.CurrentStatus{
Phase: v1alpha1.MachineRunning,
LastUpdateTime: metav1.Now(),
}
cloneDirty = true
}
} else if c.isHealthy(clone) && clone.Status.CurrentStatus.Phase != v1alpha1.MachineRunning {
// If machine is healhy and current machinePhase is not running.
// indicates that the machine is not healthy and status needs to be updated.
Expand Down Expand Up @@ -829,16 +846,27 @@ func (c *controller) isHealthy(machine *v1alpha1.Machine) bool {
// If Kubelet is not ready
return false
}

conditions := strings.Split(*c.getEffectiveNodeConditions(machine), ",")
for _, c := range conditions {
if string(condition.Type) == c && condition.Status != v1.ConditionFalse {
return false
}
}
}

return true
}

func (c *controller) criticalComponentsNotReadyTaintPresent(node *v1.Node) bool {
for _, taint := range node.Spec.Taints {
if taint.Key == machineutils.TaintNodeCriticalComponentsNotReady && taint.Effect == v1.TaintEffectNoSchedule {
return true
}
}
return false
}

/*
SECTION
Delete machine
Expand Down
86 changes: 86 additions & 0 deletions pkg/util/provider/machinecontroller/machine_util_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2309,6 +2309,92 @@ var _ = Describe("machine_util", func() {
expectedPhase: v1alpha1.MachinePending,
},
}),
Entry("pending machine is healthy, but node has `critical-components-not-ready` taint, shouldn't be marked Running", &data{
setup: setup{
machines: []*v1alpha1.Machine{
newHealthyMachine(machineSet1Deploy1, "node-0", v1alpha1.MachinePending),
},
nodes: []*v1.Node{
newNode(
1,
nil,
nil,
&v1.NodeSpec{
Taints: []v1.Taint{
{
Key: "node.gardener.cloud/critical-components-not-ready",
Effect: corev1.TaintEffectNoSchedule,
},
},
},
&v1.NodeStatus{
Phase: corev1.NodeRunning,
Conditions: nodeConditions(true, false, false, false, false),
},
),
},
targetMachineName: machineSet1Deploy1 + "-" + "0",
},
expect: expect{
retryPeriod: machineutils.LongRetry,
err: nil,
expectedPhase: v1alpha1.MachinePending,
},
}),
Entry("pending machine is healthy, and node doesn't have `critical-components-not-ready taint`, should be marked Running", &data{
setup: setup{
machines: []*v1alpha1.Machine{
newHealthyMachine(machineSet1Deploy1, "node-0", v1alpha1.MachinePending),
},
nodes: []*v1.Node{
newNode(
1,
nil,
nil,
&v1.NodeSpec{
Taints: []v1.Taint{},
},
&v1.NodeStatus{
Phase: corev1.NodeRunning,
Conditions: nodeConditions(true, false, false, false, false),
},
),
},
targetMachineName: machineSet1Deploy1 + "-" + "0",
},
expect: expect{
retryPeriod: machineutils.ShortRetry,
err: errors.New("machine creation is successful. Machine State has been UPDATED"),
expectedPhase: v1alpha1.MachineRunning,
},
}),
Entry("unknown machine is healthy, and node doesn't have `critical-components-not-ready` taint, should be marked Running", &data{
himanshu-kun marked this conversation as resolved.
Show resolved Hide resolved
setup: setup{
machines: []*v1alpha1.Machine{
newHealthyMachine(machineSet1Deploy1, "node-0", v1alpha1.MachineUnknown),
},
nodes: []*v1.Node{
newNode(
1,
nil,
nil,
&v1.NodeSpec{
Taints: []v1.Taint{},
},
&v1.NodeStatus{
Phase: corev1.NodeRunning,
Conditions: nodeConditions(true, false, false, false, false),
},
),
},
targetMachineName: machineSet1Deploy1 + "-" + "0",
},
expect: expect{
retryPeriod: machineutils.ShortRetry,
err: errors.New("machine creation is successful. Machine State has been UPDATED"),
expectedPhase: v1alpha1.MachineRunning,
},
}),
)

DescribeTable("##Meltdown scenario when many machines Unknown for over 10min(healthTimeout)", func(data *data) {
Expand Down
4 changes: 4 additions & 0 deletions pkg/util/provider/machineutils/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,10 @@ const (

// NodeTerminationCondition describes nodes that are terminating
NodeTerminationCondition v1.NodeConditionType = "Terminating"

// TaintNodeCriticalComponentsNotReady is the name of a gardener taint
// indicating that a node is not yet ready to have user workload scheduled
TaintNodeCriticalComponentsNotReady = "node.gardener.cloud/critical-components-not-ready"
)

// RetryPeriod is an alias for specifying the retry period
Expand Down