Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use remote NodeRef in Machine and MachineSet controllers #1052

Merged
merged 1 commit into from
Jun 25, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions pkg/controller/machine/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ go_library(
deps = [
"//pkg/apis/cluster/v1alpha1:go_default_library",
"//pkg/controller/error:go_default_library",
"//pkg/controller/remote:go_default_library",
"//pkg/util:go_default_library",
"//vendor/github.com/pkg/errors:go_default_library",
"//vendor/k8s.io/api/core/v1:go_default_library",
Expand Down
41 changes: 30 additions & 11 deletions pkg/controller/machine/machine_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ import (
"k8s.io/klog"
clusterv1 "sigs.k8s.io/cluster-api/pkg/apis/cluster/v1alpha1"
controllerError "sigs.k8s.io/cluster-api/pkg/controller/error"
"sigs.k8s.io/cluster-api/pkg/controller/remote"
"sigs.k8s.io/cluster-api/pkg/util"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/controller"
Expand Down Expand Up @@ -181,8 +182,8 @@ func (r *ReconcileMachine) Reconcile(request reconcile.Request) (reconcile.Resul

if m.Status.NodeRef != nil {
klog.Infof("Deleting node %q for machine %q", m.Status.NodeRef.Name, m.Name)
if err := r.deleteNode(ctx, m.Status.NodeRef.Name); err != nil {
klog.Errorf("Error deleting node %q for machine %q", name, err)
if err := r.deleteNode(ctx, cluster, m.Status.NodeRef.Name); err != nil && !apierrors.IsNotFound(err) {
klog.Errorf("Error deleting node %q for machine %q: %v", m.Status.NodeRef.Name, name, err)
return reconcile.Result{}, err
}
}
Expand Down Expand Up @@ -253,6 +254,9 @@ func (r *ReconcileMachine) getCluster(ctx context.Context, machine *clusterv1.Ma
return cluster, nil
}

// isDeletedAllowed returns false if the Machine we're trying to delete is the
// Machine hosting this controller. This method is meant to be functional
// only when the controllers are running in the workload cluster.
func (r *ReconcileMachine) isDeleteAllowed(machine *clusterv1.Machine) bool {
if r.nodeName == "" || machine.Status.NodeRef == nil {
return true
Expand All @@ -274,15 +278,30 @@ func (r *ReconcileMachine) isDeleteAllowed(machine *clusterv1.Machine) bool {
return node.UID != machine.Status.NodeRef.UID
}

func (r *ReconcileMachine) deleteNode(ctx context.Context, name string) error {
var node corev1.Node
if err := r.Client.Get(ctx, client.ObjectKey{Name: name}, &node); err != nil {
if apierrors.IsNotFound(err) {
klog.V(2).Infof("Node %q not found", name)
return nil
func (r *ReconcileMachine) deleteNode(ctx context.Context, cluster *clusterv1.Cluster, name string) error {
if cluster == nil {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ah this was a little bit confusing to read through. could consider a comment here because a nil cluster clearly means something special is happening, but I don't know why a nil cluster implies the management cluster is the workload cluster

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added some comments, in case Cluster is specified, we look for the KubeConfig secret

// Try to retrieve the Node from the local cluster, if no Cluster reference is found.
var node corev1.Node
if err := r.Client.Get(ctx, client.ObjectKey{Name: name}, &node); err != nil {
return err
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This seems like it could be return nil / won't retry error. But I'm not sure, what was the logic behind this being an actual error?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Removed this part, IsNotFound is checked from the caller

}
klog.Errorf("Failed to get node %q: %v", name, err)
return err
return r.Client.Delete(ctx, &node)
}

// Otherwise, proceed to get the remote cluster client and get the Node.
remoteClient, err := remote.NewClusterClient(r.Client, cluster)
if err != nil {
klog.Errorf("Error creating a remote client for cluster %q while deleting Machine %q, won't retry: %v",
cluster.Name, name, err)
return nil
}
return r.Client.Delete(ctx, &node)

corev1Remote, err := remoteClient.CoreV1()
if err != nil {
klog.Errorf("Error creating a remote client for cluster %q while deleting Machine %q, won't retry: %v",
cluster.Name, name, err)
return nil
}

return corev1Remote.Nodes().Delete(name, &metav1.DeleteOptions{})
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This has potential to loop forever if the node doesn't exist already. This should also error check that if the object can't be found then it's a success, not a retryable error. Unless .Delete does this automatically...:rocket:

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The caller does the check at the moment, !apierrors.IsNotFound(err). Is this what you were referring to?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yep! thanks

}
1 change: 1 addition & 0 deletions pkg/controller/machineset/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ go_library(
deps = [
"//pkg/apis/cluster/v1alpha1:go_default_library",
"//pkg/controller/noderefutil:go_default_library",
"//pkg/controller/remote:go_default_library",
"//pkg/util:go_default_library",
"//vendor/github.com/pkg/errors:go_default_library",
"//vendor/k8s.io/api/core/v1:go_default_library",
Expand Down
45 changes: 35 additions & 10 deletions pkg/controller/machineset/status.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,13 @@ import (
"context"
"fmt"

"github.com/pkg/errors"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/labels"
"k8s.io/klog"
"sigs.k8s.io/cluster-api/pkg/apis/cluster/v1alpha1"
"sigs.k8s.io/cluster-api/pkg/controller/noderefutil"
"sigs.k8s.io/cluster-api/pkg/controller/remote"
"sigs.k8s.io/controller-runtime/pkg/client"
)

Expand All @@ -37,6 +37,7 @@ const (

func (c *ReconcileMachineSet) calculateStatus(ms *v1alpha1.MachineSet, filteredMachines []*v1alpha1.Machine) v1alpha1.MachineSetStatus {
newStatus := ms.Status

// Count the number of machines that have labels matching the labels of the machine
// template of the replica set, the matching machines may have more
// labels than are in the template. Because the label of machineTemplateSpec is
Expand All @@ -46,15 +47,28 @@ func (c *ReconcileMachineSet) calculateStatus(ms *v1alpha1.MachineSet, filteredM
readyReplicasCount := 0
availableReplicasCount := 0
templateLabel := labels.Set(ms.Spec.Template.Labels).AsSelectorPreValidated()

// Retrieve Cluster, if any.
cluster, _ := c.getCluster(ms)

for _, machine := range filteredMachines {
if templateLabel.Matches(labels.Set(machine.Labels)) {
fullyLabeledReplicasCount++
}
node, err := c.getMachineNode(machine)

if machine.Status.NodeRef == nil {
klog.Warningf("Unable to retrieve Node status for Machine %q in namespace %q: missing NodeRef",
machine.Name, machine.Namespace)
continue
}

node, err := c.getMachineNode(cluster, machine)
if err != nil {
klog.V(4).Infof("Unable to get node for machine %v, %v", machine.Name, err)
klog.Warningf("Unable to retrieve Node status for Machine %q in namespace %q: %v",
machine.Name, machine.Namespace, err)
continue
}

if noderefutil.IsNodeReady(node) {
readyReplicasCount++
if noderefutil.IsNodeAvailable(node, ms.Spec.MinReadySeconds, metav1.Now()) {
Expand Down Expand Up @@ -122,13 +136,24 @@ func updateMachineSetStatus(c client.Client, ms *v1alpha1.MachineSet, newStatus
return nil, updateErr
}

func (c *ReconcileMachineSet) getMachineNode(machine *v1alpha1.Machine) (*corev1.Node, error) {
nodeRef := machine.Status.NodeRef
if nodeRef == nil {
return nil, errors.New("machine has no node ref")
func (c *ReconcileMachineSet) getMachineNode(cluster *v1alpha1.Cluster, machine *v1alpha1.Machine) (*corev1.Node, error) {
if cluster == nil {
// Try to retrieve the Node from the local cluster, if no Cluster reference is found.
node := &corev1.Node{}
err := c.Client.Get(context.Background(), client.ObjectKey{Name: machine.Status.NodeRef.Name}, node)
return node, err
}

// Otherwise, proceed to get the remote cluster client and get the Node.
remoteClient, err := remote.NewClusterClient(c.Client, cluster)
if err != nil {
return nil, err
}

corev1Remote, err := remoteClient.CoreV1()
if err != nil {
return nil, err
}

node := &corev1.Node{}
err := c.Client.Get(context.Background(), client.ObjectKey{Name: nodeRef.Name}, node)
return node, err
return corev1Remote.Nodes().Get(machine.Status.NodeRef.Name, metav1.GetOptions{})
}