From 4bd204d3e851eb2fa6cb80460fcb5b82ab0c96dc Mon Sep 17 00:00:00 2001 From: Colin Walters Date: Thu, 27 Aug 2020 23:03:24 +0000 Subject: [PATCH] server: Target the spec configuration if we have at least one node The CI cluster hit an issue where a pull secret was broken, and then we hit a deadlock because the MCO failed to drain nodes on the old config, because other nodes on the old config couldn't schedule the pod. It just generally makes sense for new nodes to use the new config; do so as long as at least one node has successfully joined the cluster at that config. This way we still avoid breaking the cluster (and scaleup) with a bad config. --- pkg/server/cluster_server.go | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/pkg/server/cluster_server.go b/pkg/server/cluster_server.go index 3be2d4237a..3492d037d2 100644 --- a/pkg/server/cluster_server.go +++ b/pkg/server/cluster_server.go @@ -65,7 +65,16 @@ func (cs *clusterServer) GetConfig(cr poolRequest) (*runtime.RawExtension, error return nil, fmt.Errorf("could not fetch pool. err: %v", err) } - currConf := mp.Status.Configuration.Name + // For new nodes, we roll out the latest if at least one node has successfully updated. + // This avoids deadlocks in situations where the old configuration broke somehow + // (e.g. pull secret expired) + // and also avoids provisioning a new node, only to update it not long thereafter. + var currConf string + if mp.Status.UpdatedMachineCount > 0 { + currConf = mp.Spec.Configuration.Name + } else { + currConf = mp.Status.Configuration.Name + } mc, err := cs.machineClient.MachineConfigs().Get(context.TODO(), currConf, metav1.GetOptions{}) if err != nil {