From f7167b5be0645c42ff5eb23231a8ad4cda9ecdfc Mon Sep 17 00:00:00 2001 From: Benjamin Schimke Date: Wed, 31 Jul 2024 09:21:42 +0200 Subject: [PATCH] Improve join logic to handle unreachable nodes (#560) --------- Co-authored-by: Mateo Florido <32885896+mateoflorido@users.noreply.github.com> Co-authored-by: Louise K. Schmidtgen --- src/k8s/pkg/k8sd/app/hooks_bootstrap.go | 17 ++++++++--- .../integration/tests/test_clustering_race.py | 28 +++++++++++++++++++ 2 files changed, 41 insertions(+), 4 deletions(-) create mode 100644 tests/integration/tests/test_clustering_race.py diff --git a/src/k8s/pkg/k8sd/app/hooks_bootstrap.go b/src/k8s/pkg/k8sd/app/hooks_bootstrap.go index f8a27b03d..78c4d1e92 100644 --- a/src/k8s/pkg/k8sd/app/hooks_bootstrap.go +++ b/src/k8s/pkg/k8sd/app/hooks_bootstrap.go @@ -70,8 +70,17 @@ func (a *App) onBootstrapWorkerNode(ctx context.Context, s state.State, encodedT } // TODO(neoaggelos): figure out how to use the microcluster client instead - // Get remote certificate from the cluster member - cert, err := utils.GetRemoteCertificate(token.JoinAddresses[0]) + // Get remote certificate from the cluster member. We only need one node to be reachable for this. + // One might fail because the node is not part of the cluster anymore but was at the time the token was created. + var cert *x509.Certificate + var address string + var err error + for _, address = range token.JoinAddresses { + cert, err = utils.GetRemoteCertificate(address) + if err == nil { + break + } + } if err != nil { return fmt.Errorf("failed to get certificate of cluster member: %w", err) } @@ -79,7 +88,7 @@ func (a *App) onBootstrapWorkerNode(ctx context.Context, s state.State, encodedT // verify that the fingerprint of the certificate matches the fingerprint of the token fingerprint := utils.CertFingerprint(cert) if fingerprint != token.Fingerprint { - return fmt.Errorf("fingerprint from token (%q) does not match fingerprint of node %q (%q)", token.Fingerprint, token.JoinAddresses[0], fingerprint) + return fmt.Errorf("fingerprint from token (%q) does not match fingerprint of node %q (%q)", token.Fingerprint, address, fingerprint) } // Create the http client with trusted certificate @@ -104,7 +113,7 @@ func (a *App) onBootstrapWorkerNode(ctx context.Context, s state.State, encodedT return fmt.Errorf("failed to prepare worker info request: %w", err) } - httpRequest, err := http.NewRequest("POST", fmt.Sprintf("https://%s/1.0/k8sd/worker/info", token.JoinAddresses[0]), bytes.NewBuffer(requestBody)) + httpRequest, err := http.NewRequest("POST", fmt.Sprintf("https://%s/1.0/k8sd/worker/info", address), bytes.NewBuffer(requestBody)) if err != nil { return fmt.Errorf("failed to prepare HTTP request: %w", err) } diff --git a/tests/integration/tests/test_clustering_race.py b/tests/integration/tests/test_clustering_race.py new file mode 100644 index 000000000..31fe62fa1 --- /dev/null +++ b/tests/integration/tests/test_clustering_race.py @@ -0,0 +1,28 @@ +# +# Copyright 2024 Canonical, Ltd. +# +from typing import List + +import pytest +from test_util import harness, util + + +@pytest.mark.node_count(3) +def test_wrong_token_race(instances: List[harness.Instance]): + cluster_node = instances[0] + + join_token = util.get_join_token(cluster_node, instances[1]) + util.join_cluster(instances[1], join_token) + + new_join_token = util.get_join_token(cluster_node, instances[2]) + + cluster_node.exec(["k8s", "remove-node", instances[1].id]) + + another_join_token = util.get_join_token(cluster_node, instances[2]) + + # The join token should have changed after the node was removed as + # it contains the ip addresses of all cluster nodes. + assert ( + new_join_token != another_join_token + ), "join token is not updated after node removal" + util.join_cluster(instances[2], new_join_token)