diff --git a/core/imageroot/var/lib/nethserver/cluster/actions/create-cluster/60wgboot b/core/imageroot/var/lib/nethserver/cluster/actions/create-cluster/60wgboot index fd7feefe1..f9f998aa3 100755 --- a/core/imageroot/var/lib/nethserver/cluster/actions/create-cluster/60wgboot +++ b/core/imageroot/var/lib/nethserver/cluster/actions/create-cluster/60wgboot @@ -33,7 +33,7 @@ read -r network < <(jq -r '.network') # 0. Allow connections to the wireguard UDP port from # the default firewall zone. # 1. Trust connections from the wireguard VPN -firewall-cmd --permanent --new-service=ns-wireguard +firewall-cmd --permanent --new-service=ns-wireguard || : firewall-cmd --permanent --add-service=ns-wireguard firewall-cmd --permanent --service=ns-wireguard --add-port=55820/udp firewall-cmd --permanent --zone=trusted --add-source="${network}" diff --git a/core/imageroot/var/lib/nethserver/cluster/actions/join-cluster/00validate_cluster b/core/imageroot/var/lib/nethserver/cluster/actions/join-cluster/00validate_cluster index feef492fa..df056f7ad 100755 --- a/core/imageroot/var/lib/nethserver/cluster/actions/join-cluster/00validate_cluster +++ b/core/imageroot/var/lib/nethserver/cluster/actions/join-cluster/00validate_cluster @@ -9,20 +9,21 @@ import json import sys import agent import agent.tasks +import requests from aiohttp import ClientConnectorCertificateError, ClientResponseError request = json.load(sys.stdin) - +endpoint_url = request['url'].rstrip('/') + '/cluster-admin' rdb = agent.redis_connect() - network = rdb.get('cluster/network') if network: agent.set_status('validation-failed') - json.dump([{'field':'url', 'parameter':'url','value': request['url'], 'error':'cluster_network_already_set'}], fp=sys.stdout) + json.dump([{'field':'url', 'parameter':'url','value': endpoint_url, 'error':'cluster_network_already_set'}], fp=sys.stdout) sys.exit(2) try: + requests.head(endpoint_url, verify=request['tls_verify'], timeout=8.0) validate_auth = agent.tasks.run( agent_id='cluster', action='list-actions', @@ -32,15 +33,23 @@ try: "isNotificationHidden": True, }, parent='', # Our AGENT_TASK_ID is useless in another cluster: force parent to empty string - endpoint=request['url'].strip('/') + '/cluster-admin', + endpoint=endpoint_url, tls_verify=request['tls_verify'], auth_token=request['jwt'], + retry_attempts=1, ) -except ClientConnectorCertificateError: +except (requests.exceptions.SSLError, ClientConnectorCertificateError) as ex: agent.set_status('validation-failed') + print(agent.SD_ERR, ex, file=sys.stderr) json.dump([{'field':'tls_verify', 'parameter':'tls_verify','value': request['tls_verify'], 'error':'cluster_tls_verify_error'}], fp=sys.stdout) sys.exit(3) -except ClientResponseError: +except ClientResponseError as ex: agent.set_status('validation-failed') + print(agent.SD_ERR, ex, file=sys.stderr) json.dump([{'field':'jwt', 'parameter':'jwt','value': '***', 'error':'cluster_auth_error'}], fp=sys.stdout) sys.exit(4) +except requests.ConnectionError as ex: + agent.set_status('validation-failed') + print(agent.SD_ERR, ex, file=sys.stderr) + json.dump([{'field':'url', 'parameter':'url','value': endpoint_url, 'error':'cluster_connection_error'}], fp=sys.stdout) + sys.exit(5) diff --git a/core/imageroot/var/lib/nethserver/cluster/actions/join-cluster/50update b/core/imageroot/var/lib/nethserver/cluster/actions/join-cluster/50update index 4a5118935..6c2d99fb1 100755 --- a/core/imageroot/var/lib/nethserver/cluster/actions/join-cluster/50update +++ b/core/imageroot/var/lib/nethserver/cluster/actions/join-cluster/50update @@ -25,6 +25,7 @@ import agent.tasks import hashlib import json import sys +import socket # Action summary: # 1. Send the join request to the remote cluster leader @@ -73,7 +74,9 @@ response = agent.tasks.run( progress_callback=agent.get_progress_callback(20,98), auth_token=jwt, ) -agent.assert_exp(response['exit_code'] == 0) +if response['exit_code'] != 0: + print(agent.SD_ERR + "add_node error:", response["error"], file=sys.stderr) + sys.exit(2) node_id = response['output']['node_id'] agent.assert_exp(node_id > 0) @@ -82,6 +85,15 @@ print(f"Leader response is successful: the new node ID is node/{node_id}!", file join_node_data = response['output'] # add-node response is forwarded to join-node +try: + # Resolve the leader endpoint address to an IP string: + peer_hostname, peer_port = join_node_data['leader_endpoint'].rsplit(':', 1) + socket.getaddrinfo(peer_hostname, peer_port, proto=socket.IPPROTO_UDP)[0][4][0] +except Exception as ex: + print(agent.SD_ERR + "leader_endpoint error:", ex, "DATA", join_node_data, file=sys.stderr) + print(agent.SD_NOTICE + f"After the issue is solved, remove node {node_id} before running a new join attempt.", file=sys.stderr) + raise + # Start the action that switches Redis to replica of the new leader. # join-node has to give us enough time to write our response back to the # UI caller. diff --git a/core/ui/public/i18n/en/translation.json b/core/ui/public/i18n/en/translation.json index 9cbe92338..0c68b0baa 100644 --- a/core/ui/public/i18n/en/translation.json +++ b/core/ui/public/i18n/en/translation.json @@ -202,6 +202,7 @@ "the_join_code_cannot_be_parsed": "Join code cannot be parsed", "cluster_tls_verify_error": "TLS certificate of leader node is not valid", "cluster_auth_error": "Invalid response", + "cluster_connection_error": "Failed to connect to the leader node at address {value}", "redirect_cluster": "Worker Node", "redirect_cluster_description": "This node is a worker of the cluster now. Click the button below to access cluster administration page on the leader node.", "redirect_cluster_link": "Go to cluster administration", diff --git a/core/ui/public/i18n/it/translation.json b/core/ui/public/i18n/it/translation.json index 224c33fc1..2c2bcfbb0 100644 --- a/core/ui/public/i18n/it/translation.json +++ b/core/ui/public/i18n/it/translation.json @@ -995,6 +995,7 @@ "not_a_valid_ipv4_network": "Indirizzo di rete IPv4 non valido", "the_join_code_is_not_correctly_encoded": "Il codice di join non ha il formato corretto", "cluster_auth_error": "Risposta non valida", + "cluster_connection_error": "Connessione al nodo leader all'indirizzo {value} fallita", "the_join_code_cannot_be_parsed": "Il codice di join non può essere interpretato", "cluster_tls_verify_error": "Il certificato TLS del nodo leader non è valido", "the_join_code_can_not_be_decoded": "Il codice di join non può essere decodificato" diff --git a/core/ui/src/views/InitializeCluster.vue b/core/ui/src/views/InitializeCluster.vue index 178e62b43..53f62eb21 100644 --- a/core/ui/src/views/InitializeCluster.vue +++ b/core/ui/src/views/InitializeCluster.vue @@ -1578,7 +1578,7 @@ export default { joinClusterValidationFailed(validationErrors) { console.error("validation failed", validationErrors); this.isJoiningCluster = false; - this.error.joinCode = this.$t("init." + validationErrors[0].error); + this.error.joinCode = this.$t("init." + validationErrors[0].error, { value: validationErrors[0].value }); this.focusElement("joinCode"); }, async onFileUpload(files) {