diff --git a/pkg/kube/Dockerfile b/pkg/kube/Dockerfile index 2922f6bd69..5d7378e9a0 100644 --- a/pkg/kube/Dockerfile +++ b/pkg/kube/Dockerfile @@ -23,6 +23,7 @@ RUN GO111MODULE=on CGO_ENABLED=0 go build -v -ldflags "-s -w" -o /out/usr/bin/ce FROM scratch COPY --from=build /out/ / COPY cluster-init.sh /usr/bin/ +COPY cluster-utils.sh /usr/bin/ COPY cgconfig.conf /etc # k3s diff --git a/pkg/kube/cluster-init.sh b/pkg/kube/cluster-init.sh index 358cfafc15..627d4c118b 100755 --- a/pkg/kube/cluster-init.sh +++ b/pkg/kube/cluster-init.sh @@ -3,139 +3,107 @@ # Copyright (c) 2023-2024 Zededa, Inc. # SPDX-License-Identifier: Apache-2.0 -K3S_VERSION=v1.28.5+k3s1 KUBEVIRT_VERSION=v1.1.0 LONGHORN_VERSION=v1.6.2 CDI_VERSION=v1.54.0 NODE_IP="" -MAX_K3S_RESTARTS=10 RESTART_COUNT=0 -K3S_LOG_DIR="/persist/newlog/kube" +K3S_LOG_DIR="/persist/kubelog" INSTALL_LOG="${K3S_LOG_DIR}/k3s-install.log" CTRD_LOG="${K3S_LOG_DIR}/containerd-user.log" -LOG_SIZE=$((5*1024*1024)) HOSTNAME="" VMICONFIG_FILENAME="/run/zedkube/vmiVNC.run" VNC_RUNNING=false - -logmsg() { - local MSG - local TIME - MSG="$*" - TIME=$(date +"%F %T") - echo "$TIME : $MSG" >> $INSTALL_LOG -} - -setup_cgroup () { - echo "cgroup /sys/fs/cgroup cgroup defaults 0 0" >> /etc/fstab -} - -check_log_file_size() { - currentSize=$(wc -c <"$K3S_LOG_DIR/$1") - if [ "$currentSize" -gt "$LOG_SIZE" ]; then - if [ -f "$K3S_LOG_DIR/$1.2" ]; then - cp "$K3S_LOG_DIR/$1.2" "$K3S_LOG_DIR/$1.3" - fi - if [ -f "$K3S_LOG_DIR/$1.1" ]; then - cp "$K3S_LOG_DIR/$1.1" "$K3S_LOG_DIR/$1.2" - fi - cp "$K3S_LOG_DIR/$1" "$K3S_LOG_DIR/$1.1" - truncate -s 0 "$K3S_LOG_DIR/$1" - logmsg "k3s logfile size $currentSize rotate" - fi -} - -save_crash_log() { - if [ "$RESTART_COUNT" = "1" ]; then - return - fi - fileBaseName=$1 - # This pattern will alias with older crashes, but also a simple way to contain log bloat - crashLogBaseName="${fileBaseName}.restart.${RESTART_COUNT}.gz" - if [ -e "${K3S_LOG_DIR}/${crashLogBaseName}" ]; then - rm "${K3S_LOG_DIR}/${crashLogBaseName}" - fi - gzip -k -9 "${K3S_LOG_DIR}/${fileBaseName}" -c > "${K3S_LOG_DIR}/${crashLogBaseName}" +ClusterPrefixMask="" +config_file="/etc/rancher/k3s/config.yaml" +k3s_config_file="/etc/rancher/k3s/k3s-config.yaml" +clusterStatusPort="12346" +INITIAL_WAIT_TIME=5 +MAX_WAIT_TIME=$((10 * 60)) # 10 minutes in seconds, exponential backoff for k3s restart +current_wait_time=$INITIAL_WAIT_TIME +CLUSTER_WAIT_FILE="/run/kube/cluster-change-wait-ongoing" + +# Source the utility script, Dockerfile copies the script to /usr/bin +# shellcheck source=/dev/null +. /usr/bin/cluster-utils.sh + +# get cluster IP address from the cluster status file +get_cluster_node_ip() { + if [ -z "$1" ]; then + enc_data=$(cat "$enc_status_file") + clusternodeip=$(echo "$enc_data" | jq -r '.ClusterIPPrefix.IP') + echo "$clusternodeip" + else + echo "$1" + fi } -check_network_connection () { - while true; do - ret=$(curl -o /dev/null -w "%{http_code}" -s "https://get.k3s.io") - if [ "$ret" -eq 200 ]; then - logmsg "Network is ready." - break; - else - logmsg "Network is not yet ready" - fi - sleep 5 - done -} +# Function to get the cluster prefix length from the cluster status file +get_cluster_prefix_len() { + enc_data=$(cat "$enc_status_file") + mask=$(echo "$enc_data" | jq -r '.ClusterIPPrefix.Mask') + decoded_mask=$(echo "$mask" | base64 -d | od -An -t u1) + prefixlen=0 + + for byte in $decoded_mask; do + case $byte in + 255) prefixlen=$((prefixlen + 8)) ;; + 254) prefixlen=$((prefixlen + 7)) ;; + 252) prefixlen=$((prefixlen + 6)) ;; + 248) prefixlen=$((prefixlen + 5)) ;; + 240) prefixlen=$((prefixlen + 4)) ;; + 224) prefixlen=$((prefixlen + 3)) ;; + 192) prefixlen=$((prefixlen + 2)) ;; + 128) prefixlen=$((prefixlen + 1)) ;; + 0) break ;; + *) logmsg "get_cluster_prefix_len, Unexpected byte value: $byte"; exit 1 ;; + esac + done -wait_for_default_route() { - while read -r iface dest gw flags refcnt use metric mask mtu window irtt; do - if [ "$dest" = "00000000" ] && [ "$mask" = "00000000" ]; then - logmsg "Default route found" - return 0 - fi - logmsg "waiting for default route $iface $dest $gw $flags $refcnt $use $metric $mask $mtu $window $irtt" - sleep 1 - done < /proc/net/route - return 1 + echo "/$prefixlen" } -# Get IP of the interface with the first default route. -# This will be then used as K3s node IP. -# XXX This is a temporary solution. Eventually, the user will be able to select -# the cluster network interface via EdgeDevConfig. -get_default_intf_IP_prefix() { - logmsg "Trying to obtain Node IP..." - while [ -z "$NODE_IP" ]; do - # Find the default route interface - default_interface="$(ip route show default | head -n 1 | awk '/default/ {print $5}')" - # Get the IP address of the default route interface - NODE_IP="$(ip addr show dev "$default_interface" | awk '/inet / {print $2}' | cut -d "/" -f1)" - [ -z "$NODE_IP" ] && sleep 1 - done - logmsg "Node IP Address: $NODE_IP" - ip_prefix="$NODE_IP/32" - # Fill in the outbound external Interface IP prefix in multus config - awk -v new_ip="$ip_prefix" '{gsub("IPAddressReplaceMe", new_ip)}1' /etc/multus-daemonset.yaml > /tmp/multus-daemonset.yaml -} +# Set the node IP to multus differently for single node and cluster mode +assign_multus_nodeip() { + if [ -f /var/lib/edge-node-cluster-mode ]; then + NODE_IP=$(get_cluster_node_ip "$1") + ClusterPrefixMask=$(get_cluster_prefix_len) + ip_prefix=$(ipcalc -n "$NODE_IP$ClusterPrefixMask" | cut -d "=" -f2) + ip_prefix="$ip_prefix$ClusterPrefixMask" + logmsg "Cluster Node IP prefix to multus: $ip_prefix with node-ip $NODE_IP" + else + while [ -z "$NODE_IP" ]; do + # Find the default route interface + default_interface="$(ip route show default | head -n 1 | awk '/default/ {print $5}')" -# kubernetes's name must be lower case and '-' instead of '_' -convert_to_k8s_compatible() { - echo "$1" | tr '[:upper:]_' '[:lower:]-' -} + # Get the IP address of the default route interface + NODE_IP="$(ip addr show dev "$default_interface" | awk '/inet / {print $2}' | cut -d "/" -f1)" -wait_for_device_name() { - logmsg "Waiting for DeviceName from controller..." - EdgeNodeInfoPath="/persist/status/zedagent/EdgeNodeInfo/global.json" - while [ ! -f $EdgeNodeInfoPath ]; do - sleep 5 - done - dName=$(jq -r '.DeviceName' $EdgeNodeInfoPath) - if [ -n "$dName" ]; then - HOSTNAME=$(convert_to_k8s_compatible "$dName") - fi + [ -z "$NODE_IP" ] && sleep 1 + done - # we should have the uuid since we got the device name - DEVUUID=$(/bin/hostname) + ip_prefix="$NODE_IP/32" + logmsg "Single Node IP prefix to multus: $ip_prefix with node-ip $NODE_IP" + fi - if ! grep -q node-name /etc/rancher/k3s/config.yaml; then - echo "node-name: $HOSTNAME" >> /etc/rancher/k3s/config.yaml - fi - logmsg "Hostname: $HOSTNAME" + logmsg "Assign node-ip for multus with $ip_prefix" + # fill in the outbound external Interface IP prefix in multus config + awk -v new_ip="$ip_prefix" '{gsub("IPAddressReplaceMe", new_ip)}1' /etc/multus-daemonset.yaml > /etc/multus-daemonset-new.yaml } apply_multus_cni() { - get_default_intf_IP_prefix - kubectl create namespace eve-kube-app - logmsg "Apply Multus, Node-IP: $NODE_IP" - if ! kubectl apply -f /tmp/multus-daemonset.yaml; then + # remove get_default_intf_IP_prefix + #get_default_intf_IP_prefix + if ! kubectl get namespace eve-kube-app > /dev/null 2>&1; then + kubectl create namespace eve-kube-app + fi + logmsg "Apply multus-daemonset-new.yaml" + if ! kubectl apply -f /etc/multus-daemonset-new.yaml > /dev/null 2>&1; then + logmsg "Apply Multus, has failed, jump out now" return 1 fi logmsg "Done applying Multus" - ln -s /var/lib/cni/bin/multus /var/lib/rancher/k3s/data/current/bin/multus + link_multus_into_k3s # need to only do this once touch /var/lib/multus_initialized return 0 @@ -182,6 +150,8 @@ setup_prereqs () { #Needed for iscsi tools mkdir -p /run/lock mkdir -p "$K3S_LOG_DIR" + rm -rf /var/log + ln -s "$K3S_LOG_DIR" /var/log /usr/sbin/iscsid start mount --make-rshared / setup_cgroup @@ -195,6 +165,11 @@ setup_prereqs () { } config_cluster_roles() { + # remove the previous k3s-debuguser*.pem files + # in the case of single node to cluster transition, we may not reboot, + # and there could be more than one certs files + rm -f /tmp/k3s-debuguser*.pem + # generate user debugging-user certificates # 10 year expiration for now if ! /usr/bin/cert-gen -l 315360000 --ca-cert /var/lib/rancher/k3s/server/tls/client-ca.crt \ @@ -226,6 +201,26 @@ apply_longhorn_disk_config() { kubectl annotate node "$node" node.longhorn.io/default-disks-config='[ { "path":"/persist/vault/volumes", "allowScheduling":true }]' } +# Apply the node-uuid label to the node, since node name is the EVE device name +apply_node_uuid_lable () { + logmsg "set node label with uuid $DEVUUID" + kubectl label node "$HOSTNAME" node-uuid="$DEVUUID" +} + +reapply_node_labes() { + apply_node_uuid_lable + apply_longhorn_disk_config "$HOSTNAME" + # Check if the node with both labels exists, don't assume above apply worked + node_count=$(kubectl get nodes -l node-uuid="$DEVUUID",node.longhorn.io/create-default-disk=config -o json | jq '.items | length') + + if [ "$node_count" -gt 0 ]; then + logmsg "Node labels re-applied successfully" + touch /var/lib/node-labels-initialized + else + logmsg "Failed to re-apply node labels, on $HOSTNAME, uuid $DEVUUID" + fi +} + check_overwrite_nsmounter() { ### REMOVE ME+ # When https://github.com/longhorn/longhorn/issues/6857 is resolved, remove this 'REMOVE ME' section @@ -255,30 +250,58 @@ longhorn_post_install_config() { } check_start_k3s() { + # the cluster change code is in another task loop, so if the cluster wait is nogoing + # don't go to start k3s in this time. wait also + if [ -f "$CLUSTER_WAIT_FILE" ]; then + logmsg "Cluster wait ongoing, wait for it before starting k3s" + while [ -f "$CLUSTER_WAIT_FILE" ]; do + sleep 5 + done + fi + pgrep -f "k3s server" > /dev/null 2>&1 if [ $? -eq 1 ]; then - if [ $RESTART_COUNT -lt $MAX_K3S_RESTARTS ]; then - ## Must be after reboot, or from k3s restart - RESTART_COUNT=$((RESTART_COUNT+1)) - save_crash_log "k3s.log" - ln -s /var/lib/k3s/bin/* /usr/bin - logmsg "Starting k3s server, restart count: $RESTART_COUNT" - # for now, always copy to get the latest - nohup /usr/bin/k3s server --config /etc/rancher/k3s/config.yaml & - k3s_pid=$! - # Give the embedded etcd in k3s priority over io as its fsync latencies are critical - ionice -c2 -n0 -p $k3s_pid - # Default location where clients will look for config - # There is a very small window where this file is not available - # while k3s is starting up - while [ ! -f /etc/rancher/k3s/k3s.yaml ]; do - sleep 5 - done - ln -s /etc/rancher/k3s/k3s.yaml ~/.kube/config - mkdir -p /run/.kube/k3s - cp /etc/rancher/k3s/k3s.yaml /run/.kube/k3s/k3s.yaml - return 1 - fi + # do exponential backoff for k3s restart, but not more than MAX_WAIT_TIME + RESTART_COUNT=$((RESTART_COUNT+1)) + logmsg "k3s server not running, restart wait time $current_wait_time, restart count: $RESTART_COUNT" + sleep $current_wait_time + current_wait_time=$((current_wait_time * 2)) + if [ $current_wait_time -gt $MAX_WAIT_TIME ]; then + current_wait_time=$MAX_WAIT_TIME + fi + + ## Must be after reboot, or from k3s restart + save_crash_log + ln -s /var/lib/k3s/bin/* /usr/bin + if [ ! -d /var/lib/cni/bin ] || [ ! -d /opt/cni/bin ]; then + copy_cni_plugin_files + fi + # for now, always copy to get the latest + + # start the k3s server now + nohup /usr/bin/k3s server --config "$k3s_config_file" & + + k3s_pid=$! + # Give the embedded etcd in k3s priority over io as its fsync latencies are critical + ionice -c2 -n0 -p $k3s_pid + # Default location where clients will look for config + # There is a very small window where this file is not available + # while k3s is starting up + counter=0 + while [ ! -f /etc/rancher/k3s/k3s.yaml ]; do + sleep 5 + counter=$((counter+1)) + # to prevent infinite looping, k3s could have crashed immediately + if [ $counter -eq 120 ]; then + break + fi + done + mkdir -p /run/.kube/k3s + cp /etc/rancher/k3s/k3s.yaml /run/.kube/k3s/k3s.yaml + return 1 + else + # k3s is running, reset the wait time to initial value + current_wait_time=$INITIAL_WAIT_TIME fi return 0 } @@ -295,11 +318,24 @@ check_start_containerd() { pgrep -f "/var/lib/rancher/k3s/data/current/bin/containerd" > /dev/null 2>&1 if [ $? -eq 1 ]; then mkdir -p /run/containerd-user - nohup /var/lib/rancher/k3s/data/current/bin/containerd --config /etc/containerd/config-k3s.toml > $CTRD_LOG 2>&1 & + nohup /var/lib/rancher/k3s/data/current/bin/containerd --config /etc/containerd/config-k3s.toml >> $CTRD_LOG 2>&1 & containerd_pid=$! logmsg "Started k3s-containerd at pid:$containerd_pid" fi if [ -f /etc/external-boot-image.tar ]; then + # Give containerd a moment to start before importing + attempts=0 + max_attempts=3 + while [ $attempts -lt $max_attempts ]; do + reported_pid=$(/var/lib/k3s/bin/k3s ctr -a /run/containerd-user/containerd.sock info | jq .server.pid) + if [ "$reported_pid" = "$containerd_pid" ]; then + logmsg "containerd online, continue to import" + break + fi + attempts=$((attempts + 1)) + sleep 1 + done + # NOTE: https://kubevirt.io/user-guide/virtual_machines/boot_from_external_source/ # Install external-boot-image image to our eve user containerd registry. # This image contains just kernel and initrd to bootstrap a container image as a VM. @@ -317,40 +353,6 @@ check_start_containerd() { fi fi } -trigger_k3s_selfextraction() { - # Analysis of the k3s source shows nearly any cli command will first self-extract a series of binaries. - # In our case we're looking for the containerd binary. - # k3s check-config appears to be the only cli cmd which doesn't: - # - start a long running process/server - # - timeout connecting to a socket - # - manipulate config/certs - - # When run on the shell this does throw some config errors, its unclear if we need this issues fixed: - # - links: aux/ip6tables should link to iptables-detect.sh (fail) - # - links: aux/ip6tables-restore should link to iptables-detect.sh (fail) - # - links: aux/ip6tables-save should link to iptables-detect.sh (fail) - # - links: aux/iptables should link to iptables-detect.sh (fail) - # - links: aux/iptables-restore should link to iptables-detect.sh (fail) - # - links: aux/iptables-save should link to iptables-detect.sh (fail) - # - apparmor: enabled, but apparmor_parser missing (fail) - /usr/bin/k3s check-config >> $INSTALL_LOG 2>&1 -} - -# wait for debugging flag in /persist/k3s/wait_{flagname} if exist -wait_for_item() { - filename="/persist/k3s/wait_$1" - processname="k3s server" - while [ -e "$filename" ]; do - k3sproc="" - if pgrep -x "$processname" > /dev/null; then - k3sproc="k3s server is running" - else - k3sproc="k3s server is NOT running" - fi - logmsg "Found $filename file. $k3sproc, Waiting for 60 seconds..." - sleep 60 - done -} # Return success if all pods are Running/Succeeded and Ready # Used in install time to control api server load @@ -416,57 +418,347 @@ check_and_run_vnc() { fi } -setup_prereqs +# get the EdgeNodeClusterStatus +enc_status_file="/run/zedkube/EdgeNodeClusterStatus/global.json" +cluster_intf="" +is_bootstrap="" +join_serverIP="" +cluster_token="" +cluster_node_ip="" +# for bootstrap node, after reboot to get neighbor node to join + +# get the EdgeNodeClusterStatus from zedkube publication +get_enc_status() { + # Read the JSON data from the file, return 0 if successful, 1 if not + if [ ! -f "$enc_status_file" ]; then + return 1 + fi + + enc_data=$(cat "$enc_status_file") + cluster_intf=$(echo "$enc_data" | jq -r '.ClusterInterface') + is_bootstrap=$(echo "$enc_data" | jq -r '.BootstrapNode') + join_serverIP=$(echo "$enc_data" | jq -r '.JoinServerIP') + cluster_token=$(echo "$enc_data" | jq -r '.EncryptedClusterToken') + cluster_node_ip=$(echo "$enc_data" | jq -r '.ClusterIPPrefix.IP') + cluster_node_ip_is_ready=$(echo "$enc_data" | jq -r '.ClusterIPIsReady') + if [ -n "$cluster_intf" ] && [ -n "$join_serverIP" ] && [ -n "$cluster_token" ] &&\ + [ -n "$cluster_node_ip" ] && [ "$cluster_node_ip_is_ready" = "true" ] &&\ + { [ "$is_bootstrap" = "true" ] || [ "$is_bootstrap" = "false" ]; }; then + return 0 + else + return 1 + fi +} + +# When transitioning from single node to cluster mode, need change the controller +# provided token for the cluster +change_to_new_token() { + if [ -n "$cluster_token" ]; then + /usr/bin/k3s token rotate --new-token "$cluster_token" + while true; do + if grep -q "server:$cluster_token" /var/lib/rancher/k3s/server/token; then + logmsg "Token change has taken effect." + break + else + logmsg "Token has not taken effect yet. Sleeping for 2 seconds..." + sleep 2 + fi + done + else + # save the content of the token file + current_token=$(cat /var/lib/rancher/k3s/server/token) + + # let k3s generate a new token + /usr/bin/k3s token rotate + logmsg "Rotate Token by k3s." + + # loop to check if the token file has changed + while true; do + if grep -q "$current_token" /var/lib/rancher/k3s/server/token; then + logmsg "Token change has not taken effect yet. Sleeping for 2 seconds..." + sleep 2 + else + logmsg "Token change has taken effect." + break + fi + done + fi +} + +# monitor function to check if the cluster mode has changed, either from single node to cluster +# or from cluster to single node +check_cluster_config_change() { + + # only check the cluster change when it's fully initialized + if [ ! -f /var/lib/all_components_initialized ]; then + return 0 + fi + + if [ ! -f "$enc_status_file" ]; then + #logmsg "EdgeNodeClusterStatus file not found" + if [ ! -f /var/lib/edge-node-cluster-mode ]; then + return 0 + else + # check to see if the persistent config file exists, if yes, then we need to + # wait until zedkube to publish the ENC status file + if [ -f /persist/status/zedagent/EdgeNodeClusterConfig/global.json ]; then + logmsg "EdgeNodeClusterConfig file found, but the EdgeNodeClusterStatus file is missing, wait..." + return 0 + fi + touch /var/lib/convert-to-single-node + reboot + fi + else + # record we have seen this ENC status file + if [ ! -f /var/lib/edge-node-cluster-mode ]; then + logmsg "EdgeNodeClusterStatus file found, but the node does not have edge-node-cluster-mode" + logmsg "*** check_cluster_config_change, before while loop. cluster_node_ip: $cluster_node_ip" # XXX + while true; do + if get_enc_status; then + logmsg "got the EdgeNodeClusterStatus successfully" + # mark it cluster mode before changing the config file + touch /var/lib/edge-node-cluster-mode + + # rotate the token with the new token + if [ "$is_bootstrap" = "true" ]; then + change_to_new_token + fi + + # remove previous multus config + remove_multus_cni + + # redo the multus config file in /etc/multus-daemonset-new.yaml + logmsg "Reapply Multus CNI for clusternodeip: $cluster_node_ip" + assign_multus_nodeip "$cluster_node_ip" + + # need to reapply node labels later + rm /var/lib/node-labels-initialized + + # kill the process and let the loop to restart k3s + terminate_k3s + # romove the /var/lib/rancher/k3s/server/tls directory files + if [ "$is_bootstrap" = "false" ]; then + rm -rf /var/lib/rancher/k3s/server/tls/* + # redo the debugger user role binding since certs are changed + rm /var/lib/debuguser-initialized + fi + + logmsg "provision config file for node to cluster mode" + provision_cluster_config_file true + + logmsg "WARNING: changing the node to cluster mode, done" + break + else + sleep 10 + fi + done + else + return 0 + fi + fi + logmsg "Check cluster config change done" +} + +monitor_cluster_config_change() { + while true; do + check_cluster_config_change + sleep 15 + done +} + +# provision the config.yaml and bootstrap-config.yaml for cluster node, passing $1 as k3s needs initializing +provision_cluster_config_file() { +# prepare the config.yaml and bootstrap-config.yaml on node +bootstrapContent=$(cat <<- EOF +cluster-init: true +token: "${cluster_token}" +tls-san: + - "${join_serverIP}" +flannel-iface: "${cluster_intf}" +node-ip: "${cluster_node_ip}" +node-name: "${HOSTNAME}" +EOF + ) +serverContent=$(cat <<- EOF +server: "https://${join_serverIP}:6443" +token: "${cluster_token}" +flannel-iface: "${cluster_intf}" +node-ip: "${cluster_node_ip}" +node-name: "${HOSTNAME}" +EOF + ) + + # we have 2 conditions, one is we are the bootstrap node or not, the other is we are + # the first time configure k3s cluster or not. If both are true, then we need bootstrap config + # otherwise, we just need normal server config to join the existing cluster + # check if is_bootstrap is true + if [ "$is_bootstrap" = "true" ]; then + #Bootstrap_Node=true + if [ "$1" = "true" ]; then + cp "$config_file" "$k3s_config_file" + echo "$bootstrapContent" >> "$k3s_config_file" + logmsg "bootstrap config.yaml configured with $join_serverIP and $HOSTNAME" + else # if we are in restart case, and we are the bootstrap node, wait for some other nodes to join + # we go here, means we can not find node to join the cluster, we have waited long enough + # but still put in the server config.yaml for now + logmsg "join the cluster, use server content config.yaml" + cp "$config_file" "$k3s_config_file" + #echo "$bootstrapContent" >> "$k3s_config_file" + echo "$serverContent" >> "$k3s_config_file" + fi + else + # non-bootstrap node, decide if we need to wait for the join server to be ready + #Bootstrap_Node=false + cp "$config_file" "$k3s_config_file" + echo "$serverContent" >> "$k3s_config_file" + logmsg "config.yaml configured with Join-ServerIP $join_serverIP and hostname $HOSTNAME" + if [ "$1" = true ]; then + logmsg "Check if the Endpoint https://$join_serverIP:6443 is in cluster mode, and wait if not..." + # Check if the join Server is available by kubernetes, wait here until it is ready + counter=0 + touch "$CLUSTER_WAIT_FILE" + while true; do + if curl --insecure --max-time 2 "https://$join_serverIP:6443" >/dev/null 2>&1; then + counter=$((counter+1)) + #logmsg "curl to Endpoint https://$join_serverIP:6443 ready, check cluster status" + # if we are here, check the bootstrap server is single or cluster mode + if ! status=$(curl --max-time 2 -s "http://$join_serverIP:$clusterStatusPort/status"); then + if [ $((counter % 30)) -eq 1 ]; then + logmsg "Attempt $counter: Failed to connect to the server. Waiting for 10 seconds..." + fi + elif [ "$status" != "cluster" ]; then + if [ $((counter % 30)) -eq 1 ]; then + logmsg "Attempt $counter: Server is not in 'cluster' status. Waiting for 10 seconds..." + fi + else + logmsg "Server is in 'cluster' status. done" + rm "$CLUSTER_WAIT_FILE" + break + fi + fi + sleep 10 + done + else + logmsg "restart case with k3s already installed, no need to wait" + fi + fi +} + DATESTR=$(date) echo "========================== $DATESTR ==========================" >> $INSTALL_LOG -echo "cluster-init.sh start for $HOSTNAME, uuid $DEVUUID" >> $INSTALL_LOG logmsg "Using ZFS persistent storage" +setup_prereqs + + +if [ -f /var/lib/convert-to-single-node ]; then + logmsg "remove /var/lib and copy saved single node /var/lib" + restore_var_lib + # assign node-ip to multus nodeIP for yaml config file + assign_multus_nodeip +fi +# since we can wait for long time, always start the containerd first +check_start_containerd +logmsg "containerd started" + +# task running in the background to check if the cluster config has changed +monitor_cluster_config_change & + +# if this is the first time to run install, we may wait for the +# cluster config and status +if [ ! -f /var/lib/all_components_initialized ]; then + logmsg "First time for k3s install" + + # if we are in edge-node cluster mode prepare the config.yaml and bootstrap-config.yaml + # for single node mode, we basically use the existing config.yaml + if [ -f /var/lib/edge-node-cluster-mode ]; then + provision_cluster_config_file true + else + logmsg "Single node mode prepare config.yaml for $HOSTNAME" + + # append the hostname to the config.yaml and bootstrap-config.yaml + cp "$config_file" "$k3s_config_file" + fi + + # assign node-ip to multus + assign_multus_nodeip "$cluster_node_ip" +else # a restart case, found all_components_initialized + # k3s initialized already and installed, get the config.yaml if not in cluster mode + if [ -f /var/lib/edge-node-cluster-mode ]; then + logmsg "Cluster config case, restarted k3s node, wait for cluster config" + while true; do + if get_enc_status; then + logmsg "got the EdgeNodeClusterStatus successfully" + break + else + sleep 10 + fi + done + # got the cluster config, make the config.ymal now + logmsg "Cluster config status ok, provision config.yaml and bootstrap-config.yaml" + provision_cluster_config_file false + logmsg "provision config.yaml done" + else # single node mode + logmsg "Single node mode, prepare config.yaml for $HOSTNAME" + cp "$config_file" "$k3s_config_file" + # append the hostname to the config.yaml + if ! grep -q node-name "$k3s_config_file"; then + echo "node-name: $HOSTNAME" >> "$k3s_config_file" + fi + fi +fi + +# use part of the /run/eve-release to get the OS-IMAGE string +get_eve_os_release + #Forever loop every 15 secs while true; do if [ ! -f /var/lib/all_components_initialized ]; then - if [ ! -f /var/lib/k3s_initialized ]; then - logmsg "Installing K3S version $K3S_VERSION on $HOSTNAME" - mkdir -p /var/lib/k3s/bin - /usr/bin/curl -sfL https://get.k3s.io | INSTALL_K3S_VERSION=${K3S_VERSION} INSTALL_K3S_SKIP_ENABLE=true INSTALL_K3S_BIN_DIR=/var/lib/k3s/bin sh - - logmsg "Initializing K3S version $K3S_VERSION" - ln -s /var/lib/k3s/bin/* /usr/bin - trigger_k3s_selfextraction - touch /var/lib/k3s_initialized - fi - - # Be kind to the API server - sleep 1 - - check_start_containerd if ! check_start_k3s; then continue fi - this_node_ready=$(kubectl get node "$HOSTNAME" -o json | jq '.status.conditions[] | select(.reason=="KubeletReady") | .status=="True"') - if [ "$this_node_ready" != "true" ]; then + # the k3s just started, may have crashed immediately, we need to continue to retry + # instead of waiting forever + start_time=$(date +%s) + while [ $(($(date +%s) - start_time)) -lt 120 ]; do + node_count_ready=$(kubectl get "node/${HOSTNAME}" | grep -cw Ready ) + if [ "$node_count_ready" -ne 1 ]; then + sleep 10 continue + else + break + fi + done + if [ "$node_count_ready" -ne 1 ]; then + continue fi - node_uuid_len=$(kubectl get nodes -l node-uuid="$DEVUUID" -o json | jq '.items | length') - if [ "$node_uuid_len" -eq 0 ]; then - logmsg "set node label with uuid $DEVUUID" - kubectl label node "$HOSTNAME" node-uuid="$DEVUUID" - fi + + # label the node with device uuid + apply_node_uuid_lable if ! are_all_pods_ready; then continue fi - if [ ! -f /var/lib/cni/bin ]; then - copy_cni_plugin_files - fi - if [ ! -f /var/lib/multus_initialized ]; then + if [ ! -f /etc/multus-daemonset-new.yaml ]; then + assign_multus_nodeip "$cluster_node_ip" + fi apply_multus_cni continue + if [ ! -f /var/lib/multus_initialized ]; then + logmsg "Failed to apply multus cni, wait a while" + sleep 10 + continue + fi fi if ! pidof dhcp; then + # if the dhcp.sock exist, then the daemon can not be restarted + if [ -f /run/cni/dhcp.sock ]; then + rm /run/cni/dhcp.sock + fi # launch CNI dhcp service /opt/cni/bin/dhcp daemon & fi @@ -482,7 +774,10 @@ if [ ! -f /var/lib/all_components_initialized ]; then # This patched version will be removed once the following PR https://github.com/kubevirt/kubevirt/pull/9668 is merged logmsg "Installing patched Kubevirt" kubectl apply -f /etc/kubevirt-operator.yaml + logmsg "Updating replica to 1 for virt-operator and virt-controller" + kubectl patch deployment virt-operator -n kubevirt --patch '{"spec":{"replicas": 1 }}' kubectl apply -f https://github.com/kubevirt/kubevirt/releases/download/${KUBEVIRT_VERSION}/kubevirt-cr.yaml + kubectl patch KubeVirt kubevirt -n kubevirt --patch '{"spec": {"infra": {"replicas": 1}}}' --type='merge' wait_for_item "cdi" #CDI (containerzed data importer) is need to convert qcow2/raw formats to Persistent Volumes and Data volumes @@ -512,25 +807,57 @@ if [ ! -f /var/lib/all_components_initialized ]; then touch /var/lib/longhorn_initialized fi - if [ -f /var/lib/k3s_initialized ] && [ -f /var/lib/kubevirt_initialized ] && [ -f /var/lib/longhorn_initialized ]; then + if [ -f /var/lib/kubevirt_initialized ] && [ -f /var/lib/longhorn_initialized ]; then logmsg "All components initialized" + touch /var/lib/node-labels-initialized touch /var/lib/all_components_initialized + sleep 5 + logmsg "stop the k3s server and wait for copy /var/lib" + terminate_k3s + sync + sleep 5 + save_var_lib + logmsg "saved the copy of /var/lib, done" fi else - check_start_containerd if ! check_start_k3s; then - while [ "$(kubectl get node "$HOSTNAME" -o json | jq '.status.conditions[] | select(.reason=="KubeletReady") | .status=="True"')" != "true" ]; - do - sleep 5; + start_time=$(date +%s) + while [ $(($(date +%s) - start_time)) -lt 120 ]; do + node_count_ready=$(kubectl get "node/${HOSTNAME}" | grep -cw Ready ) + if [ "$node_count_ready" -ne 1 ]; then + sleep 10 + pgrep -f "k3s server" > /dev/null 2>&1 + if [ $? -eq 1 ]; then + break + fi + continue + else + break + fi done + if [ "$node_count_ready" -ne 1 ]; then + logmsg "Node not ready, continue to to check_start_k3s" + continue + fi + else + if [ ! -f /var/lib/node-labels-initialized ]; then + reapply_node_labes + fi # Initialize CNI after k3s reboot - if [ ! -f /var/lib/cni/bin ]; then + if [ ! -d /var/lib/cni/bin ] || [ ! -d /opt/cni/bin ]; then copy_cni_plugin_files fi if [ ! -f /var/lib/multus_initialized ]; then + if [ ! -f /etc/multus-daemonset-new.yaml ]; then + assign_multus_nodeip "$cluster_node_ip" + fi apply_multus_cni fi if ! pidof dhcp; then + # if the dhcp.sock exist, then the daemon can not be restarted + if [ -f /run/cni/dhcp.sock ]; then + rm /run/cni/dhcp.sock + fi # launch CNI dhcp service /opt/cni/bin/dhcp daemon & fi @@ -538,9 +865,11 @@ else if [ ! -f /var/lib/debuguser-initialized ]; then config_cluster_roles else - cp /var/lib/rancher/k3s/user.yaml /run/.kube/k3s/user.yaml + if [ ! -e /run/.kube/k3s/user.yaml ]; then + cp /var/lib/rancher/k3s/user.yaml /run/.kube/k3s/user.yaml + fi fi - else + if [ -e /var/lib/longhorn_initialized ]; then check_overwrite_nsmounter fi @@ -554,6 +883,8 @@ fi check_log_file_size "k3s-install.log" check_log_file_size "eve-bridge.log" check_log_file_size "containerd-user.log" + check_kubeconfig_yaml_files + check_and_remove_excessive_k3s_logs check_and_run_vnc wait_for_item "wait" sleep 15 diff --git a/pkg/kube/cluster-utils.sh b/pkg/kube/cluster-utils.sh new file mode 100755 index 0000000000..56799fd31c --- /dev/null +++ b/pkg/kube/cluster-utils.sh @@ -0,0 +1,283 @@ +#!/bin/sh +# +# Copyright (c) 2024 Zededa, Inc. +# SPDX-License-Identifier: Apache-2.0 + +LOG_SIZE=$((5*1024*1024)) +K3s_LOG_FILE="k3s.log" +SAVE_KUBE_VAR_LIB_DIR="/persist/kube-save-var-lib" + +logmsg() { + local MSG + local TIME + MSG="$*" + TIME=$(date +"%F %T") + echo "$TIME : $MSG" >> "$INSTALL_LOG" +} + +check_network_connection () { + while true; do + ret=$(curl -o /dev/null -w "%{http_code}" -s "https://get.k3s.io") + if [ "$ret" -eq 200 ]; then + logmsg "Network is ready." + break; + else + logmsg "Network is not yet ready" + fi + sleep 5 + done +} + +setup_cgroup () { + echo "cgroup /sys/fs/cgroup cgroup defaults 0 0" >> /etc/fstab +} + +check_log_file_size() { + currentSize=$(wc -c <"$K3S_LOG_DIR/$1") + if [ "$currentSize" -gt "$LOG_SIZE" ]; then + if [ -f "$K3S_LOG_DIR/$1.2" ]; then + cp -p "$K3S_LOG_DIR/$1.2" "$K3S_LOG_DIR/$1.3" + fi + if [ -f "$K3S_LOG_DIR/$1.1" ]; then + cp -p "$K3S_LOG_DIR/$1.1" "$K3S_LOG_DIR/$1.2" + fi + # keep the original log file's attributes + cp -p "$K3S_LOG_DIR/$1" "$K3S_LOG_DIR/$1.1" + truncate -s 0 "$K3S_LOG_DIR/$1" + logmsg "k3s logfile $1, size $currentSize rotate" + fi +} + +# search and find the last occurrence of the k3s staring string in the file +# and gzip the content from that line to the end of the file +# do the entire file if the string is not found +gzip_last_restart_part() { + fileBaseName=$1 + targetFile=$2 + searchString="Starting k3s $K3S_VERSION" + + # Find the line number of the last occurrence of the search string, or 1 if not found + lastLine=$(grep -n -F "$searchString" "$fileBaseName" | tail -n 1 | cut -d: -f1) + lastLine=${lastLine:-1} + + # Extract the content from the last occurrence of the search string to the end + tail -n +"$lastLine" "$fileBaseName" | gzip -k -9 -c > "$targetFile" +} + +save_crash_log() { + if [ "$RESTART_COUNT" = "1" ]; then + return + fi + + # add timestamp to the filename for clear identification + timestamp=$(date +"%Y%m%d-%H%M%S") + # This pattern will alias with older crashes, but also a simple way to contain log bloat + crashLogBaseName="${K3s_LOG_FILE}.restart.${timestamp}.${RESTART_COUNT}.gz" + + gzip_last_restart_part "${K3S_LOG_DIR}/${K3s_LOG_FILE}" "${K3S_LOG_DIR}/${crashLogBaseName}" + + # Find and list files matching the pattern + matching_files="" + for file in "$K3S_LOG_DIR"/*; do + if echo "$file" | grep -q "${K3s_LOG_FILE}.restart.*.gz"; then + matching_files="$matching_files $file" + fi + done + matching_files=$(echo "$matching_files" | xargs) + file_count=$(echo "$matching_files" | wc -w) + + logmsg "total $file_count crash logs found in dir $K3S_LOG_DIR, file prefix $K3s_LOG_FILE" + if [ "$file_count" -gt 10 ]; then + files_to_delete=$(find "$K3S_LOG_DIR" -type f -name "${K3s_LOG_FILE}.restart.*.gz" -print0 | xargs -0 ls -t | tail -n +11) + echo "$files_to_delete" | while read -r file; do + rm -f "${K3S_LOG_DIR}/${file}" + done + fi +} + +# k3s can generate log files such as this: k3s-2024-07-30T20-29-31.172.log.gz +# they seem to be generated by raft operation warnings +# this check and remove is to prevent the log files from growing indefinitely +# keep the latest 10 log files and delete the rest +check_and_remove_excessive_k3s_logs() { + + # Directory to search in (current directory in this case) + search_dir="$K3S_LOG_DIR" + + # Regular expression pattern for date and time in the format YYYY-MM-DDTHH-MM-SS.mmm + pattern='k3s-[0-9]{4}-[0-9]{2}-[0-9]{2}T[0-9]{2}-[0-9]{2}-[0-9]{2}\.[0-9]{3}\.log\.gz' + + # Find and list files matching the pattern + matching_files=$(find "$search_dir" -type f -name 'k3s-*.log.gz' | grep -E "$pattern") + file_count=$(echo "$matching_files" | wc -w) + if [ "$file_count" -gt 10 ]; then + files_to_delete=$(echo "$matching_files" | grep ".log.gz" | tail -n +11) + echo "$files_to_delete" | while read -r file; do + rm -f "${K3S_LOG_DIR}/${file}" + done + fi +} + +# kubernetes's name must be lower case and '-' instead of '_' +convert_to_k8s_compatible() { + echo "$1" | tr '[:upper:]_' '[:lower:]-' +} + +# Function to check if a string is a valid UUID +is_valid_uuid() { + local uuid="$1" + if echo "$uuid" | grep -qE '^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}$'; then + return 0 # Valid UUID + else + return 1 # Invalid UUID + fi +} + +remove_server_tls_dir() { + if [ -d /var/lib/rancher/k3s/server/tls ]; then + rm /var/lib/rancher/k3s/server/tls/request-header-ca.key + rm /var/lib/rancher/k3s/server/tls/server-ca.key + rm /var/lib/rancher/k3s/server/tls/etcd/peer-ca.key + rm /var/lib/rancher/k3s/server/tls/etcd/server-ca.crt + rm /var/lib/rancher/k3s/server/tls/request-header-ca.crt + rm /var/lib/rancher/k3s/server/tls/etcd/server-ca.key + rm /var/lib/rancher/k3s/server/cred/ipsec.psk + rm /var/lib/rancher/k3s/server/tls/server-ca.crt + rm /var/lib/rancher/k3s/server/tls/service.key + rm /var/lib/rancher/k3s/server/tls/client-ca.crt + rm /var/lib/rancher/k3s/server/tls/client-ca.key + rm /var/lib/rancher/k3s/server/tls/etcd/peer-ca.crt + fi +} + +remove_multus_cni() { + kubectl delete -f /etc/multus-daemonset-new.yaml + rm /etc/multus-daemonset-new.yaml + rm /var/lib/multus_initialized +} + +# save the /var/lib to /persist/kube-save-var-lib +save_var_lib() { + local dest_dir="${SAVE_KUBE_VAR_LIB_DIR}" + # Check if destination directory exists, if not create it + if [ ! -d "$dest_dir" ]; then + mkdir -p "$dest_dir" + fi + + # Remove everything in the destination directory + rm -rf "${dest_dir:?}"/* + + # Copy all contents from /var/lib to destination directory + cp -a /var/lib/. "$dest_dir" +} + +# Function to restore contents from /persist/kube-save-var-lib back to /var/lib +restore_var_lib() { + local source_dir="${SAVE_KUBE_VAR_LIB_DIR}" + # Remove everything under /var/lib + rm -rf /var/lib/* + + # Copy everything from /persist/kube-save-var-lib back to /var/lib + if [ -d "$source_dir" ]; then + cp -a "${source_dir}/." /var/lib + else + ## the saved files are missing, have do install again + Update_CheckNodeComponents + fi +} + +# when transitioning from single node to cluster mode, the k3s.yaml file may need +# to change with new certificates +check_kubeconfig_yaml_files() { + file1="/etc/rancher/k3s/k3s.yaml" + file2="/run/.kube/k3s/k3s.yaml" + + if ! cmp -s "$file1" "$file2"; then + logmsg "k3s.yaml files are different, copying $file1 to $file2" + cp "$file1" "$file2" + fi +} + +# get the OS-IMAGE name from the /run/eve-release +get_eve_os_release() { + # Wait for /run/eve-release to appear + while [ ! -f /run/eve-release ]; do + sleep 1 + done + + # Read the original name from /run/eve-release + eve_image_name=$(cat /run/eve-release) + + logmsg "EVE Release: $eve_image_name, write to /etc/os-release" + # Write the short name to /etc/os-release + echo "PRETTY_NAME=\"$eve_image_name\"" > /etc/os-release +} + +terminate_k3s() { + # Find the process ID of 'k3s server' + pid=$(pgrep -f 'k3s server') + + # If the process exists, kill it + if [ -n "$pid" ]; then + logmsg "Killing 'k3s server' process with PID: $pid" + kill "$pid" + else + logmsg "'k3s server' process not found" + fi +} + +# wait for debugging flag in /persist/k3s/wait_{flagname} if exist +wait_for_item() { + filename="/persist/k3s/wait_$1" + processname="k3s server" + while [ -e "$filename" ]; do + k3sproc="" + if pgrep -x "$processname" > /dev/null; then + k3sproc="k3s server is running" + else + k3sproc="k3s server is NOT running" + fi + logmsg "Found $filename file. $k3sproc, Waiting for 60 seconds..." + sleep 60 + done +} + +wait_for_device_name() { + logmsg "Waiting for DeviceName from controller..." + EdgeNodeInfoPath="/persist/status/zedagent/EdgeNodeInfo/global.json" + while [ ! -f $EdgeNodeInfoPath ]; do + sleep 5 + done + dName=$(jq -r '.DeviceName' $EdgeNodeInfoPath) + if [ -n "$dName" ]; then + HOSTNAME=$(convert_to_k8s_compatible "$dName") + fi + + # we should have the uuid since we got the device name + while true; do + DEVUUID=$(/bin/hostname) + if is_valid_uuid "$DEVUUID"; then + logmsg "got valid Device UUID: $DEVUUID" + break + else + sleep 5 + fi + done + + if ! grep -q node-name /etc/rancher/k3s/config.yaml; then + echo "node-name: $HOSTNAME" >> /etc/rancher/k3s/config.yaml + fi + logmsg "Hostname: $HOSTNAME" +} + +wait_for_default_route() { + while read -r iface dest gw flags refcnt use metric mask mtu window irtt; do + if [ "$dest" = "00000000" ] && [ "$mask" = "00000000" ]; then + logmsg "Default route found" + return 0 + fi + logmsg "waiting for default route $iface $dest $gw $flags $refcnt $use $metric $mask $mtu $window $irtt" + sleep 1 + done < /proc/net/route + return 1 +}