lf-edge · eriknordmark · Nov 6, 2024 · Oct 16, 2024
@@ -0,0 +1,153 @@
+# Clustered eve nodes (zedkube)
+
+## Overview
+
+## Components
+
+### kubenodeop
+
+kubenodeop handles cordoning, uncordoning, and draining of clustered eve-os nodes.
+Any given node could be hosting one or more longhorn volume replicas and thus could be the rebuild source for other node replicas.
+A drain operation should be performed before any Node Operation / Node Command which can cause an extended outage of a node such as a reboot, shutdown, reset.
+kubenodeop handles NodeDrainRequest objects which zedkube subscribes to, initiates the drain, and publishes NodeDrainStatus objects.
+
+### kubeapi
+
+1. `kubeapi.GetNodeDrainStatus()` to determine if system supports drain
+    - HV!=kubevirt: NOTSUPPORTED
+    - HV=kubevirt will return:
+        - NOTSUPPORTED if in single node.
+        - NOTREQUESTED if in cluster mode
+1. `kubeapi.RequestNodeDrain()` to begin a drain
+
+### Drain PubSub setup (node reboot/shutdown)
+
+1. zedagent/handlenodedrain.go:`initNodeDrainPubSub()`
+    - subscribes to NodeDrainStatus from zedkube
+    - creates publication of NodeDrainRequest
+1. nodeagent/handlenodedrain.go:`initNodeDrainPubSub()`
+    - subscribe to NodeDrainStatus from zedkube
+
+### Drain Request path (node reboot/shutdown)
+
+1. zedagent/parseconfig.go:`scheduleDeviceOperation()`
+    - If `shouldDeferForNodeDrain()` is true
+        - Set Reboot or shutdown cmd deferred state in zedagentContext
+1. zedagent/handlenodedrain.go:`shouldDeferForNodeDrain()`
+    - NodeDrainStatus == (NOTREQUESTED || FAILEDCORDON || FAILEDDRAIN):
+        - Drain is requested via `kubeapi.RequestNodeDrain()`
+        - return Defer
+    - NodeDrainStatus == (UNKNOWN || NOTSUPPORTED || COMPLETE )
+        - return !Defer
+    - NodeDrainStatus == (REQUESTED || STARTING || CORDONED || DRAINRETRYING ):
+        - return Defer
+
+### Drain Status Handler (node reboot/shutdown)
+
+1. zedagent/handlenodedrain.go:`handleNodeDrainStatusImpl()`
+    - NodeDrainStatus = FAILEDCORDON or FAILEDDRAIN
+        - Unpublish NodeDrainRequest
+1. nodeagent/handlenodedrain.go:`handleNodeDrainStatusImplNA()`
+    - NodeDrainStatus >= REQUESTED and < COMPLETE
+        - republish nodeagentstatus with drainInProgress set
+    - NodeDrainStatus == COMPLETE
+        - republish nodeagentstatus with drainInProgress cleared
+1. zedagent/zedagent.go:`handleNodeAgentStatusImpl()`
+    - If there is:
+        - a deferred device op
+        - nodeagent configctx reports drain complete
+    - Then process deferred reboot/shutdown
+
+### Drain PubSub setup (node eveimage-update)
+
+1. baseosmgr/handlenodedrain.go:`initNodeDrainPubSub()`
+    - subscribe to NodeDrainStatus from zedkube
+    - setup publication to NodeDrainRequest
+
+### Drain Request path (node eveimage-update)
+
+1. baseosmgr/handlebaseos.go:`baseOsHandleStatusUpdateUUID()`
+    - If BaseOs download complete (LOADING||LOADED||INSTALLED), not currently Activated, and new config requested it Activated
+        - Check `shouldDeferForNodeDrain()`, if defer requested return as Completion will later will complete this BaseOsStatusUpdate.
+1. baseosmgr/handlenodedrain.go:`shouldDeferForNodeDrain()`
+    - NodeDrainStatus == (NOTREQUESTED || FAILEDCORDON || FAILEDDRAIN):
+        - save BaseOsId in baseOsMgrContext.deferredBaseOsID
+        - Drain is requested via `kubeapi.RequestNodeDrain()`
+        - return Defer
+    - NodeDrainStatus == (UNKNOWN || NOTSUPPORTED || COMPLETE )
+        - return !Defer
+    - NodeDrainStatus == (REQUESTED || STARTING || CORDONED || DRAINRETRYING ):
+        - return Defer
+
+### Drain Status Handler (node eve-image update)
+
+1. baseosmgr/handlenodedrain.go:`handleNodeDrainStatusImpl()`
+    - NodeDrainStatus == FAILEDCORDON or FAILEDDRAIN:
+        - Unpublish NodeDrainRequest
+    - NodeDrainStatus == COMPLETE:
+        - Complete deferred baseOsMgrContext.deferredBaseOsID to `baseOsHandleStatusUpdateUUID()`
+
+### General DrainRequest Processing
+
+1. zedkube/zedkube.go:Run()
+    - sub to NodeDrainRequest from zedagent and baseosmgr
+    - new publication of NodeDrainStatus
+    - Init NodeDrainStatus to NOTSUPPORTED
+1. zedkube/zedkube.go:`handleEdgeNodeClusterConfigImpl()`
+    - System switching to cluster membership: NodeDrainStatus -> NOTREQUESTED
+1. zedkube/zedkube.go:`handleEdgeNodeClusterConfigDelete()`
+    - System switching to single node: NodeDrainStatus -> NOTSUPPORTED
+1. zedkube/handlenodedrain.go:`handleNodeDrainRequestImpl()`
+    - NodeDrainStatus -> REQUESTED
+1. zedkube/kubenodeop.go:`cordonAndDrainNode()`
+    - NodeDrainStatus -> STARTING
+    - Retry Cordon up to 10 times (in case k8s api states object changed)
+        - when retries exhausted: NodeDrainStatus -> FAILEDCORDON
+    - NodeDrainStatus -> CORDONED
+    - Retry Drain up to 5 times
+        - between tries: NodeDrainStatus -> DRAINRETRYING
+        - on failure: NodeDrainStatus -> FAILEDDRAIN
+    - NodeDrainStatus -> COMPLETE
+
+## Debugging
+
+### PubSub NodeDrainRequest/NodeDrainStatus
+
+/run/zedagent/NodeDrainRequest/global.json
+/run/baseosmgr/NodeDrainRequest/global.json
+/run/zedkube/NodeDrainStatus/global.json
+
+The current node drain progress is available from the global NodeDrainStatus object found at
+`cat /run/zedkube/NodeDrainStatus/global.json | jq .`
+
+NodeDrainStatus can be forced by writing the object (in pillar svc container fs) to: /persist/kube-status/force-NodeDrainStatus-global.json
+
+eg. to force disable drain:
+echo '{"Status":1,"RequestedBy":1}' > /persist/kube-status/kubeforce-NodeDrainStatus-global.json
+
+eg. to force deviceop drain complete:
+echo '{"Status":9,"RequestedBy":2}' > /persist/kube-status/force-NodeDrainStatus-global.json
+
+eg. to force baseosmgr drain complete:
+echo '{"Status":9,"RequestedBy":3}' > /persist/kube-status/force-NodeDrainStatus-global.json
+
+"Cannot evict pod as it would violate the pod's disruption budget":
+If NodeDrainStatus can get stuck if attempting to drain a node running a pod where the pod has an
+explicit spec.nodeName == "drain node".  Delete the pod to continue.
+If workload is a statefulset declaing spec.nodeName and node is already cordoned.  Then deleting the pod is not sufficient
+The statefulset must be deleted.
+
+### NodeDrainRequest/NodeDrainStatus log strings
+
+- NodeDrainRequest
+- NodeDrainStatus
+- cordonNode
+- cordonAndDrainNode
+- scheduleDeviceOperation
+- baseOsHandleStatusUpdateUUID
+- nodedrain-step
+- kubevirt_node_drain_completion_time_seconds
+...
+    zgrep 'kubevirt_node_drain_completion_time_seconds' /persist/newlog/keepSentQueue/dev.log.1725511530990.gz | jq -r .content | jq -r .msg | cut -d ':' -f 2
+    s34.559219
+...
@@ -0,0 +1,75 @@
+// Copyright (c) 2024 Zededa, Inc.
+// SPDX-License-Identifier: Apache-2.0
+
+package kubeapi
+
+import "time"
+
+// DrainStatus tracks progress of draining a node of replica disks and workloads
+type DrainStatus uint8
+
+const (
+	UNKNOWN       DrainStatus = iota + 0 // UNKNOWN Unable to determine
+	NOTSUPPORTED                         // NOTSUPPORTED System not (HV=kubevirt and clustered)
+	NOTREQUESTED                         // NOTREQUESTED Not yet requested
+	REQUESTED                            // REQUESTED From zedagent device operation or baseosmgr new update
+	STARTING                             // STARTING Zedkube go routine started, not yet cordoned
+	CORDONED                             // CORDONED Node Unschedulable set
+	FAILEDCORDON                         // FAILEDCORDON Node modification unable to apply
+	DRAINRETRYING                        // DRAINRETRYING Drain retry in progress, could be retried replica rebuild
+	FAILEDDRAIN                          // FAILEDDRAIN Could be retried replica rebuild
+	COMPLETE                             // COMPLETE All node workloads removed from system
+)
+
+func (status DrainStatus) String() string {
+	switch status {
+	case UNKNOWN:
+		return "Unknown"
+	case NOTSUPPORTED:
+		return "Not Supported"
+	case NOTREQUESTED:
+		return "Not Requested"
+	case REQUESTED:
+		return "Requested"
+	case STARTING:
+		return "Starting"
+	case CORDONED:
+		return "Cordoned"
+	case FAILEDCORDON:
+		return "Failed Cordon"
+	case DRAINRETRYING:
+		return "Drain Retrying"
+	case FAILEDDRAIN:
+		return "Failed Drain"
+	case COMPLETE:
+		return "Complete"
+	default:
+		return "Unknown"
+	}
+}
+
+// DrainRequester is a user initiated edge-node operation from a pillar microservice
+type DrainRequester uint8
+
+const (
+	NONE     DrainRequester = iota + 1 // NONE - The default value
+	DEVICEOP                           // DEVICEOP - Node Reboot or shutdown
+	UPDATE                             // UPDATE - baseos update
+)
+
+// NodeDrainRequest is the trigger to NodeDrainStatus
+//
+//	Used by Reboots, Prepare-Shutdown, baseos updates
+type NodeDrainRequest struct {
+	RequestedAt time.Time
+	RequestedBy DrainRequester
+	Context     string
+}
+
+// NodeDrainStatus is a response to NodeDrainRequest
+//
+//	Subscribe to updates to continue NodeDrainRequest operations.
+type NodeDrainStatus struct {
+	Status      DrainStatus
+	RequestedBy DrainRequester
+}
@@ -0,0 +1,100 @@
+// Copyright (c) 2024 Zededa, Inc.
+// SPDX-License-Identifier: Apache-2.0
+
+//go:build kubevirt
+
+package kubeapi
+
+import (
+	"encoding/json"
+	"fmt"
+	"os"
+	"time"
+
+	"github.com/lf-edge/eve/pkg/pillar/base"
+	"github.com/lf-edge/eve/pkg/pillar/pubsub"
+)
+
+// An alternate path to force a drain status in the event of a drain issue.
+const forceNodeDrainPath string = "/persist/kube-status/force-NodeDrainStatus-global.json"
+
+// RequestNodeDrain generates the NodeDrainRequest object and publishes it
+func RequestNodeDrain(pubNodeDrainRequest pubsub.Publication, requester DrainRequester, context string) error {
+	drainReq := NodeDrainRequest{
+		RequestedAt: time.Now(),
+		RequestedBy: requester,
+		Context:     context,
+	}
+	err := pubNodeDrainRequest.Publish("global", drainReq)
+	if err != nil {
+		return fmt.Errorf("RequestNodeDrain: error publishing drain request: %v", err)
+	}
+	return nil
+}
+
+// GetDrainStatusOverride : an alternate way to set drain status for debug
+func GetDrainStatusOverride(log *base.LogObject) *NodeDrainStatus {
+	if _, err := os.Stat(forceNodeDrainPath); err != nil {
+		return nil
+	}
+	b, err := os.ReadFile(forceNodeDrainPath)
+	if err != nil {
+		log.Warnf("Unable to read %s:%v", forceNodeDrainPath, err)
+		return nil
+	}
+	cfg := NodeDrainStatus{}
+	err = json.Unmarshal(b, &cfg)
+	if err != nil {
+		log.Warnf("Unable to Unmarshal %s to NodeDrainStatus: %v", forceNodeDrainPath, err)
+		return nil
+	}
+	if cfg.Status == COMPLETE {
+		err = os.Remove(forceNodeDrainPath)
+		if err != nil {
+			log.Warnf("could not remove %s: %v", forceNodeDrainPath, err)
+		}
+	}
+	return &cfg
+}
+
+// CleanupDrainStatusOverride is used at microservice startup to cleanup
+// a previously user written override file
+func CleanupDrainStatusOverride(log *base.LogObject) {
+	if _, err := os.Stat(forceNodeDrainPath); err != nil {
+		return
+	}
+	err := os.Remove(forceNodeDrainPath)
+	if err != nil {
+		log.Warnf("CleanupDrainStatusOverride could not remove %s: %v", forceNodeDrainPath, err)
+		return
+	}
+	return
+}
+
+// DrainStatusFaultInjectionWait while this file exists, wait in the drain status goroutine
+func DrainStatusFaultInjectionWait() bool {
+	injectFaultPath := "/tmp/DrainStatus_FaultInjection_Wait"
+	if _, err := os.Stat(injectFaultPath); err == nil {
+		return true
+	}
+	return false
+}
+
+// GetNodeDrainStatus is a wrapper to either return latest NodeDrainStatus
+//
+//	or return a forced status from /persist/force-NodeDrainStatus-global.json
+func GetNodeDrainStatus(subNodeDrainStatus pubsub.Subscription, log *base.LogObject) *NodeDrainStatus {
+	override := GetDrainStatusOverride(log)
+	if override != nil {
+		return override
+	}
+
+	items := subNodeDrainStatus.GetAll()
+	glbStatus, ok := items["global"].(NodeDrainStatus)
+	if !ok {
+		// This should only be expected on an HV=kubevirt build
+		// and only very early in boot (before zedkube starts)
+		return &NodeDrainStatus{Status: UNKNOWN, RequestedBy: NONE}
+	}
+	return &glbStatus
+}
@@ -6,6 +6,7 @@
 package kubeapi
 
 import (
+	"fmt"
 	"time"
 
 	"github.com/lf-edge/eve/pkg/pillar/base"
@@ -27,3 +28,15 @@ func CleanupStaleVMI() (int, error) {
 func GetPVCList(*base.LogObject) ([]string, error) {
 	panic("GetPVCList is not built")
 }
+
+// RequestNodeDrain is a stub for non-kubevirt builds
+func RequestNodeDrain(pubsub.Publication, DrainRequester, string) error {
+	// Nothing to do here, just noop
+	return fmt.Errorf("nokube requested drain, should not get here")
+}
+
+// GetNodeDrainStatus is a stub for non-kubevirt builds
+func GetNodeDrainStatus(pubsub.Subscription) *NodeDrainStatus {
+	// No need to query for inprogress operations, just a noop
+	return &NodeDrainStatus{Status: NOTSUPPORTED}
+}