From 25ca0577a913cd38e114fc198a1428114a4ef1fb Mon Sep 17 00:00:00 2001 From: Eric Weber Date: Fri, 30 Aug 2024 16:28:51 -0500 Subject: [PATCH] fix: attempt to delete unknown replica instances on cleanup Longhorn 6552 Signed-off-by: Eric Weber --- controller/replica_controller.go | 32 ++++++++++++++++++++++++++++---- 1 file changed, 28 insertions(+), 4 deletions(-) diff --git a/controller/replica_controller.go b/controller/replica_controller.go index 8ed954971b..2930827080 100644 --- a/controller/replica_controller.go +++ b/controller/replica_controller.go @@ -543,7 +543,8 @@ func (rc *ReplicaController) DeleteInstance(obj interface{}) (err error) { } } - if im.Status.CurrentState != longhorn.InstanceManagerStateRunning { + if shouldSkip, skipReason := shouldSkipReplicaDeletion(im.Status.CurrentState); shouldSkip { + log.Infof("Skipping deleting replica %v since %s", r.Name, skipReason) return nil } @@ -555,14 +556,15 @@ func (rc *ReplicaController) DeleteInstance(obj interface{}) (err error) { defer func() { if err != nil { log.WithError(err).Warnf("Failed to delete replica process %v", r.Name) - if isDelinquent { - log.Warnf("Ignored the failure of deleting replica process %v because the RWX volume is currently delinquent", r.Name) + if canIgnore, ignoreReason := canIgnoreReplicaDeletionFailure(im.Status.CurrentState, + isDelinquent); canIgnore { + log.Warnf("Ignored the failure to delete replica process %v because %s", r.Name, ignoreReason) err = nil } } }() - c, err := engineapi.NewInstanceManagerClient(im, false) + c, err := engineapi.NewInstanceManagerClient(im, true) if err != nil { return err } @@ -888,3 +890,25 @@ func hasMatchingReplica(replica *longhorn.Replica, replicas map[string]*longhorn } return false } + +func shouldSkipReplicaDeletion(imState longhorn.InstanceManagerState) (canSkip bool, reason string) { + // If the instance manager is in an unknown state, we should at least attempt instance deletion. + if imState == longhorn.InstanceManagerStateRunning || imState == longhorn.InstanceManagerStateUnknown { + return false, "" + } + + return true, fmt.Sprintf("instance manager is in %v state", imState) +} + +func canIgnoreReplicaDeletionFailure(imState longhorn.InstanceManagerState, isDelinquent bool) (canIgnore bool, reason string) { + // Instance deletion is always best effort for an unknown instance manager. + if imState == longhorn.InstanceManagerStateUnknown { + return true, fmt.Sprintf("instance manager is in %v state", imState) + } + + if isDelinquent { + return true, "the RWX volume is delinquent" + } + + return false, "" +}