argoproj · jswxstw · Oct 30, 2023 · Oct 30, 2023 · agilgur5 · Oct 30, 2023
diff --git a/manifests/cluster-install/argo-server-rbac/argo-server-clusterole.yaml b/manifests/cluster-install/argo-server-rbac/argo-server-clusterole.yaml
@@ -29,6 +29,7 @@ rules:
  - list
  - watch
  - delete
+ - patch
  - apiGroups:
  - ""
  resources:

diff --git a/manifests/namespace-install/argo-server-rbac/argo-server-role.yaml b/manifests/namespace-install/argo-server-rbac/argo-server-role.yaml
@@ -29,6 +29,7 @@ rules:
  - list
  - watch
  - delete
+ - patch
  - apiGroups:
  - ""
  resources:

diff --git a/manifests/quick-start-minimal.yaml b/manifests/quick-start-minimal.yaml
diff --git a/manifests/quick-start-mysql.yaml b/manifests/quick-start-mysql.yaml
diff --git a/manifests/quick-start-postgres.yaml b/manifests/quick-start-postgres.yaml
diff --git a/server/workflow/workflow_server.go b/server/workflow/workflow_server.go
@@ -389,7 +389,7 @@ func (s *workflowServer) RetryWorkflow(ctx context.Context, req *workflowpkg.Wor
  return nil, sutils.ToStatusError(err, codes.Internal)
  }
 
- wf, podsToDelete, err := util.FormulateRetryWorkflow(ctx, wf, req.RestartSuccessful, req.NodeFieldSelector, req.Parameters)
+ wf, podsToDelete, podsToReset, err := util.FormulateRetryWorkflow(ctx, wf, req.RestartSuccessful, req.NodeFieldSelector, req.Parameters)
  if err != nil {
  return nil, sutils.ToStatusError(err, codes.Internal)
  }
@@ -402,6 +402,20 @@ func (s *workflowServer) RetryWorkflow(ctx context.Context, req *workflowpkg.Wor
  }
  }
 
+ for _, podName := range podsToReset {
+ log.WithFields(log.Fields{"podReset": podName}).Info("Resetting pod")
+ _, err := kubeClient.CoreV1().Pods(wf.Namespace).Patch(
+ ctx,
+ podName,
+ types.MergePatchType,
+ []byte(`{"metadata": {"labels": {"workflows.argoproj.io/completed": "false"}}}`),
+ metav1.PatchOptions{},
+ )
+ if err != nil && !apierr.IsNotFound(err) {
+ return nil, sutils.ToStatusError(err, codes.Internal)
+ }
+ }
+
  err = s.hydrator.Dehydrate(wf)
  if err != nil {
  return nil, sutils.ToStatusError(err, codes.Internal)

diff --git a/server/workflowarchive/archived_workflow_server.go b/server/workflowarchive/archived_workflow_server.go
@@ -17,6 +17,7 @@ import (
  metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
  "k8s.io/apimachinery/pkg/labels"
  "k8s.io/apimachinery/pkg/selection"
+ "k8s.io/apimachinery/pkg/types"
 
  "github.com/argoproj/argo-workflows/v3/persist/sqldb"
  workflowarchivepkg "github.com/argoproj/argo-workflows/v3/pkg/apiclient/workflowarchive"
@@ -286,7 +287,7 @@ func (w *archivedWorkflowServer) RetryArchivedWorkflow(ctx context.Context, req
  _, err = wfClient.ArgoprojV1alpha1().Workflows(req.Namespace).Get(ctx, wf.Name, metav1.GetOptions{})
  if apierr.IsNotFound(err) {
 
- wf, podsToDelete, err := util.FormulateRetryWorkflow(ctx, wf, req.RestartSuccessful, req.NodeFieldSelector, req.Parameters)
+ wf, podsToDelete, podsToReset, err := util.FormulateRetryWorkflow(ctx, wf, req.RestartSuccessful, req.NodeFieldSelector, req.Parameters)
  if err != nil {
  return nil, sutils.ToStatusError(err, codes.Internal)
  }
@@ -299,6 +300,20 @@ func (w *archivedWorkflowServer) RetryArchivedWorkflow(ctx context.Context, req
  }
  }
 
+ for _, podName := range podsToReset {
+ log.WithFields(log.Fields{"podReset": podName}).Info("Resetting pod")
+ _, err := kubeClient.CoreV1().Pods(wf.Namespace).Patch(
+ ctx,
+ podName,
+ types.MergePatchType,
+ []byte(`{"metadata": {"labels": {"workflows.argoproj.io/completed": "false"}}}`),
+ metav1.PatchOptions{},
+ )
+ if err != nil && !apierr.IsNotFound(err) {
+ return nil, sutils.ToStatusError(err, codes.Internal)
+ }
+ }
+
  wf.ObjectMeta.ResourceVersion = ""
  wf.ObjectMeta.UID = ""
  result, err := wfClient.ArgoprojV1alpha1().Workflows(req.Namespace).Create(ctx, wf, metav1.CreateOptions{})

diff --git a/workflow/controller/operator.go b/workflow/controller/operator.go
@@ -1099,7 +1099,8 @@ func (woc *wfOperationCtx) podReconciliation(ctx context.Context) error {
  wfNodesLock.Lock()
  defer wfNodesLock.Unlock()
  node, err := woc.wf.Status.Nodes.Get(nodeID)
- if err == nil {
+ // Pods of fulfilled nodes would be relabeled completed=false when workflow manual retry.
+ if err == nil && !node.Phase.Fulfilled() {
  if newState := woc.assessNodeStatus(pod, node); newState != nil {
  woc.addOutputsToGlobalScope(newState.Outputs)
  if newState.MemoizationStatus != nil {

diff --git a/workflow/controller/operator_test.go b/workflow/controller/operator_test.go
@@ -5677,7 +5677,7 @@ status:
  name: my-wf
  phase: Failed
 `)
- wf, _, err := util.FormulateRetryWorkflow(context.Background(), wf, false, "", []string{"message=modified"})
+ wf, _, _, err := util.FormulateRetryWorkflow(context.Background(), wf, false, "", []string{"message=modified"})
  if assert.NoError(t, err) {
  cancel, controller := newController(wf)
  defer cancel()

diff --git a/workflow/util/util.go b/workflow/util/util.go
@@ -831,15 +831,15 @@ func resetConnectedParentGroupNodes(oldWF *wfv1.Workflow, newWF *wfv1.Workflow,
 }
 
 // FormulateRetryWorkflow formulates a previous workflow to be retried, deleting all failed steps as well as the onExit node (and children)
-func FormulateRetryWorkflow(ctx context.Context, wf *wfv1.Workflow, restartSuccessful bool, nodeFieldSelector string, parameters []string) (*wfv1.Workflow, []string, error) {
+func FormulateRetryWorkflow(ctx context.Context, wf *wfv1.Workflow, restartSuccessful bool, nodeFieldSelector string, parameters []string) (*wfv1.Workflow, []string, []string, error) {
  switch wf.Status.Phase {
  case wfv1.WorkflowFailed, wfv1.WorkflowError:
  case wfv1.WorkflowSucceeded:
  if !(restartSuccessful && len(nodeFieldSelector) > 0) {
- return nil, nil, errors.Errorf(errors.CodeBadRequest, "To retry a succeeded workflow, set the options restartSuccessful and nodeFieldSelector")
+ return nil, nil, nil, errors.Errorf(errors.CodeBadRequest, "To retry a succeeded workflow, set the options restartSuccessful and nodeFieldSelector")
  }
  default:
- return nil, nil, errors.Errorf(errors.CodeBadRequest, "Cannot retry a workflow in phase %s", wf.Status.Phase)
+ return nil, nil, nil, errors.Errorf(errors.CodeBadRequest, "Cannot retry a workflow in phase %s", wf.Status.Phase)
  }
 
  newWF := wf.DeepCopy()
@@ -870,21 +870,22 @@ func FormulateRetryWorkflow(ctx context.Context, wf *wfv1.Workflow, restartSucce
  }
  err := overrideParameters(newWF, parameters)
  if err != nil {
- return nil, nil, err
+ return nil, nil, nil, err
  }
  }
 
  onExitNodeName := wf.ObjectMeta.Name + ".onExit"
  // Get all children of nodes that match filter
  nodeIDsToReset, err := getNodeIDsToReset(restartSuccessful, nodeFieldSelector, wf.Status.Nodes)
  if err != nil {
- return nil, nil, err
+ return nil, nil, nil, err
  }
 
  // Iterate the previous nodes. If it was successful Pod carry it forward
  deletedNodes := make(map[string]bool)
  deletedPods := make(map[string]bool)
  var podsToDelete []string
+ var podsToReset []string
  var resetParentGroupNodes []string
  for _, node := range wf.Status.Nodes {
  doForceResetNode := false
@@ -906,7 +907,7 @@ func FormulateRetryWorkflow(ctx context.Context, wf *wfv1.Workflow, restartSucce
  childNode, err := wf.Status.Nodes.Get(child)
  if err != nil {
  log.Fatalf("was unable to obtain node for %s due to %s", child, err)
- return nil, nil, fmt.Errorf("Was unable to obtain node for %s due to %s", child, err)
+ return nil, nil, nil, fmt.Errorf("Was unable to obtain node for %s due to %s", child, err)
  }
  if _, present := nodeIDsToReset[child]; present {
  log.Debugf("Group node %s needs to reset since its child %s is in the force reset path", node.Name, childNode.Name)
@@ -936,7 +937,7 @@ func FormulateRetryWorkflow(ctx context.Context, wf *wfv1.Workflow, restartSucce
  descendantNode, err := wf.Status.Nodes.Get(descendantNodeID)
  if err != nil {
  log.Fatalf("Was unable to obtain node for %s due to %s", descendantNodeID, err)
- return nil, nil, fmt.Errorf("Was unable to obtain node for %s due to %s", descendantNodeID, err)
+ return nil, nil, nil, fmt.Errorf("Was unable to obtain node for %s due to %s", descendantNodeID, err)
  }
  if descendantNode.Type == wfv1.NodeTypePod {
  newWF, resetParentGroupNodes = resetConnectedParentGroupNodes(wf, newWF, node, resetParentGroupNodes)
@@ -952,6 +953,12 @@ func FormulateRetryWorkflow(ctx context.Context, wf *wfv1.Workflow, restartSucce
  }
  } else {
  if !containsNode(resetParentGroupNodes, node.ID) {
+ if node.Type == wfv1.NodeTypePod {
+ templateName := GetTemplateFromNode(node)
+ version := GetWorkflowPodNameVersion(wf)
+ podName := GeneratePodName(wf.Name, node.Name, templateName, node.ID, version)
+ podsToReset = append(podsToReset, podName)
+ }
  log.Debugf("Node %s remains as is", node.Name)
  newWF.Status.Nodes.Set(node.ID, node)
  }
@@ -971,7 +978,7 @@ func FormulateRetryWorkflow(ctx context.Context, wf *wfv1.Workflow, restartSucce
  // do not add this status to the node. pretend as if this node never existed.
  default:
  // Do not allow retry of workflows with pods in Running/Pending phase
- return nil, nil, errors.InternalErrorf("Workflow cannot be retried with node %s in %s phase", node.Name, node.Phase)
+ return nil, nil, nil, errors.InternalErrorf("Workflow cannot be retried with node %s in %s phase", node.Name, node.Phase)
  }
 
  if node.Name == wf.ObjectMeta.Name {
@@ -1015,7 +1022,7 @@ func FormulateRetryWorkflow(ctx context.Context, wf *wfv1.Workflow, restartSucce
  newWF.Status.StoredTemplates[id] = tmpl
  }
 
- return newWF, podsToDelete, nil
+ return newWF, podsToDelete, podsToReset, nil
 }
 
 func resetNode(node wfv1.NodeStatus) wfv1.NodeStatus {