Skip to content

Commit

Permalink
handle all restart policies (#1649)
Browse files Browse the repository at this point in the history
* handle all restart policies

* fix lint error

* gofmt whitespace diff

Co-authored-by: abinthomasonline <abinthomasonline@gmail.com>
  • Loading branch information
abin-thomas-by and abinthomasonline authored Dec 30, 2022
1 parent e13d262 commit 84961f2
Showing 1 changed file with 18 additions and 17 deletions.
35 changes: 18 additions & 17 deletions pkg/controller.v1/tensorflow/tfjob_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -784,25 +784,26 @@ func (r *TFJobReconciler) ReconcilePods(
}
}
// Check if the pod is retryable.
if spec.RestartPolicy == commonv1.RestartPolicyExitCode {
if pod.Status.Phase == v1.PodFailed && train_util.IsRetryableExitCode(exitCode) {
logger.Infof("Need to restart the pod: %v.%v", pod.Namespace, pod.Name)
if err := r.PodControl.DeletePod(pod.Namespace, pod.Name, tfJob); err != nil {
return err
}
if pod.Status.Phase == v1.PodFailed &&
(spec.RestartPolicy == commonv1.RestartPolicyExitCode && train_util.IsRetryableExitCode(exitCode) ||
spec.RestartPolicy == commonv1.RestartPolicyOnFailure ||
spec.RestartPolicy == commonv1.RestartPolicyAlways) {
logger.Infof("Need to restart the pod: %v.%v", pod.Namespace, pod.Name)
if err := r.PodControl.DeletePod(pod.Namespace, pod.Name, tfJob); err != nil {
return err
}

// with common library framework, we have to handle restart status here
// or we won't know which replica has been restarted in updateJobStatus after reconciling all replicas
msg := fmt.Sprintf("TFJob %s is restarting because %s replica(s) failed.",
tfJob.Name, rtype)
r.Recorder.Event(tfJob, corev1.EventTypeWarning, tfJobRestartingReason, msg)
err := commonutil.UpdateJobConditions(jobStatus, commonv1.JobRestarting, tfJobRestartingReason, msg)
if err != nil {
commonutil.LoggerForJob(tfJob).Infof("Append tfjob condition error: %v", err)
return err
}
trainingoperatorcommon.RestartedJobsCounterInc(tfJob.Namespace, kubeflowv1.TFJobFrameworkName)
// with common library framework, we have to handle restart status here
// or we won't know which replica has been restarted in updateJobStatus after reconciling all replicas
msg := fmt.Sprintf("TFJob %s is restarting because %s replica(s) failed.",
tfJob.Name, rtype)
r.Recorder.Event(tfJob, corev1.EventTypeWarning, tfJobRestartingReason, msg)
err := commonutil.UpdateJobConditions(jobStatus, commonv1.JobRestarting, tfJobRestartingReason, msg)
if err != nil {
commonutil.LoggerForJob(tfJob).Infof("Append tfjob condition error: %v", err)
return err
}
trainingoperatorcommon.RestartedJobsCounterInc(tfJob.Namespace, kubeflowv1.TFJobFrameworkName)
}

updateJobReplicaStatuses(jobStatus, rtype, pod)
Expand Down

0 comments on commit 84961f2

Please sign in to comment.