From 42c501bb80b2ee55c52efa232bd0ea5b33787cdb Mon Sep 17 00:00:00 2001 From: yowenter Date: Fri, 21 Apr 2023 16:57:39 +0800 Subject: [PATCH] fix job active count; count terminating pod as failed Signed-off-by: yowenter --- pkg/controller.v1/common/status_test.go | 15 ++++++++++++--- pkg/core/status.go | 9 ++++++++- 2 files changed, 20 insertions(+), 4 deletions(-) diff --git a/pkg/controller.v1/common/status_test.go b/pkg/controller.v1/common/status_test.go index 80680a24..30d93f01 100644 --- a/pkg/controller.v1/common/status_test.go +++ b/pkg/controller.v1/common/status_test.go @@ -2,10 +2,12 @@ package common import ( "testing" + "time" apiv1 "github.com/kubeflow/common/pkg/apis/common/v1" "github.com/stretchr/testify/assert" corev1 "k8s.io/api/core/v1" + metaV1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) func TestUpdateJobReplicaStatuses(t *testing.T) { @@ -14,13 +16,14 @@ func TestUpdateJobReplicaStatuses(t *testing.T) { _, ok := jobStatus.ReplicaStatuses["worker"] // assert ReplicaStatus for "worker" exists assert.True(t, ok) - setStatusForTest(&jobStatus, "worker", 2, 3, 1) - assert.Equal(t, jobStatus.ReplicaStatuses["worker"].Failed, int32(2)) + setStatusForTest(&jobStatus, "worker", 2, 3, 1, 1) + // terminating pod should count as failed. + assert.Equal(t, jobStatus.ReplicaStatuses["worker"].Failed, int32(3)) assert.Equal(t, jobStatus.ReplicaStatuses["worker"].Succeeded, int32(3)) assert.Equal(t, jobStatus.ReplicaStatuses["worker"].Active, int32(1)) } -func setStatusForTest(jobStatus *apiv1.JobStatus, rtype apiv1.ReplicaType, failed, succeeded, active int32) { +func setStatusForTest(jobStatus *apiv1.JobStatus, rtype apiv1.ReplicaType, failed, succeeded, active, terminating int32) { pod := corev1.Pod{ Status: corev1.PodStatus{}, } @@ -37,4 +40,10 @@ func setStatusForTest(jobStatus *apiv1.JobStatus, rtype apiv1.ReplicaType, faile pod.Status.Phase = corev1.PodRunning updateJobReplicaStatuses(jobStatus, rtype, &pod) } + for i = 0; i < terminating; i++ { + pod.Status.Phase = corev1.PodRunning + deletionTimestamp := metaV1.NewTime(time.Now()) + pod.DeletionTimestamp = &deletionTimestamp + updateJobReplicaStatuses(jobStatus, rtype, &pod) + } } diff --git a/pkg/core/status.go b/pkg/core/status.go index 4c449244..bb09e3fa 100644 --- a/pkg/core/status.go +++ b/pkg/core/status.go @@ -34,7 +34,14 @@ func InitializeReplicaStatuses(jobStatus *apiv1.JobStatus, rtype apiv1.ReplicaTy func UpdateJobReplicaStatuses(jobStatus *apiv1.JobStatus, rtype apiv1.ReplicaType, pod *corev1.Pod) { switch pod.Status.Phase { case corev1.PodRunning: - jobStatus.ReplicaStatuses[rtype].Active++ + if pod.DeletionTimestamp != nil { + // when node is not ready, the pod will be in terminating state. + // Count deleted Pods as failures to account for orphan Pods that + // never have a chance to reach the Failed phase. + jobStatus.ReplicaStatuses[rtype].Failed++ + } else { + jobStatus.ReplicaStatuses[rtype].Active++ + } case corev1.PodSucceeded: jobStatus.ReplicaStatuses[rtype].Succeeded++ case corev1.PodFailed: