Skip to content
This repository has been archived by the owner on Sep 12, 2023. It is now read-only.

Commit

Permalink
fix job active count; count terminating pod as failed (#214)
Browse files Browse the repository at this point in the history
Signed-off-by: yowenter <wenter.wu@gmail.com>
  • Loading branch information
yowenter authored Apr 25, 2023
1 parent 83259a0 commit fdb9739
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 4 deletions.
15 changes: 12 additions & 3 deletions pkg/controller.v1/common/status_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,12 @@ package common

import (
"testing"
"time"

apiv1 "github.com/kubeflow/common/pkg/apis/common/v1"
"github.com/stretchr/testify/assert"
corev1 "k8s.io/api/core/v1"
metaV1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)

func TestUpdateJobReplicaStatuses(t *testing.T) {
Expand All @@ -14,13 +16,14 @@ func TestUpdateJobReplicaStatuses(t *testing.T) {
_, ok := jobStatus.ReplicaStatuses["worker"]
// assert ReplicaStatus for "worker" exists
assert.True(t, ok)
setStatusForTest(&jobStatus, "worker", 2, 3, 1)
assert.Equal(t, jobStatus.ReplicaStatuses["worker"].Failed, int32(2))
setStatusForTest(&jobStatus, "worker", 2, 3, 1, 1)
// terminating pod should count as failed.
assert.Equal(t, jobStatus.ReplicaStatuses["worker"].Failed, int32(3))
assert.Equal(t, jobStatus.ReplicaStatuses["worker"].Succeeded, int32(3))
assert.Equal(t, jobStatus.ReplicaStatuses["worker"].Active, int32(1))
}

func setStatusForTest(jobStatus *apiv1.JobStatus, rtype apiv1.ReplicaType, failed, succeeded, active int32) {
func setStatusForTest(jobStatus *apiv1.JobStatus, rtype apiv1.ReplicaType, failed, succeeded, active, terminating int32) {
pod := corev1.Pod{
Status: corev1.PodStatus{},
}
Expand All @@ -37,4 +40,10 @@ func setStatusForTest(jobStatus *apiv1.JobStatus, rtype apiv1.ReplicaType, faile
pod.Status.Phase = corev1.PodRunning
updateJobReplicaStatuses(jobStatus, rtype, &pod)
}
for i = 0; i < terminating; i++ {
pod.Status.Phase = corev1.PodRunning
deletionTimestamp := metaV1.NewTime(time.Now())
pod.DeletionTimestamp = &deletionTimestamp
updateJobReplicaStatuses(jobStatus, rtype, &pod)
}
}
9 changes: 8 additions & 1 deletion pkg/core/status.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,14 @@ func InitializeReplicaStatuses(jobStatus *apiv1.JobStatus, rtype apiv1.ReplicaTy
func UpdateJobReplicaStatuses(jobStatus *apiv1.JobStatus, rtype apiv1.ReplicaType, pod *corev1.Pod) {
switch pod.Status.Phase {
case corev1.PodRunning:
jobStatus.ReplicaStatuses[rtype].Active++
if pod.DeletionTimestamp != nil {
// when node is not ready, the pod will be in terminating state.
// Count deleted Pods as failures to account for orphan Pods that
// never have a chance to reach the Failed phase.
jobStatus.ReplicaStatuses[rtype].Failed++
} else {
jobStatus.ReplicaStatuses[rtype].Active++
}
case corev1.PodSucceeded:
jobStatus.ReplicaStatuses[rtype].Succeeded++
case corev1.PodFailed:
Expand Down

0 comments on commit fdb9739

Please sign in to comment.