Skip to content
This repository has been archived by the owner on Sep 12, 2023. It is now read-only.

fix job active count; count terminating pod as failed #214

Merged
merged 1 commit into from
Apr 25, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 12 additions & 3 deletions pkg/controller.v1/common/status_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,12 @@ package common

import (
"testing"
"time"

apiv1 "github.com/kubeflow/common/pkg/apis/common/v1"
"github.com/stretchr/testify/assert"
corev1 "k8s.io/api/core/v1"
metaV1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)

func TestUpdateJobReplicaStatuses(t *testing.T) {
Expand All @@ -14,13 +16,14 @@ func TestUpdateJobReplicaStatuses(t *testing.T) {
_, ok := jobStatus.ReplicaStatuses["worker"]
// assert ReplicaStatus for "worker" exists
assert.True(t, ok)
setStatusForTest(&jobStatus, "worker", 2, 3, 1)
assert.Equal(t, jobStatus.ReplicaStatuses["worker"].Failed, int32(2))
setStatusForTest(&jobStatus, "worker", 2, 3, 1, 1)
// terminating pod should count as failed.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

good comment :)

assert.Equal(t, jobStatus.ReplicaStatuses["worker"].Failed, int32(3))
assert.Equal(t, jobStatus.ReplicaStatuses["worker"].Succeeded, int32(3))
assert.Equal(t, jobStatus.ReplicaStatuses["worker"].Active, int32(1))
}

func setStatusForTest(jobStatus *apiv1.JobStatus, rtype apiv1.ReplicaType, failed, succeeded, active int32) {
func setStatusForTest(jobStatus *apiv1.JobStatus, rtype apiv1.ReplicaType, failed, succeeded, active, terminating int32) {
pod := corev1.Pod{
Status: corev1.PodStatus{},
}
Expand All @@ -37,4 +40,10 @@ func setStatusForTest(jobStatus *apiv1.JobStatus, rtype apiv1.ReplicaType, faile
pod.Status.Phase = corev1.PodRunning
updateJobReplicaStatuses(jobStatus, rtype, &pod)
}
for i = 0; i < terminating; i++ {
pod.Status.Phase = corev1.PodRunning
deletionTimestamp := metaV1.NewTime(time.Now())
pod.DeletionTimestamp = &deletionTimestamp
updateJobReplicaStatuses(jobStatus, rtype, &pod)
}
}
9 changes: 8 additions & 1 deletion pkg/core/status.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,14 @@ func InitializeReplicaStatuses(jobStatus *apiv1.JobStatus, rtype apiv1.ReplicaTy
func UpdateJobReplicaStatuses(jobStatus *apiv1.JobStatus, rtype apiv1.ReplicaType, pod *corev1.Pod) {
switch pod.Status.Phase {
case corev1.PodRunning:
jobStatus.ReplicaStatuses[rtype].Active++
if pod.DeletionTimestamp != nil {
// when node is not ready, the pod will be in terminating state.
// Count deleted Pods as failures to account for orphan Pods that
// never have a chance to reach the Failed phase.
jobStatus.ReplicaStatuses[rtype].Failed++
} else {
jobStatus.ReplicaStatuses[rtype].Active++
}
yowenter marked this conversation as resolved.
Show resolved Hide resolved
case corev1.PodSucceeded:
jobStatus.ReplicaStatuses[rtype].Succeeded++
case corev1.PodFailed:
Expand Down