Skip to content

Commit b48277f

Browse files
committed
using pod phase
1 parent 04e92f5 commit b48277f

File tree

2 files changed

+71
-13
lines changed

2 files changed

+71
-13
lines changed

kubeflow/trainer/backends/kubernetes/backend.py

Lines changed: 62 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
import re
2121
import string
2222
import time
23-
from typing import Optional, Union
23+
from typing import List, Optional, Union
2424
import uuid
2525

2626
from kubeflow_trainer_api import models
@@ -57,6 +57,54 @@ def __init__(
5757

5858
self.namespace = cfg.namespace
5959

60+
def _select_best_pod_for_role(self, pods: List[models.IoK8sApiCoreV1Pod]) -> Optional[models.IoK8sApiCoreV1Pod]:
61+
"""
62+
Select the best Pod for a role based on status priority and creation timestamp.
63+
64+
Priority order:
65+
1. Running or Succeeded Pods (prefer most recent)
66+
2. Failed Pods (prefer most recent)
67+
3. Pending Pods (prefer most recent)
68+
4. Unknown Pods (prefer most recent)
69+
"""
70+
if not pods:
71+
return None
72+
73+
# Pod status priority (higher number = higher priority)
74+
status_priority = {
75+
constants.POD_RUNNING: 4, # Highest priority
76+
constants.POD_SUCCEEDED: 3, # Second highest
77+
constants.POD_FAILED: 2, # Third priority
78+
constants.POD_PENDING: 1, # Low priority
79+
constants.POD_UNKNOWN: 0, # Lowest priority
80+
}
81+
82+
# Group Pods by status priority
83+
pods_by_status = {}
84+
for pod in pods:
85+
status = pod.status.phase if pod.status else constants.POD_UNKNOWN
86+
priority = status_priority.get(status, 0)
87+
88+
if priority not in pods_by_status:
89+
pods_by_status[priority] = []
90+
pods_by_status[priority].append(pod)
91+
92+
# Find the highest priority status that has Pods
93+
highest_priority = max(pods_by_status.keys()) if pods_by_status else 0
94+
candidate_pods = pods_by_status[highest_priority]
95+
96+
# Among Pods with the same status, select the most recent one
97+
if len(candidate_pods) == 1:
98+
return candidate_pods[0]
99+
100+
# Sort by creation timestamp (most recent first)
101+
candidate_pods.sort(
102+
key=lambda p: p.metadata.creation_timestamp or "",
103+
reverse=True
104+
)
105+
106+
return candidate_pods[0]
107+
60108
def list_runtimes(self) -> list[types.Runtime]:
61109
result = []
62110
try:
@@ -522,19 +570,20 @@ def __get_trainjob_from_crd(
522570
int(pod.metadata.labels[constants.JOB_INDEX_LABEL])
523571
)
524572

525-
# Keep only the most recently created Pod for each role
573+
# Collect all Pods for this role
526574
if role_key not in pods_by_role:
527-
pods_by_role[role_key] = pod
528-
else:
529-
# Compare creation timestamps to keep the most recent
530-
current_pod = pods_by_role[role_key]
531-
if (pod.metadata.creation_timestamp and
532-
current_pod.metadata.creation_timestamp and
533-
pod.metadata.creation_timestamp > current_pod.metadata.creation_timestamp):
534-
pods_by_role[role_key] = pod
535-
536-
# Process only the most recent Pod for each role
537-
for role_key, pod in pods_by_role.items():
575+
pods_by_role[role_key] = []
576+
pods_by_role[role_key].append(pod)
577+
578+
# Select the best Pod for each role using status-priority logic
579+
selected_pods = {}
580+
for role_key, pods in pods_by_role.items():
581+
best_pod = self._select_best_pod_for_role(pods)
582+
if best_pod:
583+
selected_pods[role_key] = best_pod
584+
585+
# Process only the selected Pod for each role
586+
for role_key, pod in selected_pods.items():
538587
replicated_job_name, job_index = role_key
539588

540589
# Get the Initializer step.

kubeflow/trainer/constants/constants.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,15 @@
5454
# The succeeded phase of the Pod.
5555
POD_SUCCEEDED = "Succeeded"
5656

57+
# The running phase of the Pod.
58+
POD_RUNNING = "Running"
59+
60+
# The failed phase of the Pod.
61+
POD_FAILED = "Failed"
62+
63+
# The unknown phase of the Pod.
64+
POD_UNKNOWN = "Unknown"
65+
5766
# The label key to identify the relationship between TrainJob and Pod template in the runtime.
5867
# For example, what PodTemplate must be overridden by TrainJob's .spec.trainer APIs.
5968
TRAINJOB_ANCESTOR_LABEL = "trainer.kubeflow.org/trainjob-ancestor-step"

0 commit comments

Comments
 (0)