Skip to content

Commit 7ab03e1

Browse files
committed
Implement TrainerClient Backends & Local Process
1 parent d04f157 commit 7ab03e1

File tree

13 files changed

+1290
-559
lines changed

13 files changed

+1290
-559
lines changed

python/kubeflow/trainer/api/trainer_client.py

Lines changed: 46 additions & 558 deletions
Large diffs are not rendered by default.
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
# Copyright 2025 The Kubeflow Authors.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
from kubeflow.trainer.backends.k8s import K8SBackend
16+
from kubeflow.trainer.backends.local_process import LocalProcessBackend
17+
from kubeflow.trainer.types.backends import K8SBackendConfig, LocalProcessBackendConfig
18+
19+
TRAINER_BACKEND_REGISTRY = {
20+
"kubernetes": {
21+
"backend_cls": K8SBackend,
22+
"config_cls": K8SBackendConfig,
23+
},
24+
"local": {
25+
"backend_cls": LocalProcessBackend,
26+
"config_cls": LocalProcessBackendConfig,
27+
}
28+
}
Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
# Copyright 2025 The Kubeflow Authors.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import abc
16+
17+
from typing import Dict, List, Optional
18+
from kubeflow.trainer.constants import constants
19+
from kubeflow.trainer.types import types
20+
21+
22+
class TrainingBackend(abc.ABC):
23+
24+
@abc.abstractmethod
25+
def list_runtimes(self) -> List[types.Runtime]:
26+
raise NotImplementedError()
27+
28+
@abc.abstractmethod
29+
def get_runtime(self, name: str) -> Optional[types.Runtime]:
30+
raise NotImplementedError()
31+
32+
@abc.abstractmethod
33+
def train(self,
34+
train_job_name: str,
35+
runtime: types.Runtime,
36+
initializer: Optional[types.Initializer] = None,
37+
trainer: Optional[types.Trainer] = None,
38+
) -> str:
39+
raise NotImplementedError()
40+
41+
@abc.abstractmethod
42+
def list_jobs(
43+
self, runtime: Optional[types.Runtime] = None
44+
) -> List[types.TrainJob]:
45+
raise NotImplementedError()
46+
47+
@abc.abstractmethod
48+
def get_job(self, name: str) -> Optional[types.TrainJob]:
49+
raise NotImplementedError()
50+
51+
@abc.abstractmethod
52+
def get_job_logs(self,
53+
name: str,
54+
follow: Optional[bool] = False,
55+
step: str = constants.NODE,
56+
node_rank: int = 0,
57+
) -> Dict[str, str]:
58+
raise NotImplementedError()
59+
60+
@abc.abstractmethod
61+
def delete_job(self, name: str) -> None:
62+
raise NotImplementedError()

0 commit comments

Comments
 (0)