Skip to content

Commit 57d2052

Browse files
feat: Add ContainerBackend with Docker and Podman (#119)
* Add docker backend Signed-off-by: Brian Gallagher <briangal@gmail.com> * Add podman backend Signed-off-by: Fiona Waters <fiwaters6@gmail.com> * Implementing ContainerBackend Signed-off-by: Fiona Waters <fiwaters6@gmail.com> * Use ip address for Podman Signed-off-by: Fiona Waters <fiwaters6@gmail.com> * Updating to rely on container runtime rather than storing job info in memory Signed-off-by: Fiona Waters <fiwaters6@gmail.com> * Addressing feedback Signed-off-by: Fiona Waters <fiwaters6@gmail.com> * Checking common sockets before failing Signed-off-by: Fiona Waters <fiwaters6@gmail.com> --------- Signed-off-by: Brian Gallagher <briangal@gmail.com> Signed-off-by: Fiona Waters <fiwaters6@gmail.com> Co-authored-by: Brian Gallagher <briangal@gmail.com>
1 parent c74a8b5 commit 57d2052

File tree

15 files changed

+3744
-6
lines changed

15 files changed

+3744
-6
lines changed

README.md

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,29 @@ TrainerClient().wait_for_job_status(job_id)
7171
print("\n".join(TrainerClient().get_job_logs(name=job_id)))
7272
```
7373

74+
## Local Development
75+
76+
Kubeflow Trainer client supports local development without needing a Kubernetes cluster.
77+
78+
### Available Backends
79+
80+
- **KubernetesBackend** (default) - Production training on Kubernetes
81+
- **ContainerBackend** - Local development with Docker/Podman isolation
82+
- **LocalProcessBackend** - Quick prototyping with Python subprocesses
83+
84+
**Quick Start:**
85+
Install container support: `pip install kubeflow[docker]` or `pip install kubeflow[podman]`
86+
87+
```python
88+
from kubeflow.trainer import TrainerClient, ContainerBackendConfig, CustomTrainer
89+
90+
# Switch to local container execution
91+
client = TrainerClient(backend_config=ContainerBackendConfig())
92+
93+
# Your training runs locally in isolated containers
94+
job_id = client.train(trainer=CustomTrainer(func=train_fn))
95+
```
96+
7497
## Supported Kubeflow Projects
7598

7699
| Project | Status | Version Support | Description |

kubeflow/trainer/__init__.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,10 @@
1818

1919
# Import the Kubeflow Trainer client.
2020
from kubeflow.trainer.api.trainer_client import TrainerClient
21+
from kubeflow.trainer.backends.container.types import (
22+
ContainerBackendConfig,
23+
TrainingRuntimeSource,
24+
)
2125
from kubeflow.trainer.backends.localprocess.types import LocalProcessBackendConfig
2226

2327
# Import the Kubeflow Trainer constants.
@@ -64,5 +68,7 @@
6468
"TrainerClient",
6569
"TrainerType",
6670
"LocalProcessBackendConfig",
71+
"ContainerBackendConfig",
6772
"KubernetesBackendConfig",
73+
"TrainingRuntimeSource",
6874
]

kubeflow/trainer/api/trainer_client.py

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@
1717
from typing import Optional, Union
1818

1919
from kubeflow.common.types import KubernetesBackendConfig
20+
from kubeflow.trainer.backends.container.backend import ContainerBackend
21+
from kubeflow.trainer.backends.container.types import ContainerBackendConfig
2022
from kubeflow.trainer.backends.kubernetes.backend import KubernetesBackend
2123
from kubeflow.trainer.backends.localprocess.backend import (
2224
LocalProcessBackend,
@@ -31,14 +33,19 @@
3133
class TrainerClient:
3234
def __init__(
3335
self,
34-
backend_config: Optional[Union[KubernetesBackendConfig, LocalProcessBackendConfig]] = None,
36+
backend_config: Union[
37+
KubernetesBackendConfig,
38+
LocalProcessBackendConfig,
39+
ContainerBackendConfig,
40+
] = None,
3541
):
3642
"""Initialize a Kubeflow Trainer client.
3743
3844
Args:
39-
backend_config: Backend configuration. Either KubernetesBackendConfig or
40-
LocalProcessBackendConfig, or None to use the backend's
41-
default config class. Defaults to KubernetesBackendConfig.
45+
backend_config: Backend configuration. Either KubernetesBackendConfig,
46+
LocalProcessBackendConfig, ContainerBackendConfig,
47+
or None to use the backend's default config class.
48+
Defaults to KubernetesBackendConfig.
4249
4350
Raises:
4451
ValueError: Invalid backend configuration.
@@ -52,6 +59,8 @@ def __init__(
5259
self.backend = KubernetesBackend(backend_config)
5360
elif isinstance(backend_config, LocalProcessBackendConfig):
5461
self.backend = LocalProcessBackend(backend_config)
62+
elif isinstance(backend_config, ContainerBackendConfig):
63+
self.backend = ContainerBackend(backend_config)
5564
else:
5665
raise ValueError(f"Invalid backend config '{backend_config}'")
5766

Lines changed: 195 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,195 @@
1+
# Copyright 2025 The Kubeflow Authors.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
"""
16+
Container client adapters for Docker and Podman.
17+
18+
This module implements the adapter pattern to abstract away differences between
19+
Docker and Podman APIs, allowing the backend to work with either runtime through
20+
a common interface.
21+
"""
22+
23+
from __future__ import annotations
24+
25+
import abc
26+
from collections.abc import Iterator
27+
from typing import Optional
28+
29+
30+
class BaseContainerClientAdapter(abc.ABC):
31+
"""
32+
Abstract adapter interface for container clients.
33+
34+
This adapter abstracts the container runtime API, allowing the backend
35+
to work with Docker and Podman through a unified interface.
36+
"""
37+
38+
@abc.abstractmethod
39+
def ping(self):
40+
"""Test the connection to the container runtime."""
41+
raise NotImplementedError()
42+
43+
@abc.abstractmethod
44+
def create_network(
45+
self,
46+
name: str,
47+
labels: dict[str, str],
48+
) -> str:
49+
"""
50+
Create a container network.
51+
52+
Args:
53+
name: Network name
54+
labels: Labels to attach to the network
55+
56+
Returns:
57+
Network ID or name
58+
"""
59+
raise NotImplementedError()
60+
61+
@abc.abstractmethod
62+
def delete_network(self, network_id: str):
63+
"""Delete a network."""
64+
raise NotImplementedError()
65+
66+
@abc.abstractmethod
67+
def create_and_start_container(
68+
self,
69+
image: str,
70+
command: list[str],
71+
name: str,
72+
network_id: str,
73+
environment: dict[str, str],
74+
labels: dict[str, str],
75+
volumes: dict[str, dict[str, str]],
76+
working_dir: str,
77+
) -> str:
78+
"""
79+
Create and start a container.
80+
81+
Args:
82+
image: Container image
83+
command: Command to run
84+
name: Container name
85+
network_id: Network to attach to
86+
environment: Environment variables
87+
labels: Container labels
88+
volumes: Volume mounts
89+
working_dir: Working directory
90+
91+
Returns:
92+
Container ID
93+
"""
94+
raise NotImplementedError()
95+
96+
@abc.abstractmethod
97+
def get_container(self, container_id: str):
98+
"""Get container object by ID."""
99+
raise NotImplementedError()
100+
101+
@abc.abstractmethod
102+
def container_logs(self, container_id: str, follow: bool) -> Iterator[str]:
103+
"""Stream logs from a container."""
104+
raise NotImplementedError()
105+
106+
@abc.abstractmethod
107+
def stop_container(self, container_id: str, timeout: int = 10):
108+
"""Stop a container."""
109+
raise NotImplementedError()
110+
111+
@abc.abstractmethod
112+
def remove_container(self, container_id: str, force: bool = True):
113+
"""Remove a container."""
114+
raise NotImplementedError()
115+
116+
@abc.abstractmethod
117+
def pull_image(self, image: str):
118+
"""Pull an image."""
119+
raise NotImplementedError()
120+
121+
@abc.abstractmethod
122+
def image_exists(self, image: str) -> bool:
123+
"""Check if an image exists locally."""
124+
raise NotImplementedError()
125+
126+
@abc.abstractmethod
127+
def run_oneoff_container(self, image: str, command: list[str]) -> str:
128+
"""
129+
Run a short-lived container and return its output.
130+
131+
Args:
132+
image: Container image
133+
command: Command to run
134+
135+
Returns:
136+
Container output as string
137+
"""
138+
raise NotImplementedError()
139+
140+
@abc.abstractmethod
141+
def container_status(self, container_id: str) -> tuple[str, Optional[int]]:
142+
"""
143+
Get container status.
144+
145+
Returns:
146+
Tuple of (status_string, exit_code)
147+
Status strings: "running", "created", "exited", etc.
148+
Exit code is None if container hasn't exited
149+
"""
150+
raise NotImplementedError()
151+
152+
@abc.abstractmethod
153+
def get_container_ip(self, container_id: str, network_id: str) -> Optional[str]:
154+
"""
155+
Get container's IP address on a specific network.
156+
157+
Args:
158+
container_id: Container ID
159+
network_id: Network name or ID
160+
161+
Returns:
162+
IP address string or None if not found
163+
"""
164+
raise NotImplementedError()
165+
166+
@abc.abstractmethod
167+
def list_containers(self, filters: Optional[dict[str, list[str]]] = None) -> list[dict]:
168+
"""
169+
List containers, optionally filtered by labels.
170+
171+
Args:
172+
filters: Dictionary of filters (e.g., {"label": ["key=value"]})
173+
174+
Returns:
175+
List of container info dictionaries with keys:
176+
- id: Container ID
177+
- name: Container name
178+
- labels: Dictionary of labels
179+
- status: Container status
180+
- created: Creation timestamp
181+
"""
182+
raise NotImplementedError()
183+
184+
@abc.abstractmethod
185+
def get_network(self, network_id: str) -> Optional[dict]:
186+
"""
187+
Get network information by ID or name.
188+
189+
Args:
190+
network_id: Network ID or name
191+
192+
Returns:
193+
Dictionary with network info including labels, or None if not found
194+
"""
195+
raise NotImplementedError()

0 commit comments

Comments
 (0)