Skip to content

Commit aaff6bc

Browse files
fix: wip
1 parent c3f0d37 commit aaff6bc

File tree

5 files changed

+243
-18
lines changed

5 files changed

+243
-18
lines changed
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
/*
2+
* SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3+
* SPDX-License-Identifier: Apache-2.0
4+
*/
5+
6+
package dynamo
7+
8+
import (
9+
commonconsts "github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/consts"
10+
corev1 "k8s.io/api/core/v1"
11+
)
12+
13+
// BaseComponentDefaults provides common defaults shared by all components
14+
type BaseComponentDefaults struct{}
15+
16+
func (b *BaseComponentDefaults) GetBaseContainer(backendFramework BackendFramework) (corev1.Container, error) {
17+
return b.getCommonContainer(), nil
18+
}
19+
20+
func (b *BaseComponentDefaults) getCommonContainer() corev1.Container {
21+
return corev1.Container{
22+
Name: "main",
23+
Ports: []corev1.ContainerPort{
24+
{
25+
Protocol: corev1.ProtocolTCP,
26+
Name: commonconsts.DynamoContainerPortName,
27+
ContainerPort: int32(commonconsts.DynamoServicePort),
28+
},
29+
{
30+
Protocol: corev1.ProtocolTCP,
31+
Name: commonconsts.DynamoHealthPortName,
32+
ContainerPort: int32(commonconsts.DynamoHealthPort),
33+
},
34+
},
35+
}
36+
}
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
/*
2+
* SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3+
* SPDX-License-Identifier: Apache-2.0
4+
*/
5+
6+
package dynamo
7+
8+
import (
9+
commonconsts "github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/consts"
10+
corev1 "k8s.io/api/core/v1"
11+
)
12+
13+
// ComponentDefaults interface defines how defaults should be provided
14+
type ComponentDefaults interface {
15+
// GetBaseContainer returns the base container configuration for this component type
16+
// The backendFramework parameter may be empty for components that don't need backend-specific config
17+
GetBaseContainer(backendFramework BackendFramework) (corev1.Container, error)
18+
}
19+
20+
// ComponentDefaultsFactory creates appropriate defaults based on component type
21+
func ComponentDefaultsFactory(componentType string) ComponentDefaults {
22+
switch componentType {
23+
case commonconsts.ComponentTypeMain:
24+
return NewFrontendDefaults()
25+
case commonconsts.ComponentTypeWorker:
26+
return NewWorkerDefaults()
27+
default:
28+
return &BaseComponentDefaults{}
29+
}
30+
}
Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
/*
2+
* SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3+
* SPDX-License-Identifier: Apache-2.0
4+
*/
5+
6+
package dynamo
7+
8+
import (
9+
corev1 "k8s.io/api/core/v1"
10+
"k8s.io/apimachinery/pkg/api/resource"
11+
"k8s.io/apimachinery/pkg/util/intstr"
12+
)
13+
14+
// FrontendDefaults implements ComponentDefaults for Frontend components
15+
type FrontendDefaults struct {
16+
*BaseComponentDefaults
17+
}
18+
19+
func NewFrontendDefaults() *FrontendDefaults {
20+
return &FrontendDefaults{&BaseComponentDefaults{}}
21+
}
22+
23+
func (f *FrontendDefaults) GetBaseContainer(backendFramework BackendFramework) (corev1.Container, error) {
24+
// Frontend doesn't need backend-specific config
25+
container := f.getCommonContainer()
26+
27+
// Add frontend-specific defaults
28+
container.LivenessProbe = &corev1.Probe{
29+
ProbeHandler: corev1.ProbeHandler{
30+
HTTPGet: &corev1.HTTPGetAction{
31+
Path: "/health",
32+
Port: intstr.FromInt(8000),
33+
},
34+
},
35+
InitialDelaySeconds: 60,
36+
PeriodSeconds: 60,
37+
TimeoutSeconds: 30,
38+
FailureThreshold: 10,
39+
}
40+
41+
container.ReadinessProbe = &corev1.Probe{
42+
ProbeHandler: corev1.ProbeHandler{
43+
Exec: &corev1.ExecAction{
44+
Command: []string{
45+
"/bin/sh",
46+
"-c",
47+
"curl -s http://localhost:8000/health | jq -e \".status == \\\"healthy\\\"\"",
48+
},
49+
},
50+
},
51+
InitialDelaySeconds: 60,
52+
PeriodSeconds: 60,
53+
TimeoutSeconds: 30,
54+
FailureThreshold: 10,
55+
}
56+
57+
container.Resources = corev1.ResourceRequirements{
58+
Requests: corev1.ResourceList{
59+
corev1.ResourceCPU: resource.MustParse("1"),
60+
corev1.ResourceMemory: resource.MustParse("2Gi"),
61+
},
62+
Limits: corev1.ResourceList{
63+
corev1.ResourceCPU: resource.MustParse("1"),
64+
corev1.ResourceMemory: resource.MustParse("2Gi"),
65+
},
66+
}
67+
68+
container.Env = []corev1.EnvVar{
69+
{
70+
Name: "DYN_SYSTEM_PORT",
71+
Value: "8000",
72+
},
73+
}
74+
75+
return container, nil
76+
}

deploy/cloud/operator/internal/dynamo/graph.go

Lines changed: 15 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -685,25 +685,16 @@ func GenerateBasePodSpec(
685685
multinodeDeploymentType commonconsts.MultinodeDeploymentType,
686686
serviceName string,
687687
) (corev1.PodSpec, error) {
688-
container := corev1.Container{
689-
Name: "main",
690-
LivenessProbe: component.LivenessProbe,
691-
ReadinessProbe: component.ReadinessProbe,
692-
Env: component.Envs,
693-
Ports: []corev1.ContainerPort{
694-
{
695-
Protocol: corev1.ProtocolTCP,
696-
Name: commonconsts.DynamoContainerPortName,
697-
ContainerPort: int32(commonconsts.DynamoServicePort),
698-
},
699-
{
700-
Protocol: corev1.ProtocolTCP,
701-
Name: commonconsts.DynamoHealthPortName,
702-
ContainerPort: int32(commonconsts.DynamoHealthPort),
703-
},
704-
},
688+
// Get component defaults
689+
componentDefaults := ComponentDefaultsFactory(component.ComponentType)
690+
691+
// Get base container further parameterized by the backend engine
692+
container, err := componentDefaults.GetBaseContainer(backendFramework)
693+
if err != nil {
694+
return corev1.PodSpec{}, fmt.Errorf("failed to get base container: %w", err)
705695
}
706-
// First merge the mainContainer from extraPodSpec to get the base command and args
696+
697+
// merge main container spec
707698
if component.ExtraPodSpec != nil && component.ExtraPodSpec.MainContainer != nil {
708699
main := component.ExtraPodSpec.MainContainer.DeepCopy()
709700
if main != nil {
@@ -716,13 +707,15 @@ func GenerateBasePodSpec(
716707
}
717708
}
718709

710+
// merge resources
719711
resourcesConfig, err := controller_common.GetResourcesConfig(component.Resources)
720712
if err != nil {
721713
return corev1.PodSpec{}, fmt.Errorf("failed to get resources config: %w", err)
722714
}
723715
if resourcesConfig != nil {
724716
container.Resources = *resourcesConfig
725717
}
718+
726719
imagePullSecrets := []corev1.LocalObjectReference{}
727720
if secretsRetriever != nil && component.ExtraPodSpec != nil && component.ExtraPodSpec.MainContainer != nil && component.ExtraPodSpec.MainContainer.Image != "" {
728721
secretsName, err := secretsRetriever.GetSecrets(namespace, component.ExtraPodSpec.MainContainer.Image)
@@ -740,6 +733,7 @@ func GenerateBasePodSpec(
740733
})
741734
}
742735

736+
// Add standard environment variables
743737
addStandardEnvVars(&container, controllerConfig)
744738

745739
var volumes []corev1.Volume
@@ -757,9 +751,12 @@ func GenerateBasePodSpec(
757751
MountPath: *component.PVC.MountPoint,
758752
})
759753
}
754+
760755
shmVolume, shmVolumeMount := generateSharedMemoryVolumeAndMount(&container.Resources)
761756
volumes = append(volumes, shmVolume)
762757
container.VolumeMounts = append(container.VolumeMounts, shmVolumeMount)
758+
759+
// allow
763760
// Apply backend-specific container modifications
764761
backend := BackendFactory(backendFramework)
765762
if backend == nil {
Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
/*
2+
* SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3+
* SPDX-License-Identifier: Apache-2.0
4+
*/
5+
6+
package dynamo
7+
8+
import (
9+
"fmt"
10+
11+
corev1 "k8s.io/api/core/v1"
12+
"k8s.io/apimachinery/pkg/api/resource"
13+
"k8s.io/apimachinery/pkg/util/intstr"
14+
)
15+
16+
// WorkerDefaults implements ComponentDefaults for Worker components
17+
type WorkerDefaults struct {
18+
*BaseComponentDefaults
19+
}
20+
21+
func NewWorkerDefaults() *WorkerDefaults {
22+
return &WorkerDefaults{&BaseComponentDefaults{}}
23+
}
24+
25+
func (w *WorkerDefaults) GetBaseContainer(backendFramework BackendFramework) (corev1.Container, error) {
26+
if backendFramework == "" {
27+
return corev1.Container{}, fmt.Errorf("worker components require a backend framework")
28+
}
29+
30+
container := w.getCommonContainer()
31+
32+
// Add worker base defaults
33+
container.LivenessProbe = &corev1.Probe{
34+
ProbeHandler: corev1.ProbeHandler{
35+
HTTPGet: &corev1.HTTPGetAction{
36+
Path: "/live",
37+
Port: intstr.FromInt(9090),
38+
},
39+
},
40+
PeriodSeconds: 5,
41+
TimeoutSeconds: 30,
42+
FailureThreshold: 1,
43+
}
44+
45+
container.ReadinessProbe = &corev1.Probe{
46+
ProbeHandler: corev1.ProbeHandler{
47+
HTTPGet: &corev1.HTTPGetAction{
48+
Path: "/health",
49+
Port: intstr.FromInt(9090),
50+
},
51+
},
52+
PeriodSeconds: 10,
53+
TimeoutSeconds: 30,
54+
FailureThreshold: 60,
55+
}
56+
57+
container.Resources = corev1.ResourceRequirements{
58+
Requests: corev1.ResourceList{
59+
corev1.ResourceCPU: resource.MustParse("10"),
60+
corev1.ResourceMemory: resource.MustParse("20Gi"),
61+
"nvidia.com/gpu": resource.MustParse("1"),
62+
},
63+
Limits: corev1.ResourceList{
64+
corev1.ResourceCPU: resource.MustParse("10"),
65+
corev1.ResourceMemory: resource.MustParse("20Gi"),
66+
"nvidia.com/gpu": resource.MustParse("1"),
67+
},
68+
}
69+
70+
container.Env = []corev1.EnvVar{
71+
{
72+
Name: "DYN_SYSTEM_ENABLED",
73+
Value: "true",
74+
},
75+
{
76+
Name: "DYN_SYSTEM_USE_ENDPOINT_HEALTH_STATUS",
77+
Value: "[\"generate\"]",
78+
},
79+
{
80+
Name: "DYN_SYSTEM_PORT",
81+
Value: "9090",
82+
},
83+
}
84+
85+
return container, nil
86+
}

0 commit comments

Comments
 (0)