Skip to content

Commit 2f380aa

Browse files
committed
feat: add grove multinode support
1 parent dbb4caa commit 2f380aa

21 files changed

+4768
-521
lines changed

deploy/cloud/helm/crds/Chart.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,5 +16,5 @@ apiVersion: v2
1616
name: dynamo-crds
1717
description: A Helm chart for dynamo CRDs
1818
type: application
19-
version: 0.4.0
19+
version: 0.4.1
2020
dependencies: []

deploy/cloud/helm/crds/templates/nvidia.com_dynamocomponentdeployments.yaml

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -404,10 +404,36 @@ spec:
404404
minReplicas:
405405
type: integer
406406
type: object
407+
backendFramework:
408+
default: vllm
409+
enum:
410+
- sglang
411+
- vllm
412+
type: string
407413
componentType:
408414
type: string
409415
dynamoComponent:
410416
type: string
417+
dynamoConfig:
418+
properties:
419+
dataParallelSize:
420+
format: int32
421+
type: integer
422+
extraArgs:
423+
items:
424+
type: string
425+
type: array
426+
flagOverrides:
427+
additionalProperties:
428+
type: string
429+
type: object
430+
numberOfNodes:
431+
format: int32
432+
type: integer
433+
tensorParallelSize:
434+
format: int32
435+
type: integer
436+
type: object
411437
dynamoNamespace:
412438
type: string
413439
dynamoTag:

deploy/cloud/helm/crds/templates/nvidia.com_dynamographdeployments.yaml

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,12 @@ spec:
4444
type: object
4545
spec:
4646
properties:
47+
backendFramework:
48+
default: vllm
49+
enum:
50+
- sglang
51+
- vllm
52+
type: string
4753
dynamoGraph:
4854
type: string
4955
envs:
@@ -465,6 +471,26 @@ spec:
465471
type: object
466472
componentType:
467473
type: string
474+
dynamoConfig:
475+
properties:
476+
dataParallelSize:
477+
format: int32
478+
type: integer
479+
extraArgs:
480+
items:
481+
type: string
482+
type: array
483+
flagOverrides:
484+
additionalProperties:
485+
type: string
486+
type: object
487+
numberOfNodes:
488+
format: int32
489+
type: integer
490+
tensorParallelSize:
491+
format: int32
492+
type: integer
493+
type: object
468494
dynamoNamespace:
469495
type: string
470496
envFromSecret:

deploy/cloud/operator/api/v1alpha1/dynamocomponentdeployment_types.go

Lines changed: 37 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,11 @@ type DynamoComponentDeploymentSpec struct {
4242
// contains the tag of the DynamoComponent: for example, "my_package:MyService"
4343
DynamoTag string `json:"dynamoTag,omitempty"`
4444

45+
// BackendFramework specifies the backend framework (e.g., "sglang", "vllm")
46+
// +kubebuilder:validation:Enum=sglang;vllm
47+
// +kubebuilder:default=vllm
48+
BackendFramework string `json:"backendFramework,omitempty"`
49+
4550
DynamoComponentDeploymentSharedSpec `json:",inline"`
4651
}
4752

@@ -72,6 +77,11 @@ type DynamoComponentDeploymentSharedSpec struct {
7277
RunMode *RunMode `json:"runMode,omitempty"`
7378
ExternalServices map[string]ExternalService `json:"externalServices,omitempty"`
7479

80+
// DynamoConfig provides an alternative way to generate container specifications
81+
// without manually specifying command/args. Can be used for both single-node and multinode deployments.
82+
// If present, the operator will generate command/args and translate to extraPodSpec for backward compatibility.
83+
DynamoConfig *DynamoConfig `json:"dynamoConfig,omitempty"`
84+
7585
Ingress *IngressSpec `json:"ingress,omitempty"`
7686

7787
// +optional
@@ -88,6 +98,33 @@ type RunMode struct {
8898
Standalone *bool `json:"standalone,omitempty"`
8999
}
90100

101+
// DynamoConfig provides a declarative way to generate container specifications
102+
// Can be used for both single-node and multinode deployments as an alternative to manual extraPodSpec
103+
type DynamoConfig struct {
104+
// NumberOfNodes determines deployment type for this service:
105+
// - 1: Single-node deployment (simple pod)
106+
// - >1: Multinode deployment (leader + workers with Grove)
107+
NumberOfNodes *int32 `json:"numberOfNodes,omitempty"`
108+
// TensorParallelSize defines the tensor parallel size for this role
109+
TensorParallelSize *int32 `json:"tensorParallelSize,omitempty"`
110+
// DataParallelSize defines the data parallel size for this role
111+
DataParallelSize *int32 `json:"dataParallelSize,omitempty"`
112+
113+
// Flexible flag-based configuration system
114+
// FlagOverrides allows overriding specific default flags with custom values
115+
// Key is the flag name (without --), value is the flag value
116+
// Set value to null to completely remove a default flag
117+
// Example: {"mem-fraction-static": "0.9", "enable-deepep-moe": "true", "decode-log-interval": null}
118+
FlagOverrides map[string]*string `json:"flagOverrides,omitempty"`
119+
120+
// ExtraArgs contains additional arguments to append to the command
121+
// These are added after all default flags and overrides
122+
ExtraArgs []string `json:"extraArgs,omitempty"`
123+
}
124+
125+
// Note: Removed complex SGLangConfig and VLLMConfig structs in favor of flexible
126+
// flag-based configuration using FlagOverrides and ExtraArgs for maximum flexibility
127+
91128
type ExternalService struct {
92129
DeploymentSelectorKey string `json:"deploymentSelectorKey,omitempty"`
93130
DeploymentSelectorValue string `json:"deploymentSelectorValue,omitempty"`
@@ -195,11 +232,3 @@ func (s *DynamoComponentDeployment) SetDynamoDeploymentConfig(config []byte) {
195232
Value: string(config),
196233
})
197234
}
198-
199-
// GetImage returns the docker image of the DynamoComponent
200-
func (s *DynamoComponentDeployment) GetImage() string {
201-
if s.Spec.ExtraPodSpec != nil && s.Spec.ExtraPodSpec.MainContainer != nil {
202-
return s.Spec.ExtraPodSpec.MainContainer.Image
203-
}
204-
return ""
205-
}

deploy/cloud/operator/api/v1alpha1/dynamographdeployment_types.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,10 @@ type DynamoGraphDeploymentSpec struct {
4040
// Environment variables to be set in the deployment
4141
// +kubebuilder:validation:Optional
4242
Envs []corev1.EnvVar `json:"envs,omitempty"`
43+
// BackendFramework specifies the backend framework (e.g., "sglang", "vllm")
44+
// +kubebuilder:validation:Enum=sglang;vllm
45+
// +kubebuilder:default=vllm
46+
BackendFramework string `json:"backendFramework,omitempty"`
4347
}
4448

4549
// DynamoGraphDeploymentStatus defines the observed state of DynamoGraphDeployment.

deploy/cloud/operator/api/v1alpha1/zz_generated.deepcopy.go

Lines changed: 56 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

deploy/cloud/operator/config/crd/bases/nvidia.com_dynamocomponentdeployments.yaml

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -404,10 +404,36 @@ spec:
404404
minReplicas:
405405
type: integer
406406
type: object
407+
backendFramework:
408+
default: vllm
409+
enum:
410+
- sglang
411+
- vllm
412+
type: string
407413
componentType:
408414
type: string
409415
dynamoComponent:
410416
type: string
417+
dynamoConfig:
418+
properties:
419+
dataParallelSize:
420+
format: int32
421+
type: integer
422+
extraArgs:
423+
items:
424+
type: string
425+
type: array
426+
flagOverrides:
427+
additionalProperties:
428+
type: string
429+
type: object
430+
numberOfNodes:
431+
format: int32
432+
type: integer
433+
tensorParallelSize:
434+
format: int32
435+
type: integer
436+
type: object
411437
dynamoNamespace:
412438
type: string
413439
dynamoTag:

deploy/cloud/operator/config/crd/bases/nvidia.com_dynamographdeployments.yaml

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,12 @@ spec:
4444
type: object
4545
spec:
4646
properties:
47+
backendFramework:
48+
default: vllm
49+
enum:
50+
- sglang
51+
- vllm
52+
type: string
4753
dynamoGraph:
4854
type: string
4955
envs:
@@ -465,6 +471,26 @@ spec:
465471
type: object
466472
componentType:
467473
type: string
474+
dynamoConfig:
475+
properties:
476+
dataParallelSize:
477+
format: int32
478+
type: integer
479+
extraArgs:
480+
items:
481+
type: string
482+
type: array
483+
flagOverrides:
484+
additionalProperties:
485+
type: string
486+
type: object
487+
numberOfNodes:
488+
format: int32
489+
type: integer
490+
tensorParallelSize:
491+
format: int32
492+
type: integer
493+
type: object
468494
dynamoNamespace:
469495
type: string
470496
envFromSecret:

deploy/cloud/operator/go.mod

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ toolchain go1.24.3
66

77
require (
88
emperror.dev/errors v0.8.1
9-
github.com/NVIDIA/grove/operator/api v0.0.0-20250717114148-daac6e53774f
9+
github.com/NVIDIA/grove/operator/api v0.0.0-20250801123021-8b42bac59ef2
1010
github.com/bsm/gomega v1.27.10
1111
github.com/google/go-cmp v0.7.0
1212
github.com/imdario/mergo v0.3.6

deploy/cloud/operator/go.sum

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
emperror.dev/errors v0.8.1 h1:UavXZ5cSX/4u9iyvH6aDcuGkVjeexUGJ7Ij7G4VfQT0=
22
emperror.dev/errors v0.8.1/go.mod h1:YcRvLPh626Ubn2xqtoprejnA5nFha+TJ+2vew48kWuE=
3-
github.com/NVIDIA/grove/operator/api v0.0.0-20250717114148-daac6e53774f h1:2ePSNDm7/Tep8F99yCQVH8/vmn86L1cUzTbVlyNopmQ=
4-
github.com/NVIDIA/grove/operator/api v0.0.0-20250717114148-daac6e53774f/go.mod h1:nJL33lsBe+9xCcZLYkNYg1wucE4hJfa4ZfHm1zamuG0=
3+
github.com/NVIDIA/grove/operator/api v0.0.0-20250801123021-8b42bac59ef2 h1:JLOj0GiubP3VlR0okIbuqljvl+e2Vccnu6LX6wL34G0=
4+
github.com/NVIDIA/grove/operator/api v0.0.0-20250801123021-8b42bac59ef2/go.mod h1:QlsR2wQLj9m/zVEqv5SsCPzyjN2ykYZ0r/NEnDf4WB4=
55
github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM=
66
github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw=
77
github.com/blang/semver/v4 v4.0.0 h1:1PFHFE6yCCTv8C1TeyNNarDzntLi7wMI5i/pzqYIsAM=

0 commit comments

Comments
 (0)