Skip to content

Commit c382005

Browse files
feat: use official Grove 0.1.0-alpha release (#3030)
Signed-off-by: Julien Mancuso <jmancuso@nvidia.com>
1 parent 2a61e29 commit c382005

File tree

12 files changed

+49
-49
lines changed

12 files changed

+49
-49
lines changed

deploy/cloud/helm/platform/Chart.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,11 +35,11 @@ dependencies:
3535
repository: "https://charts.bitnami.com/bitnami"
3636
condition: etcd.enabled
3737
- name: kai-scheduler
38-
version: v0.8.4
38+
version: v0.9.2
3939
repository: oci://ghcr.io/nvidia/kai-scheduler
4040
condition: kai-scheduler.enabled
4141
- name: grove-charts
4242
alias: grove
43-
version: v0.0.0-6e30275
43+
version: v0.1.0-alpha.1
4444
repository: oci://ghcr.io/nvidia/grove
4545
condition: grove.enabled

deploy/cloud/helm/platform/components/operator/templates/manager-rbac.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -119,7 +119,7 @@ rules:
119119
- apiGroups:
120120
- grove.io
121121
resources:
122-
- podgangsets
122+
- podcliquesets
123123
verbs:
124124
- create
125125
- delete

deploy/cloud/operator/cmd/main.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -159,7 +159,7 @@ func main() {
159159
flag.StringVar(&ingressHostSuffix, "ingress-host-suffix", "",
160160
"The suffix to use for the ingress host")
161161
flag.DurationVar(&groveTerminationDelay, "grove-termination-delay", consts.DefaultGroveTerminationDelay,
162-
"The termination delay for Grove PodGangSets")
162+
"The termination delay for Grove PodCliqueSets")
163163
flag.StringVar(&modelExpressURL, "model-express-url", "",
164164
"URL of the Model Express server to inject into all pods")
165165
flag.StringVar(&prometheusEndpoint, "prometheus-endpoint", "",

deploy/cloud/operator/config/rbac/role.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -110,7 +110,7 @@ rules:
110110
- apiGroups:
111111
- grove.io
112112
resources:
113-
- podgangsets
113+
- podcliquesets
114114
verbs:
115115
- create
116116
- delete

deploy/cloud/operator/go.mod

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,9 @@ toolchain go1.24.3
66

77
require (
88
emperror.dev/errors v0.8.1
9-
github.com/NVIDIA/grove/operator/api v0.0.0-20250825164137-da01400261a6
9+
github.com/NVIDIA/grove/operator/api v0.1.0-alpha.1
1010
github.com/bsm/gomega v1.27.10
11+
github.com/go-logr/logr v1.4.2
1112
github.com/google/go-cmp v0.7.0
1213
github.com/imdario/mergo v0.3.6
1314
github.com/onsi/ginkgo/v2 v2.23.4
@@ -39,7 +40,6 @@ require (
3940
github.com/evanphx/json-patch/v5 v5.9.11 // indirect
4041
github.com/fsnotify/fsnotify v1.7.0 // indirect
4142
github.com/fxamacker/cbor/v2 v2.7.0 // indirect
42-
github.com/go-logr/logr v1.4.2 // indirect
4343
github.com/go-logr/zapr v1.3.0 // indirect
4444
github.com/go-openapi/jsonpointer v0.21.0 // indirect
4545
github.com/go-openapi/jsonreference v0.21.0 // indirect

deploy/cloud/operator/go.sum

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
emperror.dev/errors v0.8.1 h1:UavXZ5cSX/4u9iyvH6aDcuGkVjeexUGJ7Ij7G4VfQT0=
22
emperror.dev/errors v0.8.1/go.mod h1:YcRvLPh626Ubn2xqtoprejnA5nFha+TJ+2vew48kWuE=
3-
github.com/NVIDIA/grove/operator/api v0.0.0-20250825164137-da01400261a6 h1:JkW8LeRVsQH/YkRTz80T/JxlDgfk0URKgTUKyYKxbso=
4-
github.com/NVIDIA/grove/operator/api v0.0.0-20250825164137-da01400261a6/go.mod h1:QlsR2wQLj9m/zVEqv5SsCPzyjN2ykYZ0r/NEnDf4WB4=
3+
github.com/NVIDIA/grove/operator/api v0.1.0-alpha.1 h1:4DE6ZGa/3muBa5gk1GtJskMVss6GjeCPpn+xTnR1h9w=
4+
github.com/NVIDIA/grove/operator/api v0.1.0-alpha.1/go.mod h1:QlsR2wQLj9m/zVEqv5SsCPzyjN2ykYZ0r/NEnDf4WB4=
55
github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM=
66
github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw=
77
github.com/blang/semver/v4 v4.0.0 h1:1PFHFE6yCCTv8C1TeyNNarDzntLi7wMI5i/pzqYIsAM=

deploy/cloud/operator/internal/controller/dynamographdeployment_controller.go

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,7 @@ type DynamoGraphDeploymentReconciler struct {
7171
// +kubebuilder:rbac:groups=nvidia.com,resources=dynamographdeployments,verbs=get;list;watch;create;update;patch;delete
7272
// +kubebuilder:rbac:groups=nvidia.com,resources=dynamographdeployments/status,verbs=get;update;patch
7373
// +kubebuilder:rbac:groups=nvidia.com,resources=dynamographdeployments/finalizers,verbs=update
74-
// +kubebuilder:rbac:groups=grove.io,resources=podgangsets,verbs=get;list;watch;create;update;patch;delete
74+
// +kubebuilder:rbac:groups=grove.io,resources=podcliquesets,verbs=get;list;watch;create;update;patch;delete
7575
// +kubebuilder:rbac:groups=grove.io,resources=podcliques/scale,verbs=get;update;patch
7676
// +kubebuilder:rbac:groups=grove.io,resources=podcliquescalinggroups/scale,verbs=get;update;patch
7777
// +kubebuilder:rbac:groups=scheduling.run.ai,resources=queues,verbs=get;list
@@ -258,12 +258,12 @@ func (r *DynamoGraphDeploymentReconciler) reconcileGroveScaling(ctx context.Cont
258258
func (r *DynamoGraphDeploymentReconciler) reconcileGroveResources(ctx context.Context, dynamoDeployment *nvidiacomv1alpha1.DynamoGraphDeployment) (State, Reason, Message, error) {
259259
logger := log.FromContext(ctx)
260260
// generate the dynamoComponentsDeployments from the config
261-
groveGangSet, err := dynamo.GenerateGrovePodGangSet(ctx, dynamoDeployment, r.Config, r.DockerSecretRetriever)
261+
groveGangSet, err := dynamo.GenerateGrovePodCliqueSet(ctx, dynamoDeployment, r.Config, r.DockerSecretRetriever)
262262
if err != nil {
263263
logger.Error(err, "failed to generate the Grove GangSet")
264264
return "", "", "", fmt.Errorf("failed to generate the Grove GangSet: %w", err)
265265
}
266-
_, syncedGroveGangSet, err := commonController.SyncResource(ctx, r, dynamoDeployment, func(ctx context.Context) (*grovev1alpha1.PodGangSet, bool, error) {
266+
_, syncedGroveGangSet, err := commonController.SyncResource(ctx, r, dynamoDeployment, func(ctx context.Context) (*grovev1alpha1.PodCliqueSet, bool, error) {
267267
return groveGangSet, false, nil
268268
})
269269
if err != nil {
@@ -421,7 +421,7 @@ func (r *DynamoGraphDeploymentReconciler) SetupWithManager(mgr ctrl.Manager) err
421421
})).
422422
WithEventFilter(commonController.EphemeralDeploymentEventFilter(r.Config))
423423
if r.Config.Grove.Enabled {
424-
ctrlBuilder = ctrlBuilder.Owns(&grovev1alpha1.PodGangSet{}, builder.WithPredicates(predicate.Funcs{
424+
ctrlBuilder = ctrlBuilder.Owns(&grovev1alpha1.PodCliqueSet{}, builder.WithPredicates(predicate.Funcs{
425425
// ignore creation cause we don't want to be called again after we create the pod gang set
426426
CreateFunc: func(ce event.CreateEvent) bool { return false },
427427
DeleteFunc: func(de event.DeleteEvent) bool { return true },

deploy/cloud/operator/internal/controller_common/podgangset.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ import (
66
grovev1alpha1 "github.com/NVIDIA/grove/operator/api/core/v1alpha1"
77
)
88

9-
func CanonicalizePodGangSet(gangSet *grovev1alpha1.PodGangSet) *grovev1alpha1.PodGangSet {
9+
func CanonicalizePodCliqueSet(gangSet *grovev1alpha1.PodCliqueSet) *grovev1alpha1.PodCliqueSet {
1010
// sort cliques by name
1111
sort.Slice(gangSet.Spec.Template.Cliques, func(i, j int) bool {
1212
return gangSet.Spec.Template.Cliques[i].Name < gangSet.Spec.Template.Cliques[j].Name

deploy/cloud/operator/internal/controller_common/predicate.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ import (
3333
type GroveConfig struct {
3434
// Enabled is automatically determined by checking if Grove CRDs are installed in the cluster
3535
Enabled bool
36-
// TerminationDelay configures the termination delay for Grove PodGangSets
36+
// TerminationDelay configures the termination delay for Grove PodCliqueSets
3737
TerminationDelay time.Duration
3838
}
3939

deploy/cloud/operator/internal/dynamo/graph.go

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -317,16 +317,16 @@ type SecretsRetriever interface {
317317
GetSecrets(namespace, registry string) ([]string, error)
318318
}
319319

320-
// applyCliqueStartupDependencies configures StartsAfter dependencies for cliques in a PodGangSet
320+
// applyCliqueStartupDependencies configures StartsAfter dependencies for cliques in a PodCliqueSet
321321
// based on the backend framework and multinode deployment patterns.
322322
//
323323
// Rules:
324324
// - For VLLM and SGLang: worker cliques start after leader clique
325325
// - For TRTLLM: leader clique starts after worker cliques
326326
// - Only applies to multinode deployments (numberOfNodes > 1)
327-
// - Sets the PodGangSet StartupType to Explicit if any dependencies are configured
327+
// - Sets the PodCliqueSet StartupType to Explicit if any dependencies are configured
328328
func applyCliqueStartupDependencies(
329-
gangSet *grovev1alpha1.PodGangSet,
329+
gangSet *grovev1alpha1.PodCliqueSet,
330330
roles []ServiceRole,
331331
backendFramework BackendFramework,
332332
numberOfNodes int32,
@@ -880,14 +880,14 @@ func GeneratePodSpecForComponent(
880880
return podSpec, nil
881881
}
882882

883-
// GenerateGrovePodGangSet generates a Grove PodGangSet for the given deployment, supporting both single-node and multinode cases.
884-
func GenerateGrovePodGangSet(
883+
// GenerateGrovePodCliqueSet generates a Grove PodCliqueSet for the given deployment, supporting both single-node and multinode cases.
884+
func GenerateGrovePodCliqueSet(
885885
ctx context.Context,
886886
dynamoDeployment *v1alpha1.DynamoGraphDeployment,
887887
controllerConfig controller_common.Config,
888888
secretsRetriever SecretsRetriever,
889-
) (*grovev1alpha1.PodGangSet, error) {
890-
gangSet := &grovev1alpha1.PodGangSet{}
889+
) (*grovev1alpha1.PodCliqueSet, error) {
890+
gangSet := &grovev1alpha1.PodCliqueSet{}
891891
gangSet.Name = dynamoDeployment.Name
892892
gangSet.Namespace = dynamoDeployment.Namespace
893893
gangSet.Spec.Replicas = 1
@@ -986,7 +986,7 @@ func GenerateGrovePodGangSet(
986986
gangSet.Spec.Template.PodCliqueScalingGroupConfigs = scalingGroups
987987
}
988988

989-
return controller_common.CanonicalizePodGangSet(gangSet), nil
989+
return controller_common.CanonicalizePodCliqueSet(gangSet), nil
990990
}
991991

992992
func generateLabels(component *v1alpha1.DynamoComponentDeploymentOverridesSpec, dynamoDeployment *v1alpha1.DynamoGraphDeployment, componentName string) (map[string]string, error) {

0 commit comments

Comments
 (0)