@@ -20,12 +20,17 @@ package controller
2020import (
2121 "context"
2222 "fmt"
23+ "strings"
2324
2425 grovev1alpha1 "github.com/NVIDIA/grove/operator/api/core/v1alpha1"
26+ "k8s.io/apimachinery/pkg/api/errors"
27+
2528 networkingv1beta1 "istio.io/client-go/pkg/apis/networking/v1beta1"
2629 corev1 "k8s.io/api/core/v1"
2730 networkingv1 "k8s.io/api/networking/v1"
2831 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
32+ "k8s.io/apimachinery/pkg/runtime/schema"
33+ "k8s.io/client-go/scale"
2934 "k8s.io/client-go/tools/record"
3035 ctrl "sigs.k8s.io/controller-runtime"
3136 "sigs.k8s.io/controller-runtime/pkg/builder"
@@ -50,6 +55,20 @@ const (
5055 PendingState State = "pending"
5156)
5257
58+ var (
59+ // Grove GroupVersionResources for scaling operations
60+ podCliqueGVR = schema.GroupVersionResource {
61+ Group : "grove.io" ,
62+ Version : "v1alpha1" ,
63+ Resource : "podcliques" ,
64+ }
65+ podCliqueScalingGroupGVR = schema.GroupVersionResource {
66+ Group : "grove.io" ,
67+ Version : "v1alpha1" ,
68+ Resource : "podcliquescalinggroups" ,
69+ }
70+ )
71+
5372type etcdStorage interface {
5473 DeleteKeys (ctx context.Context , prefix string ) error
5574}
@@ -60,12 +79,15 @@ type DynamoGraphDeploymentReconciler struct {
6079 Config commonController.Config
6180 Recorder record.EventRecorder
6281 DockerSecretRetriever dockerSecretRetriever
82+ ScaleClient scale.ScalesGetter
6383}
6484
6585// +kubebuilder:rbac:groups=nvidia.com,resources=dynamographdeployments,verbs=get;list;watch;create;update;patch;delete
6686// +kubebuilder:rbac:groups=nvidia.com,resources=dynamographdeployments/status,verbs=get;update;patch
6787// +kubebuilder:rbac:groups=nvidia.com,resources=dynamographdeployments/finalizers,verbs=update
6888// +kubebuilder:rbac:groups=grove.io,resources=podgangsets,verbs=get;list;watch;create;update;patch;delete
89+ // +kubebuilder:rbac:groups=grove.io,resources=podcliques/scale,verbs=get;update;patch
90+ // +kubebuilder:rbac:groups=grove.io,resources=podcliquescalinggroups/scale,verbs=get;update;patch
6991
7092// Reconcile is part of the main kubernetes reconciliation loop which aims to
7193// move the current state of the cluster closer to the desired state.
@@ -156,6 +178,80 @@ func (r *DynamoGraphDeploymentReconciler) reconcileResources(ctx context.Context
156178
157179}
158180
181+ // scaleGroveResource scales a Grove resource using the generic scaling function
182+ func (r * DynamoGraphDeploymentReconciler ) scaleGroveResource (ctx context.Context , resourceName , namespace string , newReplicas int32 , resourceType string ) error {
183+ logger := log .FromContext (ctx )
184+ // Determine the GroupVersionResource based on resource type
185+ var gvr schema.GroupVersionResource
186+ switch resourceType {
187+ case "PodClique" :
188+ gvr = podCliqueGVR
189+ case "PodCliqueScalingGroup" :
190+ gvr = podCliqueScalingGroupGVR
191+ default :
192+ return fmt .Errorf ("unsupported Grove resource type: %s" , resourceType )
193+ }
194+
195+ // Use the generic scaling function
196+ err := commonController .ScaleResource (ctx , r .ScaleClient , gvr , namespace , resourceName , newReplicas )
197+ if err != nil {
198+ if errors .IsNotFound (err ) {
199+ // Resource doesn't exist yet - this is normal during initial creation when Grove is still creating the resources asynchronously
200+ logger .V (1 ).Info ("Grove resource not found yet, skipping scaling for now - will retry on next reconciliation" , "gvr" , gvr , "name" , resourceName , "namespace" , namespace )
201+ return nil
202+ }
203+ }
204+ return err
205+ }
206+
207+ // reconcileGroveScaling handles scaling operations for Grove resources based on service replica changes
208+ func (r * DynamoGraphDeploymentReconciler ) reconcileGroveScaling (ctx context.Context , dynamoDeployment * nvidiacomv1alpha1.DynamoGraphDeployment ) error {
209+ logger := log .FromContext (ctx )
210+ logger .V (1 ).Info ("Reconciling Grove scaling operations" )
211+
212+ replicaIndex := 0
213+ for serviceName , component := range dynamoDeployment .Spec .Services {
214+ // Skip if replicas are not specified
215+ if component .Replicas == nil {
216+ continue
217+ }
218+
219+ numberOfNodes := component .GetNumberOfNodes ()
220+ isMultinode := numberOfNodes > 1
221+
222+ if isMultinode {
223+ // Scale PodCliqueScalingGroup for multinode services
224+ // Grove naming pattern: {DGD.name}-{replicaIndex}-{serviceName}
225+ resourceName := fmt .Sprintf ("%s-%d-%s" , dynamoDeployment .Name , replicaIndex , strings .ToLower (serviceName ))
226+ err := r .scaleGroveResource (ctx ,
227+ resourceName ,
228+ dynamoDeployment .Namespace ,
229+ * component .Replicas ,
230+ "PodCliqueScalingGroup" )
231+ if err != nil {
232+ logger .Error (err , "Failed to scale PodCliqueScalingGroup" , "serviceName" , serviceName , "resourceName" , resourceName , "replicas" , * component .Replicas )
233+ return fmt .Errorf ("failed to scale PodCliqueScalingGroup %s: %w" , resourceName , err )
234+ }
235+ } else {
236+ // Scale individual PodClique for single-node services
237+ // Grove naming pattern: {DGD.name}-{replicaIndex}-{serviceName}
238+ resourceName := fmt .Sprintf ("%s-%d-%s" , dynamoDeployment .Name , replicaIndex , strings .ToLower (serviceName ))
239+ err := r .scaleGroveResource (ctx ,
240+ resourceName ,
241+ dynamoDeployment .Namespace ,
242+ * component .Replicas ,
243+ "PodClique" )
244+ if err != nil {
245+ logger .Error (err , "Failed to scale PodClique" , "serviceName" , serviceName , "resourceName" , resourceName , "replicas" , * component .Replicas )
246+ return fmt .Errorf ("failed to scale PodClique %s: %w" , resourceName , err )
247+ }
248+ }
249+ }
250+
251+ logger .V (1 ).Info ("Successfully reconciled Grove scaling operations" )
252+ return nil
253+ }
254+
159255func (r * DynamoGraphDeploymentReconciler ) reconcileGroveResources (ctx context.Context , dynamoDeployment * nvidiacomv1alpha1.DynamoGraphDeployment ) (State , Reason , Message , error ) {
160256 logger := log .FromContext (ctx )
161257 // generate the dynamoComponentsDeployments from the config
@@ -177,6 +273,13 @@ func (r *DynamoGraphDeploymentReconciler) reconcileGroveResources(ctx context.Co
177273 }
178274 return false
179275 })
276+
277+ // Handle Grove scaling operations after structural changes
278+ if err := r .reconcileGroveScaling (ctx , dynamoDeployment ); err != nil {
279+ logger .Error (err , "failed to reconcile Grove scaling" )
280+ return FailedState , "grove_scaling_failed" , Message (err .Error ()), err
281+ }
282+
180283 resources := []Resource {groveGangSetAsResource }
181284 for componentName , component := range dynamoDeployment .Spec .Services {
182285 if component .ComponentType == consts .ComponentTypeFrontend {
@@ -203,10 +306,6 @@ func (r *DynamoGraphDeploymentReconciler) reconcileGroveResources(ctx context.Co
203306 ingressSpec = * component .Ingress
204307 }
205308 mainComponentIngress := dynamo .GenerateComponentIngress (ctx , dynamo .GetDynamoComponentName (dynamoDeployment , componentName ), dynamoDeployment .Namespace , ingressSpec )
206- if err != nil {
207- logger .Error (err , "failed to generate the main component ingress" )
208- return "" , "" , "" , fmt .Errorf ("failed to generate the main component ingress: %w" , err )
209- }
210309 _ , syncedMainComponentIngress , err := commonController .SyncResource (ctx , r , dynamoDeployment , func (ctx context.Context ) (* networkingv1.Ingress , bool , error ) {
211310 if ! ingressSpec .Enabled || ingressSpec .IngressControllerClassName == nil {
212311 logger .Info ("Ingress is not enabled" )
@@ -224,10 +323,6 @@ func (r *DynamoGraphDeploymentReconciler) reconcileGroveResources(ctx context.Co
224323 // generate the main component virtual service
225324 if r .Config .IngressConfig .UseVirtualService () {
226325 mainComponentVirtualService := dynamo .GenerateComponentVirtualService (ctx , dynamo .GetDynamoComponentName (dynamoDeployment , componentName ), dynamoDeployment .Namespace , ingressSpec )
227- if err != nil {
228- logger .Error (err , "failed to generate the main component virtual service" )
229- return "" , "" , "" , fmt .Errorf ("failed to generate the main component virtual service: %w" , err )
230- }
231326 _ , syncedMainComponentVirtualService , err := commonController .SyncResource (ctx , r , dynamoDeployment , func (ctx context.Context ) (* networkingv1beta1.VirtualService , bool , error ) {
232327 if ! ingressSpec .IsVirtualServiceEnabled () {
233328 logger .Info ("VirtualService is not enabled" )
0 commit comments