@@ -14,6 +14,10 @@ import (
1414
1515 nvidiacomv1alpha1 "github.com/ai-dynamo/dynamo/deploy/cloud/operator/api/v1alpha1"
1616 commonconsts "github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/consts"
17+ "github.com/ai-dynamo/dynamo/deploy/cloud/operator/internal/controller_common"
18+ metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
19+ "k8s.io/client-go/dynamic"
20+ ctrl "sigs.k8s.io/controller-runtime"
1721)
1822
1923type GroveMultinodeDeployer struct {
@@ -130,3 +134,95 @@ func checkPCSGReady(ctx context.Context, client client.Client, resourceName, nam
130134
131135 return true , ""
132136}
137+
138+ // resolveKaiSchedulerQueueName extracts the queue name from annotations or returns default
139+ // This is the shared logic between DetermineKaiSchedulerQueue and ResolveKaiSchedulerQueue
140+ func resolveKaiSchedulerQueueName (annotations map [string ]string ) string {
141+ queueName := commonconsts .DefaultKaiSchedulerQueue
142+ if annotations != nil {
143+ if annotationQueue , exists := annotations [commonconsts .KubeAnnotationKaiSchedulerQueue ]; exists && strings .TrimSpace (annotationQueue ) != "" {
144+ queueName = strings .TrimSpace (annotationQueue )
145+ }
146+ }
147+ return queueName
148+ }
149+
150+ // ensureQueueExists validates that a Queue resource with the given name exists in the cluster
151+ // Returns an error if the queue doesn't exist or if validation fails
152+ func ensureQueueExists (ctx context.Context , dynamicClient dynamic.Interface , queueName string ) error {
153+ logger := log .FromContext (ctx )
154+
155+ // Try to get the queue resource using the predefined GVR
156+ _ , err := dynamicClient .Resource (commonconsts .QueueGVR ).Get (ctx , queueName , metav1.GetOptions {})
157+ if err != nil {
158+ if errors .IsNotFound (err ) {
159+ logger .Error (err , "Queue not found" , "queueName" , queueName )
160+ return fmt .Errorf ("queue '%s' not found in cluster. Ensure the queue exists before using kai-scheduler" , queueName )
161+ }
162+ logger .Error (err , "Failed to validate queue" , "queueName" , queueName )
163+ return fmt .Errorf ("failed to validate queue '%s': %w" , queueName , err )
164+ }
165+
166+ logger .Info ("Queue validation successful" , "queueName" , queueName )
167+ return nil
168+ }
169+
170+ // DetermineKaiSchedulerQueue determines the queue name for kai-scheduler from deployment annotations or returns default
171+ // Also validates that the queue exists in the cluster
172+ func DetermineKaiSchedulerQueue (ctx context.Context , annotations map [string ]string ) (string , error ) {
173+ // Get the queue name from annotation or use default
174+ queueName := resolveKaiSchedulerQueueName (annotations )
175+
176+ // Create a dynamic client for CRD validation (Queue CRD might not be in the standard client scheme)
177+ cfg , err := ctrl .GetConfig ()
178+ if err != nil {
179+ return "" , fmt .Errorf ("failed to get kubernetes config for queue validation: %w" , err )
180+ }
181+
182+ dynamicClient , err := dynamic .NewForConfig (cfg )
183+ if err != nil {
184+ return "" , fmt .Errorf ("failed to create dynamic client for queue validation: %w" , err )
185+ }
186+
187+ // Validate that the queue exists
188+ if err := ensureQueueExists (ctx , dynamicClient , queueName ); err != nil {
189+ return "" , fmt .Errorf ("kai-scheduler queue validation failed: %w" , err )
190+ }
191+
192+ return queueName , nil
193+ }
194+
195+ // ResolveKaiSchedulerQueue determines the queue name for kai-scheduler from deployment annotations or returns default
196+ // Does NOT validate - use DetermineKaiSchedulerQueue for validation
197+ func ResolveKaiSchedulerQueue (annotations map [string ]string ) string {
198+ return resolveKaiSchedulerQueueName (annotations )
199+ }
200+
201+ // injectKaiSchedulerIfEnabled injects kai-scheduler settings into a clique if kai-scheduler is enabled and grove is enabled
202+ func injectKaiSchedulerIfEnabled (
203+ clique * grovev1alpha1.PodCliqueTemplateSpec ,
204+ controllerConfig controller_common.Config ,
205+ validatedQueueName string ,
206+ ) {
207+ // Only proceed if grove is enabled, kai-scheduler is enabled, and no manual schedulerName is set
208+ if ! controllerConfig .Grove .Enabled || ! controllerConfig .KaiScheduler .Enabled {
209+ return
210+ }
211+
212+ // Check if user has manually set schedulerName - if so, respect their choice
213+ if clique .Spec .PodSpec .SchedulerName != "" && clique .Spec .PodSpec .SchedulerName != commonconsts .KaiSchedulerName {
214+ return
215+ }
216+
217+ // Use the pre-validated queue name
218+ queueName := validatedQueueName
219+
220+ // Inject schedulerName
221+ clique .Spec .PodSpec .SchedulerName = commonconsts .KaiSchedulerName
222+
223+ // Inject queue label
224+ if clique .Labels == nil {
225+ clique .Labels = make (map [string ]string )
226+ }
227+ clique .Labels [commonconsts .KubeLabelKaiSchedulerQueue ] = queueName
228+ }
0 commit comments