From 1962437b37a4c55a68d60d5e0a62765d2a3d38d4 Mon Sep 17 00:00:00 2001 From: Michal Szadkowski Date: Fri, 16 Aug 2024 14:11:37 +0200 Subject: [PATCH 01/15] Introduce MultiKueue adapter for MPIJob --- .../jobs/mpijob/mpijob_controller.go | 1 + .../jobs/mpijob/mpijob_controller_test.go | 16 +- .../jobs/mpijob/mpijob_multikueue_adapter.go | 117 +++++++++++++ .../mpijob/mpijob_multikueue_adapter_test.go | 160 ++++++++++++++++++ .../testingjobs/mpijob/wrappers_mpijob.go | 120 +++++++++---- .../jobs/mpijob/mpijob_controller_test.go | 13 +- 6 files changed, 384 insertions(+), 43 deletions(-) create mode 100644 pkg/controller/jobs/mpijob/mpijob_multikueue_adapter.go create mode 100644 pkg/controller/jobs/mpijob/mpijob_multikueue_adapter_test.go diff --git a/pkg/controller/jobs/mpijob/mpijob_controller.go b/pkg/controller/jobs/mpijob/mpijob_controller.go index 2dc7113d76..7beb9132b5 100644 --- a/pkg/controller/jobs/mpijob/mpijob_controller.go +++ b/pkg/controller/jobs/mpijob/mpijob_controller.go @@ -51,6 +51,7 @@ func init() { JobType: &kubeflow.MPIJob{}, AddToScheme: kubeflow.AddToScheme, IsManagingObjectsOwner: isMPIJob, + MultiKueueAdapter: &multikueueAdapter{}, })) } diff --git a/pkg/controller/jobs/mpijob/mpijob_controller_test.go b/pkg/controller/jobs/mpijob/mpijob_controller_test.go index a051520921..ada57145be 100644 --- a/pkg/controller/jobs/mpijob/mpijob_controller_test.go +++ b/pkg/controller/jobs/mpijob/mpijob_controller_test.go @@ -211,8 +211,8 @@ func TestReconciler(t *testing.T) { reconcilerOptions: []jobframework.Option{ jobframework.WithManageJobsWithoutQueueName(true), }, - job: testingmpijob.MakeMPIJob("mpijob", "ns").Parallelism(2).Obj(), - wantJob: testingmpijob.MakeMPIJob("mpijob", "ns").Parallelism(2).Obj(), + job: testingmpijob.MakeMPIJob("mpijob", "ns").MPIJobReplicaSpecsDefault().Parallelism(2).Obj(), + wantJob: testingmpijob.MakeMPIJob("mpijob", "ns").MPIJobReplicaSpecsDefault().Parallelism(2).Obj(), wantWorkloads: []kueue.Workload{ *utiltesting.MakeWorkload("mpijob", "ns"). PodSets( @@ -226,11 +226,11 @@ func TestReconciler(t *testing.T) { reconcilerOptions: []jobframework.Option{ jobframework.WithManageJobsWithoutQueueName(true), }, - job: testingmpijob.MakeMPIJob("mpijob", "ns").Parallelism(2).WorkloadPriorityClass("test-wpc").Obj(), + job: testingmpijob.MakeMPIJob("mpijob", "ns").MPIJobReplicaSpecsDefault().Parallelism(2).WorkloadPriorityClass("test-wpc").Obj(), priorityClasses: []client.Object{ baseWPCWrapper.Obj(), }, - wantJob: testingmpijob.MakeMPIJob("mpijob", "ns").Parallelism(2).WorkloadPriorityClass("test-wpc").Obj(), + wantJob: testingmpijob.MakeMPIJob("mpijob", "ns").MPIJobReplicaSpecsDefault().Parallelism(2).WorkloadPriorityClass("test-wpc").Obj(), wantWorkloads: []kueue.Workload{ *utiltesting.MakeWorkload("mpijob", "ns"). PodSets( @@ -245,11 +245,11 @@ func TestReconciler(t *testing.T) { reconcilerOptions: []jobframework.Option{ jobframework.WithManageJobsWithoutQueueName(true), }, - job: testingmpijob.MakeMPIJob("mpijob", "ns").Parallelism(2).PriorityClass("test-pc").Obj(), + job: testingmpijob.MakeMPIJob("mpijob", "ns").MPIJobReplicaSpecsDefault().Parallelism(2).PriorityClass("test-pc").Obj(), priorityClasses: []client.Object{ basePCWrapper.Obj(), }, - wantJob: testingmpijob.MakeMPIJob("mpijob", "ns").Parallelism(2).PriorityClass("test-pc").Obj(), + wantJob: testingmpijob.MakeMPIJob("mpijob", "ns").MPIJobReplicaSpecsDefault().Parallelism(2).PriorityClass("test-pc").Obj(), wantWorkloads: []kueue.Workload{ *utiltesting.MakeWorkload("mpijob", "ns"). PodSets( @@ -264,12 +264,12 @@ func TestReconciler(t *testing.T) { reconcilerOptions: []jobframework.Option{ jobframework.WithManageJobsWithoutQueueName(true), }, - job: testingmpijob.MakeMPIJob("mpijob", "ns").Parallelism(2). + job: testingmpijob.MakeMPIJob("mpijob", "ns").MPIJobReplicaSpecsDefault().Parallelism(2). WorkloadPriorityClass("test-wpc").PriorityClass("test-pc").Obj(), priorityClasses: []client.Object{ basePCWrapper.Obj(), baseWPCWrapper.Obj(), }, - wantJob: testingmpijob.MakeMPIJob("mpijob", "ns").Parallelism(2). + wantJob: testingmpijob.MakeMPIJob("mpijob", "ns").MPIJobReplicaSpecsDefault().Parallelism(2). WorkloadPriorityClass("test-wpc").PriorityClass("test-pc").Obj(), wantWorkloads: []kueue.Workload{ *utiltesting.MakeWorkload("mpijob", "ns"). diff --git a/pkg/controller/jobs/mpijob/mpijob_multikueue_adapter.go b/pkg/controller/jobs/mpijob/mpijob_multikueue_adapter.go new file mode 100644 index 0000000000..99209fd63d --- /dev/null +++ b/pkg/controller/jobs/mpijob/mpijob_multikueue_adapter.go @@ -0,0 +1,117 @@ +/* +Copyright 2024 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package mpijob + +import ( + "context" + "errors" + "fmt" + + kubeflow "github.com/kubeflow/mpi-operator/pkg/apis/kubeflow/v2beta1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/apimachinery/pkg/types" + "k8s.io/klog/v2" + "sigs.k8s.io/controller-runtime/pkg/client" + + kueuealpha "sigs.k8s.io/kueue/apis/kueue/v1alpha1" + "sigs.k8s.io/kueue/pkg/controller/constants" + "sigs.k8s.io/kueue/pkg/controller/jobframework" + "sigs.k8s.io/kueue/pkg/util/api" + clientutil "sigs.k8s.io/kueue/pkg/util/client" +) + +type multikueueAdapter struct{} + +var _ jobframework.MultiKueueAdapter = (*multikueueAdapter)(nil) + +func (b *multikueueAdapter) SyncJob(ctx context.Context, localClient client.Client, remoteClient client.Client, key types.NamespacedName, workloadName, origin string) error { + localJob := kubeflow.MPIJob{} + err := localClient.Get(ctx, key, &localJob) + if err != nil { + return err + } + + remoteJob := kubeflow.MPIJob{} + err = remoteClient.Get(ctx, key, &remoteJob) + if client.IgnoreNotFound(err) != nil { + return err + } + + // if the remote exists, just copy the status + if err == nil { + return clientutil.PatchStatus(ctx, localClient, &localJob, func() (bool, error) { + localJob.Status = remoteJob.Status + return true, nil + }) + } + + remoteJob = kubeflow.MPIJob{ + ObjectMeta: api.CloneObjectMetaForCreation(&localJob.ObjectMeta), + Spec: *localJob.Spec.DeepCopy(), + } + + // add the prebuilt workload + if remoteJob.Labels == nil { + remoteJob.Labels = map[string]string{} + } + remoteJob.Labels[constants.PrebuiltWorkloadLabel] = workloadName + remoteJob.Labels[kueuealpha.MultiKueueOriginLabel] = origin + + return remoteClient.Create(ctx, &remoteJob) +} + +func (b *multikueueAdapter) DeleteRemoteObject(ctx context.Context, remoteClient client.Client, key types.NamespacedName) error { + job := kubeflow.MPIJob{} + err := remoteClient.Get(ctx, key, &job) + if err != nil { + return client.IgnoreNotFound(err) + } + return client.IgnoreNotFound(remoteClient.Delete(ctx, &job)) +} + +func (b *multikueueAdapter) KeepAdmissionCheckPending() bool { + return false +} + +func (b *multikueueAdapter) IsJobManagedByKueue(context.Context, client.Client, types.NamespacedName) (bool, string, error) { + return true, "", nil +} + +func (b *multikueueAdapter) GVK() schema.GroupVersionKind { + return gvk +} + +var _ jobframework.MultiKueueWatcher = (*multikueueAdapter)(nil) + +func (*multikueueAdapter) GetEmptyList() client.ObjectList { + return &kubeflow.MPIJobList{} +} + +func (*multikueueAdapter) WorkloadKeyFor(o runtime.Object) (types.NamespacedName, error) { + job, isJob := o.(*kubeflow.MPIJob) + if !isJob { + return types.NamespacedName{}, errors.New("not a mpijob") + } + + prebuiltWl, hasPrebuiltWorkload := job.Labels[constants.PrebuiltWorkloadLabel] + if !hasPrebuiltWorkload { + return types.NamespacedName{}, fmt.Errorf("no prebuilt workload found for mpijob: %s", klog.KObj(job)) + } + + return types.NamespacedName{Name: prebuiltWl, Namespace: job.Namespace}, nil +} diff --git a/pkg/controller/jobs/mpijob/mpijob_multikueue_adapter_test.go b/pkg/controller/jobs/mpijob/mpijob_multikueue_adapter_test.go new file mode 100644 index 0000000000..844bbf215d --- /dev/null +++ b/pkg/controller/jobs/mpijob/mpijob_multikueue_adapter_test.go @@ -0,0 +1,160 @@ +/* +Copyright 2024 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package mpijob + +import ( + "context" + "testing" + + "github.com/google/go-cmp/cmp" + "github.com/google/go-cmp/cmp/cmpopts" + kubeflow "github.com/kubeflow/mpi-operator/pkg/apis/kubeflow/v2beta1" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/interceptor" + + kueuealpha "sigs.k8s.io/kueue/apis/kueue/v1alpha1" + "sigs.k8s.io/kueue/pkg/controller/constants" + "sigs.k8s.io/kueue/pkg/util/slices" + utiltesting "sigs.k8s.io/kueue/pkg/util/testing" + utiltestingmpijob "sigs.k8s.io/kueue/pkg/util/testingjobs/mpijob" +) + +const ( + TestNamespace = "ns" +) + +func TestMultikueueAdapter(t *testing.T) { + objCheckOpts := []cmp.Option{ + cmpopts.IgnoreFields(metav1.ObjectMeta{}, "ResourceVersion"), + cmpopts.EquateEmpty(), + } + + mpiJobBuilder := utiltestingmpijob.MakeMPIJob("mpijob1", TestNamespace) + + cases := map[string]struct { + managersJobSets []kubeflow.MPIJob + workerJobSets []kubeflow.MPIJob + + operation func(ctx context.Context, adapter *multikueueAdapter, managerClient, workerClient client.Client) error + + wantError error + wantManagersJobSets []kubeflow.MPIJob + wantWorkerJobSets []kubeflow.MPIJob + }{ + + "sync creates missing remote mpijob": { + managersJobSets: []kubeflow.MPIJob{ + *mpiJobBuilder.DeepCopy(), + }, + operation: func(ctx context.Context, adapter *multikueueAdapter, managerClient, workerClient client.Client) error { + return adapter.SyncJob(ctx, managerClient, workerClient, types.NamespacedName{Name: "mpijob1", Namespace: TestNamespace}, "wl1", "origin1") + }, + + wantManagersJobSets: []kubeflow.MPIJob{ + *mpiJobBuilder.DeepCopy(), + }, + wantWorkerJobSets: []kubeflow.MPIJob{ + *mpiJobBuilder.Clone(). + Label(constants.PrebuiltWorkloadLabel, "wl1"). + Label(kueuealpha.MultiKueueOriginLabel, "origin1"). + Obj(), + }, + }, + "sync status from remote mpijob": { + managersJobSets: []kubeflow.MPIJob{ + *mpiJobBuilder.DeepCopy(), + }, + workerJobSets: []kubeflow.MPIJob{ + *mpiJobBuilder.Clone(). + Label(constants.PrebuiltWorkloadLabel, "wl1"). + Label(kueuealpha.MultiKueueOriginLabel, "origin1"). + StatusConditions(kubeflow.JobCondition{Type: kubeflow.JobSucceeded, Status: corev1.ConditionTrue}). + Obj(), + }, + operation: func(ctx context.Context, adapter *multikueueAdapter, managerClient, workerClient client.Client) error { + return adapter.SyncJob(ctx, managerClient, workerClient, types.NamespacedName{Name: "mpijob1", Namespace: TestNamespace}, "wl1", "origin1") + }, + + wantManagersJobSets: []kubeflow.MPIJob{ + *mpiJobBuilder.Clone(). + StatusConditions(kubeflow.JobCondition{Type: kubeflow.JobSucceeded, Status: corev1.ConditionTrue}). + Obj(), + }, + wantWorkerJobSets: []kubeflow.MPIJob{ + *mpiJobBuilder.Clone(). + Label(constants.PrebuiltWorkloadLabel, "wl1"). + Label(kueuealpha.MultiKueueOriginLabel, "origin1"). + StatusConditions(kubeflow.JobCondition{Type: kubeflow.JobSucceeded, Status: corev1.ConditionTrue}). + Obj(), + }, + }, + "remote mpijob is deleted": { + workerJobSets: []kubeflow.MPIJob{ + *mpiJobBuilder.Clone(). + Label(constants.PrebuiltWorkloadLabel, "wl1"). + Label(kueuealpha.MultiKueueOriginLabel, "origin1"). + Obj(), + }, + operation: func(ctx context.Context, adapter *multikueueAdapter, managerClient, workerClient client.Client) error { + return adapter.DeleteRemoteObject(ctx, workerClient, types.NamespacedName{Name: "mpijob1", Namespace: TestNamespace}) + }, + }, + } + for name, tc := range cases { + t.Run(name, func(t *testing.T) { + managerBuilder := utiltesting.NewClientBuilder(kubeflow.AddToScheme).WithInterceptorFuncs(interceptor.Funcs{SubResourcePatch: utiltesting.TreatSSAAsStrategicMerge}) + managerBuilder = managerBuilder.WithLists(&kubeflow.MPIJobList{Items: tc.managersJobSets}) + managerBuilder = managerBuilder.WithStatusSubresource(slices.Map(tc.managersJobSets, func(w *kubeflow.MPIJob) client.Object { return w })...) + managerClient := managerBuilder.Build() + + workerBuilder := utiltesting.NewClientBuilder(kubeflow.AddToScheme).WithInterceptorFuncs(interceptor.Funcs{SubResourcePatch: utiltesting.TreatSSAAsStrategicMerge}) + workerBuilder = workerBuilder.WithLists(&kubeflow.MPIJobList{Items: tc.workerJobSets}) + workerClient := workerBuilder.Build() + + ctx, _ := utiltesting.ContextWithLog(t) + + adapter := &multikueueAdapter{} + + gotErr := tc.operation(ctx, adapter, managerClient, workerClient) + + if diff := cmp.Diff(tc.wantError, gotErr, cmpopts.EquateErrors()); diff != "" { + t.Errorf("unexpected error (-want/+got):\n%s", diff) + } + + gotManagersJobSets := &kubeflow.MPIJobList{} + if err := managerClient.List(ctx, gotManagersJobSets); err != nil { + t.Errorf("unexpected list manager's mpijobs error %s", err) + } else { + if diff := cmp.Diff(tc.wantManagersJobSets, gotManagersJobSets.Items, objCheckOpts...); diff != "" { + t.Errorf("unexpected manager's mpijobs (-want/+got):\n%s", diff) + } + } + + gotWorkerJobSets := &kubeflow.MPIJobList{} + if err := workerClient.List(ctx, gotWorkerJobSets); err != nil { + t.Errorf("unexpected list worker's mpijobs error %s", err) + } else { + if diff := cmp.Diff(tc.wantWorkerJobSets, gotWorkerJobSets.Items, objCheckOpts...); diff != "" { + t.Errorf("unexpected worker's mpijobs (-want/+got):\n%s", diff) + } + } + }) + } +} diff --git a/pkg/util/testingjobs/mpijob/wrappers_mpijob.go b/pkg/util/testingjobs/mpijob/wrappers_mpijob.go index 42be69ce49..f19661b73f 100644 --- a/pkg/util/testingjobs/mpijob/wrappers_mpijob.go +++ b/pkg/util/testingjobs/mpijob/wrappers_mpijob.go @@ -42,44 +42,88 @@ func MakeMPIJob(name, ns string) *MPIJobWrapper { RunPolicy: kubeflow.RunPolicy{ Suspend: ptr.To(true), }, - MPIReplicaSpecs: map[kubeflow.MPIReplicaType]*kubeflow.ReplicaSpec{ - kubeflow.MPIReplicaTypeLauncher: { - Replicas: ptr.To[int32](1), - Template: corev1.PodTemplateSpec{ - Spec: corev1.PodSpec{ - RestartPolicy: corev1.RestartPolicyNever, - Containers: []corev1.Container{ - { - Name: "c", - Image: "pause", - Command: []string{}, - Resources: corev1.ResourceRequirements{Requests: corev1.ResourceList{}}, - }, - }, - NodeSelector: map[string]string{}, - }, + MPIReplicaSpecs: make(map[kubeflow.MPIReplicaType]*kubeflow.ReplicaSpec), + }, + }, + } +} + +type MPIJobReplicaSpecRequirement struct { + ReplicaType kubeflow.MPIReplicaType + Name string + ReplicaCount int32 + Annotations map[string]string + RestartPolicy kubeflow.RestartPolicy +} + +func (j *MPIJobWrapper) MPIJobReplicaSpecs(replicaSpecs ...MPIJobReplicaSpecRequirement) *MPIJobWrapper { + j = j.MPIJobReplicaSpecsDefault() + for _, rs := range replicaSpecs { + j.Spec.MPIReplicaSpecs[rs.ReplicaType].Replicas = ptr.To[int32](rs.ReplicaCount) + j.Spec.MPIReplicaSpecs[rs.ReplicaType].Template.Name = rs.Name + j.Spec.MPIReplicaSpecs[rs.ReplicaType].Template.Spec.RestartPolicy = corev1.RestartPolicy(rs.RestartPolicy) + j.Spec.MPIReplicaSpecs[rs.ReplicaType].Template.Spec.Containers[0].Name = "mpijob" + + if rs.Annotations != nil { + j.Spec.MPIReplicaSpecs[rs.ReplicaType].Template.ObjectMeta.Annotations = rs.Annotations + } + } + + return j +} + +func (j *MPIJobWrapper) MPIJobReplicaSpecsDefault() *MPIJobWrapper { + j.Spec.MPIReplicaSpecs[kubeflow.MPIReplicaTypeLauncher] = &kubeflow.ReplicaSpec{ + Replicas: ptr.To[int32](1), + Template: corev1.PodTemplateSpec{ + Spec: corev1.PodSpec{ + RestartPolicy: "Never", + Containers: []corev1.Container{ + { + Name: "c", + Image: "pause", + Command: []string{}, + Resources: corev1.ResourceRequirements{Requests: corev1.ResourceList{}}, }, }, - kubeflow.MPIReplicaTypeWorker: { - Replicas: ptr.To[int32](1), - Template: corev1.PodTemplateSpec{ - Spec: corev1.PodSpec{ - RestartPolicy: corev1.RestartPolicyNever, - Containers: []corev1.Container{ - { - Name: "c", - Image: "pause", - Command: []string{}, - Resources: corev1.ResourceRequirements{Requests: corev1.ResourceList{}}, - }, - }, - NodeSelector: map[string]string{}, - }, + NodeSelector: map[string]string{}, + }, + }, + } + + j.Spec.MPIReplicaSpecs[kubeflow.MPIReplicaTypeWorker] = &kubeflow.ReplicaSpec{ + Replicas: ptr.To[int32](1), + Template: corev1.PodTemplateSpec{ + Spec: corev1.PodSpec{ + RestartPolicy: "Never", + Containers: []corev1.Container{ + { + Name: "c", + Image: "pause", + Command: []string{}, + Resources: corev1.ResourceRequirements{Requests: corev1.ResourceList{}}, }, }, + NodeSelector: map[string]string{}, }, }, - }} + } + + return j +} + +// Clone returns deep copy of the PaddleJobWrapper. +func (j *MPIJobWrapper) Clone() *MPIJobWrapper { + return &MPIJobWrapper{MPIJob: *j.DeepCopy()} +} + +// Label sets the label key and value +func (j *MPIJobWrapper) Label(key, value string) *MPIJobWrapper { + if j.Labels == nil { + j.Labels = make(map[string]string) + } + j.Labels[key] = value + return j } // PriorityClass updates job priorityclass. @@ -161,3 +205,15 @@ func (j *MPIJobWrapper) Generation(num int64) *MPIJobWrapper { j.ObjectMeta.Generation = num return j } + +// Condition adds a condition +func (j *MPIJobWrapper) StatusConditions(c kubeflow.JobCondition) *MPIJobWrapper { + j.Status.Conditions = append(j.Status.Conditions, c) + return j +} + +func (j *MPIJobWrapper) Image(replicaType kubeflow.MPIReplicaType, image string, args []string) *MPIJobWrapper { + j.Spec.MPIReplicaSpecs[replicaType].Template.Spec.Containers[0].Image = image + j.Spec.MPIReplicaSpecs[replicaType].Template.Spec.Containers[0].Args = args + return j +} diff --git a/test/integration/controller/jobs/mpijob/mpijob_controller_test.go b/test/integration/controller/jobs/mpijob/mpijob_controller_test.go index 4e503346e4..330d013fa3 100644 --- a/test/integration/controller/jobs/mpijob/mpijob_controller_test.go +++ b/test/integration/controller/jobs/mpijob/mpijob_controller_test.go @@ -89,7 +89,9 @@ var _ = ginkgo.Describe("Job controller", ginkgo.Ordered, ginkgo.ContinueOnFailu PriorityValue(int32(priorityValue)).Obj() gomega.Expect(k8sClient.Create(ctx, priorityClass)).Should(gomega.Succeed()) - job := testingmpijob.MakeMPIJob(jobName, ns.Name).PriorityClass(priorityClassName).Obj() + job := testingmpijob.MakeMPIJob(jobName, ns.Name). + MPIJobReplicaSpecsDefault(). + PriorityClass(priorityClassName).Obj() err := k8sClient.Create(ctx, job) gomega.Expect(err).To(gomega.Succeed()) createdJob := &kubeflow.MPIJob{} @@ -316,6 +318,7 @@ var _ = ginkgo.Describe("Job controller", ginkgo.Ordered, ginkgo.ContinueOnFailu ginkgo.It("labels and annotations should be propagated from admission check to job", func() { job := testingmpijob.MakeMPIJob(jobName, ns.Name). + MPIJobReplicaSpecsDefault(). Queue(localQueue.Name). PodAnnotation(kubeflow.MPIReplicaTypeWorker, "old-ann-key", "old-ann-value"). PodLabel(kubeflow.MPIReplicaTypeWorker, "old-label-key", "old-label-value"). @@ -503,7 +506,7 @@ var _ = ginkgo.Describe("Job controller for workloads when only jobs with queue ginkgo.It("Should reconcile jobs only when queue is set", func() { ginkgo.By("checking the workload is not created when queue name is not set") - job := testingmpijob.MakeMPIJob(jobName, ns.Name).Obj() + job := testingmpijob.MakeMPIJob(jobName, ns.Name).MPIJobReplicaSpecsDefault().Obj() gomega.Expect(k8sClient.Create(ctx, job)).Should(gomega.Succeed()) lookupKey := types.NamespacedName{Name: jobName, Namespace: ns.Name} createdJob := &kubeflow.MPIJob{} @@ -616,7 +619,9 @@ var _ = ginkgo.Describe("Job controller when waitForPodsReady enabled", ginkgo.O ginkgo.DescribeTable("Single job at different stages of progress towards completion", func(podsReadyTestSpec podsReadyTestSpec) { ginkgo.By("Create a job") - job := testingmpijob.MakeMPIJob(jobName, ns.Name).Parallelism(2).Obj() + job := testingmpijob.MakeMPIJob(jobName, ns.Name). + MPIJobReplicaSpecsDefault(). + Parallelism(2).Obj() jobQueueName := "test-queue" job.Annotations = map[string]string{constants.QueueAnnotation: jobQueueName} gomega.Expect(k8sClient.Create(ctx, job)).Should(gomega.Succeed()) @@ -839,6 +844,7 @@ var _ = ginkgo.Describe("Job controller interacting with scheduler", ginkgo.Orde ginkgo.By("checking a dev job starts") job := testingmpijob.MakeMPIJob("dev-job", ns.Name).Queue(localQueue.Name). + MPIJobReplicaSpecsDefault(). Request(kubeflow.MPIReplicaTypeLauncher, corev1.ResourceCPU, "3"). Request(kubeflow.MPIReplicaTypeWorker, corev1.ResourceCPU, "4"). Obj() @@ -859,6 +865,7 @@ var _ = ginkgo.Describe("Job controller interacting with scheduler", ginkgo.Orde ginkgo.It("Should restore the original node selectors", func() { localQueue := testing.MakeLocalQueue("local-queue", ns.Name).ClusterQueue(clusterQueue.Name).Obj() job := testingmpijob.MakeMPIJob(jobName, ns.Name).Queue(localQueue.Name). + MPIJobReplicaSpecsDefault(). Request(kubeflow.MPIReplicaTypeLauncher, corev1.ResourceCPU, "3"). Request(kubeflow.MPIReplicaTypeWorker, corev1.ResourceCPU, "4"). Obj() From 422ae06588b379d902feb3f59da21f975f539fa7 Mon Sep 17 00:00:00 2001 From: Michal Szadkowski Date: Mon, 19 Aug 2024 14:37:26 +0200 Subject: [PATCH 02/15] Add MPIJobs integration tests --- .../integration/multikueue/multikueue_test.go | 87 ++++++++++++++++++- test/integration/multikueue/suite_test.go | 22 ++++- 2 files changed, 105 insertions(+), 4 deletions(-) diff --git a/test/integration/multikueue/multikueue_test.go b/test/integration/multikueue/multikueue_test.go index b347a16b24..14b00248cc 100644 --- a/test/integration/multikueue/multikueue_test.go +++ b/test/integration/multikueue/multikueue_test.go @@ -23,6 +23,7 @@ import ( "time" "github.com/google/go-cmp/cmp/cmpopts" + kubeflow "github.com/kubeflow/mpi-operator/pkg/apis/kubeflow/v2beta1" kftraining "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" "github.com/onsi/ginkgo/v2" "github.com/onsi/gomega" @@ -41,14 +42,18 @@ import ( "sigs.k8s.io/kueue/pkg/controller/admissionchecks/multikueue" workloadjob "sigs.k8s.io/kueue/pkg/controller/jobs/job" workloadjobset "sigs.k8s.io/kueue/pkg/controller/jobs/jobset" + workloadpaddlejob "sigs.k8s.io/kueue/pkg/controller/jobs/kubeflow/jobs/paddlejob" workloadpytorchjob "sigs.k8s.io/kueue/pkg/controller/jobs/kubeflow/jobs/pytorchjob" workloadtfjob "sigs.k8s.io/kueue/pkg/controller/jobs/kubeflow/jobs/tfjob" workloadxgboostjob "sigs.k8s.io/kueue/pkg/controller/jobs/kubeflow/jobs/xgboostjob" + workloadmpijob "sigs.k8s.io/kueue/pkg/controller/jobs/mpijob" "sigs.k8s.io/kueue/pkg/features" utiltesting "sigs.k8s.io/kueue/pkg/util/testing" testingjob "sigs.k8s.io/kueue/pkg/util/testingjobs/job" testingjobset "sigs.k8s.io/kueue/pkg/util/testingjobs/jobset" + testingmpijob "sigs.k8s.io/kueue/pkg/util/testingjobs/mpijob" + testingpaddlejob "sigs.k8s.io/kueue/pkg/util/testingjobs/paddlejob" testingpytorchjob "sigs.k8s.io/kueue/pkg/util/testingjobs/pytorchjob" testingtfjob "sigs.k8s.io/kueue/pkg/util/testingjobs/tfjob" @@ -947,7 +952,7 @@ var _ = ginkgo.Describe("Multikueue", ginkgo.Ordered, ginkgo.ContinueOnFailure, }) ginkgo.It("Should run a XGBoostJob on worker if admitted", func() { - xgBoostJob := testingxgboostjob.MakeXGBoostJob("pytorchjob1", managerNs.Name). + xgBoostJob := testingxgboostjob.MakeXGBoostJob("xgboostjob1", managerNs.Name). Queue(managerLq.Name). XGBReplicaSpecs( testingxgboostjob.XGBReplicaSpecRequirement{ @@ -1026,6 +1031,86 @@ var _ = ginkgo.Describe("Multikueue", ginkgo.Ordered, ginkgo.ContinueOnFailure, }) }) + ginkgo.It("Should run a MPIJob on worker if admitted", func() { + mpijob := testingmpijob.MakeMPIJob("mpijob1", managerNs.Name). + Queue(managerLq.Name). + MPIJobReplicaSpecs( + testingmpijob.MPIJobReplicaSpecRequirement{ + ReplicaType: kubeflow.MPIReplicaTypeLauncher, + ReplicaCount: 1, + Name: "launcher", + RestartPolicy: "OnFailure", + }, + testingmpijob.MPIJobReplicaSpecRequirement{ + ReplicaType: kubeflow.MPIReplicaTypeWorker, + ReplicaCount: 1, + Name: "worker", + RestartPolicy: "Never", + }, + ). + Obj() + gomega.Expect(managerTestCluster.client.Create(managerTestCluster.ctx, mpijob)).Should(gomega.Succeed()) + + wlLookupKey := types.NamespacedName{Name: workloadmpijob.GetWorkloadNameForMPIJob(mpijob.Name, mpijob.UID), Namespace: managerNs.Name} + admission := utiltesting.MakeAdmission(managerCq.Name).PodSets( + kueue.PodSetAssignment{ + Name: "launcher", + }, kueue.PodSetAssignment{ + Name: "worker", + }, + ) + + admitWorkloadAndCheckWorkerCopies(multikueueAC.Name, wlLookupKey, admission) + + ginkgo.By("changing the status of the XGBoostJob in the worker, updates the manager's XGBoostJob status", func() { + gomega.Eventually(func(g gomega.Gomega) { + createdMPIJob := kubeflow.MPIJob{} + g.Expect(worker2TestCluster.client.Get(worker2TestCluster.ctx, client.ObjectKeyFromObject(mpijob), &createdMPIJob)).To(gomega.Succeed()) + createdMPIJob.Status.ReplicaStatuses = map[kubeflow.MPIReplicaType]*kubeflow.ReplicaStatus{ + kubeflow.MPIReplicaTypeLauncher: { + Active: 1, + }, + kubeflow.MPIReplicaTypeWorker: { + Active: 1, + Succeeded: 1, + }, + } + g.Expect(worker2TestCluster.client.Status().Update(worker2TestCluster.ctx, &createdMPIJob)).To(gomega.Succeed()) + }, util.Timeout, util.Interval).Should(gomega.Succeed()) + gomega.Eventually(func(g gomega.Gomega) { + createdMPIJob := kubeflow.MPIJob{} + g.Expect(managerTestCluster.client.Get(managerTestCluster.ctx, client.ObjectKeyFromObject(mpijob), &createdMPIJob)).To(gomega.Succeed()) + g.Expect(createdMPIJob.Status.ReplicaStatuses).To(gomega.Equal( + map[kubeflow.MPIReplicaType]*kubeflow.ReplicaStatus{ + kubeflow.MPIReplicaTypeLauncher: { + Active: 1, + }, + kubeflow.MPIReplicaTypeWorker: { + Active: 1, + Succeeded: 1, + }, + })) + }, util.Timeout, util.Interval).Should(gomega.Succeed()) + }) + + ginkgo.By("finishing the worker MPIJob, the manager's wl is marked as finished and the worker2 wl removed", func() { + finishJobReason := "MPIJob finished successfully" + gomega.Eventually(func(g gomega.Gomega) { + createdMPIJob := kubeflow.MPIJob{} + g.Expect(worker2TestCluster.client.Get(worker2TestCluster.ctx, client.ObjectKeyFromObject(mpijob), &createdMPIJob)).To(gomega.Succeed()) + createdMPIJob.Status.Conditions = append(createdMPIJob.Status.Conditions, kubeflow.JobCondition{ + Type: kubeflow.JobSucceeded, + Status: corev1.ConditionTrue, + Reason: "ByTest", + Message: finishJobReason, + }) + g.Expect(worker2TestCluster.client.Status().Update(worker2TestCluster.ctx, &createdMPIJob)).To(gomega.Succeed()) + }, util.Timeout, util.Interval).Should(gomega.Succeed()) + + waitForWorkloadToFinishAndRemoteWorkloadToBeDeleted(wlLookupKey, finishJobReason) + }) + }) + ginkgo.It("Should remove the worker's workload and job after reconnect when the managers job and workload are deleted", func() { job := testingjob.MakeJob("job", managerNs.Name). Queue(managerLq.Name). diff --git a/test/integration/multikueue/suite_test.go b/test/integration/multikueue/suite_test.go index 00db2ce734..9674dbb7a8 100644 --- a/test/integration/multikueue/suite_test.go +++ b/test/integration/multikueue/suite_test.go @@ -45,6 +45,7 @@ import ( workloadpytorchjob "sigs.k8s.io/kueue/pkg/controller/jobs/kubeflow/jobs/pytorchjob" workloadtfjob "sigs.k8s.io/kueue/pkg/controller/jobs/kubeflow/jobs/tfjob" workloadxgboostjob "sigs.k8s.io/kueue/pkg/controller/jobs/kubeflow/jobs/xgboostjob" + workloadmpijob "sigs.k8s.io/kueue/pkg/controller/jobs/mpijob" "sigs.k8s.io/kueue/pkg/queue" "sigs.k8s.io/kueue/pkg/util/kubeversion" utiltesting "sigs.k8s.io/kueue/pkg/util/testing" @@ -90,9 +91,12 @@ func TestMultiKueue(t *testing.T) { func createCluster(setupFnc framework.ManagerSetup, apiFeatureGates ...string) cluster { c := cluster{} c.fwk = &framework.Framework{ - CRDPath: filepath.Join("..", "..", "..", "config", "components", "crd", "bases"), - WebhookPath: filepath.Join("..", "..", "..", "config", "components", "webhook"), - DepCRDPaths: []string{filepath.Join("..", "..", "..", "dep-crds", "jobset-operator"), filepath.Join("..", "..", "..", "dep-crds", "training-operator")}, + CRDPath: filepath.Join("..", "..", "..", "config", "components", "crd", "bases"), + WebhookPath: filepath.Join("..", "..", "..", "config", "components", "webhook"), + DepCRDPaths: []string{filepath.Join("..", "..", "..", "dep-crds", "jobset-operator"), + filepath.Join("..", "..", "..", "dep-crds", "training-operator"), + filepath.Join("..", "..", "..", "dep-crds", "mpi-operator"), + }, APIServerFeatureGates: apiFeatureGates, } c.cfg = c.fwk.Init() @@ -187,6 +191,18 @@ func managerSetup(ctx context.Context, mgr manager.Manager) { err = workloadxgboostjob.SetupXGBoostJobWebhook(mgr) gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + err = workloadmpijob.SetupIndexes(ctx, mgr.GetFieldIndexer()) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + mpiJobReconciler := workloadmpijob.NewReconciler( + mgr.GetClient(), + mgr.GetEventRecorderFor(constants.JobControllerName)) + err = mpiJobReconciler.SetupWithManager(mgr) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + + err = workloadmpijob.SetupMPIJobWebhook(mgr) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) } func managerAndMultiKueueSetup(ctx context.Context, mgr manager.Manager, gcInterval time.Duration) { From e10f9f720ebc0b85b12d8e43b999e079458a29e7 Mon Sep 17 00:00:00 2001 From: Michal Szadkowski Date: Mon, 19 Aug 2024 14:33:05 +0200 Subject: [PATCH 03/15] Update access rights for mpijob --- charts/kueue/templates/rbac/role.yaml | 1 + config/components/rbac/role.yaml | 1 + pkg/controller/jobframework/validation.go | 4 +++- .../multikueue/create-multikueue-kubeconfig.sh | 16 ++++++++++++++++ 4 files changed, 21 insertions(+), 1 deletion(-) diff --git a/charts/kueue/templates/rbac/role.yaml b/charts/kueue/templates/rbac/role.yaml index 5ad1221638..41f948e72d 100644 --- a/charts/kueue/templates/rbac/role.yaml +++ b/charts/kueue/templates/rbac/role.yaml @@ -205,6 +205,7 @@ rules: - mpijobs/status verbs: - get + - patch - update - apiGroups: - kubeflow.org diff --git a/config/components/rbac/role.yaml b/config/components/rbac/role.yaml index 4296c83b70..3eaef50b2b 100644 --- a/config/components/rbac/role.yaml +++ b/config/components/rbac/role.yaml @@ -204,6 +204,7 @@ rules: - mpijobs/status verbs: - get + - patch - update - apiGroups: - kubeflow.org diff --git a/pkg/controller/jobframework/validation.go b/pkg/controller/jobframework/validation.go index 6478648c87..97ee3bf617 100644 --- a/pkg/controller/jobframework/validation.go +++ b/pkg/controller/jobframework/validation.go @@ -20,6 +20,7 @@ import ( "fmt" "strings" + kubeflow "github.com/kubeflow/mpi-operator/pkg/apis/kubeflow/v2beta1" kftraining "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" batchv1 "k8s.io/api/batch/v1" apivalidation "k8s.io/apimachinery/pkg/api/validation" @@ -42,7 +43,8 @@ var ( kftraining.SchemeGroupVersion.WithKind(kftraining.TFJobKind).String(), kftraining.SchemeGroupVersion.WithKind(kftraining.PaddleJobKind).String(), kftraining.SchemeGroupVersion.WithKind(kftraining.PyTorchJobKind).String(), - kftraining.SchemeGroupVersion.WithKind(kftraining.XGBoostJobKind).String()) + kftraining.SchemeGroupVersion.WithKind(kftraining.XGBoostJobKind).String(), + kubeflow.SchemeGroupVersion.WithKind(kubeflow.Kind).String()) ) // ValidateJobOnCreate encapsulates all GenericJob validations that must be performed on a Create operation diff --git a/site/static/examples/multikueue/create-multikueue-kubeconfig.sh b/site/static/examples/multikueue/create-multikueue-kubeconfig.sh index 971d7b9d36..4ad71e36df 100644 --- a/site/static/examples/multikueue/create-multikueue-kubeconfig.sh +++ b/site/static/examples/multikueue/create-multikueue-kubeconfig.sh @@ -149,6 +149,22 @@ rules: - xgboostjobs/status verbs: - get +- apiGroups: + - kubeflow.org + resources: + - mpijobs + verbs: + - create + - delete + - get + - list + - watch +- apiGroups: + - kubeflow.org + resources: + - mpijobs/status + verbs: + - get --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding From edeeb9cc69a9a330bfea4c261f7712cd72272152 Mon Sep 17 00:00:00 2001 From: Michal Szadkowski Date: Mon, 19 Aug 2024 14:35:03 +0200 Subject: [PATCH 04/15] Remove v1 MPIJob yaml from training-operator dep-crds --- Makefile-deps.mk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile-deps.mk b/Makefile-deps.mk index f266b27be3..724b8c04d0 100644 --- a/Makefile-deps.mk +++ b/Makefile-deps.mk @@ -112,7 +112,7 @@ KF_TRAINING_ROOT = $(shell $(GO_CMD) list -m -mod=readonly -f "{{.Dir}}" github. .PHONY: kf-training-operator-crd kf-training-operator-crd: ## Copy the CRDs from the training-operator to the dep-crds directory. mkdir -p $(EXTERNAL_CRDS_DIR)/training-operator/ - cp -f $(KF_TRAINING_ROOT)/manifests/base/crds/* $(EXTERNAL_CRDS_DIR)/training-operator/ + find $(KF_TRAINING_ROOT)/manifests/base/crds/ -type f -not -name "kubeflow.org_mpijobs.yaml" -exec cp {} $(EXTERNAL_CRDS_DIR)/training-operator/ \; RAY_ROOT = $(shell $(GO_CMD) list -m -mod=readonly -f "{{.Dir}}" github.com/ray-project/kuberay/ray-operator) .PHONY: ray-operator-crd From 2ae47f57748050849816cbebbb40d767bc73dc4a Mon Sep 17 00:00:00 2001 From: Michal Szadkowski Date: Mon, 19 Aug 2024 14:36:37 +0200 Subject: [PATCH 05/15] Update MPIJob version to v2beta1 --- pkg/controller/jobs/mpijob/mpijob_controller.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pkg/controller/jobs/mpijob/mpijob_controller.go b/pkg/controller/jobs/mpijob/mpijob_controller.go index 7beb9132b5..0e75e65d55 100644 --- a/pkg/controller/jobs/mpijob/mpijob_controller.go +++ b/pkg/controller/jobs/mpijob/mpijob_controller.go @@ -58,7 +58,7 @@ func init() { // +kubebuilder:rbac:groups=scheduling.k8s.io,resources=priorityclasses,verbs=list;get;watch // +kubebuilder:rbac:groups="",resources=events,verbs=create;watch;update;patch // +kubebuilder:rbac:groups=kubeflow.org,resources=mpijobs,verbs=get;list;watch;update;patch -// +kubebuilder:rbac:groups=kubeflow.org,resources=mpijobs/status,verbs=get;update +// +kubebuilder:rbac:groups=kubeflow.org,resources=mpijobs/status,verbs=get;update;patch // +kubebuilder:rbac:groups=kubeflow.org,resources=mpijobs/finalizers,verbs=get;update // +kubebuilder:rbac:groups=kueue.x-k8s.io,resources=workloads,verbs=get;list;watch;create;update;patch;delete // +kubebuilder:rbac:groups=kueue.x-k8s.io,resources=workloads/status,verbs=get;update;patch @@ -73,7 +73,7 @@ func NewJob() jobframework.GenericJob { var NewReconciler = jobframework.NewGenericReconcilerFactory(NewJob) func isMPIJob(owner *metav1.OwnerReference) bool { - return owner.Kind == "MPIJob" && strings.HasPrefix(owner.APIVersion, "kubeflow.org/v2") + return owner.Kind == "MPIJob" && strings.HasPrefix(owner.APIVersion, kubeflow.SchemeGroupVersion.Group) } type MPIJob kubeflow.MPIJob From f4b2402e79c2d861fe7f97ceb2881a2cdd87fd99 Mon Sep 17 00:00:00 2001 From: Michal Szadkowski Date: Fri, 23 Aug 2024 12:55:31 +0200 Subject: [PATCH 06/15] Introduce mpi-operator to multikueue e2e tests Modify training-operator setup to be able to work along side mpi-operator --- .gitignore | 1 + Makefile-deps.mk | 10 ++++++++- Makefile-test.mk | 10 +++++---- hack/e2e-common.sh | 43 +++++++++++++++++++++++++++++++++++-- hack/multikueue-e2e-test.sh | 6 ++++++ 5 files changed, 63 insertions(+), 7 deletions(-) diff --git a/.gitignore b/.gitignore index c20e0f25b1..5e6d17a14b 100644 --- a/.gitignore +++ b/.gitignore @@ -29,5 +29,6 @@ testbin/* # Directory with CRDs for dependencies dep-crds +dep-manifests kueue.yaml diff --git a/Makefile-deps.mk b/Makefile-deps.mk index 724b8c04d0..125414504c 100644 --- a/Makefile-deps.mk +++ b/Makefile-deps.mk @@ -16,6 +16,7 @@ PROJECT_DIR := $(shell dirname $(abspath $(lastword $(MAKEFILE_LIST)))) TOOLS_DIR := $(PROJECT_DIR)/hack/internal/tools BIN_DIR ?= $(PROJECT_DIR)/bin EXTERNAL_CRDS_DIR ?= $(PROJECT_DIR)/dep-crds +EXTERNAL_MANIFESTS_DIR ?= $(PROJECT_DIR)/dep-manifests ifeq (,$(shell go env GOBIN)) GOBIN=$(shell go env GOPATH)/bin @@ -112,7 +113,14 @@ KF_TRAINING_ROOT = $(shell $(GO_CMD) list -m -mod=readonly -f "{{.Dir}}" github. .PHONY: kf-training-operator-crd kf-training-operator-crd: ## Copy the CRDs from the training-operator to the dep-crds directory. mkdir -p $(EXTERNAL_CRDS_DIR)/training-operator/ - find $(KF_TRAINING_ROOT)/manifests/base/crds/ -type f -not -name "kubeflow.org_mpijobs.yaml" -exec cp {} $(EXTERNAL_CRDS_DIR)/training-operator/ \; + # Remove `kubeflow.org_mpijobs.yaml` from the folder and from the kustomization.yaml + find $(KF_TRAINING_ROOT)/manifests/base/crds/ -type f -not -name "kubeflow.org_mpijobs.yaml" -exec cp -f {} $(EXTERNAL_CRDS_DIR)/training-operator/ \; + sed -i '/kubeflow.org_mpijobs.yaml/d' $(EXTERNAL_CRDS_DIR)/training-operator/kustomization.yaml + +.PHONY: kf-training-operator-manifest +kf-training-operator-manifest: ## Copy the manifest from the training-operator to the dep-manifests directory. + mkdir -p $(EXTERNAL_MANIFESTS_DIR)/training-operator/ + cp -rf $(KF_TRAINING_ROOT)/manifests/ $(EXTERNAL_MANIFESTS_DIR)/training-operator/ ; RAY_ROOT = $(shell $(GO_CMD) list -m -mod=readonly -f "{{.Dir}}" github.com/ray-project/kuberay/ray-operator) .PHONY: ray-operator-crd diff --git a/Makefile-test.mk b/Makefile-test.mk index d839573dc7..72625d8714 100644 --- a/Makefile-test.mk +++ b/Makefile-test.mk @@ -59,6 +59,8 @@ IMAGE_TAG ?= $(IMAGE_REPO):$(GIT_TAG) # JobSet Version JOBSET_VERSION = $(shell $(GO_CMD) list -m -f "{{.Version}}" sigs.k8s.io/jobset) KUBEFLOW_VERSION = $(shell $(GO_CMD) list -m -f "{{.Version}}" github.com/kubeflow/training-operator) +# Mismatch between manifest semver tag and docker release tag, latter has no leading 'v' +KUBEFLOW_MPI_VERSION = $(shell $(GO_CMD) list -m -f "{{.Version}}" github.com/kubeflow/mpi-operator | sed 's/^v//') ##@ Tests @@ -76,10 +78,10 @@ test-integration: gomod-download envtest ginkgo mpi-operator-crd ray-operator-cr CREATE_KIND_CLUSTER ?= true .PHONY: test-e2e -test-e2e: kustomize ginkgo yq gomod-download jobset-operator-crd kf-training-operator-crd kueuectl run-test-e2e-$(E2E_KIND_VERSION:kindest/node:v%=%) +test-e2e: kustomize ginkgo yq gomod-download jobset-operator-crd kf-training-operator-crd mpi-operator-crd kueuectl run-test-e2e-$(E2E_KIND_VERSION:kindest/node:v%=%) .PHONY: test-multikueue-e2e -test-multikueue-e2e: kustomize ginkgo yq gomod-download jobset-operator-crd kf-training-operator-crd run-test-multikueue-e2e-$(E2E_KIND_VERSION:kindest/node:v%=%) +test-multikueue-e2e: kustomize ginkgo yq gomod-download jobset-operator-crd kf-training-operator-crd kf-training-operator-manifest mpi-operator-crd run-test-multikueue-e2e-$(E2E_KIND_VERSION:kindest/node:v%=%) E2E_TARGETS := $(addprefix run-test-e2e-,${E2E_K8S_VERSIONS}) @@ -92,12 +94,12 @@ FORCE: run-test-e2e-%: K8S_VERSION = $(@:run-test-e2e-%=%) run-test-e2e-%: FORCE @echo Running e2e for k8s ${K8S_VERSION} - E2E_KIND_VERSION="kindest/node:v$(K8S_VERSION)" KIND_CLUSTER_NAME=$(KIND_CLUSTER_NAME) CREATE_KIND_CLUSTER=$(CREATE_KIND_CLUSTER) ARTIFACTS="$(ARTIFACTS)/$@" IMAGE_TAG=$(IMAGE_TAG) GINKGO_ARGS="$(GINKGO_ARGS)" JOBSET_VERSION=$(JOBSET_VERSION) KUBEFLOW_VERSION=$(KUBEFLOW_VERSION) ./hack/e2e-test.sh + E2E_KIND_VERSION="kindest/node:v$(K8S_VERSION)" KIND_CLUSTER_NAME=$(KIND_CLUSTER_NAME) CREATE_KIND_CLUSTER=$(CREATE_KIND_CLUSTER) ARTIFACTS="$(ARTIFACTS)/$@" IMAGE_TAG=$(IMAGE_TAG) GINKGO_ARGS="$(GINKGO_ARGS)" JOBSET_VERSION=$(JOBSET_VERSION) KUBEFLOW_VERSION=$(KUBEFLOW_VERSION) KUBEFLOW_MPI_VERSION=$(KUBEFLOW_MPI_VERSION) ./hack/e2e-test.sh run-test-multikueue-e2e-%: K8S_VERSION = $(@:run-test-multikueue-e2e-%=%) run-test-multikueue-e2e-%: FORCE @echo Running multikueue e2e for k8s ${K8S_VERSION} - E2E_KIND_VERSION="kindest/node:v$(K8S_VERSION)" KIND_CLUSTER_NAME=$(KIND_CLUSTER_NAME) CREATE_KIND_CLUSTER=$(CREATE_KIND_CLUSTER) ARTIFACTS="$(ARTIFACTS)/$@" IMAGE_TAG=$(IMAGE_TAG) GINKGO_ARGS="$(GINKGO_ARGS)" JOBSET_VERSION=$(JOBSET_VERSION) KUBEFLOW_VERSION=$(KUBEFLOW_VERSION) ./hack/multikueue-e2e-test.sh + E2E_KIND_VERSION="kindest/node:v$(K8S_VERSION)" KIND_CLUSTER_NAME=$(KIND_CLUSTER_NAME) CREATE_KIND_CLUSTER=$(CREATE_KIND_CLUSTER) ARTIFACTS="$(ARTIFACTS)/$@" IMAGE_TAG=$(IMAGE_TAG) GINKGO_ARGS="$(GINKGO_ARGS)" JOBSET_VERSION=$(JOBSET_VERSION) KUBEFLOW_VERSION=$(KUBEFLOW_VERSION) KUBEFLOW_MPI_VERSION=$(KUBEFLOW_MPI_VERSION) ./hack/multikueue-e2e-test.sh SCALABILITY_RUNNER := $(PROJECT_DIR)/bin/performance-scheduler-runner .PHONY: performance-scheduler-runner diff --git a/hack/e2e-common.sh b/hack/e2e-common.sh index 850e17f098..a2ee7c6aef 100644 --- a/hack/e2e-common.sh +++ b/hack/e2e-common.sh @@ -23,11 +23,15 @@ export JOBSET_MANIFEST="https://github.com/kubernetes-sigs/jobset/releases/downl export JOBSET_IMAGE=registry.k8s.io/jobset/jobset:${JOBSET_VERSION} export JOBSET_CRDS=${ROOT_DIR}/dep-crds/jobset-operator/ -export KUBEFLOW_MANIFEST="https://github.com/kubeflow/training-operator/manifests/overlays/standalone?ref=${KUBEFLOW_VERSION}" +export KUBEFLOW_MANIFEST_DIR="${ROOT_DIR}/dep-manifests/training-operator" #no matching semver tag unfortunately export KUBEFLOW_IMAGE=kubeflow/training-operator:v1-855e096 export KUBEFLOW_CRDS=${ROOT_DIR}/dep-crds/training-operator/ +export KUBEFLOW_MPI_MANIFEST="https://github.com/kubeflow/mpi-operator/manifests/overlays/standalone?ref=v${KUBEFLOW_MPI_VERSION}" +export KUBEFLOW_MPI_IMAGE=mpioperator/mpi-operator:${KUBEFLOW_MPI_VERSION} +export KUBEFLOW_MPI_CRD=${ROOT_DIR}/dep-crds/mpi-operator/kubeflow.org_mpijobs.yaml + # $1 - cluster name function cluster_cleanup { kubectl config use-context "kind-$1" @@ -72,11 +76,46 @@ function install_jobset { kubectl apply --server-side -f "${JOBSET_MANIFEST}" } +function patch_kubeflow_manifest { + # In order for MPI-operator and Training-operator to work on the same cluster it is required that: + # 1. 'kubeflow.org_mpijobs.yaml' is removed from base/crds/kustomization.yaml - https://github.com/kubeflow/training-operator/issues/1930 + chmod -R u+w "${KUBEFLOW_MANIFEST_DIR}/" + sed -i '/kubeflow.org_mpijobs.yaml/d' ${KUBEFLOW_MANIFEST_DIR}/base/crds/kustomization.yaml + + # 2. Training-operator deployment file is patched and manually enabled for all kubeflow jobs except for mpi - https://github.com/kubeflow/training-operator/issues/1777 + KUBEFLOW_DEPLOYMENT="${KUBEFLOW_MANIFEST_DIR}/base/deployment.yaml" + + # Find the line after which to insert the args + INSERT_LINE=$(grep -n "^ *- /manager" "${KUBEFLOW_DEPLOYMENT}" | head -n 1 | cut -d ':' -f 1) + + # Prepare patch with the args after the specified line + # EOF must be indented most left - doesn't work otherwise + ARGS_PATCH=$(cat < Date: Fri, 23 Aug 2024 13:09:29 +0200 Subject: [PATCH 07/15] Reduce the amount of KFJob e2e multikueue tests due to consolidation of MultiKueue adapters for the KFJobs --- test/e2e/multikueue/e2e_test.go | 135 -------------------------------- 1 file changed, 135 deletions(-) diff --git a/test/e2e/multikueue/e2e_test.go b/test/e2e/multikueue/e2e_test.go index 8504a54171..eb01249c79 100644 --- a/test/e2e/multikueue/e2e_test.go +++ b/test/e2e/multikueue/e2e_test.go @@ -40,17 +40,11 @@ import ( kueue "sigs.k8s.io/kueue/apis/kueue/v1beta1" workloadjob "sigs.k8s.io/kueue/pkg/controller/jobs/job" workloadjobset "sigs.k8s.io/kueue/pkg/controller/jobs/jobset" - workloadpaddlejob "sigs.k8s.io/kueue/pkg/controller/jobs/kubeflow/jobs/paddlejob" workloadpytorchjob "sigs.k8s.io/kueue/pkg/controller/jobs/kubeflow/jobs/pytorchjob" - workloadtfjob "sigs.k8s.io/kueue/pkg/controller/jobs/kubeflow/jobs/tfjob" - workloadxgboostjob "sigs.k8s.io/kueue/pkg/controller/jobs/kubeflow/jobs/xgboostjob" utiltesting "sigs.k8s.io/kueue/pkg/util/testing" testingjob "sigs.k8s.io/kueue/pkg/util/testingjobs/job" testingjobset "sigs.k8s.io/kueue/pkg/util/testingjobs/jobset" - testingpaddlejob "sigs.k8s.io/kueue/pkg/util/testingjobs/paddlejob" testingpytorchjob "sigs.k8s.io/kueue/pkg/util/testingjobs/pytorchjob" - testingtfjob "sigs.k8s.io/kueue/pkg/util/testingjobs/tfjob" - testingxgboostjob "sigs.k8s.io/kueue/pkg/util/testingjobs/xgboostjob" "sigs.k8s.io/kueue/pkg/workload" "sigs.k8s.io/kueue/test/util" ) @@ -375,135 +369,6 @@ var _ = ginkgo.Describe("MultiKueue", func() { util.IgnoreConditionTimestampsAndObservedGeneration))) }) }) - ginkgo.It("Should run a kubeflow TFJob on worker if admitted", func() { - // Since it requires 1.5 CPU, this job can only be admitted in worker 1. - tfJob := testingtfjob.MakeTFJob("tfjob1", managerNs.Name). - Queue(managerLq.Name). - TFReplicaSpecs( - testingtfjob.TFReplicaSpecRequirement{ - ReplicaType: kftraining.TFJobReplicaTypeChief, - ReplicaCount: 1, - RestartPolicy: "OnFailure", - }, - testingtfjob.TFReplicaSpecRequirement{ - ReplicaType: kftraining.TFJobReplicaTypePS, - ReplicaCount: 1, - RestartPolicy: "Never", - }, - testingtfjob.TFReplicaSpecRequirement{ - ReplicaType: kftraining.TFJobReplicaTypeWorker, - ReplicaCount: 1, - RestartPolicy: "OnFailure", - }, - ). - Request(kftraining.TFJobReplicaTypeChief, corev1.ResourceCPU, "0.5"). - Request(kftraining.TFJobReplicaTypeChief, corev1.ResourceMemory, "200M"). - Request(kftraining.TFJobReplicaTypePS, corev1.ResourceCPU, "0.5"). - Request(kftraining.TFJobReplicaTypePS, corev1.ResourceMemory, "200M"). - Request(kftraining.TFJobReplicaTypeWorker, corev1.ResourceCPU, "0.5"). - Request(kftraining.TFJobReplicaTypeWorker, corev1.ResourceMemory, "100M"). - Image(kftraining.TFJobReplicaTypeChief, "gcr.io/k8s-staging-perf-tests/sleep:v0.1.0", []string{"1ms"}). - Image(kftraining.TFJobReplicaTypePS, "gcr.io/k8s-staging-perf-tests/sleep:v0.1.0", []string{"1ms"}). - Image(kftraining.TFJobReplicaTypeWorker, "gcr.io/k8s-staging-perf-tests/sleep:v0.1.0", []string{"1ms"}). - Obj() - - ginkgo.By("Creating the TfJob", func() { - gomega.Expect(k8sManagerClient.Create(ctx, tfJob)).Should(gomega.Succeed()) - }) - wlLookupKey := types.NamespacedName{Name: workloadtfjob.GetWorkloadNameForTFJob(tfJob.Name, tfJob.UID), Namespace: managerNs.Name} - - // the execution should be given to the worker1 - waitForJobAdmitted(wlLookupKey, multiKueueAc.Name, "worker1") - - ginkgo.By("Waiting for the TfJob to finish", func() { - gomega.Eventually(func(g gomega.Gomega) { - createdTfJob := &kftraining.TFJob{} - g.Expect(k8sManagerClient.Get(ctx, client.ObjectKeyFromObject(tfJob), createdTfJob)).To(gomega.Succeed()) - g.Expect(createdTfJob.Status.ReplicaStatuses[kftraining.TFJobReplicaTypeChief]).To(gomega.BeComparableTo( - &kftraining.ReplicaStatus{ - Active: 0, - Succeeded: 1, - }, - util.IgnoreConditionTimestampsAndObservedGeneration)) - - finishReasonMessage := fmt.Sprintf("TFJob %s/%s successfully completed.", tfJob.Namespace, tfJob.Name) - checkFinishStatusCondition(g, wlLookupKey, finishReasonMessage) - }, util.LongTimeout, util.Interval).Should(gomega.Succeed()) - }) - - ginkgo.By("Checking no objects are left in the worker clusters and the TfJob is completed", func() { - gomega.Eventually(func(g gomega.Gomega) { - workerWl := &kueue.Workload{} - g.Expect(k8sWorker1Client.Get(ctx, wlLookupKey, workerWl)).To(utiltesting.BeNotFoundError()) - g.Expect(k8sWorker2Client.Get(ctx, wlLookupKey, workerWl)).To(utiltesting.BeNotFoundError()) - workerTfJob := &kftraining.TFJob{} - g.Expect(k8sWorker1Client.Get(ctx, client.ObjectKeyFromObject(tfJob), workerTfJob)).To(utiltesting.BeNotFoundError()) - g.Expect(k8sWorker2Client.Get(ctx, client.ObjectKeyFromObject(tfJob), workerTfJob)).To(utiltesting.BeNotFoundError()) - }, util.Timeout, util.Interval).Should(gomega.Succeed()) - }) - }) - - ginkgo.It("Should run a kubeflow PaddleJob on worker if admitted", func() { - // Since it requires 1600M memory, this job can only be admitted in worker 2. - paddleJob := testingpaddlejob.MakePaddleJob("paddlejob1", managerNs.Name). - Queue(managerLq.Name). - PaddleReplicaSpecs( - testingpaddlejob.PaddleReplicaSpecRequirement{ - ReplicaType: kftraining.PaddleJobReplicaTypeMaster, - ReplicaCount: 1, - RestartPolicy: "OnFailure", - }, - testingpaddlejob.PaddleReplicaSpecRequirement{ - ReplicaType: kftraining.PaddleJobReplicaTypeWorker, - ReplicaCount: 1, - RestartPolicy: "OnFailure", - }, - ). - Request(kftraining.PaddleJobReplicaTypeMaster, corev1.ResourceCPU, "0.2"). - Request(kftraining.PaddleJobReplicaTypeMaster, corev1.ResourceMemory, "800M"). - Request(kftraining.PaddleJobReplicaTypeWorker, corev1.ResourceCPU, "0.2"). - Request(kftraining.PaddleJobReplicaTypeWorker, corev1.ResourceMemory, "800M"). - Image("gcr.io/k8s-staging-perf-tests/sleep:v0.1.0"). - Args([]string{"1ms"}). - Obj() - - ginkgo.By("Creating the PaddleJob", func() { - gomega.Expect(k8sManagerClient.Create(ctx, paddleJob)).Should(gomega.Succeed()) - }) - - wlLookupKey := types.NamespacedName{Name: workloadpaddlejob.GetWorkloadNameForPaddleJob(paddleJob.Name, paddleJob.UID), Namespace: managerNs.Name} - - // the execution should be given to the worker2 - waitForJobAdmitted(wlLookupKey, multiKueueAc.Name, "worker2") - - ginkgo.By("Waiting for the PaddleJob to finish", func() { - gomega.Eventually(func(g gomega.Gomega) { - createdPaddleJob := &kftraining.PaddleJob{} - g.Expect(k8sManagerClient.Get(ctx, client.ObjectKeyFromObject(paddleJob), createdPaddleJob)).To(gomega.Succeed()) - g.Expect(createdPaddleJob.Status.ReplicaStatuses[kftraining.PaddleJobReplicaTypeMaster]).To(gomega.BeComparableTo( - &kftraining.ReplicaStatus{ - Active: 0, - Succeeded: 1, - Selector: fmt.Sprintf("training.kubeflow.org/job-name=%s,training.kubeflow.org/operator-name=paddlejob-controller,training.kubeflow.org/replica-type=master", createdPaddleJob.Name), - }, - util.IgnoreConditionTimestampsAndObservedGeneration)) - - finishReasonMessage := fmt.Sprintf("PaddleJob %s is successfully completed.", paddleJob.Name) - checkFinishStatusCondition(g, wlLookupKey, finishReasonMessage) - }, util.Timeout, util.Interval).Should(gomega.Succeed()) - }) - - ginkgo.By("Checking no objects are left in the worker clusters and the PaddleJob is completed", func() { - gomega.Eventually(func(g gomega.Gomega) { - workerWl := &kueue.Workload{} - g.Expect(k8sWorker1Client.Get(ctx, wlLookupKey, workerWl)).To(utiltesting.BeNotFoundError()) - g.Expect(k8sWorker2Client.Get(ctx, wlLookupKey, workerWl)).To(utiltesting.BeNotFoundError()) - workerPaddleJob := &kftraining.PaddleJob{} - g.Expect(k8sWorker1Client.Get(ctx, client.ObjectKeyFromObject(paddleJob), workerPaddleJob)).To(utiltesting.BeNotFoundError()) - g.Expect(k8sWorker2Client.Get(ctx, client.ObjectKeyFromObject(paddleJob), workerPaddleJob)).To(utiltesting.BeNotFoundError()) - }, util.Timeout, util.Interval).Should(gomega.Succeed()) - }) - }) ginkgo.It("Should run a kubeflow PyTorchJob on worker if admitted", func() { // Since it requires 1600M of memory, this job can only be admitted in worker 2. From f8d28b48ada8c4e3e5f28b6608d23b7a2cf2c334 Mon Sep 17 00:00:00 2001 From: Michal Szadkowski Date: Fri, 23 Aug 2024 13:11:29 +0200 Subject: [PATCH 08/15] Add e2e Multikueue tests for MPIJob --- test/e2e/multikueue/e2e_test.go | 59 ++++++++++++++++--------------- test/e2e/multikueue/suite_test.go | 10 ++++-- test/util/e2e.go | 11 +++++- 3 files changed, 48 insertions(+), 32 deletions(-) diff --git a/test/e2e/multikueue/e2e_test.go b/test/e2e/multikueue/e2e_test.go index eb01249c79..4971797351 100644 --- a/test/e2e/multikueue/e2e_test.go +++ b/test/e2e/multikueue/e2e_test.go @@ -33,6 +33,7 @@ import ( versionutil "k8s.io/apimachinery/pkg/util/version" "k8s.io/utils/ptr" + kubeflow "github.com/kubeflow/mpi-operator/pkg/apis/kubeflow/v2beta1" "sigs.k8s.io/controller-runtime/pkg/client" jobset "sigs.k8s.io/jobset/api/jobset/v1alpha2" @@ -41,9 +42,11 @@ import ( workloadjob "sigs.k8s.io/kueue/pkg/controller/jobs/job" workloadjobset "sigs.k8s.io/kueue/pkg/controller/jobs/jobset" workloadpytorchjob "sigs.k8s.io/kueue/pkg/controller/jobs/kubeflow/jobs/pytorchjob" + workloadmpijob "sigs.k8s.io/kueue/pkg/controller/jobs/mpijob" utiltesting "sigs.k8s.io/kueue/pkg/util/testing" testingjob "sigs.k8s.io/kueue/pkg/util/testingjobs/job" testingjobset "sigs.k8s.io/kueue/pkg/util/testingjobs/jobset" + testingmpijob "sigs.k8s.io/kueue/pkg/util/testingjobs/mpijob" testingpytorchjob "sigs.k8s.io/kueue/pkg/util/testingjobs/pytorchjob" "sigs.k8s.io/kueue/pkg/workload" "sigs.k8s.io/kueue/test/util" @@ -432,65 +435,63 @@ var _ = ginkgo.Describe("MultiKueue", func() { }) }) - ginkgo.It("Should run a kubeflow XGBoostJob on worker if admitted", func() { - // Skipped due to known bug - https://github.com/kubeflow/training-operator/issues/1711 - ginkgo.Skip("Skipped due to state transitioning bug in training-operator") + ginkgo.It("Should run a MPIJob on worker if admitted", func() { // Since it requires 1.5 CPU, this job can only be admitted in worker 1. - xgboostJob := testingxgboostjob.MakeXGBoostJob("xgboostjob1", managerNs.Name). + mpijob := testingmpijob.MakeMPIJob("mpijob1", managerNs.Name). Queue(managerLq.Name). - XGBReplicaSpecs( - testingxgboostjob.XGBReplicaSpecRequirement{ - ReplicaType: kftraining.XGBoostJobReplicaTypeMaster, + MPIJobReplicaSpecs( + testingmpijob.MPIJobReplicaSpecRequirement{ + ReplicaType: kubeflow.MPIReplicaTypeLauncher, ReplicaCount: 1, RestartPolicy: "OnFailure", }, - testingxgboostjob.XGBReplicaSpecRequirement{ - ReplicaType: kftraining.XGBoostJobReplicaTypeWorker, - ReplicaCount: 1, + testingmpijob.MPIJobReplicaSpecRequirement{ + ReplicaType: kubeflow.MPIReplicaTypeWorker, + ReplicaCount: 2, RestartPolicy: "OnFailure", }, ). - Request(kftraining.XGBoostJobReplicaTypeMaster, corev1.ResourceCPU, "1"). - Request(kftraining.XGBoostJobReplicaTypeMaster, corev1.ResourceMemory, "200M"). - Request(kftraining.XGBoostJobReplicaTypeWorker, corev1.ResourceCPU, "0.5"). - Request(kftraining.XGBoostJobReplicaTypeWorker, corev1.ResourceMemory, "100M"). - Image("gcr.io/k8s-staging-perf-tests/sleep:v0.1.0"). - Args([]string{"1ms"}). + Request(kubeflow.MPIReplicaTypeLauncher, corev1.ResourceCPU, "1"). + Request(kubeflow.MPIReplicaTypeLauncher, corev1.ResourceMemory, "200M"). + Request(kubeflow.MPIReplicaTypeWorker, corev1.ResourceCPU, "0.5"). + Request(kubeflow.MPIReplicaTypeWorker, corev1.ResourceMemory, "100M"). + Image(kubeflow.MPIReplicaTypeLauncher, "gcr.io/k8s-staging-perf-tests/sleep:v0.1.0", []string{"1ms"}). + Image(kubeflow.MPIReplicaTypeWorker, "gcr.io/k8s-staging-perf-tests/sleep:v0.1.0", []string{"1ms"}). Obj() - ginkgo.By("Creating the XGBoostJob", func() { - gomega.Expect(k8sManagerClient.Create(ctx, xgboostJob)).Should(gomega.Succeed()) + ginkgo.By("Creating the MPIJob", func() { + gomega.Expect(k8sManagerClient.Create(ctx, mpijob)).Should(gomega.Succeed()) }) - wlLookupKey := types.NamespacedName{Name: workloadxgboostjob.GetWorkloadNameForXGBoostJob(xgboostJob.Name, xgboostJob.UID), Namespace: managerNs.Name} + wlLookupKey := types.NamespacedName{Name: workloadmpijob.GetWorkloadNameForMPIJob(mpijob.Name, mpijob.UID), Namespace: managerNs.Name} // the execution should be given to the worker1 waitForJobAdmitted(wlLookupKey, multiKueueAc.Name, "worker1") - ginkgo.By("Waiting for the XGBoostJob to finish", func() { + ginkgo.By("Waiting for the MPIJob to finish", func() { gomega.Eventually(func(g gomega.Gomega) { - createdXGBoostJob := &kftraining.XGBoostJob{} - g.Expect(k8sManagerClient.Get(ctx, client.ObjectKeyFromObject(xgboostJob), createdXGBoostJob)).To(gomega.Succeed()) - g.Expect(createdXGBoostJob.Status.ReplicaStatuses[kftraining.XGBoostJobReplicaTypeMaster]).To(gomega.BeComparableTo( - &kftraining.ReplicaStatus{ + createdMPIJob := &kubeflow.MPIJob{} + g.Expect(k8sManagerClient.Get(ctx, client.ObjectKeyFromObject(mpijob), createdMPIJob)).To(gomega.Succeed()) + g.Expect(createdMPIJob.Status.ReplicaStatuses[kubeflow.MPIReplicaTypeLauncher]).To(gomega.BeComparableTo( + &kubeflow.ReplicaStatus{ Active: 0, Succeeded: 1, }, util.IgnoreConditionTimestampsAndObservedGeneration)) - finishReasonMessage := fmt.Sprintf("XGBoostJob %s is successfully completed.", xgboostJob.Name) + finishReasonMessage := fmt.Sprintf("MPIJob %s/%s successfully completed.", mpijob.Namespace, mpijob.Name) checkFinishStatusCondition(g, wlLookupKey, finishReasonMessage) }, util.Timeout, util.Interval).Should(gomega.Succeed()) }) - ginkgo.By("Checking no objects are left in the worker clusters and the XGBoostJob is completed", func() { + ginkgo.By("Checking no objects are left in the worker clusters and the MPIJob is completed", func() { gomega.Eventually(func(g gomega.Gomega) { workerWl := &kueue.Workload{} g.Expect(k8sWorker1Client.Get(ctx, wlLookupKey, workerWl)).To(utiltesting.BeNotFoundError()) g.Expect(k8sWorker2Client.Get(ctx, wlLookupKey, workerWl)).To(utiltesting.BeNotFoundError()) - workerXGBoostJob := &kftraining.XGBoostJob{} - g.Expect(k8sWorker1Client.Get(ctx, client.ObjectKeyFromObject(xgboostJob), workerXGBoostJob)).To(utiltesting.BeNotFoundError()) - g.Expect(k8sWorker2Client.Get(ctx, client.ObjectKeyFromObject(xgboostJob), workerXGBoostJob)).To(utiltesting.BeNotFoundError()) + workerMPIJob := &kubeflow.MPIJob{} + g.Expect(k8sWorker1Client.Get(ctx, client.ObjectKeyFromObject(mpijob), workerMPIJob)).To(utiltesting.BeNotFoundError()) + g.Expect(k8sWorker2Client.Get(ctx, client.ObjectKeyFromObject(mpijob), workerMPIJob)).To(utiltesting.BeNotFoundError()) }, util.Timeout, util.Interval).Should(gomega.Succeed()) }) }) diff --git a/test/e2e/multikueue/suite_test.go b/test/e2e/multikueue/suite_test.go index c5a3f897f3..c3855405bc 100644 --- a/test/e2e/multikueue/suite_test.go +++ b/test/e2e/multikueue/suite_test.go @@ -23,6 +23,7 @@ import ( "testing" "time" + kubeflow "github.com/kubeflow/mpi-operator/pkg/apis/kubeflow/v2beta1" kftraining "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" "github.com/onsi/ginkgo/v2" "github.com/onsi/gomega" @@ -95,6 +96,8 @@ func kubeconfigForMultiKueueSA(ctx context.Context, c client.Client, restConfig policyRule(kftraining.SchemeGroupVersion.Group, "pytorchjobs/status", "get"), policyRule(kftraining.SchemeGroupVersion.Group, "xgboostjobs", resourceVerbs...), policyRule(kftraining.SchemeGroupVersion.Group, "xgboostjobs/status", "get"), + policyRule(kubeflow.SchemeGroupVersion.Group, "mpijobs", resourceVerbs...), + policyRule(kubeflow.SchemeGroupVersion.Group, "mpijobs/status", "get"), }, } err := c.Create(ctx, cr) @@ -233,8 +236,11 @@ var _ = ginkgo.BeforeSuite(func() { util.WaitForJobSetAvailability(ctx, k8sWorker2Client) // there should not be a kubeflow operator in manager cluster - util.WaitForKubeFlowAvailability(ctx, k8sWorker1Client) - util.WaitForKubeFlowAvailability(ctx, k8sWorker2Client) + util.WaitForKubeFlowTrainingOperatorAvailability(ctx, k8sWorker1Client) + util.WaitForKubeFlowTrainingOperatorAvailability(ctx, k8sWorker2Client) + + util.WaitForKubeFlowMPIOperatorAvailability(ctx, k8sWorker1Client) + util.WaitForKubeFlowMPIOperatorAvailability(ctx, k8sWorker2Client) ginkgo.GinkgoLogr.Info("Kueue and JobSet operators are available in all the clusters", "waitingTime", time.Since(waitForAvailableStart)) diff --git a/test/util/e2e.go b/test/util/e2e.go index f4dc7a81dc..e7dacc2d0d 100644 --- a/test/util/e2e.go +++ b/test/util/e2e.go @@ -6,6 +6,7 @@ import ( "os" "github.com/google/go-cmp/cmp/cmpopts" + kubeflow "github.com/kubeflow/mpi-operator/pkg/apis/kubeflow/v2beta1" kftraining "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" "github.com/onsi/gomega" appsv1 "k8s.io/api/apps/v1" @@ -47,6 +48,9 @@ func CreateClientUsingCluster(kContext string) (client.WithWatch, *rest.Config) err = kftraining.AddToScheme(scheme.Scheme) gomega.ExpectWithOffset(1, err).NotTo(gomega.HaveOccurred()) + err = kubeflow.AddToScheme(scheme.Scheme) + gomega.ExpectWithOffset(1, err).NotTo(gomega.HaveOccurred()) + client, err := client.NewWithWatch(cfg, client.Options{Scheme: scheme.Scheme}) gomega.ExpectWithOffset(1, err).NotTo(gomega.HaveOccurred()) return client, cfg @@ -108,7 +112,12 @@ func WaitForJobSetAvailability(ctx context.Context, k8sClient client.Client) { waitForOperatorAvailability(ctx, k8sClient, jcmKey) } -func WaitForKubeFlowAvailability(ctx context.Context, k8sClient client.Client) { +func WaitForKubeFlowTrainingOperatorAvailability(ctx context.Context, k8sClient client.Client) { kftoKey := types.NamespacedName{Namespace: "kubeflow", Name: "training-operator"} waitForOperatorAvailability(ctx, k8sClient, kftoKey) } + +func WaitForKubeFlowMPIOperatorAvailability(ctx context.Context, k8sClient client.Client) { + kftoKey := types.NamespacedName{Namespace: "mpi-operator", Name: "mpi-operator"} + waitForOperatorAvailability(ctx, k8sClient, kftoKey) +} From fdcbc952e9260f3ae8606eaf3c239e6ec0daa3f9 Mon Sep 17 00:00:00 2001 From: Michal Szadkowski Date: Fri, 23 Aug 2024 14:51:08 +0200 Subject: [PATCH 09/15] Apply suggestions from verify --- hack/e2e-common.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/hack/e2e-common.sh b/hack/e2e-common.sh index a2ee7c6aef..3961c1bbc6 100644 --- a/hack/e2e-common.sh +++ b/hack/e2e-common.sh @@ -80,7 +80,7 @@ function patch_kubeflow_manifest { # In order for MPI-operator and Training-operator to work on the same cluster it is required that: # 1. 'kubeflow.org_mpijobs.yaml' is removed from base/crds/kustomization.yaml - https://github.com/kubeflow/training-operator/issues/1930 chmod -R u+w "${KUBEFLOW_MANIFEST_DIR}/" - sed -i '/kubeflow.org_mpijobs.yaml/d' ${KUBEFLOW_MANIFEST_DIR}/base/crds/kustomization.yaml + sed -i '/kubeflow.org_mpijobs.yaml/d' "${KUBEFLOW_MANIFEST_DIR}/base/crds/kustomization.yaml" # 2. Training-operator deployment file is patched and manually enabled for all kubeflow jobs except for mpi - https://github.com/kubeflow/training-operator/issues/1777 KUBEFLOW_DEPLOYMENT="${KUBEFLOW_MANIFEST_DIR}/base/deployment.yaml" @@ -106,14 +106,14 @@ EOF #$1 - cluster name function install_kubeflow { - cluster_kind_load_image "${1}" ${KUBEFLOW_IMAGE} + cluster_kind_load_image "${1}" "${KUBEFLOW_IMAGE}" kubectl config use-context "kind-${1}" kubectl apply -k "${KUBEFLOW_MANIFEST_DIR}/overlays/standalone" } #$1 - cluster name function install_mpi { - cluster_kind_load_image "${1}" ${KUBEFLOW_MPI_IMAGE} + cluster_kind_load_image "${1}" "${KUBEFLOW_MPI_IMAGE}" kubectl config use-context "kind-${1}" kubectl create -k "${KUBEFLOW_MPI_MANIFEST}" } From 332239061818143ba8a2c9c2085d92e595786202 Mon Sep 17 00:00:00 2001 From: Michal Szadkowski Date: Wed, 28 Aug 2024 09:28:02 +0200 Subject: [PATCH 10/15] Use one makefile target to prep kubeflow traiing-operator manifest and crds --- Makefile-deps.mk | 12 ++++-------- Makefile-test.mk | 4 ++-- hack/e2e-common.sh | 12 ++++++------ hack/multikueue-e2e-test.sh | 2 +- test/integration/controller/jobs/mxjob/suite_test.go | 2 +- .../controller/jobs/paddlejob/suite_test.go | 2 +- .../controller/jobs/pytorchjob/suite_test.go | 2 +- test/integration/controller/jobs/tfjob/suite_test.go | 2 +- .../controller/jobs/xgboostjob/suite_test.go | 2 +- test/integration/multikueue/suite_test.go | 2 +- 10 files changed, 19 insertions(+), 23 deletions(-) diff --git a/Makefile-deps.mk b/Makefile-deps.mk index 125414504c..572979fc53 100644 --- a/Makefile-deps.mk +++ b/Makefile-deps.mk @@ -113,14 +113,10 @@ KF_TRAINING_ROOT = $(shell $(GO_CMD) list -m -mod=readonly -f "{{.Dir}}" github. .PHONY: kf-training-operator-crd kf-training-operator-crd: ## Copy the CRDs from the training-operator to the dep-crds directory. mkdir -p $(EXTERNAL_CRDS_DIR)/training-operator/ - # Remove `kubeflow.org_mpijobs.yaml` from the folder and from the kustomization.yaml - find $(KF_TRAINING_ROOT)/manifests/base/crds/ -type f -not -name "kubeflow.org_mpijobs.yaml" -exec cp -f {} $(EXTERNAL_CRDS_DIR)/training-operator/ \; - sed -i '/kubeflow.org_mpijobs.yaml/d' $(EXTERNAL_CRDS_DIR)/training-operator/kustomization.yaml - -.PHONY: kf-training-operator-manifest -kf-training-operator-manifest: ## Copy the manifest from the training-operator to the dep-manifests directory. - mkdir -p $(EXTERNAL_MANIFESTS_DIR)/training-operator/ - cp -rf $(KF_TRAINING_ROOT)/manifests/ $(EXTERNAL_MANIFESTS_DIR)/training-operator/ ; + # Remove `kubeflow.org_mpijobs.yaml` version v1 from the folder and from the kustomization.yaml, kueue uses v2beta1 CRD from mpi-operator + cp -prf $(KF_TRAINING_ROOT)/manifests/* $(EXTERNAL_CRDS_DIR)/training-operator/ + chmod -R u+w "${EXTERNAL_CRDS_DIR}/training-operator/" + sed -i '/kubeflow.org_mpijobs.yaml/d' $(EXTERNAL_CRDS_DIR)/training-operator/base/crds/kustomization.yaml RAY_ROOT = $(shell $(GO_CMD) list -m -mod=readonly -f "{{.Dir}}" github.com/ray-project/kuberay/ray-operator) .PHONY: ray-operator-crd diff --git a/Makefile-test.mk b/Makefile-test.mk index 72625d8714..6a2048f222 100644 --- a/Makefile-test.mk +++ b/Makefile-test.mk @@ -78,10 +78,10 @@ test-integration: gomod-download envtest ginkgo mpi-operator-crd ray-operator-cr CREATE_KIND_CLUSTER ?= true .PHONY: test-e2e -test-e2e: kustomize ginkgo yq gomod-download jobset-operator-crd kf-training-operator-crd mpi-operator-crd kueuectl run-test-e2e-$(E2E_KIND_VERSION:kindest/node:v%=%) +test-e2e: kustomize ginkgo yq gomod-download dep-crds kueuectl run-test-e2e-$(E2E_KIND_VERSION:kindest/node:v%=%) .PHONY: test-multikueue-e2e -test-multikueue-e2e: kustomize ginkgo yq gomod-download jobset-operator-crd kf-training-operator-crd kf-training-operator-manifest mpi-operator-crd run-test-multikueue-e2e-$(E2E_KIND_VERSION:kindest/node:v%=%) +test-multikueue-e2e: kustomize ginkgo yq gomod-download dep-crds run-test-multikueue-e2e-$(E2E_KIND_VERSION:kindest/node:v%=%) E2E_TARGETS := $(addprefix run-test-e2e-,${E2E_K8S_VERSIONS}) diff --git a/hack/e2e-common.sh b/hack/e2e-common.sh index 3961c1bbc6..94ebf9c643 100644 --- a/hack/e2e-common.sh +++ b/hack/e2e-common.sh @@ -23,10 +23,11 @@ export JOBSET_MANIFEST="https://github.com/kubernetes-sigs/jobset/releases/downl export JOBSET_IMAGE=registry.k8s.io/jobset/jobset:${JOBSET_VERSION} export JOBSET_CRDS=${ROOT_DIR}/dep-crds/jobset-operator/ -export KUBEFLOW_MANIFEST_DIR="${ROOT_DIR}/dep-manifests/training-operator" #no matching semver tag unfortunately export KUBEFLOW_IMAGE=kubeflow/training-operator:v1-855e096 -export KUBEFLOW_CRDS=${ROOT_DIR}/dep-crds/training-operator/ +export KUBEFLOW_CRDS=${ROOT_DIR}/dep-crds/training-operator +export KUBEFLOW_CRDS_BASE=${KUBEFLOW_CRDS}/base/crds +export KUBEFLOW_MANIFEST=${KUBEFLOW_CRDS}/overlays/standalone export KUBEFLOW_MPI_MANIFEST="https://github.com/kubeflow/mpi-operator/manifests/overlays/standalone?ref=v${KUBEFLOW_MPI_VERSION}" export KUBEFLOW_MPI_IMAGE=mpioperator/mpi-operator:${KUBEFLOW_MPI_VERSION} @@ -79,11 +80,10 @@ function install_jobset { function patch_kubeflow_manifest { # In order for MPI-operator and Training-operator to work on the same cluster it is required that: # 1. 'kubeflow.org_mpijobs.yaml' is removed from base/crds/kustomization.yaml - https://github.com/kubeflow/training-operator/issues/1930 - chmod -R u+w "${KUBEFLOW_MANIFEST_DIR}/" - sed -i '/kubeflow.org_mpijobs.yaml/d' "${KUBEFLOW_MANIFEST_DIR}/base/crds/kustomization.yaml" + sed -i '/kubeflow.org_mpijobs.yaml/d' "${KUBEFLOW_CRDS}/base/crds/kustomization.yaml" # 2. Training-operator deployment file is patched and manually enabled for all kubeflow jobs except for mpi - https://github.com/kubeflow/training-operator/issues/1777 - KUBEFLOW_DEPLOYMENT="${KUBEFLOW_MANIFEST_DIR}/base/deployment.yaml" + KUBEFLOW_DEPLOYMENT="${KUBEFLOW_CRDS}/base/deployment.yaml" # Find the line after which to insert the args INSERT_LINE=$(grep -n "^ *- /manager" "${KUBEFLOW_DEPLOYMENT}" | head -n 1 | cut -d ':' -f 1) @@ -108,7 +108,7 @@ EOF function install_kubeflow { cluster_kind_load_image "${1}" "${KUBEFLOW_IMAGE}" kubectl config use-context "kind-${1}" - kubectl apply -k "${KUBEFLOW_MANIFEST_DIR}/overlays/standalone" + kubectl apply -k "${KUBEFLOW_MANIFEST}" } #$1 - cluster name diff --git a/hack/multikueue-e2e-test.sh b/hack/multikueue-e2e-test.sh index 1e53d7f10a..fbcfcc7be4 100755 --- a/hack/multikueue-e2e-test.sh +++ b/hack/multikueue-e2e-test.sh @@ -84,7 +84,7 @@ function kind_load { # Only install the CRDs and not the controller to be able to # have Kubeflow Jobs admitted without execution in the manager cluster. kubectl config use-context "kind-${MANAGER_KIND_CLUSTER_NAME}" - kubectl apply -k "${KUBEFLOW_CRDS}" + kubectl apply -k "${KUBEFLOW_CRDS_BASE}" ## MPI kubectl apply --server-side -f "${KUBEFLOW_MPI_CRD}" diff --git a/test/integration/controller/jobs/mxjob/suite_test.go b/test/integration/controller/jobs/mxjob/suite_test.go index f29689c1b5..f50e9e1faa 100644 --- a/test/integration/controller/jobs/mxjob/suite_test.go +++ b/test/integration/controller/jobs/mxjob/suite_test.go @@ -45,7 +45,7 @@ var ( ctx context.Context fwk *framework.Framework crdPath = filepath.Join("..", "..", "..", "..", "..", "config", "components", "crd", "bases") - mxnetCrdPath = filepath.Join("..", "..", "..", "..", "..", "dep-crds", "training-operator") + mxnetCrdPath = filepath.Join("..", "..", "..", "..", "..", "dep-crds", "training-operator", "base", "crds") ) func TestAPIs(t *testing.T) { diff --git a/test/integration/controller/jobs/paddlejob/suite_test.go b/test/integration/controller/jobs/paddlejob/suite_test.go index 754bbb9334..02b0e36347 100644 --- a/test/integration/controller/jobs/paddlejob/suite_test.go +++ b/test/integration/controller/jobs/paddlejob/suite_test.go @@ -45,7 +45,7 @@ var ( ctx context.Context fwk *framework.Framework crdPath = filepath.Join("..", "..", "..", "..", "..", "config", "components", "crd", "bases") - paddleCrdPath = filepath.Join("..", "..", "..", "..", "..", "dep-crds", "training-operator") + paddleCrdPath = filepath.Join("..", "..", "..", "..", "..", "dep-crds", "training-operator", "base", "crds") ) func TestAPIs(t *testing.T) { diff --git a/test/integration/controller/jobs/pytorchjob/suite_test.go b/test/integration/controller/jobs/pytorchjob/suite_test.go index c7d3982d5c..de7367f0fc 100644 --- a/test/integration/controller/jobs/pytorchjob/suite_test.go +++ b/test/integration/controller/jobs/pytorchjob/suite_test.go @@ -46,7 +46,7 @@ var ( ctx context.Context fwk *framework.Framework crdPath = filepath.Join("..", "..", "..", "..", "..", "config", "components", "crd", "bases") - pytorchCrdPath = filepath.Join("..", "..", "..", "..", "..", "dep-crds", "training-operator") + pytorchCrdPath = filepath.Join("..", "..", "..", "..", "..", "dep-crds", "training-operator", "base", "crds") ) func TestAPIs(t *testing.T) { diff --git a/test/integration/controller/jobs/tfjob/suite_test.go b/test/integration/controller/jobs/tfjob/suite_test.go index ecd5328333..16e3a69270 100644 --- a/test/integration/controller/jobs/tfjob/suite_test.go +++ b/test/integration/controller/jobs/tfjob/suite_test.go @@ -46,7 +46,7 @@ var ( ctx context.Context fwk *framework.Framework crdPath = filepath.Join("..", "..", "..", "..", "..", "config", "components", "crd", "bases") - tensorflowCrdPath = filepath.Join("..", "..", "..", "..", "..", "dep-crds", "training-operator") + tensorflowCrdPath = filepath.Join("..", "..", "..", "..", "..", "dep-crds", "training-operator", "base", "crds") ) func TestAPIs(t *testing.T) { diff --git a/test/integration/controller/jobs/xgboostjob/suite_test.go b/test/integration/controller/jobs/xgboostjob/suite_test.go index fc764c5e47..3bb2f1adb0 100644 --- a/test/integration/controller/jobs/xgboostjob/suite_test.go +++ b/test/integration/controller/jobs/xgboostjob/suite_test.go @@ -46,7 +46,7 @@ var ( ctx context.Context fwk *framework.Framework crdPath = filepath.Join("..", "..", "..", "..", "..", "config", "components", "crd", "bases") - xgbCrdPath = filepath.Join("..", "..", "..", "..", "..", "dep-crds", "training-operator") + xgbCrdPath = filepath.Join("..", "..", "..", "..", "..", "dep-crds", "training-operator", "base", "crds") ) func TestAPIs(t *testing.T) { diff --git a/test/integration/multikueue/suite_test.go b/test/integration/multikueue/suite_test.go index 9674dbb7a8..4798e31717 100644 --- a/test/integration/multikueue/suite_test.go +++ b/test/integration/multikueue/suite_test.go @@ -94,7 +94,7 @@ func createCluster(setupFnc framework.ManagerSetup, apiFeatureGates ...string) c CRDPath: filepath.Join("..", "..", "..", "config", "components", "crd", "bases"), WebhookPath: filepath.Join("..", "..", "..", "config", "components", "webhook"), DepCRDPaths: []string{filepath.Join("..", "..", "..", "dep-crds", "jobset-operator"), - filepath.Join("..", "..", "..", "dep-crds", "training-operator"), + filepath.Join("..", "..", "..", "dep-crds", "training-operator", "base", "crds"), filepath.Join("..", "..", "..", "dep-crds", "mpi-operator"), }, APIServerFeatureGates: apiFeatureGates, From 69d10c7bf1967b4beea882f63390c854ff0338ce Mon Sep 17 00:00:00 2001 From: Michal Szadkowski Date: Thu, 29 Aug 2024 09:20:18 +0200 Subject: [PATCH 11/15] Apply code review changes --- .gitignore | 1 - Makefile-deps.mk | 3 +-- Makefile-test.mk | 3 +-- hack/e2e-common.sh | 11 +++++------ hack/multikueue-e2e-test.sh | 2 +- pkg/util/testingjobs/mpijob/wrappers_mpijob.go | 11 +++++------ test/integration/multikueue/multikueue_test.go | 4 ++-- 7 files changed, 15 insertions(+), 20 deletions(-) diff --git a/.gitignore b/.gitignore index 5e6d17a14b..c20e0f25b1 100644 --- a/.gitignore +++ b/.gitignore @@ -29,6 +29,5 @@ testbin/* # Directory with CRDs for dependencies dep-crds -dep-manifests kueue.yaml diff --git a/Makefile-deps.mk b/Makefile-deps.mk index 572979fc53..7783ea7b97 100644 --- a/Makefile-deps.mk +++ b/Makefile-deps.mk @@ -16,7 +16,6 @@ PROJECT_DIR := $(shell dirname $(abspath $(lastword $(MAKEFILE_LIST)))) TOOLS_DIR := $(PROJECT_DIR)/hack/internal/tools BIN_DIR ?= $(PROJECT_DIR)/bin EXTERNAL_CRDS_DIR ?= $(PROJECT_DIR)/dep-crds -EXTERNAL_MANIFESTS_DIR ?= $(PROJECT_DIR)/dep-manifests ifeq (,$(shell go env GOBIN)) GOBIN=$(shell go env GOPATH)/bin @@ -113,7 +112,7 @@ KF_TRAINING_ROOT = $(shell $(GO_CMD) list -m -mod=readonly -f "{{.Dir}}" github. .PHONY: kf-training-operator-crd kf-training-operator-crd: ## Copy the CRDs from the training-operator to the dep-crds directory. mkdir -p $(EXTERNAL_CRDS_DIR)/training-operator/ - # Remove `kubeflow.org_mpijobs.yaml` version v1 from the folder and from the kustomization.yaml, kueue uses v2beta1 CRD from mpi-operator + # Remove `kubeflow.org_mpijobs.yaml` version v1 from the kustomization.yaml, kueue uses v2beta1 CRD from mpi-operator cp -prf $(KF_TRAINING_ROOT)/manifests/* $(EXTERNAL_CRDS_DIR)/training-operator/ chmod -R u+w "${EXTERNAL_CRDS_DIR}/training-operator/" sed -i '/kubeflow.org_mpijobs.yaml/d' $(EXTERNAL_CRDS_DIR)/training-operator/base/crds/kustomization.yaml diff --git a/Makefile-test.mk b/Makefile-test.mk index 6a2048f222..837501b974 100644 --- a/Makefile-test.mk +++ b/Makefile-test.mk @@ -59,8 +59,7 @@ IMAGE_TAG ?= $(IMAGE_REPO):$(GIT_TAG) # JobSet Version JOBSET_VERSION = $(shell $(GO_CMD) list -m -f "{{.Version}}" sigs.k8s.io/jobset) KUBEFLOW_VERSION = $(shell $(GO_CMD) list -m -f "{{.Version}}" github.com/kubeflow/training-operator) -# Mismatch between manifest semver tag and docker release tag, latter has no leading 'v' -KUBEFLOW_MPI_VERSION = $(shell $(GO_CMD) list -m -f "{{.Version}}" github.com/kubeflow/mpi-operator | sed 's/^v//') +KUBEFLOW_MPI_VERSION = $(shell $(GO_CMD) list -m -f "{{.Version}}" github.com/kubeflow/mpi-operator) ##@ Tests diff --git a/hack/e2e-common.sh b/hack/e2e-common.sh index 94ebf9c643..e3e63a1bdc 100644 --- a/hack/e2e-common.sh +++ b/hack/e2e-common.sh @@ -29,8 +29,8 @@ export KUBEFLOW_CRDS=${ROOT_DIR}/dep-crds/training-operator export KUBEFLOW_CRDS_BASE=${KUBEFLOW_CRDS}/base/crds export KUBEFLOW_MANIFEST=${KUBEFLOW_CRDS}/overlays/standalone -export KUBEFLOW_MPI_MANIFEST="https://github.com/kubeflow/mpi-operator/manifests/overlays/standalone?ref=v${KUBEFLOW_MPI_VERSION}" -export KUBEFLOW_MPI_IMAGE=mpioperator/mpi-operator:${KUBEFLOW_MPI_VERSION} +export KUBEFLOW_MPI_MANIFEST="https://raw.githubusercontent.com/kubeflow/mpi-operator/${KUBEFLOW_MPI_VERSION}/deploy/v2beta1/mpi-operator.yaml" #"https://github.com/kubeflow/mpi-operator/manifests/overlays/standalone?ref=${KUBEFLOW_MPI_VERSION}" +export KUBEFLOW_MPI_IMAGE=mpioperator/mpi-operator:${KUBEFLOW_MPI_VERSION/#v} export KUBEFLOW_MPI_CRD=${ROOT_DIR}/dep-crds/mpi-operator/kubeflow.org_mpijobs.yaml # $1 - cluster name @@ -80,8 +80,7 @@ function install_jobset { function patch_kubeflow_manifest { # In order for MPI-operator and Training-operator to work on the same cluster it is required that: # 1. 'kubeflow.org_mpijobs.yaml' is removed from base/crds/kustomization.yaml - https://github.com/kubeflow/training-operator/issues/1930 - sed -i '/kubeflow.org_mpijobs.yaml/d' "${KUBEFLOW_CRDS}/base/crds/kustomization.yaml" - + # Done already in Makefile-deps.mk target `kf-training-operator-crd` # 2. Training-operator deployment file is patched and manually enabled for all kubeflow jobs except for mpi - https://github.com/kubeflow/training-operator/issues/1777 KUBEFLOW_DEPLOYMENT="${KUBEFLOW_CRDS}/base/deployment.yaml" @@ -113,9 +112,9 @@ function install_kubeflow { #$1 - cluster name function install_mpi { - cluster_kind_load_image "${1}" "${KUBEFLOW_MPI_IMAGE}" + cluster_kind_load_image "${1}" "${KUBEFLOW_MPI_IMAGE/#v}" kubectl config use-context "kind-${1}" - kubectl create -k "${KUBEFLOW_MPI_MANIFEST}" + kubectl apply --server-side -f "${KUBEFLOW_MPI_MANIFEST}" } INITIAL_IMAGE=$($YQ '.images[] | select(.name == "controller") | [.newName, .newTag] | join(":")' config/components/manager/kustomization.yaml) diff --git a/hack/multikueue-e2e-test.sh b/hack/multikueue-e2e-test.sh index fbcfcc7be4..75f2aaa390 100755 --- a/hack/multikueue-e2e-test.sh +++ b/hack/multikueue-e2e-test.sh @@ -90,7 +90,7 @@ function kind_load { # WORKERS docker pull kubeflow/training-operator:v1-855e096 - docker pull "mpioperator/mpi-operator:$KUBEFLOW_MPI_VERSION" + docker pull "mpioperator/mpi-operator:${KUBEFLOW_MPI_VERSION/#v}" patch_kubeflow_manifest install_kubeflow "$WORKER1_KIND_CLUSTER_NAME" install_kubeflow "$WORKER2_KIND_CLUSTER_NAME" diff --git a/pkg/util/testingjobs/mpijob/wrappers_mpijob.go b/pkg/util/testingjobs/mpijob/wrappers_mpijob.go index f19661b73f..2050fedcee 100644 --- a/pkg/util/testingjobs/mpijob/wrappers_mpijob.go +++ b/pkg/util/testingjobs/mpijob/wrappers_mpijob.go @@ -53,7 +53,7 @@ type MPIJobReplicaSpecRequirement struct { Name string ReplicaCount int32 Annotations map[string]string - RestartPolicy kubeflow.RestartPolicy + RestartPolicy corev1.RestartPolicy } func (j *MPIJobWrapper) MPIJobReplicaSpecs(replicaSpecs ...MPIJobReplicaSpecRequirement) *MPIJobWrapper { @@ -61,8 +61,7 @@ func (j *MPIJobWrapper) MPIJobReplicaSpecs(replicaSpecs ...MPIJobReplicaSpecRequ for _, rs := range replicaSpecs { j.Spec.MPIReplicaSpecs[rs.ReplicaType].Replicas = ptr.To[int32](rs.ReplicaCount) j.Spec.MPIReplicaSpecs[rs.ReplicaType].Template.Name = rs.Name - j.Spec.MPIReplicaSpecs[rs.ReplicaType].Template.Spec.RestartPolicy = corev1.RestartPolicy(rs.RestartPolicy) - j.Spec.MPIReplicaSpecs[rs.ReplicaType].Template.Spec.Containers[0].Name = "mpijob" + j.Spec.MPIReplicaSpecs[rs.ReplicaType].Template.Spec.RestartPolicy = rs.RestartPolicy if rs.Annotations != nil { j.Spec.MPIReplicaSpecs[rs.ReplicaType].Template.ObjectMeta.Annotations = rs.Annotations @@ -80,7 +79,7 @@ func (j *MPIJobWrapper) MPIJobReplicaSpecsDefault() *MPIJobWrapper { RestartPolicy: "Never", Containers: []corev1.Container{ { - Name: "c", + Name: "mpijob", Image: "pause", Command: []string{}, Resources: corev1.ResourceRequirements{Requests: corev1.ResourceList{}}, @@ -98,7 +97,7 @@ func (j *MPIJobWrapper) MPIJobReplicaSpecsDefault() *MPIJobWrapper { RestartPolicy: "Never", Containers: []corev1.Container{ { - Name: "c", + Name: "mpijob", Image: "pause", Command: []string{}, Resources: corev1.ResourceRequirements{Requests: corev1.ResourceList{}}, @@ -206,7 +205,7 @@ func (j *MPIJobWrapper) Generation(num int64) *MPIJobWrapper { return j } -// Condition adds a condition +// StatusConditions adds a condition func (j *MPIJobWrapper) StatusConditions(c kubeflow.JobCondition) *MPIJobWrapper { j.Status.Conditions = append(j.Status.Conditions, c) return j diff --git a/test/integration/multikueue/multikueue_test.go b/test/integration/multikueue/multikueue_test.go index 14b00248cc..372a6704b3 100644 --- a/test/integration/multikueue/multikueue_test.go +++ b/test/integration/multikueue/multikueue_test.go @@ -1039,13 +1039,13 @@ var _ = ginkgo.Describe("Multikueue", ginkgo.Ordered, ginkgo.ContinueOnFailure, ReplicaType: kubeflow.MPIReplicaTypeLauncher, ReplicaCount: 1, Name: "launcher", - RestartPolicy: "OnFailure", + RestartPolicy: corev1.RestartPolicyOnFailure, }, testingmpijob.MPIJobReplicaSpecRequirement{ ReplicaType: kubeflow.MPIReplicaTypeWorker, ReplicaCount: 1, Name: "worker", - RestartPolicy: "Never", + RestartPolicy: corev1.RestartPolicyNever, }, ). Obj() From cba68de7f3719d95f9dc4a6557d2d2fb6aecba57 Mon Sep 17 00:00:00 2001 From: Michal Szadkowski Date: Thu, 29 Aug 2024 12:41:24 +0200 Subject: [PATCH 12/15] Yet another small fix --- hack/e2e-common.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hack/e2e-common.sh b/hack/e2e-common.sh index e3e63a1bdc..c829596256 100644 --- a/hack/e2e-common.sh +++ b/hack/e2e-common.sh @@ -29,7 +29,7 @@ export KUBEFLOW_CRDS=${ROOT_DIR}/dep-crds/training-operator export KUBEFLOW_CRDS_BASE=${KUBEFLOW_CRDS}/base/crds export KUBEFLOW_MANIFEST=${KUBEFLOW_CRDS}/overlays/standalone -export KUBEFLOW_MPI_MANIFEST="https://raw.githubusercontent.com/kubeflow/mpi-operator/${KUBEFLOW_MPI_VERSION}/deploy/v2beta1/mpi-operator.yaml" #"https://github.com/kubeflow/mpi-operator/manifests/overlays/standalone?ref=${KUBEFLOW_MPI_VERSION}" +export KUBEFLOW_MPI_MANIFEST="https://raw.githubusercontent.com/kubeflow/mpi-operator/${KUBEFLOW_MPI_VERSION}/deploy/v2beta1/mpi-operator.yaml" export KUBEFLOW_MPI_IMAGE=mpioperator/mpi-operator:${KUBEFLOW_MPI_VERSION/#v} export KUBEFLOW_MPI_CRD=${ROOT_DIR}/dep-crds/mpi-operator/kubeflow.org_mpijobs.yaml From 467b4ee707286f1f86f4f94cdb681111152bee61 Mon Sep 17 00:00:00 2001 From: Michal Szadkowski Date: Mon, 2 Sep 2024 13:53:06 +0200 Subject: [PATCH 13/15] Rework after code review * move modifications of training-operator deployment and crds to kustomize --- Makefile-deps.mk | 20 +++++++--- Makefile-test.mk | 2 +- hack/e2e-common.sh | 38 +++---------------- hack/multikueue-e2e-test.sh | 15 +++++--- .../jobs/mpijob/mpijob_multikueue_adapter.go | 2 +- .../testingjobs/mpijob/wrappers_mpijob.go | 10 ++--- .../multikueue/manager/kustomization.yaml | 14 +++++++ .../config/multikueue/manager/namespace.yaml | 4 ++ .../config/multikueue/manager/patch_crds.yaml | 5 +++ .../multikueue/worker/kustomization.yaml | 8 ++++ .../config/multikueue/worker/patch_crds.yaml | 5 +++ .../multikueue/worker/patch_deployment.yaml | 15 ++++++++ test/e2e/multikueue/e2e_test.go | 4 +- .../controller/jobs/mxjob/suite_test.go | 2 +- .../controller/jobs/paddlejob/suite_test.go | 2 +- .../controller/jobs/pytorchjob/suite_test.go | 2 +- .../controller/jobs/tfjob/suite_test.go | 2 +- .../controller/jobs/xgboostjob/suite_test.go | 2 +- .../integration/multikueue/multikueue_test.go | 4 +- test/integration/multikueue/suite_test.go | 2 +- 20 files changed, 96 insertions(+), 62 deletions(-) create mode 100644 test/e2e/config/multikueue/manager/kustomization.yaml create mode 100644 test/e2e/config/multikueue/manager/namespace.yaml create mode 100644 test/e2e/config/multikueue/manager/patch_crds.yaml create mode 100644 test/e2e/config/multikueue/worker/kustomization.yaml create mode 100644 test/e2e/config/multikueue/worker/patch_crds.yaml create mode 100644 test/e2e/config/multikueue/worker/patch_deployment.yaml diff --git a/Makefile-deps.mk b/Makefile-deps.mk index 7783ea7b97..4f5e063371 100644 --- a/Makefile-deps.mk +++ b/Makefile-deps.mk @@ -111,11 +111,19 @@ mpi-operator-crd: ## Copy the CRDs from the mpi-operator to the dep-crds directo KF_TRAINING_ROOT = $(shell $(GO_CMD) list -m -mod=readonly -f "{{.Dir}}" github.com/kubeflow/training-operator) .PHONY: kf-training-operator-crd kf-training-operator-crd: ## Copy the CRDs from the training-operator to the dep-crds directory. - mkdir -p $(EXTERNAL_CRDS_DIR)/training-operator/ - # Remove `kubeflow.org_mpijobs.yaml` version v1 from the kustomization.yaml, kueue uses v2beta1 CRD from mpi-operator - cp -prf $(KF_TRAINING_ROOT)/manifests/* $(EXTERNAL_CRDS_DIR)/training-operator/ - chmod -R u+w "${EXTERNAL_CRDS_DIR}/training-operator/" - sed -i '/kubeflow.org_mpijobs.yaml/d' $(EXTERNAL_CRDS_DIR)/training-operator/base/crds/kustomization.yaml + ## Removing kubeflow.org_mpijobs.yaml is required as the version of MPIJob is conflicting between training-operator and mpi-operator - in integration tests. + mkdir -p $(EXTERNAL_CRDS_DIR)/training-operator-crds/ + find $(KF_TRAINING_ROOT)/manifests/base/crds/* -type f -not -name "kubeflow.org_mpijobs.yaml" -exec cp -pf {} $(EXTERNAL_CRDS_DIR)/training-operator-crds/ \; + +.PHONY: kf-training-operator-manifests +kf-training-operator-manifests: ## Copy whole manifests folder from the training-operator to the dep-crds directory. + ## Full version of the manifest is required for e2e multikueue tests. + if [ -d "$(EXTERNAL_CRDS_DIR)/training-operator" ]; then \ + chmod -R u+w "$(EXTERNAL_CRDS_DIR)/training-operator" && \ + rm -rf "$(EXTERNAL_CRDS_DIR)/training-operator"; \ + fi + mkdir -p "$(EXTERNAL_CRDS_DIR)/training-operator" + cp -rf "$(KF_TRAINING_ROOT)/manifests" "$(EXTERNAL_CRDS_DIR)/training-operator" RAY_ROOT = $(shell $(GO_CMD) list -m -mod=readonly -f "{{.Dir}}" github.com/ray-project/kuberay/ray-operator) .PHONY: ray-operator-crd @@ -136,7 +144,7 @@ cluster-autoscaler-crd: ## Copy the CRDs from the cluster-autoscaler to the dep- cp -f $(CLUSTER_AUTOSCALER_ROOT)/config/crd/* $(EXTERNAL_CRDS_DIR)/cluster-autoscaler/ .PHONY: dep-crds -dep-crds: mpi-operator-crd kf-training-operator-crd ray-operator-crd jobset-operator-crd cluster-autoscaler-crd ## Copy the CRDs from the external operators to the dep-crds directory. +dep-crds: mpi-operator-crd kf-training-operator-crd ray-operator-crd jobset-operator-crd cluster-autoscaler-crd kf-training-operator-manifests ## Copy the CRDs from the external operators to the dep-crds directory. @echo "Copying CRDs from external operators to dep-crds directory" .PHONY: kueuectl-docs diff --git a/Makefile-test.mk b/Makefile-test.mk index 837501b974..52e4417c38 100644 --- a/Makefile-test.mk +++ b/Makefile-test.mk @@ -68,7 +68,7 @@ test: gotestsum ## Run tests. $(GOTESTSUM) --junitfile $(ARTIFACTS)/junit.xml -- $(GO_TEST_FLAGS) $(shell $(GO_CMD) list ./... | grep -v '/test/') -coverpkg=./... -coverprofile $(ARTIFACTS)/cover.out .PHONY: test-integration -test-integration: gomod-download envtest ginkgo mpi-operator-crd ray-operator-crd jobset-operator-crd kf-training-operator-crd cluster-autoscaler-crd kueuectl ## Run tests. +test-integration: gomod-download envtest ginkgo dep-crds kueuectl ## Run tests. KUBEBUILDER_ASSETS="$(shell $(ENVTEST) use $(ENVTEST_K8S_VERSION) -p path)" \ KUEUE_BIN=$(PROJECT_DIR)/bin \ ENVTEST_K8S_VERSION=$(ENVTEST_K8S_VERSION) \ diff --git a/hack/e2e-common.sh b/hack/e2e-common.sh index c829596256..5c1b5e9b3a 100644 --- a/hack/e2e-common.sh +++ b/hack/e2e-common.sh @@ -23,11 +23,11 @@ export JOBSET_MANIFEST="https://github.com/kubernetes-sigs/jobset/releases/downl export JOBSET_IMAGE=registry.k8s.io/jobset/jobset:${JOBSET_VERSION} export JOBSET_CRDS=${ROOT_DIR}/dep-crds/jobset-operator/ -#no matching semver tag unfortunately -export KUBEFLOW_IMAGE=kubeflow/training-operator:v1-855e096 -export KUBEFLOW_CRDS=${ROOT_DIR}/dep-crds/training-operator -export KUBEFLOW_CRDS_BASE=${KUBEFLOW_CRDS}/base/crds -export KUBEFLOW_MANIFEST=${KUBEFLOW_CRDS}/overlays/standalone +export KUBEFLOW_MANIFEST_MANAGER=${ROOT_DIR}/test/e2e/config/multikueue/manager +export KUBEFLOW_MANIFEST_WORKER=${ROOT_DIR}/test/e2e/config/multikueue/worker +KUBEFLOW_IMAGE_VERSION=$($KUSTOMIZE build "$KUBEFLOW_MANIFEST_WORKER" | $YQ e 'select(.kind == "Deployment") | .spec.template.spec.containers[0].image | split(":") | .[1]') +export KUBEFLOW_IMAGE_VERSION +export KUBEFLOW_IMAGE=kubeflow/training-operator:${KUBEFLOW_IMAGE_VERSION} export KUBEFLOW_MPI_MANIFEST="https://raw.githubusercontent.com/kubeflow/mpi-operator/${KUBEFLOW_MPI_VERSION}/deploy/v2beta1/mpi-operator.yaml" export KUBEFLOW_MPI_IMAGE=mpioperator/mpi-operator:${KUBEFLOW_MPI_VERSION/#v} @@ -77,37 +77,11 @@ function install_jobset { kubectl apply --server-side -f "${JOBSET_MANIFEST}" } -function patch_kubeflow_manifest { - # In order for MPI-operator and Training-operator to work on the same cluster it is required that: - # 1. 'kubeflow.org_mpijobs.yaml' is removed from base/crds/kustomization.yaml - https://github.com/kubeflow/training-operator/issues/1930 - # Done already in Makefile-deps.mk target `kf-training-operator-crd` - # 2. Training-operator deployment file is patched and manually enabled for all kubeflow jobs except for mpi - https://github.com/kubeflow/training-operator/issues/1777 - KUBEFLOW_DEPLOYMENT="${KUBEFLOW_CRDS}/base/deployment.yaml" - - # Find the line after which to insert the args - INSERT_LINE=$(grep -n "^ *- /manager" "${KUBEFLOW_DEPLOYMENT}" | head -n 1 | cut -d ':' -f 1) - - # Prepare patch with the args after the specified line - # EOF must be indented most left - doesn't work otherwise - ARGS_PATCH=$(cat < Date: Mon, 9 Sep 2024 10:22:54 +0200 Subject: [PATCH 14/15] Another rework after code review --- hack/multikueue-e2e-test.sh | 2 +- .../jobframework/reconciler_test.go | 10 +- pkg/controller/jobframework/setup_test.go | 16 +-- pkg/controller/jobframework/validation.go | 4 +- .../jobs/mpijob/mpijob_controller.go | 42 +++---- .../jobs/mpijob/mpijob_controller_test.go | 98 +++++++-------- .../jobs/mpijob/mpijob_multikueue_adapter.go | 20 ++- .../mpijob/mpijob_multikueue_adapter_test.go | 46 +++---- pkg/controller/jobs/mpijob/mpijob_webhook.go | 4 +- .../jobs/mpijob/mpijob_webhook_test.go | 6 +- .../testingjobs/mpijob/wrappers_mpijob.go | 38 +++--- .../en/docs/tasks/run/kubeflow/mpijobs.md | 5 + test/e2e/multikueue/e2e_test.go | 26 ++-- test/e2e/multikueue/suite_test.go | 6 +- .../jobs/mpijob/mpijob_controller_test.go | 116 +++++++++--------- test/integration/framework/framework.go | 4 +- .../integration/multikueue/multikueue_test.go | 28 ++--- test/util/e2e.go | 4 +- 18 files changed, 239 insertions(+), 236 deletions(-) diff --git a/hack/multikueue-e2e-test.sh b/hack/multikueue-e2e-test.sh index f1f39e581f..1c9902ace3 100755 --- a/hack/multikueue-e2e-test.sh +++ b/hack/multikueue-e2e-test.sh @@ -57,7 +57,7 @@ function startup { MANAGER_KIND_CONFIG="${SOURCE_DIR}/multikueue/manager-cluster.kind.yaml" fi - echo "Using manager config: $MANAGER_KIND_CONFIG" + echo "Using manager config: $MANAGER_KIND_CONFIG" cluster_create "$MANAGER_KIND_CLUSTER_NAME" "$MANAGER_KIND_CONFIG" cluster_create "$WORKER1_KIND_CLUSTER_NAME" "$SOURCE_DIR/multikueue/worker-cluster.kind.yaml" cluster_create "$WORKER2_KIND_CLUSTER_NAME" "$SOURCE_DIR/multikueue/worker-cluster.kind.yaml" diff --git a/pkg/controller/jobframework/reconciler_test.go b/pkg/controller/jobframework/reconciler_test.go index e2d1b6ffb1..d536e3e2f9 100644 --- a/pkg/controller/jobframework/reconciler_test.go +++ b/pkg/controller/jobframework/reconciler_test.go @@ -22,7 +22,7 @@ import ( "github.com/google/go-cmp/cmp" "github.com/google/go-cmp/cmp/cmpopts" - kubeflow "github.com/kubeflow/mpi-operator/pkg/apis/kubeflow/v2beta1" + kfmpi "github.com/kubeflow/mpi-operator/pkg/apis/kubeflow/v2beta1" batchv1 "k8s.io/api/batch/v1" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -59,7 +59,7 @@ func TestIsParentJobManaged(t *testing.T) { }, "child job has ownerReference with known non-existing workload owner": { job: testingjob.MakeJob(childJobName, jobNamespace). - OwnerReference(parentJobName, kubeflow.SchemeGroupVersionKind). + OwnerReference(parentJobName, kfmpi.SchemeGroupVersionKind). Obj(), wantErr: ErrWorkloadOwnerNotFound, }, @@ -69,7 +69,7 @@ func TestIsParentJobManaged(t *testing.T) { Queue("test-q"). Obj(), job: testingjob.MakeJob(childJobName, jobNamespace). - OwnerReference(parentJobName, kubeflow.SchemeGroupVersionKind). + OwnerReference(parentJobName, kfmpi.SchemeGroupVersionKind). Obj(), wantManaged: true, }, @@ -78,14 +78,14 @@ func TestIsParentJobManaged(t *testing.T) { UID(parentJobName). Obj(), job: testingjob.MakeJob(childJobName, jobNamespace). - OwnerReference(parentJobName, kubeflow.SchemeGroupVersionKind). + OwnerReference(parentJobName, kfmpi.SchemeGroupVersionKind). Obj(), }, } for name, tc := range cases { t.Run(name, func(t *testing.T) { t.Cleanup(EnableIntegrationsForTest(t, "kubeflow.org/mpijob")) - builder := utiltesting.NewClientBuilder(kubeflow.AddToScheme) + builder := utiltesting.NewClientBuilder(kfmpi.AddToScheme) if tc.parentJob != nil { builder = builder.WithObjects(tc.parentJob) } diff --git a/pkg/controller/jobframework/setup_test.go b/pkg/controller/jobframework/setup_test.go index 69f6e5cf08..988db4d408 100644 --- a/pkg/controller/jobframework/setup_test.go +++ b/pkg/controller/jobframework/setup_test.go @@ -26,7 +26,7 @@ import ( "github.com/google/go-cmp/cmp" "github.com/google/go-cmp/cmp/cmpopts" - kubeflow "github.com/kubeflow/mpi-operator/pkg/apis/kubeflow/v2beta1" + kfmpi "github.com/kubeflow/mpi-operator/pkg/apis/kubeflow/v2beta1" kftraining "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" rayv1 "github.com/ray-project/kuberay/ray-operator/apis/ray/v1" batchv1 "k8s.io/api/batch/v1" @@ -58,7 +58,7 @@ func TestSetupControllers(t *testing.T) { "kubeflow.org/mpijob": { NewReconciler: testNewReconciler, SetupWebhook: testSetupWebhook, - JobType: &kubeflow.MPIJob{}, + JobType: &kfmpi.MPIJob{}, SetupIndexes: testSetupIndexes, AddToScheme: testAddToScheme, CanSupportIntegration: testCanSupportIntegration, @@ -98,7 +98,7 @@ func TestSetupControllers(t *testing.T) { }, mapperGVKs: []schema.GroupVersionKind{ batchv1.SchemeGroupVersion.WithKind("Job"), - kubeflow.SchemeGroupVersionKind, + kfmpi.SchemeGroupVersionKind, }, wantEnabledIntegrations: []string{"batch/job", "kubeflow.org/mpijob"}, }, @@ -117,7 +117,7 @@ func TestSetupControllers(t *testing.T) { }, mapperGVKs: []schema.GroupVersionKind{ batchv1.SchemeGroupVersion.WithKind("Job"), - kubeflow.SchemeGroupVersionKind, + kfmpi.SchemeGroupVersionKind, // Not including RayCluster }, delayedGVKs: []*schema.GroupVersionKind{ @@ -137,7 +137,7 @@ func TestSetupControllers(t *testing.T) { } ctx, logger := utiltesting.ContextWithLog(t) - k8sClient := utiltesting.NewClientBuilder(jobset.AddToScheme, kubeflow.AddToScheme, kftraining.AddToScheme, rayv1.AddToScheme).Build() + k8sClient := utiltesting.NewClientBuilder(jobset.AddToScheme, kfmpi.AddToScheme, kftraining.AddToScheme, rayv1.AddToScheme).Build() mgrOpts := ctrlmgr.Options{ Scheme: k8sClient.Scheme(), @@ -245,16 +245,16 @@ func TestSetupIndexes(t *testing.T) { "kubeflow.org/mpijob is disabled in the configAPI": { workloads: []kueue.Workload{ *utiltesting.MakeWorkload("alpha-wl", testNamespace). - ControllerReference(kubeflow.SchemeGroupVersionKind, "alpha", "mpijob"). + ControllerReference(kfmpi.SchemeGroupVersionKind, "alpha", "mpijob"). Obj(), *utiltesting.MakeWorkload("beta-wl", testNamespace). - ControllerReference(kubeflow.SchemeGroupVersionKind, "beta", "mpijob"). + ControllerReference(kfmpi.SchemeGroupVersionKind, "beta", "mpijob"). Obj(), }, opts: []Option{ WithEnabledFrameworks([]string{"batch/job"}), }, - filter: client.MatchingFields{GetOwnerKey(kubeflow.SchemeGroupVersionKind): "alpha"}, + filter: client.MatchingFields{GetOwnerKey(kfmpi.SchemeGroupVersionKind): "alpha"}, wantFieldMatcherError: true, }, } diff --git a/pkg/controller/jobframework/validation.go b/pkg/controller/jobframework/validation.go index 97ee3bf617..da355f35a0 100644 --- a/pkg/controller/jobframework/validation.go +++ b/pkg/controller/jobframework/validation.go @@ -20,7 +20,7 @@ import ( "fmt" "strings" - kubeflow "github.com/kubeflow/mpi-operator/pkg/apis/kubeflow/v2beta1" + kfmpi "github.com/kubeflow/mpi-operator/pkg/apis/kubeflow/v2beta1" kftraining "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" batchv1 "k8s.io/api/batch/v1" apivalidation "k8s.io/apimachinery/pkg/api/validation" @@ -44,7 +44,7 @@ var ( kftraining.SchemeGroupVersion.WithKind(kftraining.PaddleJobKind).String(), kftraining.SchemeGroupVersion.WithKind(kftraining.PyTorchJobKind).String(), kftraining.SchemeGroupVersion.WithKind(kftraining.XGBoostJobKind).String(), - kubeflow.SchemeGroupVersion.WithKind(kubeflow.Kind).String()) + kfmpi.SchemeGroupVersion.WithKind(kfmpi.Kind).String()) ) // ValidateJobOnCreate encapsulates all GenericJob validations that must be performed on a Create operation diff --git a/pkg/controller/jobs/mpijob/mpijob_controller.go b/pkg/controller/jobs/mpijob/mpijob_controller.go index 0e75e65d55..43b2c6dba1 100644 --- a/pkg/controller/jobs/mpijob/mpijob_controller.go +++ b/pkg/controller/jobs/mpijob/mpijob_controller.go @@ -21,7 +21,7 @@ import ( "fmt" "strings" - kubeflow "github.com/kubeflow/mpi-operator/pkg/apis/kubeflow/v2beta1" + kfmpi "github.com/kubeflow/mpi-operator/pkg/apis/kubeflow/v2beta1" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" @@ -37,7 +37,7 @@ import ( ) var ( - gvk = kubeflow.SchemeGroupVersionKind + gvk = kfmpi.SchemeGroupVersionKind FrameworkName = "kubeflow.org/mpijob" ) @@ -48,8 +48,8 @@ func init() { NewJob: NewJob, NewReconciler: NewReconciler, SetupWebhook: SetupMPIJobWebhook, - JobType: &kubeflow.MPIJob{}, - AddToScheme: kubeflow.AddToScheme, + JobType: &kfmpi.MPIJob{}, + AddToScheme: kfmpi.AddToScheme, IsManagingObjectsOwner: isMPIJob, MultiKueueAdapter: &multikueueAdapter{}, })) @@ -73,20 +73,20 @@ func NewJob() jobframework.GenericJob { var NewReconciler = jobframework.NewGenericReconcilerFactory(NewJob) func isMPIJob(owner *metav1.OwnerReference) bool { - return owner.Kind == "MPIJob" && strings.HasPrefix(owner.APIVersion, kubeflow.SchemeGroupVersion.Group) + return owner.Kind == "MPIJob" && strings.HasPrefix(owner.APIVersion, kfmpi.SchemeGroupVersion.Group) } -type MPIJob kubeflow.MPIJob +type MPIJob kfmpi.MPIJob var _ jobframework.GenericJob = (*MPIJob)(nil) var _ jobframework.JobWithPriorityClass = (*MPIJob)(nil) func (j *MPIJob) Object() client.Object { - return (*kubeflow.MPIJob)(j) + return (*kfmpi.MPIJob)(j) } func fromObject(o runtime.Object) *MPIJob { - return (*MPIJob)(o.(*kubeflow.MPIJob)) + return (*MPIJob)(o.(*kfmpi.MPIJob)) } func (j *MPIJob) IsSuspended() bool { @@ -111,7 +111,7 @@ func (j *MPIJob) GVK() schema.GroupVersionKind { } func (j *MPIJob) PodLabelSelector() string { - return fmt.Sprintf("%s=%s,%s=%s", kubeflow.JobNameLabel, j.Name, kubeflow.OperatorNameLabel, kubeflow.OperatorName) + return fmt.Sprintf("%s=%s,%s=%s", kfmpi.JobNameLabel, j.Name, kfmpi.OperatorNameLabel, kfmpi.OperatorName) } func (j *MPIJob) PodSets() []kueue.PodSet { @@ -161,8 +161,8 @@ func (j *MPIJob) RestorePodSetsInfo(podSetsInfo []podset.PodSetInfo) bool { func (j *MPIJob) Finished() (message string, success, finished bool) { for _, c := range j.Status.Conditions { - if (c.Type == kubeflow.JobSucceeded || c.Type == kubeflow.JobFailed) && c.Status == corev1.ConditionTrue { - return c.Message, c.Type != kubeflow.JobFailed, true + if (c.Type == kfmpi.JobSucceeded || c.Type == kfmpi.JobFailed) && c.Status == corev1.ConditionTrue { + return c.Message, c.Type != kfmpi.JobFailed, true } } @@ -179,9 +179,9 @@ func (j *MPIJob) Finished() (message string, success, finished bool) { func (j *MPIJob) PriorityClass() string { if j.Spec.RunPolicy.SchedulingPolicy != nil && len(j.Spec.RunPolicy.SchedulingPolicy.PriorityClass) != 0 { return j.Spec.RunPolicy.SchedulingPolicy.PriorityClass - } else if l := j.Spec.MPIReplicaSpecs[kubeflow.MPIReplicaTypeLauncher]; l != nil && len(l.Template.Spec.PriorityClassName) != 0 { + } else if l := j.Spec.MPIReplicaSpecs[kfmpi.MPIReplicaTypeLauncher]; l != nil && len(l.Template.Spec.PriorityClassName) != 0 { return l.Template.Spec.PriorityClassName - } else if w := j.Spec.MPIReplicaSpecs[kubeflow.MPIReplicaTypeWorker]; w != nil && len(w.Template.Spec.PriorityClassName) != 0 { + } else if w := j.Spec.MPIReplicaSpecs[kfmpi.MPIReplicaTypeWorker]; w != nil && len(w.Template.Spec.PriorityClassName) != 0 { return w.Template.Spec.PriorityClassName } return "" @@ -189,7 +189,7 @@ func (j *MPIJob) PriorityClass() string { func (j *MPIJob) PodsReady() bool { for _, c := range j.Status.Conditions { - if c.Type == kubeflow.JobRunning && c.Status == corev1.ConditionTrue { + if c.Type == kfmpi.JobRunning && c.Status == corev1.ConditionTrue { return true } } @@ -200,18 +200,18 @@ func SetupIndexes(ctx context.Context, indexer client.FieldIndexer) error { return jobframework.SetupWorkloadOwnerIndex(ctx, indexer, gvk) } -func orderedReplicaTypes(jobSpec *kubeflow.MPIJobSpec) []kubeflow.MPIReplicaType { - var result []kubeflow.MPIReplicaType - if _, ok := jobSpec.MPIReplicaSpecs[kubeflow.MPIReplicaTypeLauncher]; ok { - result = append(result, kubeflow.MPIReplicaTypeLauncher) +func orderedReplicaTypes(jobSpec *kfmpi.MPIJobSpec) []kfmpi.MPIReplicaType { + var result []kfmpi.MPIReplicaType + if _, ok := jobSpec.MPIReplicaSpecs[kfmpi.MPIReplicaTypeLauncher]; ok { + result = append(result, kfmpi.MPIReplicaTypeLauncher) } - if _, ok := jobSpec.MPIReplicaSpecs[kubeflow.MPIReplicaTypeWorker]; ok { - result = append(result, kubeflow.MPIReplicaTypeWorker) + if _, ok := jobSpec.MPIReplicaSpecs[kfmpi.MPIReplicaTypeWorker]; ok { + result = append(result, kfmpi.MPIReplicaTypeWorker) } return result } -func podsCount(jobSpec *kubeflow.MPIJobSpec, mpiReplicaType kubeflow.MPIReplicaType) int32 { +func podsCount(jobSpec *kfmpi.MPIJobSpec, mpiReplicaType kfmpi.MPIReplicaType) int32 { return ptr.Deref(jobSpec.MPIReplicaSpecs[mpiReplicaType].Replicas, 1) } diff --git a/pkg/controller/jobs/mpijob/mpijob_controller_test.go b/pkg/controller/jobs/mpijob/mpijob_controller_test.go index ada57145be..095c4d869a 100644 --- a/pkg/controller/jobs/mpijob/mpijob_controller_test.go +++ b/pkg/controller/jobs/mpijob/mpijob_controller_test.go @@ -21,7 +21,7 @@ import ( "github.com/google/go-cmp/cmp" "github.com/google/go-cmp/cmp/cmpopts" - kubeflow "github.com/kubeflow/mpi-operator/pkg/apis/kubeflow/v2beta1" + kfmpi "github.com/kubeflow/mpi-operator/pkg/apis/kubeflow/v2beta1" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/client-go/tools/record" @@ -37,30 +37,30 @@ import ( func TestCalcPriorityClassName(t *testing.T) { testcases := map[string]struct { - job kubeflow.MPIJob + job kfmpi.MPIJob wantPriorityClassName string }{ "none priority class name specified": { - job: kubeflow.MPIJob{}, + job: kfmpi.MPIJob{}, wantPriorityClassName: "", }, "priority specified at runPolicy and replicas; use priority in runPolicy": { - job: kubeflow.MPIJob{ - Spec: kubeflow.MPIJobSpec{ - RunPolicy: kubeflow.RunPolicy{ - SchedulingPolicy: &kubeflow.SchedulingPolicy{ + job: kfmpi.MPIJob{ + Spec: kfmpi.MPIJobSpec{ + RunPolicy: kfmpi.RunPolicy{ + SchedulingPolicy: &kfmpi.SchedulingPolicy{ PriorityClass: "scheduling-priority", }, }, - MPIReplicaSpecs: map[kubeflow.MPIReplicaType]*kubeflow.ReplicaSpec{ - kubeflow.MPIReplicaTypeLauncher: { + MPIReplicaSpecs: map[kfmpi.MPIReplicaType]*kfmpi.ReplicaSpec{ + kfmpi.MPIReplicaTypeLauncher: { Template: corev1.PodTemplateSpec{ Spec: corev1.PodSpec{ PriorityClassName: "launcher-priority", }, }, }, - kubeflow.MPIReplicaTypeWorker: { + kfmpi.MPIReplicaTypeWorker: { Template: corev1.PodTemplateSpec{ Spec: corev1.PodSpec{ PriorityClassName: "worker-priority", @@ -73,13 +73,13 @@ func TestCalcPriorityClassName(t *testing.T) { wantPriorityClassName: "scheduling-priority", }, "runPolicy present, but without priority; fallback to launcher": { - job: kubeflow.MPIJob{ - Spec: kubeflow.MPIJobSpec{ - RunPolicy: kubeflow.RunPolicy{ - SchedulingPolicy: &kubeflow.SchedulingPolicy{}, + job: kfmpi.MPIJob{ + Spec: kfmpi.MPIJobSpec{ + RunPolicy: kfmpi.RunPolicy{ + SchedulingPolicy: &kfmpi.SchedulingPolicy{}, }, - MPIReplicaSpecs: map[kubeflow.MPIReplicaType]*kubeflow.ReplicaSpec{ - kubeflow.MPIReplicaTypeLauncher: { + MPIReplicaSpecs: map[kfmpi.MPIReplicaType]*kfmpi.ReplicaSpec{ + kfmpi.MPIReplicaTypeLauncher: { Template: corev1.PodTemplateSpec{ Spec: corev1.PodSpec{ PriorityClassName: "launcher-priority", @@ -92,17 +92,17 @@ func TestCalcPriorityClassName(t *testing.T) { wantPriorityClassName: "launcher-priority", }, "specified on launcher takes precedence over worker": { - job: kubeflow.MPIJob{ - Spec: kubeflow.MPIJobSpec{ - MPIReplicaSpecs: map[kubeflow.MPIReplicaType]*kubeflow.ReplicaSpec{ - kubeflow.MPIReplicaTypeLauncher: { + job: kfmpi.MPIJob{ + Spec: kfmpi.MPIJobSpec{ + MPIReplicaSpecs: map[kfmpi.MPIReplicaType]*kfmpi.ReplicaSpec{ + kfmpi.MPIReplicaTypeLauncher: { Template: corev1.PodTemplateSpec{ Spec: corev1.PodSpec{ PriorityClassName: "launcher-priority", }, }, }, - kubeflow.MPIReplicaTypeWorker: { + kfmpi.MPIReplicaTypeWorker: { Template: corev1.PodTemplateSpec{ Spec: corev1.PodSpec{ PriorityClassName: "worker-priority", @@ -115,15 +115,15 @@ func TestCalcPriorityClassName(t *testing.T) { wantPriorityClassName: "launcher-priority", }, "launcher present, but without priority; fallback to worker": { - job: kubeflow.MPIJob{ - Spec: kubeflow.MPIJobSpec{ - MPIReplicaSpecs: map[kubeflow.MPIReplicaType]*kubeflow.ReplicaSpec{ - kubeflow.MPIReplicaTypeLauncher: { + job: kfmpi.MPIJob{ + Spec: kfmpi.MPIJobSpec{ + MPIReplicaSpecs: map[kfmpi.MPIReplicaType]*kfmpi.ReplicaSpec{ + kfmpi.MPIReplicaTypeLauncher: { Template: corev1.PodTemplateSpec{ Spec: corev1.PodSpec{}, }, }, - kubeflow.MPIReplicaTypeWorker: { + kfmpi.MPIReplicaTypeWorker: { Template: corev1.PodTemplateSpec{ Spec: corev1.PodSpec{ PriorityClassName: "worker-priority", @@ -136,11 +136,11 @@ func TestCalcPriorityClassName(t *testing.T) { wantPriorityClassName: "worker-priority", }, "specified on worker only": { - job: kubeflow.MPIJob{ - Spec: kubeflow.MPIJobSpec{ - MPIReplicaSpecs: map[kubeflow.MPIReplicaType]*kubeflow.ReplicaSpec{ - kubeflow.MPIReplicaTypeLauncher: {}, - kubeflow.MPIReplicaTypeWorker: { + job: kfmpi.MPIJob{ + Spec: kfmpi.MPIJobSpec{ + MPIReplicaSpecs: map[kfmpi.MPIReplicaType]*kfmpi.ReplicaSpec{ + kfmpi.MPIReplicaTypeLauncher: {}, + kfmpi.MPIReplicaTypeWorker: { Template: corev1.PodTemplateSpec{ Spec: corev1.PodSpec{ PriorityClassName: "worker-priority", @@ -153,11 +153,11 @@ func TestCalcPriorityClassName(t *testing.T) { wantPriorityClassName: "worker-priority", }, "worker present, but without priority; fallback to empty": { - job: kubeflow.MPIJob{ - Spec: kubeflow.MPIJobSpec{ - MPIReplicaSpecs: map[kubeflow.MPIReplicaType]*kubeflow.ReplicaSpec{ - kubeflow.MPIReplicaTypeLauncher: {}, - kubeflow.MPIReplicaTypeWorker: { + job: kfmpi.MPIJob{ + Spec: kfmpi.MPIJobSpec{ + MPIReplicaSpecs: map[kfmpi.MPIReplicaType]*kfmpi.ReplicaSpec{ + kfmpi.MPIReplicaTypeLauncher: {}, + kfmpi.MPIReplicaTypeWorker: { Template: corev1.PodTemplateSpec{ Spec: corev1.PodSpec{}, }, @@ -183,7 +183,7 @@ func TestCalcPriorityClassName(t *testing.T) { var ( jobCmpOpts = []cmp.Option{ cmpopts.EquateEmpty(), - cmpopts.IgnoreFields(kubeflow.MPIJob{}, "TypeMeta", "ObjectMeta"), + cmpopts.IgnoreFields(kfmpi.MPIJob{}, "TypeMeta", "ObjectMeta"), } workloadCmpOpts = []cmp.Option{ cmpopts.EquateEmpty(), @@ -201,9 +201,9 @@ func TestReconciler(t *testing.T) { PriorityValue(200) cases := map[string]struct { reconcilerOptions []jobframework.Option - job *kubeflow.MPIJob + job *kfmpi.MPIJob priorityClasses []client.Object - wantJob *kubeflow.MPIJob + wantJob *kfmpi.MPIJob wantWorkloads []kueue.Workload wantErr error }{ @@ -211,8 +211,8 @@ func TestReconciler(t *testing.T) { reconcilerOptions: []jobframework.Option{ jobframework.WithManageJobsWithoutQueueName(true), }, - job: testingmpijob.MakeMPIJob("mpijob", "ns").MPIJobReplicaSpecsDefault().Parallelism(2).Obj(), - wantJob: testingmpijob.MakeMPIJob("mpijob", "ns").MPIJobReplicaSpecsDefault().Parallelism(2).Obj(), + job: testingmpijob.MakeMPIJob("mpijob", "ns").GenericLauncherAndWorker().Parallelism(2).Obj(), + wantJob: testingmpijob.MakeMPIJob("mpijob", "ns").GenericLauncherAndWorker().Parallelism(2).Obj(), wantWorkloads: []kueue.Workload{ *utiltesting.MakeWorkload("mpijob", "ns"). PodSets( @@ -226,11 +226,11 @@ func TestReconciler(t *testing.T) { reconcilerOptions: []jobframework.Option{ jobframework.WithManageJobsWithoutQueueName(true), }, - job: testingmpijob.MakeMPIJob("mpijob", "ns").MPIJobReplicaSpecsDefault().Parallelism(2).WorkloadPriorityClass("test-wpc").Obj(), + job: testingmpijob.MakeMPIJob("mpijob", "ns").GenericLauncherAndWorker().Parallelism(2).WorkloadPriorityClass("test-wpc").Obj(), priorityClasses: []client.Object{ baseWPCWrapper.Obj(), }, - wantJob: testingmpijob.MakeMPIJob("mpijob", "ns").MPIJobReplicaSpecsDefault().Parallelism(2).WorkloadPriorityClass("test-wpc").Obj(), + wantJob: testingmpijob.MakeMPIJob("mpijob", "ns").GenericLauncherAndWorker().Parallelism(2).WorkloadPriorityClass("test-wpc").Obj(), wantWorkloads: []kueue.Workload{ *utiltesting.MakeWorkload("mpijob", "ns"). PodSets( @@ -245,11 +245,11 @@ func TestReconciler(t *testing.T) { reconcilerOptions: []jobframework.Option{ jobframework.WithManageJobsWithoutQueueName(true), }, - job: testingmpijob.MakeMPIJob("mpijob", "ns").MPIJobReplicaSpecsDefault().Parallelism(2).PriorityClass("test-pc").Obj(), + job: testingmpijob.MakeMPIJob("mpijob", "ns").GenericLauncherAndWorker().Parallelism(2).PriorityClass("test-pc").Obj(), priorityClasses: []client.Object{ basePCWrapper.Obj(), }, - wantJob: testingmpijob.MakeMPIJob("mpijob", "ns").MPIJobReplicaSpecsDefault().Parallelism(2).PriorityClass("test-pc").Obj(), + wantJob: testingmpijob.MakeMPIJob("mpijob", "ns").GenericLauncherAndWorker().Parallelism(2).PriorityClass("test-pc").Obj(), wantWorkloads: []kueue.Workload{ *utiltesting.MakeWorkload("mpijob", "ns"). PodSets( @@ -264,12 +264,12 @@ func TestReconciler(t *testing.T) { reconcilerOptions: []jobframework.Option{ jobframework.WithManageJobsWithoutQueueName(true), }, - job: testingmpijob.MakeMPIJob("mpijob", "ns").MPIJobReplicaSpecsDefault().Parallelism(2). + job: testingmpijob.MakeMPIJob("mpijob", "ns").GenericLauncherAndWorker().Parallelism(2). WorkloadPriorityClass("test-wpc").PriorityClass("test-pc").Obj(), priorityClasses: []client.Object{ basePCWrapper.Obj(), baseWPCWrapper.Obj(), }, - wantJob: testingmpijob.MakeMPIJob("mpijob", "ns").MPIJobReplicaSpecsDefault().Parallelism(2). + wantJob: testingmpijob.MakeMPIJob("mpijob", "ns").GenericLauncherAndWorker().Parallelism(2). WorkloadPriorityClass("test-wpc").PriorityClass("test-pc").Obj(), wantWorkloads: []kueue.Workload{ *utiltesting.MakeWorkload("mpijob", "ns"). @@ -286,7 +286,7 @@ func TestReconciler(t *testing.T) { for name, tc := range cases { t.Run(name, func(t *testing.T) { ctx, _ := utiltesting.ContextWithLog(t) - clientBuilder := utiltesting.NewClientBuilder(kubeflow.AddToScheme) + clientBuilder := utiltesting.NewClientBuilder(kfmpi.AddToScheme) if err := SetupIndexes(ctx, utiltesting.AsIndexer(clientBuilder)); err != nil { t.Fatalf("Could not setup indexes: %v", err) } @@ -303,7 +303,7 @@ func TestReconciler(t *testing.T) { t.Errorf("Reconcile returned error (-want,+got):\n%s", diff) } - var gotMpiJob kubeflow.MPIJob + var gotMpiJob kfmpi.MPIJob if err := kClient.Get(ctx, jobKey, &gotMpiJob); err != nil { t.Fatalf("Could not get Job after reconcile: %v", err) } diff --git a/pkg/controller/jobs/mpijob/mpijob_multikueue_adapter.go b/pkg/controller/jobs/mpijob/mpijob_multikueue_adapter.go index c660b226ab..a4e973fa8d 100644 --- a/pkg/controller/jobs/mpijob/mpijob_multikueue_adapter.go +++ b/pkg/controller/jobs/mpijob/mpijob_multikueue_adapter.go @@ -21,7 +21,7 @@ import ( "errors" "fmt" - kubeflow "github.com/kubeflow/mpi-operator/pkg/apis/kubeflow/v2beta1" + kfmpi "github.com/kubeflow/mpi-operator/pkg/apis/kubeflow/v2beta1" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/runtime/schema" "k8s.io/apimachinery/pkg/types" @@ -40,13 +40,13 @@ type multikueueAdapter struct{} var _ jobframework.MultiKueueAdapter = (*multikueueAdapter)(nil) func (b *multikueueAdapter) SyncJob(ctx context.Context, localClient client.Client, remoteClient client.Client, key types.NamespacedName, workloadName, origin string) error { - localJob := kubeflow.MPIJob{} + localJob := kfmpi.MPIJob{} err := localClient.Get(ctx, key, &localJob) if err != nil { return err } - remoteJob := kubeflow.MPIJob{} + remoteJob := kfmpi.MPIJob{} err = remoteClient.Get(ctx, key, &remoteJob) if client.IgnoreNotFound(err) != nil { return err @@ -60,7 +60,7 @@ func (b *multikueueAdapter) SyncJob(ctx context.Context, localClient client.Clie }) } - remoteJob = kubeflow.MPIJob{ + remoteJob = kfmpi.MPIJob{ ObjectMeta: api.CloneObjectMetaForCreation(&localJob.ObjectMeta), Spec: *localJob.Spec.DeepCopy(), } @@ -76,11 +76,9 @@ func (b *multikueueAdapter) SyncJob(ctx context.Context, localClient client.Clie } func (b *multikueueAdapter) DeleteRemoteObject(ctx context.Context, remoteClient client.Client, key types.NamespacedName) error { - job := kubeflow.MPIJob{} - err := remoteClient.Get(ctx, key, &job) - if err != nil { - return client.IgnoreNotFound(err) - } + job := kfmpi.MPIJob{} + job.SetName(key.Name) + job.SetNamespace(key.Namespace) return client.IgnoreNotFound(remoteClient.Delete(ctx, &job)) } @@ -99,11 +97,11 @@ func (b *multikueueAdapter) GVK() schema.GroupVersionKind { var _ jobframework.MultiKueueWatcher = (*multikueueAdapter)(nil) func (*multikueueAdapter) GetEmptyList() client.ObjectList { - return &kubeflow.MPIJobList{} + return &kfmpi.MPIJobList{} } func (*multikueueAdapter) WorkloadKeyFor(o runtime.Object) (types.NamespacedName, error) { - job, isJob := o.(*kubeflow.MPIJob) + job, isJob := o.(*kfmpi.MPIJob) if !isJob { return types.NamespacedName{}, errors.New("not a mpijob") } diff --git a/pkg/controller/jobs/mpijob/mpijob_multikueue_adapter_test.go b/pkg/controller/jobs/mpijob/mpijob_multikueue_adapter_test.go index 844bbf215d..a658591707 100644 --- a/pkg/controller/jobs/mpijob/mpijob_multikueue_adapter_test.go +++ b/pkg/controller/jobs/mpijob/mpijob_multikueue_adapter_test.go @@ -22,7 +22,7 @@ import ( "github.com/google/go-cmp/cmp" "github.com/google/go-cmp/cmp/cmpopts" - kubeflow "github.com/kubeflow/mpi-operator/pkg/apis/kubeflow/v2beta1" + kfmpi "github.com/kubeflow/mpi-operator/pkg/apis/kubeflow/v2beta1" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" @@ -49,28 +49,28 @@ func TestMultikueueAdapter(t *testing.T) { mpiJobBuilder := utiltestingmpijob.MakeMPIJob("mpijob1", TestNamespace) cases := map[string]struct { - managersJobSets []kubeflow.MPIJob - workerJobSets []kubeflow.MPIJob + managersJobSets []kfmpi.MPIJob + workerJobSets []kfmpi.MPIJob operation func(ctx context.Context, adapter *multikueueAdapter, managerClient, workerClient client.Client) error wantError error - wantManagersJobSets []kubeflow.MPIJob - wantWorkerJobSets []kubeflow.MPIJob + wantManagersJobSets []kfmpi.MPIJob + wantWorkerJobSets []kfmpi.MPIJob }{ "sync creates missing remote mpijob": { - managersJobSets: []kubeflow.MPIJob{ + managersJobSets: []kfmpi.MPIJob{ *mpiJobBuilder.DeepCopy(), }, operation: func(ctx context.Context, adapter *multikueueAdapter, managerClient, workerClient client.Client) error { return adapter.SyncJob(ctx, managerClient, workerClient, types.NamespacedName{Name: "mpijob1", Namespace: TestNamespace}, "wl1", "origin1") }, - wantManagersJobSets: []kubeflow.MPIJob{ + wantManagersJobSets: []kfmpi.MPIJob{ *mpiJobBuilder.DeepCopy(), }, - wantWorkerJobSets: []kubeflow.MPIJob{ + wantWorkerJobSets: []kfmpi.MPIJob{ *mpiJobBuilder.Clone(). Label(constants.PrebuiltWorkloadLabel, "wl1"). Label(kueuealpha.MultiKueueOriginLabel, "origin1"). @@ -78,35 +78,35 @@ func TestMultikueueAdapter(t *testing.T) { }, }, "sync status from remote mpijob": { - managersJobSets: []kubeflow.MPIJob{ + managersJobSets: []kfmpi.MPIJob{ *mpiJobBuilder.DeepCopy(), }, - workerJobSets: []kubeflow.MPIJob{ + workerJobSets: []kfmpi.MPIJob{ *mpiJobBuilder.Clone(). Label(constants.PrebuiltWorkloadLabel, "wl1"). Label(kueuealpha.MultiKueueOriginLabel, "origin1"). - StatusConditions(kubeflow.JobCondition{Type: kubeflow.JobSucceeded, Status: corev1.ConditionTrue}). + StatusConditions(kfmpi.JobCondition{Type: kfmpi.JobSucceeded, Status: corev1.ConditionTrue}). Obj(), }, operation: func(ctx context.Context, adapter *multikueueAdapter, managerClient, workerClient client.Client) error { return adapter.SyncJob(ctx, managerClient, workerClient, types.NamespacedName{Name: "mpijob1", Namespace: TestNamespace}, "wl1", "origin1") }, - wantManagersJobSets: []kubeflow.MPIJob{ + wantManagersJobSets: []kfmpi.MPIJob{ *mpiJobBuilder.Clone(). - StatusConditions(kubeflow.JobCondition{Type: kubeflow.JobSucceeded, Status: corev1.ConditionTrue}). + StatusConditions(kfmpi.JobCondition{Type: kfmpi.JobSucceeded, Status: corev1.ConditionTrue}). Obj(), }, - wantWorkerJobSets: []kubeflow.MPIJob{ + wantWorkerJobSets: []kfmpi.MPIJob{ *mpiJobBuilder.Clone(). Label(constants.PrebuiltWorkloadLabel, "wl1"). Label(kueuealpha.MultiKueueOriginLabel, "origin1"). - StatusConditions(kubeflow.JobCondition{Type: kubeflow.JobSucceeded, Status: corev1.ConditionTrue}). + StatusConditions(kfmpi.JobCondition{Type: kfmpi.JobSucceeded, Status: corev1.ConditionTrue}). Obj(), }, }, "remote mpijob is deleted": { - workerJobSets: []kubeflow.MPIJob{ + workerJobSets: []kfmpi.MPIJob{ *mpiJobBuilder.Clone(). Label(constants.PrebuiltWorkloadLabel, "wl1"). Label(kueuealpha.MultiKueueOriginLabel, "origin1"). @@ -119,13 +119,13 @@ func TestMultikueueAdapter(t *testing.T) { } for name, tc := range cases { t.Run(name, func(t *testing.T) { - managerBuilder := utiltesting.NewClientBuilder(kubeflow.AddToScheme).WithInterceptorFuncs(interceptor.Funcs{SubResourcePatch: utiltesting.TreatSSAAsStrategicMerge}) - managerBuilder = managerBuilder.WithLists(&kubeflow.MPIJobList{Items: tc.managersJobSets}) - managerBuilder = managerBuilder.WithStatusSubresource(slices.Map(tc.managersJobSets, func(w *kubeflow.MPIJob) client.Object { return w })...) + managerBuilder := utiltesting.NewClientBuilder(kfmpi.AddToScheme).WithInterceptorFuncs(interceptor.Funcs{SubResourcePatch: utiltesting.TreatSSAAsStrategicMerge}) + managerBuilder = managerBuilder.WithLists(&kfmpi.MPIJobList{Items: tc.managersJobSets}) + managerBuilder = managerBuilder.WithStatusSubresource(slices.Map(tc.managersJobSets, func(w *kfmpi.MPIJob) client.Object { return w })...) managerClient := managerBuilder.Build() - workerBuilder := utiltesting.NewClientBuilder(kubeflow.AddToScheme).WithInterceptorFuncs(interceptor.Funcs{SubResourcePatch: utiltesting.TreatSSAAsStrategicMerge}) - workerBuilder = workerBuilder.WithLists(&kubeflow.MPIJobList{Items: tc.workerJobSets}) + workerBuilder := utiltesting.NewClientBuilder(kfmpi.AddToScheme).WithInterceptorFuncs(interceptor.Funcs{SubResourcePatch: utiltesting.TreatSSAAsStrategicMerge}) + workerBuilder = workerBuilder.WithLists(&kfmpi.MPIJobList{Items: tc.workerJobSets}) workerClient := workerBuilder.Build() ctx, _ := utiltesting.ContextWithLog(t) @@ -138,7 +138,7 @@ func TestMultikueueAdapter(t *testing.T) { t.Errorf("unexpected error (-want/+got):\n%s", diff) } - gotManagersJobSets := &kubeflow.MPIJobList{} + gotManagersJobSets := &kfmpi.MPIJobList{} if err := managerClient.List(ctx, gotManagersJobSets); err != nil { t.Errorf("unexpected list manager's mpijobs error %s", err) } else { @@ -147,7 +147,7 @@ func TestMultikueueAdapter(t *testing.T) { } } - gotWorkerJobSets := &kubeflow.MPIJobList{} + gotWorkerJobSets := &kfmpi.MPIJobList{} if err := workerClient.List(ctx, gotWorkerJobSets); err != nil { t.Errorf("unexpected list worker's mpijobs error %s", err) } else { diff --git a/pkg/controller/jobs/mpijob/mpijob_webhook.go b/pkg/controller/jobs/mpijob/mpijob_webhook.go index f3167b3efd..253e75264e 100644 --- a/pkg/controller/jobs/mpijob/mpijob_webhook.go +++ b/pkg/controller/jobs/mpijob/mpijob_webhook.go @@ -19,7 +19,7 @@ package mpijob import ( "context" - kubeflow "github.com/kubeflow/mpi-operator/pkg/apis/kubeflow/v2beta1" + kfmpi "github.com/kubeflow/mpi-operator/pkg/apis/kubeflow/v2beta1" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/util/validation/field" "k8s.io/klog/v2" @@ -41,7 +41,7 @@ func SetupMPIJobWebhook(mgr ctrl.Manager, opts ...jobframework.Option) error { manageJobsWithoutQueueName: options.ManageJobsWithoutQueueName, } return ctrl.NewWebhookManagedBy(mgr). - For(&kubeflow.MPIJob{}). + For(&kfmpi.MPIJob{}). WithDefaulter(wh). WithValidator(wh). Complete() diff --git a/pkg/controller/jobs/mpijob/mpijob_webhook_test.go b/pkg/controller/jobs/mpijob/mpijob_webhook_test.go index e6e9a06ec8..57e0c2b76c 100644 --- a/pkg/controller/jobs/mpijob/mpijob_webhook_test.go +++ b/pkg/controller/jobs/mpijob/mpijob_webhook_test.go @@ -21,16 +21,16 @@ import ( "testing" "github.com/google/go-cmp/cmp" - kubeflow "github.com/kubeflow/mpi-operator/pkg/apis/kubeflow/v2beta1" + kfmpi "github.com/kubeflow/mpi-operator/pkg/apis/kubeflow/v2beta1" testingutil "sigs.k8s.io/kueue/pkg/util/testingjobs/mpijob" ) func TestDefault(t *testing.T) { testcases := map[string]struct { - job *kubeflow.MPIJob + job *kfmpi.MPIJob manageJobsWithoutQueueName bool - want *kubeflow.MPIJob + want *kfmpi.MPIJob }{ "update the suspend field with 'manageJobsWithoutQueueName=false'": { job: testingutil.MakeMPIJob("job", "default").Queue("queue").Suspend(false).Obj(), diff --git a/pkg/util/testingjobs/mpijob/wrappers_mpijob.go b/pkg/util/testingjobs/mpijob/wrappers_mpijob.go index 597be10fca..22939f1761 100644 --- a/pkg/util/testingjobs/mpijob/wrappers_mpijob.go +++ b/pkg/util/testingjobs/mpijob/wrappers_mpijob.go @@ -17,7 +17,7 @@ limitations under the License. package testing import ( - kubeflow "github.com/kubeflow/mpi-operator/pkg/apis/kubeflow/v2beta1" + kfmpi "github.com/kubeflow/mpi-operator/pkg/apis/kubeflow/v2beta1" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -28,35 +28,35 @@ import ( ) // MPIJobWrapper wraps a Job. -type MPIJobWrapper struct{ kubeflow.MPIJob } +type MPIJobWrapper struct{ kfmpi.MPIJob } // MakeMPIJob creates a wrapper for a suspended job with a single container and parallelism=1. func MakeMPIJob(name, ns string) *MPIJobWrapper { - return &MPIJobWrapper{kubeflow.MPIJob{ + return &MPIJobWrapper{kfmpi.MPIJob{ ObjectMeta: metav1.ObjectMeta{ Name: name, Namespace: ns, Annotations: make(map[string]string, 1), }, - Spec: kubeflow.MPIJobSpec{ - RunPolicy: kubeflow.RunPolicy{ + Spec: kfmpi.MPIJobSpec{ + RunPolicy: kfmpi.RunPolicy{ Suspend: ptr.To(true), }, - MPIReplicaSpecs: make(map[kubeflow.MPIReplicaType]*kubeflow.ReplicaSpec), + MPIReplicaSpecs: make(map[kfmpi.MPIReplicaType]*kfmpi.ReplicaSpec), }, }, } } type MPIJobReplicaSpecRequirement struct { - ReplicaType kubeflow.MPIReplicaType + ReplicaType kfmpi.MPIReplicaType ReplicaCount int32 Annotations map[string]string RestartPolicy corev1.RestartPolicy } func (j *MPIJobWrapper) MPIJobReplicaSpecs(replicaSpecs ...MPIJobReplicaSpecRequirement) *MPIJobWrapper { - j = j.MPIJobReplicaSpecsDefault() + j = j.GenericLauncherAndWorker() for _, rs := range replicaSpecs { j.Spec.MPIReplicaSpecs[rs.ReplicaType].Replicas = ptr.To[int32](rs.ReplicaCount) j.Spec.MPIReplicaSpecs[rs.ReplicaType].Template.Spec.RestartPolicy = rs.RestartPolicy @@ -69,8 +69,8 @@ func (j *MPIJobWrapper) MPIJobReplicaSpecs(replicaSpecs ...MPIJobReplicaSpecRequ return j } -func (j *MPIJobWrapper) MPIJobReplicaSpecsDefault() *MPIJobWrapper { - j.Spec.MPIReplicaSpecs[kubeflow.MPIReplicaTypeLauncher] = &kubeflow.ReplicaSpec{ +func (j *MPIJobWrapper) GenericLauncherAndWorker() *MPIJobWrapper { + j.Spec.MPIReplicaSpecs[kfmpi.MPIReplicaTypeLauncher] = &kfmpi.ReplicaSpec{ Replicas: ptr.To[int32](1), Template: corev1.PodTemplateSpec{ Spec: corev1.PodSpec{ @@ -88,7 +88,7 @@ func (j *MPIJobWrapper) MPIJobReplicaSpecsDefault() *MPIJobWrapper { }, } - j.Spec.MPIReplicaSpecs[kubeflow.MPIReplicaTypeWorker] = &kubeflow.ReplicaSpec{ + j.Spec.MPIReplicaSpecs[kfmpi.MPIReplicaTypeWorker] = &kfmpi.ReplicaSpec{ Replicas: ptr.To[int32](1), Template: corev1.PodTemplateSpec{ Spec: corev1.PodSpec{ @@ -126,7 +126,7 @@ func (j *MPIJobWrapper) Label(key, value string) *MPIJobWrapper { // PriorityClass updates job priorityclass. func (j *MPIJobWrapper) PriorityClass(pc string) *MPIJobWrapper { if j.Spec.RunPolicy.SchedulingPolicy == nil { - j.Spec.RunPolicy.SchedulingPolicy = &kubeflow.SchedulingPolicy{} + j.Spec.RunPolicy.SchedulingPolicy = &kfmpi.SchedulingPolicy{} } j.Spec.RunPolicy.SchedulingPolicy.PriorityClass = pc return j @@ -142,7 +142,7 @@ func (j *MPIJobWrapper) WorkloadPriorityClass(wpc string) *MPIJobWrapper { } // Obj returns the inner Job. -func (j *MPIJobWrapper) Obj() *kubeflow.MPIJob { +func (j *MPIJobWrapper) Obj() *kfmpi.MPIJob { return &j.MPIJob } @@ -156,14 +156,14 @@ func (j *MPIJobWrapper) Queue(queue string) *MPIJobWrapper { } // Request adds a resource request to the default container. -func (j *MPIJobWrapper) Request(replicaType kubeflow.MPIReplicaType, r corev1.ResourceName, v string) *MPIJobWrapper { +func (j *MPIJobWrapper) Request(replicaType kfmpi.MPIReplicaType, r corev1.ResourceName, v string) *MPIJobWrapper { j.Spec.MPIReplicaSpecs[replicaType].Template.Spec.Containers[0].Resources.Requests[r] = resource.MustParse(v) return j } // Parallelism updates job parallelism. func (j *MPIJobWrapper) Parallelism(p int32) *MPIJobWrapper { - j.Spec.MPIReplicaSpecs[kubeflow.MPIReplicaTypeWorker].Replicas = ptr.To(p) + j.Spec.MPIReplicaSpecs[kfmpi.MPIReplicaTypeWorker].Replicas = ptr.To(p) return j } @@ -180,7 +180,7 @@ func (j *MPIJobWrapper) UID(uid string) *MPIJobWrapper { } // PodAnnotation sets annotation at the pod template level -func (j *MPIJobWrapper) PodAnnotation(replicaType kubeflow.MPIReplicaType, k, v string) *MPIJobWrapper { +func (j *MPIJobWrapper) PodAnnotation(replicaType kfmpi.MPIReplicaType, k, v string) *MPIJobWrapper { if j.Spec.MPIReplicaSpecs[replicaType].Template.Annotations == nil { j.Spec.MPIReplicaSpecs[replicaType].Template.Annotations = make(map[string]string) } @@ -189,7 +189,7 @@ func (j *MPIJobWrapper) PodAnnotation(replicaType kubeflow.MPIReplicaType, k, v } // PodLabel sets label at the pod template level -func (j *MPIJobWrapper) PodLabel(replicaType kubeflow.MPIReplicaType, k, v string) *MPIJobWrapper { +func (j *MPIJobWrapper) PodLabel(replicaType kfmpi.MPIReplicaType, k, v string) *MPIJobWrapper { if j.Spec.MPIReplicaSpecs[replicaType].Template.Labels == nil { j.Spec.MPIReplicaSpecs[replicaType].Template.Labels = make(map[string]string) } @@ -204,12 +204,12 @@ func (j *MPIJobWrapper) Generation(num int64) *MPIJobWrapper { } // StatusConditions adds a condition -func (j *MPIJobWrapper) StatusConditions(c kubeflow.JobCondition) *MPIJobWrapper { +func (j *MPIJobWrapper) StatusConditions(c kfmpi.JobCondition) *MPIJobWrapper { j.Status.Conditions = append(j.Status.Conditions, c) return j } -func (j *MPIJobWrapper) Image(replicaType kubeflow.MPIReplicaType, image string, args []string) *MPIJobWrapper { +func (j *MPIJobWrapper) Image(replicaType kfmpi.MPIReplicaType, image string, args []string) *MPIJobWrapper { j.Spec.MPIReplicaSpecs[replicaType].Template.Spec.Containers[0].Image = image j.Spec.MPIReplicaSpecs[replicaType].Template.Spec.Containers[0].Args = args return j diff --git a/site/content/en/docs/tasks/run/kubeflow/mpijobs.md b/site/content/en/docs/tasks/run/kubeflow/mpijobs.md index 7abbcbc7b5..4ff7640410 100644 --- a/site/content/en/docs/tasks/run/kubeflow/mpijobs.md +++ b/site/content/en/docs/tasks/run/kubeflow/mpijobs.md @@ -23,6 +23,11 @@ In order to use MPIJob you need to restart Kueue after the installation. You can do it by running: `kubectl delete pods -lcontrol-plane=controller-manager -nkueue-system`. {{% /alert %}} +{{% alert title="Note" color="primary" %}} +While using both MPI Operator and Training Operator, it is required to disable Training Operator's MPIJob option. +Training Operator deployment needs to be modified to enable all kubeflow jobs except MPIJob, as mentioned [here](https://github.com/kubeflow/training-operator/issues/1777). +{{% /alert %}} + ## MPI Operator definition ### a. Queue selection diff --git a/test/e2e/multikueue/e2e_test.go b/test/e2e/multikueue/e2e_test.go index 64accbc6fd..bf1ccf3f59 100644 --- a/test/e2e/multikueue/e2e_test.go +++ b/test/e2e/multikueue/e2e_test.go @@ -33,7 +33,7 @@ import ( versionutil "k8s.io/apimachinery/pkg/util/version" "k8s.io/utils/ptr" - kubeflow "github.com/kubeflow/mpi-operator/pkg/apis/kubeflow/v2beta1" + kfmpi "github.com/kubeflow/mpi-operator/pkg/apis/kubeflow/v2beta1" "sigs.k8s.io/controller-runtime/pkg/client" jobset "sigs.k8s.io/jobset/api/jobset/v1alpha2" @@ -441,22 +441,22 @@ var _ = ginkgo.Describe("MultiKueue", func() { Queue(managerLq.Name). MPIJobReplicaSpecs( testingmpijob.MPIJobReplicaSpecRequirement{ - ReplicaType: kubeflow.MPIReplicaTypeLauncher, + ReplicaType: kfmpi.MPIReplicaTypeLauncher, ReplicaCount: 1, RestartPolicy: "OnFailure", }, testingmpijob.MPIJobReplicaSpecRequirement{ - ReplicaType: kubeflow.MPIReplicaTypeWorker, + ReplicaType: kfmpi.MPIReplicaTypeWorker, ReplicaCount: 1, RestartPolicy: "OnFailure", }, ). - Request(kubeflow.MPIReplicaTypeLauncher, corev1.ResourceCPU, "1"). - Request(kubeflow.MPIReplicaTypeLauncher, corev1.ResourceMemory, "200M"). - Request(kubeflow.MPIReplicaTypeWorker, corev1.ResourceCPU, "0.5"). - Request(kubeflow.MPIReplicaTypeWorker, corev1.ResourceMemory, "100M"). - Image(kubeflow.MPIReplicaTypeLauncher, "gcr.io/k8s-staging-perf-tests/sleep:v0.1.0", []string{"1ms"}). - Image(kubeflow.MPIReplicaTypeWorker, "gcr.io/k8s-staging-perf-tests/sleep:v0.1.0", []string{"1ms"}). + Request(kfmpi.MPIReplicaTypeLauncher, corev1.ResourceCPU, "1"). + Request(kfmpi.MPIReplicaTypeLauncher, corev1.ResourceMemory, "200M"). + Request(kfmpi.MPIReplicaTypeWorker, corev1.ResourceCPU, "0.5"). + Request(kfmpi.MPIReplicaTypeWorker, corev1.ResourceMemory, "100M"). + Image(kfmpi.MPIReplicaTypeLauncher, "gcr.io/k8s-staging-perf-tests/sleep:v0.1.0", []string{"1ms"}). + Image(kfmpi.MPIReplicaTypeWorker, "gcr.io/k8s-staging-perf-tests/sleep:v0.1.0", []string{"1ms"}). Obj() ginkgo.By("Creating the MPIJob", func() { @@ -470,10 +470,10 @@ var _ = ginkgo.Describe("MultiKueue", func() { ginkgo.By("Waiting for the MPIJob to finish", func() { gomega.Eventually(func(g gomega.Gomega) { - createdMPIJob := &kubeflow.MPIJob{} + createdMPIJob := &kfmpi.MPIJob{} g.Expect(k8sManagerClient.Get(ctx, client.ObjectKeyFromObject(mpijob), createdMPIJob)).To(gomega.Succeed()) - g.Expect(createdMPIJob.Status.ReplicaStatuses[kubeflow.MPIReplicaTypeLauncher]).To(gomega.BeComparableTo( - &kubeflow.ReplicaStatus{ + g.Expect(createdMPIJob.Status.ReplicaStatuses[kfmpi.MPIReplicaTypeLauncher]).To(gomega.BeComparableTo( + &kfmpi.ReplicaStatus{ Active: 0, Succeeded: 1, }, @@ -489,7 +489,7 @@ var _ = ginkgo.Describe("MultiKueue", func() { workerWl := &kueue.Workload{} g.Expect(k8sWorker1Client.Get(ctx, wlLookupKey, workerWl)).To(utiltesting.BeNotFoundError()) g.Expect(k8sWorker2Client.Get(ctx, wlLookupKey, workerWl)).To(utiltesting.BeNotFoundError()) - workerMPIJob := &kubeflow.MPIJob{} + workerMPIJob := &kfmpi.MPIJob{} g.Expect(k8sWorker1Client.Get(ctx, client.ObjectKeyFromObject(mpijob), workerMPIJob)).To(utiltesting.BeNotFoundError()) g.Expect(k8sWorker2Client.Get(ctx, client.ObjectKeyFromObject(mpijob), workerMPIJob)).To(utiltesting.BeNotFoundError()) }, util.Timeout, util.Interval).Should(gomega.Succeed()) diff --git a/test/e2e/multikueue/suite_test.go b/test/e2e/multikueue/suite_test.go index c3855405bc..e40ed7696e 100644 --- a/test/e2e/multikueue/suite_test.go +++ b/test/e2e/multikueue/suite_test.go @@ -23,7 +23,7 @@ import ( "testing" "time" - kubeflow "github.com/kubeflow/mpi-operator/pkg/apis/kubeflow/v2beta1" + kfmpi "github.com/kubeflow/mpi-operator/pkg/apis/kubeflow/v2beta1" kftraining "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" "github.com/onsi/ginkgo/v2" "github.com/onsi/gomega" @@ -96,8 +96,8 @@ func kubeconfigForMultiKueueSA(ctx context.Context, c client.Client, restConfig policyRule(kftraining.SchemeGroupVersion.Group, "pytorchjobs/status", "get"), policyRule(kftraining.SchemeGroupVersion.Group, "xgboostjobs", resourceVerbs...), policyRule(kftraining.SchemeGroupVersion.Group, "xgboostjobs/status", "get"), - policyRule(kubeflow.SchemeGroupVersion.Group, "mpijobs", resourceVerbs...), - policyRule(kubeflow.SchemeGroupVersion.Group, "mpijobs/status", "get"), + policyRule(kfmpi.SchemeGroupVersion.Group, "mpijobs", resourceVerbs...), + policyRule(kfmpi.SchemeGroupVersion.Group, "mpijobs/status", "get"), }, } err := c.Create(ctx, cr) diff --git a/test/integration/controller/jobs/mpijob/mpijob_controller_test.go b/test/integration/controller/jobs/mpijob/mpijob_controller_test.go index 330d013fa3..8ecc686044 100644 --- a/test/integration/controller/jobs/mpijob/mpijob_controller_test.go +++ b/test/integration/controller/jobs/mpijob/mpijob_controller_test.go @@ -19,7 +19,7 @@ package mpijob import ( "fmt" - kubeflow "github.com/kubeflow/mpi-operator/pkg/apis/kubeflow/v2beta1" + kfmpi "github.com/kubeflow/mpi-operator/pkg/apis/kubeflow/v2beta1" "github.com/onsi/ginkgo/v2" "github.com/onsi/gomega" corev1 "k8s.io/api/core/v1" @@ -90,11 +90,11 @@ var _ = ginkgo.Describe("Job controller", ginkgo.Ordered, ginkgo.ContinueOnFailu gomega.Expect(k8sClient.Create(ctx, priorityClass)).Should(gomega.Succeed()) job := testingmpijob.MakeMPIJob(jobName, ns.Name). - MPIJobReplicaSpecsDefault(). + GenericLauncherAndWorker(). PriorityClass(priorityClassName).Obj() err := k8sClient.Create(ctx, job) gomega.Expect(err).To(gomega.Succeed()) - createdJob := &kubeflow.MPIJob{} + createdJob := &kfmpi.MPIJob{} gomega.Eventually(func() bool { if err := k8sClient.Get(ctx, types.NamespacedName{Name: jobName, Namespace: ns.Name}, createdJob); err != nil { @@ -190,10 +190,10 @@ var _ = ginkgo.Describe("Job controller", ginkgo.Ordered, ginkgo.ContinueOnFailu ok, _ := testing.CheckLatestEvent(ctx, k8sClient, "Started", corev1.EventTypeNormal, fmt.Sprintf("Admitted by clusterQueue %v", clusterQueue.Name)) return ok }, util.Timeout, util.Interval).Should(gomega.BeTrue()) - gomega.Expect(createdJob.Spec.MPIReplicaSpecs[kubeflow.MPIReplicaTypeLauncher].Template.Spec.NodeSelector).Should(gomega.HaveLen(1)) - gomega.Expect(createdJob.Spec.MPIReplicaSpecs[kubeflow.MPIReplicaTypeLauncher].Template.Spec.NodeSelector[instanceKey]).Should(gomega.Equal(onDemandFlavor.Name)) - gomega.Expect(createdJob.Spec.MPIReplicaSpecs[kubeflow.MPIReplicaTypeWorker].Template.Spec.NodeSelector).Should(gomega.HaveLen(1)) - gomega.Expect(createdJob.Spec.MPIReplicaSpecs[kubeflow.MPIReplicaTypeWorker].Template.Spec.NodeSelector[instanceKey]).Should(gomega.Equal(spotFlavor.Name)) + gomega.Expect(createdJob.Spec.MPIReplicaSpecs[kfmpi.MPIReplicaTypeLauncher].Template.Spec.NodeSelector).Should(gomega.HaveLen(1)) + gomega.Expect(createdJob.Spec.MPIReplicaSpecs[kfmpi.MPIReplicaTypeLauncher].Template.Spec.NodeSelector[instanceKey]).Should(gomega.Equal(onDemandFlavor.Name)) + gomega.Expect(createdJob.Spec.MPIReplicaSpecs[kfmpi.MPIReplicaTypeWorker].Template.Spec.NodeSelector).Should(gomega.HaveLen(1)) + gomega.Expect(createdJob.Spec.MPIReplicaSpecs[kfmpi.MPIReplicaTypeWorker].Template.Spec.NodeSelector[instanceKey]).Should(gomega.Equal(spotFlavor.Name)) gomega.Eventually(func() bool { if err := k8sClient.Get(ctx, wlLookupKey, createdWorkload); err != nil { return false @@ -202,16 +202,16 @@ var _ = ginkgo.Describe("Job controller", ginkgo.Ordered, ginkgo.ContinueOnFailu }, util.Timeout, util.Interval).Should(gomega.BeTrue()) ginkgo.By("checking the job gets suspended when parallelism changes and the added node selectors are removed") - parallelism := ptr.Deref(job.Spec.MPIReplicaSpecs[kubeflow.MPIReplicaTypeWorker].Replicas, 1) + parallelism := ptr.Deref(job.Spec.MPIReplicaSpecs[kfmpi.MPIReplicaTypeWorker].Replicas, 1) newParallelism := parallelism + 1 - createdJob.Spec.MPIReplicaSpecs[kubeflow.MPIReplicaTypeWorker].Replicas = &newParallelism + createdJob.Spec.MPIReplicaSpecs[kfmpi.MPIReplicaTypeWorker].Replicas = &newParallelism gomega.Expect(k8sClient.Update(ctx, createdJob)).Should(gomega.Succeed()) gomega.Eventually(func() bool { if err := k8sClient.Get(ctx, lookupKey, createdJob); err != nil { return false } return createdJob.Spec.RunPolicy.Suspend != nil && *createdJob.Spec.RunPolicy.Suspend && - len(createdJob.Spec.MPIReplicaSpecs[kubeflow.MPIReplicaTypeWorker].Template.Spec.NodeSelector) == 0 + len(createdJob.Spec.MPIReplicaSpecs[kfmpi.MPIReplicaTypeWorker].Template.Spec.NodeSelector) == 0 }, util.Timeout, util.Interval).Should(gomega.BeTrue()) gomega.Eventually(func() bool { ok, _ := testing.CheckLatestEvent(ctx, k8sClient, "DeletedWorkload", corev1.EventTypeNormal, fmt.Sprintf("Deleted not matching Workload: %v", wlLookupKey.String())) @@ -254,10 +254,10 @@ var _ = ginkgo.Describe("Job controller", ginkgo.Ordered, ginkgo.ContinueOnFailu } return !*createdJob.Spec.RunPolicy.Suspend }, util.Timeout, util.Interval).Should(gomega.BeTrue()) - gomega.Expect(createdJob.Spec.MPIReplicaSpecs[kubeflow.MPIReplicaTypeLauncher].Template.Spec.NodeSelector).Should(gomega.HaveLen(1)) - gomega.Expect(createdJob.Spec.MPIReplicaSpecs[kubeflow.MPIReplicaTypeLauncher].Template.Spec.NodeSelector[instanceKey]).Should(gomega.Equal(onDemandFlavor.Name)) - gomega.Expect(createdJob.Spec.MPIReplicaSpecs[kubeflow.MPIReplicaTypeWorker].Template.Spec.NodeSelector).Should(gomega.HaveLen(1)) - gomega.Expect(createdJob.Spec.MPIReplicaSpecs[kubeflow.MPIReplicaTypeWorker].Template.Spec.NodeSelector[instanceKey]).Should(gomega.Equal(spotFlavor.Name)) + gomega.Expect(createdJob.Spec.MPIReplicaSpecs[kfmpi.MPIReplicaTypeLauncher].Template.Spec.NodeSelector).Should(gomega.HaveLen(1)) + gomega.Expect(createdJob.Spec.MPIReplicaSpecs[kfmpi.MPIReplicaTypeLauncher].Template.Spec.NodeSelector[instanceKey]).Should(gomega.Equal(onDemandFlavor.Name)) + gomega.Expect(createdJob.Spec.MPIReplicaSpecs[kfmpi.MPIReplicaTypeWorker].Template.Spec.NodeSelector).Should(gomega.HaveLen(1)) + gomega.Expect(createdJob.Spec.MPIReplicaSpecs[kfmpi.MPIReplicaTypeWorker].Template.Spec.NodeSelector[instanceKey]).Should(gomega.Equal(spotFlavor.Name)) gomega.Eventually(func() bool { if err := k8sClient.Get(ctx, wlLookupKey, createdWorkload); err != nil { return false @@ -267,8 +267,8 @@ var _ = ginkgo.Describe("Job controller", ginkgo.Ordered, ginkgo.ContinueOnFailu ginkgo.By("checking the workload is finished when job is completed") createdJob.Status.Conditions = append(createdJob.Status.Conditions, - kubeflow.JobCondition{ - Type: kubeflow.JobSucceeded, + kfmpi.JobCondition{ + Type: kfmpi.JobSucceeded, Status: corev1.ConditionTrue, LastTransitionTime: metav1.Now(), }) @@ -318,12 +318,12 @@ var _ = ginkgo.Describe("Job controller", ginkgo.Ordered, ginkgo.ContinueOnFailu ginkgo.It("labels and annotations should be propagated from admission check to job", func() { job := testingmpijob.MakeMPIJob(jobName, ns.Name). - MPIJobReplicaSpecsDefault(). + GenericLauncherAndWorker(). Queue(localQueue.Name). - PodAnnotation(kubeflow.MPIReplicaTypeWorker, "old-ann-key", "old-ann-value"). - PodLabel(kubeflow.MPIReplicaTypeWorker, "old-label-key", "old-label-value"). + PodAnnotation(kfmpi.MPIReplicaTypeWorker, "old-ann-key", "old-ann-value"). + PodLabel(kfmpi.MPIReplicaTypeWorker, "old-label-key", "old-label-value"). Obj() - createdJob := &kubeflow.MPIJob{} + createdJob := &kfmpi.MPIJob{} createdWorkload := &kueue.Workload{} ginkgo.By("creating the job with pod labels & annotations", func() { @@ -410,7 +410,7 @@ var _ = ginkgo.Describe("Job controller", ginkgo.Ordered, ginkgo.ContinueOnFailu }) ginkgo.By("verify the PodSetUpdates are propagated to the running job, for worker", func() { - worker := createdJob.Spec.MPIReplicaSpecs[kubeflow.MPIReplicaTypeWorker].Template + worker := createdJob.Spec.MPIReplicaSpecs[kfmpi.MPIReplicaTypeWorker].Template gomega.Expect(worker.Annotations).Should(gomega.HaveKeyWithValue("ann1", "ann-value1")) gomega.Expect(worker.Annotations).Should(gomega.HaveKeyWithValue("old-ann-key", "old-ann-value")) gomega.Expect(worker.Labels).Should(gomega.HaveKeyWithValue("label1", "label-value1")) @@ -430,7 +430,7 @@ var _ = ginkgo.Describe("Job controller", ginkgo.Ordered, ginkgo.ContinueOnFailu }) ginkgo.By("verify the node selectors are propagated to the running job, for launcher", func() { - launcher := createdJob.Spec.MPIReplicaSpecs[kubeflow.MPIReplicaTypeLauncher].Template + launcher := createdJob.Spec.MPIReplicaSpecs[kfmpi.MPIReplicaTypeLauncher].Template gomega.Expect(launcher.Spec.NodeSelector).Should(gomega.HaveKeyWithValue(instanceKey, "test-flavor")) }) @@ -452,7 +452,7 @@ var _ = ginkgo.Describe("Job controller", ginkgo.Ordered, ginkgo.ContinueOnFailu }) ginkgo.By("verify the PodSetUpdates are restored for worker", func() { - worker := createdJob.Spec.MPIReplicaSpecs[kubeflow.MPIReplicaTypeWorker].Template + worker := createdJob.Spec.MPIReplicaSpecs[kfmpi.MPIReplicaTypeWorker].Template gomega.Expect(worker.Annotations).ShouldNot(gomega.HaveKey("ann1")) gomega.Expect(worker.Annotations).Should(gomega.HaveKeyWithValue("old-ann-key", "old-ann-value")) gomega.Expect(worker.Labels).ShouldNot(gomega.HaveKey("label1")) @@ -462,7 +462,7 @@ var _ = ginkgo.Describe("Job controller", ginkgo.Ordered, ginkgo.ContinueOnFailu }) ginkgo.By("verify the PodSetUpdates are restored for launcher", func() { - launcher := createdJob.Spec.MPIReplicaSpecs[kubeflow.MPIReplicaTypeLauncher].Template + launcher := createdJob.Spec.MPIReplicaSpecs[kfmpi.MPIReplicaTypeLauncher].Template gomega.Expect(launcher.Annotations).ShouldNot(gomega.HaveKey("ann1")) gomega.Expect(launcher.Labels).ShouldNot(gomega.HaveKey("label1")) gomega.Expect(launcher.Spec.NodeSelector).ShouldNot(gomega.HaveKey(instanceKey)) @@ -506,10 +506,10 @@ var _ = ginkgo.Describe("Job controller for workloads when only jobs with queue ginkgo.It("Should reconcile jobs only when queue is set", func() { ginkgo.By("checking the workload is not created when queue name is not set") - job := testingmpijob.MakeMPIJob(jobName, ns.Name).MPIJobReplicaSpecsDefault().Obj() + job := testingmpijob.MakeMPIJob(jobName, ns.Name).GenericLauncherAndWorker().Obj() gomega.Expect(k8sClient.Create(ctx, job)).Should(gomega.Succeed()) lookupKey := types.NamespacedName{Name: jobName, Namespace: ns.Name} - createdJob := &kubeflow.MPIJob{} + createdJob := &kfmpi.MPIJob{} gomega.Expect(k8sClient.Get(ctx, lookupKey, createdJob)).Should(gomega.Succeed()) createdWorkload := &kueue.Workload{} @@ -537,7 +537,7 @@ var _ = ginkgo.Describe("Job controller for workloads when only jobs with queue ginkgo.By("Creating the child job") childJob := testingjob.MakeJob(childJobName, ns.Name). - OwnerReference(parentJobName, kubeflow.SchemeGroupVersionKind). + OwnerReference(parentJobName, kfmpi.SchemeGroupVersionKind). Suspend(false). Obj() gomega.Expect(ctrl.SetControllerReference(parentJob, childJob, k8sClient.Scheme())).To(gomega.Succeed()) @@ -560,7 +560,7 @@ var _ = ginkgo.Describe("Job controller for workloads when only jobs with queue ginkgo.By("Creating the child job which has ownerReference with known existing workload owner") childJob := testingjob.MakeJob(childJobName, ns.Name). - OwnerReference(parentJobName, kubeflow.SchemeGroupVersionKind). + OwnerReference(parentJobName, kfmpi.SchemeGroupVersionKind). Suspend(false). Obj() gomega.Expect(ctrl.SetControllerReference(parentJob, childJob, k8sClient.Scheme())).To(gomega.Succeed()) @@ -576,9 +576,9 @@ var _ = ginkgo.Describe("Job controller for workloads when only jobs with queue var _ = ginkgo.Describe("Job controller when waitForPodsReady enabled", ginkgo.Ordered, ginkgo.ContinueOnFailure, func() { type podsReadyTestSpec struct { - beforeJobStatus *kubeflow.JobStatus + beforeJobStatus *kfmpi.JobStatus beforeCondition *metav1.Condition - jobStatus kubeflow.JobStatus + jobStatus kfmpi.JobStatus suspended bool wantCondition *metav1.Condition } @@ -620,13 +620,13 @@ var _ = ginkgo.Describe("Job controller when waitForPodsReady enabled", ginkgo.O func(podsReadyTestSpec podsReadyTestSpec) { ginkgo.By("Create a job") job := testingmpijob.MakeMPIJob(jobName, ns.Name). - MPIJobReplicaSpecsDefault(). + GenericLauncherAndWorker(). Parallelism(2).Obj() jobQueueName := "test-queue" job.Annotations = map[string]string{constants.QueueAnnotation: jobQueueName} gomega.Expect(k8sClient.Create(ctx, job)).Should(gomega.Succeed()) lookupKey := types.NamespacedName{Name: jobName, Namespace: ns.Name} - createdJob := &kubeflow.MPIJob{} + createdJob := &kfmpi.MPIJob{} gomega.Expect(k8sClient.Get(ctx, lookupKey, createdJob)).Should(gomega.Succeed()) ginkgo.By("Fetch the workload created for the job") @@ -713,10 +713,10 @@ var _ = ginkgo.Describe("Job controller when waitForPodsReady enabled", ginkgo.O }, }), ginkgo.Entry("Running MPIJob", podsReadyTestSpec{ - jobStatus: kubeflow.JobStatus{ - Conditions: []kubeflow.JobCondition{ + jobStatus: kfmpi.JobStatus{ + Conditions: []kfmpi.JobCondition{ { - Type: kubeflow.JobRunning, + Type: kfmpi.JobRunning, Status: corev1.ConditionTrue, Reason: "Running", }, @@ -736,10 +736,10 @@ var _ = ginkgo.Describe("Job controller when waitForPodsReady enabled", ginkgo.O Reason: "PodsReady", Message: "Not all pods are ready or succeeded", }, - jobStatus: kubeflow.JobStatus{ - Conditions: []kubeflow.JobCondition{ + jobStatus: kfmpi.JobStatus{ + Conditions: []kfmpi.JobCondition{ { - Type: kubeflow.JobRunning, + Type: kfmpi.JobRunning, Status: corev1.ConditionTrue, Reason: "Running", }, @@ -753,10 +753,10 @@ var _ = ginkgo.Describe("Job controller when waitForPodsReady enabled", ginkgo.O }, }), ginkgo.Entry("Job suspended; PodsReady=True before", podsReadyTestSpec{ - beforeJobStatus: &kubeflow.JobStatus{ - Conditions: []kubeflow.JobCondition{ + beforeJobStatus: &kfmpi.JobStatus{ + Conditions: []kfmpi.JobCondition{ { - Type: kubeflow.JobRunning, + Type: kfmpi.JobRunning, Status: corev1.ConditionTrue, Reason: "Running", }, @@ -768,10 +768,10 @@ var _ = ginkgo.Describe("Job controller when waitForPodsReady enabled", ginkgo.O Reason: "PodsReady", Message: "All pods were ready or succeeded since the workload admission", }, - jobStatus: kubeflow.JobStatus{ - Conditions: []kubeflow.JobCondition{ + jobStatus: kfmpi.JobStatus{ + Conditions: []kfmpi.JobCondition{ { - Type: kubeflow.JobRunning, + Type: kfmpi.JobRunning, Status: corev1.ConditionFalse, Reason: "Suspended", }, @@ -844,19 +844,19 @@ var _ = ginkgo.Describe("Job controller interacting with scheduler", ginkgo.Orde ginkgo.By("checking a dev job starts") job := testingmpijob.MakeMPIJob("dev-job", ns.Name).Queue(localQueue.Name). - MPIJobReplicaSpecsDefault(). - Request(kubeflow.MPIReplicaTypeLauncher, corev1.ResourceCPU, "3"). - Request(kubeflow.MPIReplicaTypeWorker, corev1.ResourceCPU, "4"). + GenericLauncherAndWorker(). + Request(kfmpi.MPIReplicaTypeLauncher, corev1.ResourceCPU, "3"). + Request(kfmpi.MPIReplicaTypeWorker, corev1.ResourceCPU, "4"). Obj() gomega.Expect(k8sClient.Create(ctx, job)).Should(gomega.Succeed()) - createdJob := &kubeflow.MPIJob{} + createdJob := &kfmpi.MPIJob{} gomega.Eventually(func() *bool { gomega.Expect(k8sClient.Get(ctx, types.NamespacedName{Name: job.Name, Namespace: job.Namespace}, createdJob)). Should(gomega.Succeed()) return createdJob.Spec.RunPolicy.Suspend }, util.Timeout, util.Interval).Should(gomega.Equal(ptr.To(false))) - gomega.Expect(createdJob.Spec.MPIReplicaSpecs[kubeflow.MPIReplicaTypeLauncher].Template.Spec.NodeSelector[instanceKey]).Should(gomega.Equal(spotUntaintedFlavor.Name)) - gomega.Expect(createdJob.Spec.MPIReplicaSpecs[kubeflow.MPIReplicaTypeWorker].Template.Spec.NodeSelector[instanceKey]).Should(gomega.Equal(onDemandFlavor.Name)) + gomega.Expect(createdJob.Spec.MPIReplicaSpecs[kfmpi.MPIReplicaTypeLauncher].Template.Spec.NodeSelector[instanceKey]).Should(gomega.Equal(spotUntaintedFlavor.Name)) + gomega.Expect(createdJob.Spec.MPIReplicaSpecs[kfmpi.MPIReplicaTypeWorker].Template.Spec.NodeSelector[instanceKey]).Should(gomega.Equal(onDemandFlavor.Name)) util.ExpectPendingWorkloadsMetric(clusterQueue, 0, 0) util.ExpectReservingActiveWorkloadsMetric(clusterQueue, 1) }) @@ -865,15 +865,15 @@ var _ = ginkgo.Describe("Job controller interacting with scheduler", ginkgo.Orde ginkgo.It("Should restore the original node selectors", func() { localQueue := testing.MakeLocalQueue("local-queue", ns.Name).ClusterQueue(clusterQueue.Name).Obj() job := testingmpijob.MakeMPIJob(jobName, ns.Name).Queue(localQueue.Name). - MPIJobReplicaSpecsDefault(). - Request(kubeflow.MPIReplicaTypeLauncher, corev1.ResourceCPU, "3"). - Request(kubeflow.MPIReplicaTypeWorker, corev1.ResourceCPU, "4"). + GenericLauncherAndWorker(). + Request(kfmpi.MPIReplicaTypeLauncher, corev1.ResourceCPU, "3"). + Request(kfmpi.MPIReplicaTypeWorker, corev1.ResourceCPU, "4"). Obj() lookupKey := types.NamespacedName{Name: job.Name, Namespace: job.Namespace} - createdJob := &kubeflow.MPIJob{} + createdJob := &kfmpi.MPIJob{} - nodeSelectors := func(j *kubeflow.MPIJob) map[kubeflow.MPIReplicaType]map[string]string { - ret := map[kubeflow.MPIReplicaType]map[string]string{} + nodeSelectors := func(j *kfmpi.MPIJob) map[kfmpi.MPIReplicaType]map[string]string { + ret := map[kfmpi.MPIReplicaType]map[string]string{} for k := range j.Spec.MPIReplicaSpecs { ret[k] = j.Spec.MPIReplicaSpecs[k].Template.Spec.NodeSelector } @@ -906,7 +906,7 @@ var _ = ginkgo.Describe("Job controller interacting with scheduler", ginkgo.Orde }) ginkgo.By("the node selectors should be updated", func() { - gomega.Eventually(func() map[kubeflow.MPIReplicaType]map[string]string { + gomega.Eventually(func() map[kfmpi.MPIReplicaType]map[string]string { gomega.Expect(k8sClient.Get(ctx, lookupKey, createdJob)).Should(gomega.Succeed()) return nodeSelectors(createdJob) }, util.Timeout, util.Interval).ShouldNot(gomega.Equal(originalNodeSelectors)) @@ -925,7 +925,7 @@ var _ = ginkgo.Describe("Job controller interacting with scheduler", ginkgo.Orde }) ginkgo.By("the node selectors should be restored", func() { - gomega.Eventually(func() map[kubeflow.MPIReplicaType]map[string]string { + gomega.Eventually(func() map[kfmpi.MPIReplicaType]map[string]string { gomega.Expect(k8sClient.Get(ctx, lookupKey, createdJob)).Should(gomega.Succeed()) return nodeSelectors(createdJob) }, util.Timeout, util.Interval).Should(gomega.Equal(originalNodeSelectors)) diff --git a/test/integration/framework/framework.go b/test/integration/framework/framework.go index 02686f43a1..664a02c457 100644 --- a/test/integration/framework/framework.go +++ b/test/integration/framework/framework.go @@ -26,7 +26,7 @@ import ( "strings" "time" - kubeflow "github.com/kubeflow/mpi-operator/pkg/apis/kubeflow/v2beta1" + kfmpi "github.com/kubeflow/mpi-operator/pkg/apis/kubeflow/v2beta1" kftraining "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" "github.com/onsi/ginkgo/v2" "github.com/onsi/gomega" @@ -97,7 +97,7 @@ func (f *Framework) SetupClient(cfg *rest.Config) (context.Context, client.Clien err = kueuealpha.AddToScheme(scheme.Scheme) gomega.ExpectWithOffset(1, err).NotTo(gomega.HaveOccurred()) - err = kubeflow.AddToScheme(scheme.Scheme) + err = kfmpi.AddToScheme(scheme.Scheme) gomega.ExpectWithOffset(1, err).NotTo(gomega.HaveOccurred()) err = rayv1.AddToScheme(scheme.Scheme) diff --git a/test/integration/multikueue/multikueue_test.go b/test/integration/multikueue/multikueue_test.go index a222886d7d..f44a6c609a 100644 --- a/test/integration/multikueue/multikueue_test.go +++ b/test/integration/multikueue/multikueue_test.go @@ -23,7 +23,7 @@ import ( "time" "github.com/google/go-cmp/cmp/cmpopts" - kubeflow "github.com/kubeflow/mpi-operator/pkg/apis/kubeflow/v2beta1" + kfmpi "github.com/kubeflow/mpi-operator/pkg/apis/kubeflow/v2beta1" kftraining "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" "github.com/onsi/ginkgo/v2" "github.com/onsi/gomega" @@ -1036,12 +1036,12 @@ var _ = ginkgo.Describe("Multikueue", ginkgo.Ordered, ginkgo.ContinueOnFailure, Queue(managerLq.Name). MPIJobReplicaSpecs( testingmpijob.MPIJobReplicaSpecRequirement{ - ReplicaType: kubeflow.MPIReplicaTypeLauncher, + ReplicaType: kfmpi.MPIReplicaTypeLauncher, ReplicaCount: 1, RestartPolicy: corev1.RestartPolicyOnFailure, }, testingmpijob.MPIJobReplicaSpecRequirement{ - ReplicaType: kubeflow.MPIReplicaTypeWorker, + ReplicaType: kfmpi.MPIReplicaTypeWorker, ReplicaCount: 1, RestartPolicy: corev1.RestartPolicyNever, }, @@ -1062,13 +1062,13 @@ var _ = ginkgo.Describe("Multikueue", ginkgo.Ordered, ginkgo.ContinueOnFailure, ginkgo.By("changing the status of the MPIJob in the worker, updates the manager's MPIJob status", func() { gomega.Eventually(func(g gomega.Gomega) { - createdMPIJob := kubeflow.MPIJob{} + createdMPIJob := kfmpi.MPIJob{} g.Expect(worker2TestCluster.client.Get(worker2TestCluster.ctx, client.ObjectKeyFromObject(mpijob), &createdMPIJob)).To(gomega.Succeed()) - createdMPIJob.Status.ReplicaStatuses = map[kubeflow.MPIReplicaType]*kubeflow.ReplicaStatus{ - kubeflow.MPIReplicaTypeLauncher: { + createdMPIJob.Status.ReplicaStatuses = map[kfmpi.MPIReplicaType]*kfmpi.ReplicaStatus{ + kfmpi.MPIReplicaTypeLauncher: { Active: 1, }, - kubeflow.MPIReplicaTypeWorker: { + kfmpi.MPIReplicaTypeWorker: { Active: 1, Succeeded: 1, }, @@ -1076,14 +1076,14 @@ var _ = ginkgo.Describe("Multikueue", ginkgo.Ordered, ginkgo.ContinueOnFailure, g.Expect(worker2TestCluster.client.Status().Update(worker2TestCluster.ctx, &createdMPIJob)).To(gomega.Succeed()) }, util.Timeout, util.Interval).Should(gomega.Succeed()) gomega.Eventually(func(g gomega.Gomega) { - createdMPIJob := kubeflow.MPIJob{} + createdMPIJob := kfmpi.MPIJob{} g.Expect(managerTestCluster.client.Get(managerTestCluster.ctx, client.ObjectKeyFromObject(mpijob), &createdMPIJob)).To(gomega.Succeed()) g.Expect(createdMPIJob.Status.ReplicaStatuses).To(gomega.Equal( - map[kubeflow.MPIReplicaType]*kubeflow.ReplicaStatus{ - kubeflow.MPIReplicaTypeLauncher: { + map[kfmpi.MPIReplicaType]*kfmpi.ReplicaStatus{ + kfmpi.MPIReplicaTypeLauncher: { Active: 1, }, - kubeflow.MPIReplicaTypeWorker: { + kfmpi.MPIReplicaTypeWorker: { Active: 1, Succeeded: 1, }, @@ -1094,10 +1094,10 @@ var _ = ginkgo.Describe("Multikueue", ginkgo.Ordered, ginkgo.ContinueOnFailure, ginkgo.By("finishing the worker MPIJob, the manager's wl is marked as finished and the worker2 wl removed", func() { finishJobReason := "MPIJob finished successfully" gomega.Eventually(func(g gomega.Gomega) { - createdMPIJob := kubeflow.MPIJob{} + createdMPIJob := kfmpi.MPIJob{} g.Expect(worker2TestCluster.client.Get(worker2TestCluster.ctx, client.ObjectKeyFromObject(mpijob), &createdMPIJob)).To(gomega.Succeed()) - createdMPIJob.Status.Conditions = append(createdMPIJob.Status.Conditions, kubeflow.JobCondition{ - Type: kubeflow.JobSucceeded, + createdMPIJob.Status.Conditions = append(createdMPIJob.Status.Conditions, kfmpi.JobCondition{ + Type: kfmpi.JobSucceeded, Status: corev1.ConditionTrue, Reason: "ByTest", Message: finishJobReason, diff --git a/test/util/e2e.go b/test/util/e2e.go index e7dacc2d0d..02dc9686df 100644 --- a/test/util/e2e.go +++ b/test/util/e2e.go @@ -6,7 +6,7 @@ import ( "os" "github.com/google/go-cmp/cmp/cmpopts" - kubeflow "github.com/kubeflow/mpi-operator/pkg/apis/kubeflow/v2beta1" + kfmpi "github.com/kubeflow/mpi-operator/pkg/apis/kubeflow/v2beta1" kftraining "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" "github.com/onsi/gomega" appsv1 "k8s.io/api/apps/v1" @@ -48,7 +48,7 @@ func CreateClientUsingCluster(kContext string) (client.WithWatch, *rest.Config) err = kftraining.AddToScheme(scheme.Scheme) gomega.ExpectWithOffset(1, err).NotTo(gomega.HaveOccurred()) - err = kubeflow.AddToScheme(scheme.Scheme) + err = kfmpi.AddToScheme(scheme.Scheme) gomega.ExpectWithOffset(1, err).NotTo(gomega.HaveOccurred()) client, err := client.NewWithWatch(cfg, client.Options{Scheme: scheme.Scheme}) From 8daa5f7b18041b7c23d9d8d33d2457e10f52cef2 Mon Sep 17 00:00:00 2001 From: Michal Szadkowski Date: Wed, 11 Sep 2024 21:42:13 +0200 Subject: [PATCH 15/15] Cleanup after manager kustomize modification --- hack/multikueue-e2e-test.sh | 2 ++ .../config/multikueue/manager/kustomization.yaml | 14 ++++++-------- test/e2e/config/multikueue/manager/namespace.yaml | 4 ---- 3 files changed, 8 insertions(+), 12 deletions(-) delete mode 100644 test/e2e/config/multikueue/manager/namespace.yaml diff --git a/hack/multikueue-e2e-test.sh b/hack/multikueue-e2e-test.sh index 1c9902ace3..54925f56ce 100755 --- a/hack/multikueue-e2e-test.sh +++ b/hack/multikueue-e2e-test.sh @@ -41,6 +41,8 @@ function cleanup { fi #do the image restore here for the case when an error happened during deploy restore_managers_image + # Remove the `newTag` for the `kubeflow/training-operator` to revert to the default version + $YQ eval 'del(.images[] | select(.name == "kubeflow/training-operator").newTag)' -i "$KUBEFLOW_MANIFEST_MANAGER/kustomization.yaml" } diff --git a/test/e2e/config/multikueue/manager/kustomization.yaml b/test/e2e/config/multikueue/manager/kustomization.yaml index 38348037ca..bf62f9ec13 100644 --- a/test/e2e/config/multikueue/manager/kustomization.yaml +++ b/test/e2e/config/multikueue/manager/kustomization.yaml @@ -1,14 +1,12 @@ apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization -namespace: kubeflow resources: -- ../../../../../dep-crds/training-operator/manifests/base/crds -- namespace.yaml + - ../../../../../dep-crds/training-operator/manifests/base/crds images: -- name: kubeflow/training-operator + - name: kubeflow/training-operator secretGenerator: -- name: training-operator-webhook-cert - options: - disableNameSuffixHash: true + - name: training-operator-webhook-cert + options: + disableNameSuffixHash: true patchesStrategicMerge: -- patch_crds.yaml + - patch_crds.yaml diff --git a/test/e2e/config/multikueue/manager/namespace.yaml b/test/e2e/config/multikueue/manager/namespace.yaml deleted file mode 100644 index 7a940e4673..0000000000 --- a/test/e2e/config/multikueue/manager/namespace.yaml +++ /dev/null @@ -1,4 +0,0 @@ -apiVersion: v1 -kind: Namespace -metadata: - name: kubeflow