From aefd100dcfe50e693ce1255999986744a5e233aa Mon Sep 17 00:00:00 2001 From: Yuki Iwai Date: Fri, 10 Feb 2023 03:48:51 +0900 Subject: [PATCH] adapt hostfile to IntelMPI Signed-off-by: Yuki Iwai --- build/base/intel-entrypoint.sh | 2 +- pkg/controller/mpi_job_controller.go | 6 +++++- pkg/controller/mpi_job_controller_test.go | 13 ++++++++----- 3 files changed, 14 insertions(+), 7 deletions(-) diff --git a/build/base/intel-entrypoint.sh b/build/base/intel-entrypoint.sh index 10d16646..bfddddac 100755 --- a/build/base/intel-entrypoint.sh +++ b/build/base/intel-entrypoint.sh @@ -27,7 +27,7 @@ function resolve_host() { if [ "$K_MPI_JOB_ROLE" == "launcher" ]; then resolve_host "$HOSTNAME" - cut -d ' ' -f 1 /etc/mpi/hostfile | while read -r host + cut -d ':' -f 1 /etc/mpi/hostfile | while read -r host do resolve_host "$host" done diff --git a/pkg/controller/mpi_job_controller.go b/pkg/controller/mpi_job_controller.go index 0a08379e..a005c89f 100644 --- a/pkg/controller/mpi_job_controller.go +++ b/pkg/controller/mpi_job_controller.go @@ -1187,7 +1187,11 @@ func newConfigMap(mpiJob *kubeflow.MPIJob, workerReplicas int32) *corev1.ConfigM slots = int(*mpiJob.Spec.SlotsPerWorker) } for i := 0; i < int(workerReplicas); i++ { - buffer.WriteString(fmt.Sprintf("%s%s-%d.%s.%s.svc slots=%d\n", mpiJob.Name, workerSuffix, i, workersService, mpiJob.Namespace, slots)) + if mpiJob.Spec.MPIImplementation == kubeflow.MPIImplementationOpenMPI { + buffer.WriteString(fmt.Sprintf("%s%s-%d.%s.%s.svc slots=%d\n", mpiJob.Name, workerSuffix, i, workersService, mpiJob.Namespace, slots)) + } else if mpiJob.Spec.MPIImplementation == kubeflow.MPIImplementationIntel { + buffer.WriteString(fmt.Sprintf("%s%s-%d.%s.%s.svc:%d\n", mpiJob.Name, workerSuffix, i, workersService, mpiJob.Namespace, slots)) + } } return &corev1.ConfigMap{ diff --git a/pkg/controller/mpi_job_controller_test.go b/pkg/controller/mpi_job_controller_test.go index b71c7eea..d36f9ddc 100644 --- a/pkg/controller/mpi_job_controller_test.go +++ b/pkg/controller/mpi_job_controller_test.go @@ -1506,13 +1506,15 @@ func TestNewConfigMap(t *testing.T) { workerReplicas int32 wantCM *corev1.ConfigMap }{ - "without slots": { + "OpenMPI without slots": { mpiJob: &kubeflow.MPIJob{ ObjectMeta: metav1.ObjectMeta{ Name: "without-slots", Namespace: "tenant-a", }, - Spec: kubeflow.MPIJobSpec{}, + Spec: kubeflow.MPIJobSpec{ + MPIImplementation: kubeflow.MPIImplementationOpenMPI, + }, }, workerReplicas: 2, wantCM: &corev1.ConfigMap{ @@ -1528,14 +1530,15 @@ func TestNewConfigMap(t *testing.T) { }, }, }, - "with slots": { + "IntelMPI with slots": { mpiJob: &kubeflow.MPIJob{ ObjectMeta: metav1.ObjectMeta{ Name: "with-slots", Namespace: "project-x", }, Spec: kubeflow.MPIJobSpec{ - SlotsPerWorker: pointer.Int32(10), + SlotsPerWorker: pointer.Int32(10), + MPIImplementation: kubeflow.MPIImplementationIntel, }, }, workerReplicas: 1, @@ -1548,7 +1551,7 @@ func TestNewConfigMap(t *testing.T) { }, }, Data: map[string]string{ - "hostfile": "with-slots-worker-0.with-slots-worker.project-x.svc slots=10\n", + "hostfile": "with-slots-worker-0.with-slots-worker.project-x.svc:10\n", }, }, },