pmodels · raffenet · Jul 9, 2024 · Oct 1, 2024
diff --git a/src/mpid/ch4/netmod/ofi/ofi_impl.h b/src/mpid/ch4/netmod/ofi/ofi_impl.h
@@ -11,6 +11,9 @@
 #include "ofi_types.h"
 #include "mpidch4r.h"
 #include "ch4_impl.h"
+#ifdef MPL_HAVE_CUDA
+#include <cuda.h>       /* for cuDeviceGet */
+#endif
 
 extern unsigned long long PVAR_COUNTER_nic_sent_bytes_count[MPIDI_OFI_MAX_NICS] ATTRIBUTE((unused));
 extern unsigned long long PVAR_COUNTER_nic_recvd_bytes_count[MPIDI_OFI_MAX_NICS]
@@ -707,8 +710,15 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_register_memory(char *send_buf, size_t da
     mr_attr.context = NULL;
     if (MPL_gpu_attr_is_strict_dev(attr)) {
 #ifdef MPL_HAVE_CUDA
+        CUdevice device;
+        int dev_id;
+
+        /* libfabric says to get the device handle from cuDeviceGet */
+        dev_id = MPL_gpu_get_dev_id_from_attr(attr);
+        cuDeviceGet(&device, dev_id);
+
         mr_attr.iface = FI_HMEM_CUDA;
-        mr_attr.device.cuda = MPL_gpu_get_dev_id_from_attr(attr);
+        mr_attr.device.cuda = device;
 #elif defined MPL_HAVE_ZE
         /* OFI does not support tiles yet, need to pass the root device. */
         mr_attr.iface = FI_HMEM_ZE;

diff --git a/src/mpl/include/mpl_gpu.h b/src/mpl/include/mpl_gpu.h
@@ -36,7 +36,7 @@ typedef enum {
 
 typedef struct {
     MPL_pointer_type_t type;
-    MPL_gpu_device_handle_t device;
+    int device;
     MPL_gpu_device_attr device_attr;
 } MPL_pointer_attr_t;
 
@@ -125,7 +125,7 @@ int MPL_gpu_free_host(void *ptr);
 int MPL_gpu_register_host(const void *ptr, size_t size);
 int MPL_gpu_unregister_host(const void *ptr);
 
-int MPL_gpu_malloc(void **ptr, size_t size, MPL_gpu_device_handle_t h_device);
+int MPL_gpu_malloc(void **ptr, size_t size, int h_device);
 int MPL_gpu_free(void *ptr);
 
 int MPL_gpu_init(int debug_summary);

diff --git a/src/mpl/include/mpl_gpu_cuda.h b/src/mpl/include/mpl_gpu_cuda.h
@@ -10,7 +10,6 @@
 #include "cuda_runtime_api.h"
 
 typedef cudaIpcMemHandle_t MPL_gpu_ipc_mem_handle_t;
-typedef int MPL_gpu_device_handle_t;
 typedef struct cudaPointerAttributes MPL_gpu_device_attr;
 typedef int MPL_gpu_request;
 typedef cudaStream_t MPL_gpu_stream_t;

diff --git a/src/mpl/include/mpl_gpu_fallback.h b/src/mpl/include/mpl_gpu_fallback.h
@@ -7,7 +7,6 @@
 #define MPL_GPU_CUDA_H_INCLUDED
 
 typedef int MPL_gpu_ipc_mem_handle_t;
-typedef int MPL_gpu_device_handle_t;
 typedef int MPL_gpu_device_attr;        /* dummy type */
 typedef int MPL_gpu_request;
 typedef int MPL_gpu_stream_t;

diff --git a/src/mpl/include/mpl_gpu_hip.h b/src/mpl/include/mpl_gpu_hip.h
@@ -14,7 +14,6 @@
 #include "hip/hip_runtime_api.h"
 
 typedef hipIpcMemHandle_t MPL_gpu_ipc_mem_handle_t;
-typedef int MPL_gpu_device_handle_t;
 typedef struct hipPointerAttribute_t MPL_gpu_device_attr;
 typedef int MPL_gpu_request;
 typedef hipStream_t MPL_gpu_stream_t;

diff --git a/src/mpl/include/mpl_gpu_ze.h b/src/mpl/include/mpl_gpu_ze.h
@@ -26,7 +26,6 @@ typedef struct _MPL_gpu_ipc_mem_handle_t {
     fd_pid_t data;
 } MPL_gpu_ipc_mem_handle_t;
 
-typedef ze_device_handle_t MPL_gpu_device_handle_t;
 typedef ze_alloc_attr_t MPL_gpu_device_attr;
 
 typedef struct MPL_cmdlist_pool {
@@ -52,7 +51,7 @@ typedef int MPL_gpu_stream_t;
 typedef volatile int MPL_gpu_event_t;
 
 #define MPL_GPU_STREAM_DEFAULT 0
-#define MPL_GPU_DEVICE_INVALID NULL
+#define MPL_GPU_DEVICE_INVALID -1
 
 #define MPL_GPU_DEV_AFFINITY_ENV "ZE_AFFINITY_MASK"
 
@@ -67,7 +66,7 @@ int MPL_ze_ipc_handle_map(MPL_gpu_ipc_mem_handle_t * ipc_handle, int is_shared_h
 int MPL_ze_ipc_handle_mmap_host(MPL_gpu_ipc_mem_handle_t * ipc_handle, int shared_handle,
                                 int dev_id, size_t size, void **ptr);
 int MPL_ze_mmap_device_pointer(void *dptr, MPL_gpu_device_attr * attr,
-                               MPL_gpu_device_handle_t device, void **mmaped_ptr);
+                               int device, void **mmaped_ptr);
 int MPL_ze_mmap_handle_unmap(void *ptr, int dev_id);
 
 #endif /* ifndef MPL_GPU_ZE_H_INCLUDED */
diff --git a/src/mpl/src/gpu/mpl_gpu_cuda.c b/src/mpl/src/gpu/mpl_gpu_cuda.c
@@ -264,7 +264,7 @@ int MPL_gpu_unregister_host(const void *ptr)
     goto fn_exit;
 }
 
-int MPL_gpu_malloc(void **ptr, size_t size, MPL_gpu_device_handle_t h_device)
+int MPL_gpu_malloc(void **ptr, size_t size, int h_device)
 {
     int mpl_err = MPL_SUCCESS;
     int prev_devid;

diff --git a/src/mpl/src/gpu/mpl_gpu_fallback.c b/src/mpl/src/gpu/mpl_gpu_fallback.c
@@ -78,7 +78,7 @@ int MPL_gpu_unregister_host(const void *ptr)
     return MPL_SUCCESS;
 }
 
-int MPL_gpu_malloc(void **ptr, size_t size, MPL_gpu_device_handle_t h_device)
+int MPL_gpu_malloc(void **ptr, size_t size, int h_device)
 {
     abort();
     return MPL_ERR_GPU_INTERNAL;

diff --git a/src/mpl/src/gpu/mpl_gpu_hip.c b/src/mpl/src/gpu/mpl_gpu_hip.c
@@ -291,7 +291,7 @@ int MPL_gpu_unregister_host(const void *ptr)
     goto fn_exit;
 }
 
-int MPL_gpu_malloc(void **ptr, size_t size, MPL_gpu_device_handle_t h_device)
+int MPL_gpu_malloc(void **ptr, size_t size, int h_device)
 {
     int mpl_err = MPL_SUCCESS;
     int prev_devid;

diff --git a/src/mpl/src/gpu/mpl_gpu_ze.c b/src/mpl/src/gpu/mpl_gpu_ze.c
@@ -692,7 +692,7 @@ static int get_physical_device(int dev_id)
 }
 
 /* Get dev_id from device handle */
-MPL_STATIC_INLINE_PREFIX int device_to_dev_id(MPL_gpu_device_handle_t device)
+MPL_STATIC_INLINE_PREFIX int device_to_dev_id(ze_device_handle_t device)
 {
     int dev_id = -1;
     for (int d = 0; d < local_ze_device_count; d++) {
@@ -706,7 +706,7 @@ MPL_STATIC_INLINE_PREFIX int device_to_dev_id(MPL_gpu_device_handle_t device)
 }
 
 /* Get device from dev_id */
-MPL_STATIC_INLINE_PREFIX int dev_id_to_device(int dev_id, MPL_gpu_device_handle_t * device)
+MPL_STATIC_INLINE_PREFIX int dev_id_to_device(int dev_id, ze_device_handle_t * device)
 {
     int mpl_err = MPL_SUCCESS;
 
@@ -1774,7 +1774,7 @@ int MPL_gpu_ipc_handle_destroy(const void *ptr, MPL_pointer_attr_t * gpu_attr)
     }
 
     if (likely(MPL_gpu_info.specialized_cache)) {
-        dev_id = device_to_dev_id(gpu_attr->device);
+        dev_id = gpu_attr->device;
         if (dev_id == -1) {
             goto fn_fail;
         }
@@ -2053,7 +2053,7 @@ int MPL_gpu_query_pointer_attr(const void *ptr, MPL_pointer_attr_t * attr)
     ret = zeMemGetAllocProperties(ze_context, ptr,
                                   &attr->device_attr.prop, &attr->device_attr.device);
     ZE_ERR_CHECK(ret);
-    attr->device = attr->device_attr.device;
+    attr->device = device_to_dev_id(attr->device_attr.device);
     switch (attr->device_attr.prop.type) {
         case ZE_MEMORY_TYPE_UNKNOWN:
             attr->type = MPL_GPU_POINTER_UNREGISTERED_HOST;
@@ -2127,7 +2127,7 @@ int MPL_gpu_query_is_same_dev(int global_dev1, int global_dev2)
 #endif
 }
 
-int MPL_gpu_malloc(void **ptr, size_t size, MPL_gpu_device_handle_t h_device)
+int MPL_gpu_malloc(void **ptr, size_t size, int h_device)
 {
     int mpl_err = MPL_SUCCESS;
     int ret;
@@ -2138,10 +2138,16 @@ int MPL_gpu_malloc(void **ptr, size_t size, MPL_gpu_device_handle_t h_device)
         .flags = 0,
         .ordinal = 0,   /* We currently support a single memory type */
     };
+
+    ze_device_handle_t device_handle;
+    ret = dev_id_to_device(h_device, &device_handle);
+    if (ret) {
+        goto fn_fail;
+    }
     /* Currently ZE ignores this argument and uses an internal alignment
      * value. However, this behavior can change in the future. */
     mem_alignment = 1;
-    ret = zeMemAllocDevice(ze_context, &device_desc, size, mem_alignment, h_device, ptr);
+    ret = zeMemAllocDevice(ze_context, &device_desc, size, mem_alignment, device_handle, ptr);
 
     ZE_ERR_CHECK(ret);
 
@@ -2238,11 +2244,7 @@ int MPL_gpu_unregister_host(const void *ptr)
 
 int MPL_gpu_get_dev_id_from_attr(MPL_pointer_attr_t * attr)
 {
-    int dev_id = -1;
-
-    dev_id = device_to_dev_id(attr->device);
-
-    return dev_id;
+    return attr->device;
 }
 
 int MPL_gpu_get_buffer_bounds(const void *ptr, void **pbase, uintptr_t * len)
@@ -3044,7 +3046,7 @@ int MPL_ze_ipc_handle_map(MPL_gpu_ipc_mem_handle_t * mpl_ipc_handle, int is_shar
     ze_result_t ret;
     int status;
     uint32_t nfds;
-    MPL_gpu_device_handle_t dev_handle;
+    ze_device_handle_t dev_handle;
 
     fd_pid_t h;
     h = mpl_ipc_handle->data;
@@ -3210,7 +3212,7 @@ int MPL_ze_ipc_handle_mmap_host(MPL_gpu_ipc_mem_handle_t * mpl_ipc_handle, int i
 
 /* this function takes a local device pointer and mmap to host */
 int MPL_ze_mmap_device_pointer(void *dptr, MPL_gpu_device_attr * attr,
-                               MPL_gpu_device_handle_t device, void **mmaped_ptr)
+                               int device, void **mmaped_ptr)
 {
     ze_result_t ret;
     int mpl_err = MPL_SUCCESS;
@@ -3228,7 +3230,7 @@ int MPL_ze_mmap_device_pointer(void *dptr, MPL_gpu_device_attr * attr,
     offset = (char *) dptr - (char *) pbase;
 
     mem_id = attr->prop.id;
-    local_dev_id = device_to_dev_id(device);
+    local_dev_id = device;
     if (local_dev_id == -1) {
         goto fn_fail;
     }