Merge pull request #7168 from zhenggb72/fast-avx

gpu/ze: use stream load/store for GPU fast copy
pmodels · Oct 15, 2024 · c88cc63 · c88cc63
2 parents 64788e6 + 341542d
commit c88cc63
Show file tree

Hide file tree

Showing 6 changed files with 88 additions and 9 deletions.
diff --git a/src/mpi/misc/utils.c b/src/mpi/misc/utils.c
@@ -9,7 +9,7 @@
     - name        : MPIR_CVAR_GPU_FAST_COPY_MAX_SIZE
       category    : CH4
       type        : int
-      default     : 1024
+      default     : 4096
       class       : none
       verbosity   : MPI_T_VERBOSITY_USER_BASIC
       scope       : MPI_T_SCOPE_ALL_EQ
@@ -29,6 +29,16 @@
         If a receive message size is less than or equal to MPIR_CVAR_GPU_FAST_COPY_MAX_SIZE_H2D (in
         bytes), then enable GPU-based fast memcpy.
 
+    - name        : MPIR_CVAR_GPU_FAST_COPY_MAX_SIZE_D2H
+      category    : CH4
+      type        : int
+      default     : 32768
+      class       : none
+      verbosity   : MPI_T_VERBOSITY_USER_BASIC
+      scope       : MPI_T_SCOPE_ALL_EQ
+      description : >-
+        If a send message size is less than or equal to MPIR_CVAR_GPU_FAST_COPY_MAX_SIZE_D2H (in
+        bytes), then enable GPU-based fast memcpy.
 === END_MPI_T_CVAR_INFO_BLOCK ===
 */
 
@@ -285,6 +295,8 @@ static int do_localcopy_gpu(const void *sendbuf, MPI_Aint sendcount, MPI_Datatyp
         if (dir == MPL_GPU_COPY_H2D) {
             /* Used in ofi_events.h when unpacking from received pack_buffer to original device buffer */
             fast_copy_threshold = MPIR_CVAR_GPU_FAST_COPY_MAX_SIZE_H2D;
+        } else if (dir == MPL_GPU_COPY_D2H) {
+            fast_copy_threshold = MPIR_CVAR_GPU_FAST_COPY_MAX_SIZE_D2H;
         }
         if (copy_sz <= fast_copy_threshold) {
             mpl_errno = MPL_gpu_fast_memcpy(send_ptr, send_attr, recv_ptr, recv_attr, copy_sz);

diff --git a/src/mpid/ch4/netmod/ofi/ofi_recv.h b/src/mpid/ch4/netmod/ofi/ofi_recv.h
@@ -282,7 +282,8 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_do_irecv(void *buf,
 
         /* Unpack */
         MPIDI_OFI_REQUEST(rreq, event_id) = MPIDI_OFI_EVENT_RECV_PACK;
-        MPIDI_OFI_REQUEST(rreq, noncontig.pack.pack_buffer) = MPL_malloc(data_sz, MPL_MEM_OTHER);
+        MPIDI_OFI_REQUEST(rreq, noncontig.pack.pack_buffer) =
+            MPL_aligned_alloc(64, data_sz, MPL_MEM_OTHER);
         MPIR_ERR_CHKANDJUMP1(MPIDI_OFI_REQUEST(rreq, noncontig.pack.pack_buffer) == NULL, mpi_errno,
                              MPI_ERR_OTHER, "**nomem", "**nomem %s", "Recv Pack Buffer alloc");
         recv_buf = MPIDI_OFI_REQUEST(rreq, noncontig.pack.pack_buffer);

diff --git a/src/mpid/ch4/netmod/ofi/ofi_send.h b/src/mpid/ch4/netmod/ofi/ofi_send.h
@@ -625,10 +625,10 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_send(const void *buf, MPI_Aint count, MPI
         /* inject path */
         void *pack_buf = NULL;
         if (need_pack) {
-            pack_buf = MPL_malloc(data_sz, MPL_MEM_OTHER);
+            pack_buf = MPL_aligned_alloc(64, data_sz, MPL_MEM_OTHER);
             mpi_errno = MPIR_Localcopy_gpu(buf, count, datatype, 0, &attr,
                                            pack_buf, data_sz, MPI_BYTE, 0, MPIR_GPU_ATTR_HOST,
-                                           MPL_GPU_COPY_DIRECTION_NONE,
+                                           MPL_GPU_COPY_D2H,
                                            MPIDI_OFI_gpu_get_send_engine_type(), true);
             MPIR_ERR_CHECK(mpi_errno);
             send_buf = pack_buf;
@@ -649,13 +649,13 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_send(const void *buf, MPI_Aint count, MPI
 
         void *data = NULL;
         if (need_pack) {
-            void *pack_buf = MPL_malloc(data_sz, MPL_MEM_OTHER);
+            void *pack_buf = MPL_aligned_alloc(64, data_sz, MPL_MEM_OTHER);
             MPIR_ERR_CHKANDJUMP1(pack_buf == NULL, mpi_errno,
                                  MPI_ERR_OTHER, "**nomem", "**nomem %s", "Send Pack buffer alloc");
 
             mpi_errno = MPIR_Localcopy_gpu(buf, count, datatype, 0, &attr,
                                            pack_buf, data_sz, MPI_BYTE, 0, MPIR_GPU_ATTR_HOST,
-                                           MPL_GPU_COPY_DIRECTION_NONE,
+                                           MPL_GPU_COPY_D2H,
                                            MPIDI_OFI_gpu_get_send_engine_type(), true);
             MPIR_ERR_CHECK(mpi_errno);
 

diff --git a/src/mpl/configure.ac b/src/mpl/configure.ac
@@ -365,6 +365,38 @@ if test "$enable_fast_avx512f_instr" = "yes" ; then
         if test "$pac_cv_found__mm512_storeu_si512" = "yes" ; then
             AC_DEFINE(HAVE__MM512_STOREU_SI512,1,[Define if _mm512_storeu_si512 is available])
         fi
+
+        AC_CACHE_CHECK([whether _mm512_stream_si512 is supported], pac_cv_found__mm512_stream_si512,[
+                        AC_RUN_IFELSE([AC_LANG_SOURCE([[
+                                       #include <immintrin.h>
+                                       #include <stdlib.h>
+
+                                       int main(int argc, char **argv) {
+                                           int ret = 0;
+                                           char *source = NULL, *dest = NULL;
+                                           posix_memalign((void **)&source, 64, 64);
+                                           posix_memalign((void **)&dest, 64, 64);
+                                           for (int i = 0; i < 64; i++) source[i] = 'a';
+
+                                           __m512i ymm0 = _mm512_loadu_si512((__m512i const *) source);
+                                           _mm512_stream_si512((__m512i *) dest, ymm0);
+                                           _mm_sfence();
+
+                                           if (dest[0] == source[0]) ret = 0;
+                                           else ret = 1;
+
+                                           free(source);
+                                           free(dest);
+
+                                           return ret;
+                                       }
+                                       ]])], pac_cv_found__mm512_stream_si512="yes",
+                                       pac_cv_found__mm512_stream_si512="no",
+                                       pac_cv_found__mm512_stream_si512="unknown")
+                        ])
+        if test "$pac_cv_found__mm512_stream_si512" = "yes" ; then
+            AC_DEFINE(HAVE__MM512_STREAM_SI512,1,[Define if _mm512_stream_si512 is available])
+        fi
     fi
 fi
 

diff --git a/src/mpl/include/mpl_trmem.h b/src/mpl/include/mpl_trmem.h
@@ -317,11 +317,14 @@ MPL_STATIC_INLINE_PREFIX void *MPL_aligned_alloc(size_t alignment, size_t size,
                                                  MPL_memory_class class)
 {
 #if defined (MPL_HAVE_ALIGNED_ALLOC)
-    return aligned_alloc(alignment, size);
+    /* aligned_alloc requires size to be multiples of alignment, we round it up here */
+    return aligned_alloc(alignment, MPL_ROUND_UP_ALIGN(size, alignment));
 #elif defined (MPL_HAVE_POSIX_MEMALIGN)
     void *ptr;
     int ret;
 
+    /* posix_memalign requires alignment to be multiples of sizeof(void *) */
+    assert(alignment % sizeof(void *) == 0);
     ret = posix_memalign(&ptr, alignment, size);
     if (ret != 0)
         return NULL;

diff --git a/src/mpl/src/gpu/mpl_gpu_ze.c b/src/mpl/src/gpu/mpl_gpu_ze.c
@@ -3364,7 +3364,34 @@ int MPL_gpu_fast_memcpy(void *src, MPL_pointer_attr_t * src_attr, void *dest,
         if (mpl_err != MPL_SUCCESS)
             goto fn_fail;
     }
-#if defined(MPL_HAVE__MM512_STOREU_SI512)
+#if defined(MPL_HAVE__MM512_STREAM_SI512) || defined(MPL_HAVE__MM256_STREAM_SI256)
+    /* fallback to MPL_Memcpy_stream if not 64-byte aligned */
+    if (((uintptr_t) s) & 63 || ((uintptr_t) d) & 63) {
+        MPL_Memcpy_stream(d, s, size);
+        goto fn_exit;
+    }
+#if defined(MPL_HAVE__MM512_STREAM_SI512)
+    while (n >= 64) {
+        _mm512_stream_si512((__m512i *) d, _mm512_stream_load_si512((__m512i const *) s));
+        d += 64;
+        s += 64;
+        n -= 64;
+    }
+    if (n & 32) {
+        _mm256_stream_si256((__m256i *) d, _mm256_stream_load_si256((__m256i const *) s));
+        d += 32;
+        s += 32;
+        n -= 32;
+    }
+#elif defined(MPL_HAVE__MM256_STOREU_SI256)
+    while (n >= 32) {
+        _mm256_storeu_si256((__m256i *) d, _mm256_loadu_si256((__m256i const *) s));
+        d += 32;
+        s += 32;
+        n -= 32;
+    }
+#endif /* MPL_HAVE__MM512_STREAM_SI512 */
+#elif defined(MPL_HAVE__MM512_STOREU_SI512)
     while (n >= 64) {
         _mm512_storeu_si512((__m512i *) d, _mm512_loadu_si512((__m512i const *) s));
         d += 64;
@@ -3388,7 +3415,11 @@ int MPL_gpu_fast_memcpy(void *src, MPL_pointer_attr_t * src_attr, void *dest,
     goto fallback;
 #endif
     if (n & 16) {
+#if defined(MPL_HAVE__MM_STREAM_SI128)
+        _mm_stream_si128((__m128i *) d, _mm_stream_load_si128((__m128i const *) s));
+#else
         _mm_storeu_si128((__m128i *) d, _mm_loadu_si128((__m128i const *) s));
+#endif
         d += 16;
         s += 16;
         n -= 16;
@@ -3414,7 +3445,7 @@ int MPL_gpu_fast_memcpy(void *src, MPL_pointer_attr_t * src_attr, void *dest,
     if (n == 1) {
         *(char *) d = *(char *) s;
     }
-#if defined(MPL_HAVE__MM256_STOREU_SI256) || defined(MPL_HAVE__MM512_STOREU_SI512)
+#if defined(MPL_HAVE__MM512_STOREU_SI512) || defined(MPL_HAVE__MM512_STREAM_SI512) || defined(MPL_HAVE__MM256_STREAM_SI256) || defined(MPL_HAVE__MM256_STOREU_SI256)
     _mm_sfence();
 #endif
     goto fn_exit;