Skip to content

Commit

Permalink
Merge pull request #7168 from zhenggb72/fast-avx
Browse files Browse the repository at this point in the history
gpu/ze: use stream load/store for GPU fast copy
  • Loading branch information
yfguo authored Oct 15, 2024
2 parents 64788e6 + 341542d commit c88cc63
Show file tree
Hide file tree
Showing 6 changed files with 88 additions and 9 deletions.
14 changes: 13 additions & 1 deletion src/mpi/misc/utils.c
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
- name : MPIR_CVAR_GPU_FAST_COPY_MAX_SIZE
category : CH4
type : int
default : 1024
default : 4096
class : none
verbosity : MPI_T_VERBOSITY_USER_BASIC
scope : MPI_T_SCOPE_ALL_EQ
Expand All @@ -29,6 +29,16 @@
If a receive message size is less than or equal to MPIR_CVAR_GPU_FAST_COPY_MAX_SIZE_H2D (in
bytes), then enable GPU-based fast memcpy.
- name : MPIR_CVAR_GPU_FAST_COPY_MAX_SIZE_D2H
category : CH4
type : int
default : 32768
class : none
verbosity : MPI_T_VERBOSITY_USER_BASIC
scope : MPI_T_SCOPE_ALL_EQ
description : >-
If a send message size is less than or equal to MPIR_CVAR_GPU_FAST_COPY_MAX_SIZE_D2H (in
bytes), then enable GPU-based fast memcpy.
=== END_MPI_T_CVAR_INFO_BLOCK ===
*/

Expand Down Expand Up @@ -285,6 +295,8 @@ static int do_localcopy_gpu(const void *sendbuf, MPI_Aint sendcount, MPI_Datatyp
if (dir == MPL_GPU_COPY_H2D) {
/* Used in ofi_events.h when unpacking from received pack_buffer to original device buffer */
fast_copy_threshold = MPIR_CVAR_GPU_FAST_COPY_MAX_SIZE_H2D;
} else if (dir == MPL_GPU_COPY_D2H) {
fast_copy_threshold = MPIR_CVAR_GPU_FAST_COPY_MAX_SIZE_D2H;
}
if (copy_sz <= fast_copy_threshold) {
mpl_errno = MPL_gpu_fast_memcpy(send_ptr, send_attr, recv_ptr, recv_attr, copy_sz);
Expand Down
3 changes: 2 additions & 1 deletion src/mpid/ch4/netmod/ofi/ofi_recv.h
Original file line number Diff line number Diff line change
Expand Up @@ -282,7 +282,8 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_do_irecv(void *buf,

/* Unpack */
MPIDI_OFI_REQUEST(rreq, event_id) = MPIDI_OFI_EVENT_RECV_PACK;
MPIDI_OFI_REQUEST(rreq, noncontig.pack.pack_buffer) = MPL_malloc(data_sz, MPL_MEM_OTHER);
MPIDI_OFI_REQUEST(rreq, noncontig.pack.pack_buffer) =
MPL_aligned_alloc(64, data_sz, MPL_MEM_OTHER);
MPIR_ERR_CHKANDJUMP1(MPIDI_OFI_REQUEST(rreq, noncontig.pack.pack_buffer) == NULL, mpi_errno,
MPI_ERR_OTHER, "**nomem", "**nomem %s", "Recv Pack Buffer alloc");
recv_buf = MPIDI_OFI_REQUEST(rreq, noncontig.pack.pack_buffer);
Expand Down
8 changes: 4 additions & 4 deletions src/mpid/ch4/netmod/ofi/ofi_send.h
Original file line number Diff line number Diff line change
Expand Up @@ -625,10 +625,10 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_send(const void *buf, MPI_Aint count, MPI
/* inject path */
void *pack_buf = NULL;
if (need_pack) {
pack_buf = MPL_malloc(data_sz, MPL_MEM_OTHER);
pack_buf = MPL_aligned_alloc(64, data_sz, MPL_MEM_OTHER);
mpi_errno = MPIR_Localcopy_gpu(buf, count, datatype, 0, &attr,
pack_buf, data_sz, MPI_BYTE, 0, MPIR_GPU_ATTR_HOST,
MPL_GPU_COPY_DIRECTION_NONE,
MPL_GPU_COPY_D2H,
MPIDI_OFI_gpu_get_send_engine_type(), true);
MPIR_ERR_CHECK(mpi_errno);
send_buf = pack_buf;
Expand All @@ -649,13 +649,13 @@ MPL_STATIC_INLINE_PREFIX int MPIDI_OFI_send(const void *buf, MPI_Aint count, MPI

void *data = NULL;
if (need_pack) {
void *pack_buf = MPL_malloc(data_sz, MPL_MEM_OTHER);
void *pack_buf = MPL_aligned_alloc(64, data_sz, MPL_MEM_OTHER);
MPIR_ERR_CHKANDJUMP1(pack_buf == NULL, mpi_errno,
MPI_ERR_OTHER, "**nomem", "**nomem %s", "Send Pack buffer alloc");

mpi_errno = MPIR_Localcopy_gpu(buf, count, datatype, 0, &attr,
pack_buf, data_sz, MPI_BYTE, 0, MPIR_GPU_ATTR_HOST,
MPL_GPU_COPY_DIRECTION_NONE,
MPL_GPU_COPY_D2H,
MPIDI_OFI_gpu_get_send_engine_type(), true);
MPIR_ERR_CHECK(mpi_errno);

Expand Down
32 changes: 32 additions & 0 deletions src/mpl/configure.ac
Original file line number Diff line number Diff line change
Expand Up @@ -365,6 +365,38 @@ if test "$enable_fast_avx512f_instr" = "yes" ; then
if test "$pac_cv_found__mm512_storeu_si512" = "yes" ; then
AC_DEFINE(HAVE__MM512_STOREU_SI512,1,[Define if _mm512_storeu_si512 is available])
fi

AC_CACHE_CHECK([whether _mm512_stream_si512 is supported], pac_cv_found__mm512_stream_si512,[
AC_RUN_IFELSE([AC_LANG_SOURCE([[
#include <immintrin.h>
#include <stdlib.h>
int main(int argc, char **argv) {
int ret = 0;
char *source = NULL, *dest = NULL;
posix_memalign((void **)&source, 64, 64);
posix_memalign((void **)&dest, 64, 64);
for (int i = 0; i < 64; i++) source[i] = 'a';
__m512i ymm0 = _mm512_loadu_si512((__m512i const *) source);
_mm512_stream_si512((__m512i *) dest, ymm0);
_mm_sfence();
if (dest[0] == source[0]) ret = 0;
else ret = 1;
free(source);
free(dest);
return ret;
}
]])], pac_cv_found__mm512_stream_si512="yes",
pac_cv_found__mm512_stream_si512="no",
pac_cv_found__mm512_stream_si512="unknown")
])
if test "$pac_cv_found__mm512_stream_si512" = "yes" ; then
AC_DEFINE(HAVE__MM512_STREAM_SI512,1,[Define if _mm512_stream_si512 is available])
fi
fi
fi

Expand Down
5 changes: 4 additions & 1 deletion src/mpl/include/mpl_trmem.h
Original file line number Diff line number Diff line change
Expand Up @@ -317,11 +317,14 @@ MPL_STATIC_INLINE_PREFIX void *MPL_aligned_alloc(size_t alignment, size_t size,
MPL_memory_class class)
{
#if defined (MPL_HAVE_ALIGNED_ALLOC)
return aligned_alloc(alignment, size);
/* aligned_alloc requires size to be multiples of alignment, we round it up here */
return aligned_alloc(alignment, MPL_ROUND_UP_ALIGN(size, alignment));
#elif defined (MPL_HAVE_POSIX_MEMALIGN)
void *ptr;
int ret;

/* posix_memalign requires alignment to be multiples of sizeof(void *) */
assert(alignment % sizeof(void *) == 0);
ret = posix_memalign(&ptr, alignment, size);
if (ret != 0)
return NULL;
Expand Down
35 changes: 33 additions & 2 deletions src/mpl/src/gpu/mpl_gpu_ze.c
Original file line number Diff line number Diff line change
Expand Up @@ -3364,7 +3364,34 @@ int MPL_gpu_fast_memcpy(void *src, MPL_pointer_attr_t * src_attr, void *dest,
if (mpl_err != MPL_SUCCESS)
goto fn_fail;
}
#if defined(MPL_HAVE__MM512_STOREU_SI512)
#if defined(MPL_HAVE__MM512_STREAM_SI512) || defined(MPL_HAVE__MM256_STREAM_SI256)
/* fallback to MPL_Memcpy_stream if not 64-byte aligned */
if (((uintptr_t) s) & 63 || ((uintptr_t) d) & 63) {
MPL_Memcpy_stream(d, s, size);
goto fn_exit;
}
#if defined(MPL_HAVE__MM512_STREAM_SI512)
while (n >= 64) {
_mm512_stream_si512((__m512i *) d, _mm512_stream_load_si512((__m512i const *) s));
d += 64;
s += 64;
n -= 64;
}
if (n & 32) {
_mm256_stream_si256((__m256i *) d, _mm256_stream_load_si256((__m256i const *) s));
d += 32;
s += 32;
n -= 32;
}
#elif defined(MPL_HAVE__MM256_STOREU_SI256)
while (n >= 32) {
_mm256_storeu_si256((__m256i *) d, _mm256_loadu_si256((__m256i const *) s));
d += 32;
s += 32;
n -= 32;
}
#endif /* MPL_HAVE__MM512_STREAM_SI512 */
#elif defined(MPL_HAVE__MM512_STOREU_SI512)
while (n >= 64) {
_mm512_storeu_si512((__m512i *) d, _mm512_loadu_si512((__m512i const *) s));
d += 64;
Expand All @@ -3388,7 +3415,11 @@ int MPL_gpu_fast_memcpy(void *src, MPL_pointer_attr_t * src_attr, void *dest,
goto fallback;
#endif
if (n & 16) {
#if defined(MPL_HAVE__MM_STREAM_SI128)
_mm_stream_si128((__m128i *) d, _mm_stream_load_si128((__m128i const *) s));
#else
_mm_storeu_si128((__m128i *) d, _mm_loadu_si128((__m128i const *) s));
#endif
d += 16;
s += 16;
n -= 16;
Expand All @@ -3414,7 +3445,7 @@ int MPL_gpu_fast_memcpy(void *src, MPL_pointer_attr_t * src_attr, void *dest,
if (n == 1) {
*(char *) d = *(char *) s;
}
#if defined(MPL_HAVE__MM256_STOREU_SI256) || defined(MPL_HAVE__MM512_STOREU_SI512)
#if defined(MPL_HAVE__MM512_STOREU_SI512) || defined(MPL_HAVE__MM512_STREAM_SI512) || defined(MPL_HAVE__MM256_STREAM_SI256) || defined(MPL_HAVE__MM256_STOREU_SI256)
_mm_sfence();
#endif
goto fn_exit;
Expand Down

0 comments on commit c88cc63

Please sign in to comment.