diff --git a/sycl/include/syclcompat/memory.hpp b/sycl/include/syclcompat/memory.hpp index 6d51a56f66e65..9580dafc48a54 100644 --- a/sycl/include/syclcompat/memory.hpp +++ b/sycl/include/syclcompat/memory.hpp @@ -199,23 +199,23 @@ static inline sycl::event memset(sycl::queue q, void *dev_ptr, int value, return q.memset(dev_ptr, value, size); } -/// Set \p value to the 3D memory region pointed by \p data in \p q. \p size -/// specifies the 3D memory size to set. -/// -/// \param q The queue in which the operation is done. -/// \param data Pointer to the device memory region. -/// \param value Value to be set. -/// \param size Memory region size. -/// \returns An event list representing the memset operations. -static inline std::vector memset(sycl::queue q, pitched_data data, - int value, sycl::range<3> size) { +/// \brief Sets \p value to the 3D memory region pointed by \p data in \p q. +/// \tparam T The type of the element to be set. +/// \param [in] q The queue in which the operation is done. +/// \param [in] data Pointer to the pitched device memory region. +/// \param [in] value The value to be set. +/// \param [in] size 3D memory region by number of elements. +/// \return An event list representing the memset operations. +template +static inline std::vector +memset(sycl::queue q, pitched_data data, const T &value, sycl::range<3> size) { std::vector event_list; size_t slice = data.get_pitch() * data.get_y(); unsigned char *data_surface = (unsigned char *)data.get_data_ptr(); for (size_t z = 0; z < size.get(2); ++z) { unsigned char *data_ptr = data_surface; for (size_t y = 0; y < size.get(1); ++y) { - event_list.push_back(memset(q, data_ptr, value, size.get(0))); + event_list.push_back(detail::fill(q, data_ptr, value, size.get(0))); data_ptr += data.get_pitch(); } data_surface += slice; @@ -225,6 +225,7 @@ static inline std::vector memset(sycl::queue q, pitched_data data, /// \brief Sets \p val to the pitched 2D memory region pointed by \p ptr in \p /// q. +/// \tparam T The type of the element to be set. /// \param [in] q The queue in which the operation is done. /// \param [in] ptr Pointer to the virtual device memory. /// \param [in] pitch The pitch size by number of elements, including padding. @@ -232,8 +233,10 @@ static inline std::vector memset(sycl::queue q, pitched_data data, /// \param [in] x The width of memory region by number of elements. /// \param [in] y The height of memory region by number of elements. /// \return An event list representing the memset operations. -static inline std::vector -memset(sycl::queue q, void *ptr, size_t pitch, int value, size_t x, size_t y) { +template +static inline std::vector memset(sycl::queue q, void *ptr, + size_t pitch, const T &value, + size_t x, size_t y) { return memset(q, pitched_data(ptr, pitch, x, 1), value, sycl::range<3>(x, y, 1)); } @@ -407,8 +410,7 @@ memcpy(sycl::queue q, void *to_ptr, const void *from_ptr, })); break; default: - throw std::runtime_error("[SYCLcompat]" - "memcpy: invalid direction value"); + throw std::runtime_error("[SYCLcompat] memcpy: invalid direction value"); } return event_list; } @@ -731,7 +733,7 @@ static void inline fill(void *dev_ptr, const T &pattern, size_t count, /// \param pattern Pattern of type \p T to be set. /// \param count Number of elements to be set to the patten. /// \param q The queue in which the operation is done. -/// \returns no return value. +/// \returns An event representing the fill operation. template static sycl::event inline fill_async(void *dev_ptr, const T &pattern, size_t count, @@ -752,51 +754,151 @@ static void memset(void *dev_ptr, int value, size_t size, detail::memset(q, dev_ptr, value, size).wait(); } -/// Asynchronously sets \p value to the first \p size bytes starting from \p -/// dev_ptr. The return of the function does NOT guarantee the memset operation -/// is completed. -/// +/// \brief Sets 2 bytes data \p value to the first \p size elements starting +/// from \p dev_ptr in \p q synchronously. +/// \param [in] dev_ptr Pointer to the virtual device memory address. +/// \param [in] value The value to be set. +/// \param [in] size Number of elements to be set to the value. +/// \param [in] q The queue in which the operation is done. +static inline void memset_d16(void *dev_ptr, unsigned short value, size_t size, + sycl::queue q = get_default_queue()) { + detail::fill(q, dev_ptr, value, size).wait(); +} + +/// \brief Sets 4 bytes data \p value to the first \p size elements starting +/// from \p dev_ptr in \p q synchronously. +/// \param [in] dev_ptr Pointer to the virtual device memory address. +/// \param [in] value The value to be set. +/// \param [in] size Number of elements to be set to the value. +/// \param [in] q The queue in which the operation is done. +static inline void memset_d32(void *dev_ptr, unsigned int value, size_t size, + sycl::queue q = get_default_queue()) { + detail::fill(q, dev_ptr, value, size).wait(); +} + +/// \brief Sets 1 byte data \p value to the first \p size elements starting +/// from \p dev_ptr in \p q asynchronously. /// \param dev_ptr Pointer to the device memory address. /// \param value Value to be set. /// \param size Number of bytes to be set to the value. -/// \returns no return value. -static sycl::event memset_async(void *dev_ptr, int value, size_t size, - sycl::queue q = get_default_queue()) { +/// \returns An event representing the memset operation. +static inline sycl::event memset_async(void *dev_ptr, int value, size_t size, + sycl::queue q = get_default_queue()) { return detail::memset(q, dev_ptr, value, size); } -/// Sets \p value to the 2D memory region pointed by \p ptr in \p q. \p x and -/// \p y specify the setted 2D memory size. \p pitch is the bytes in linear -/// dimension, including padding bytes. The function will return after the -/// memset operation is completed. -/// -/// \param ptr Pointer to the device memory region. -/// \param pitch Bytes in linear dimension, including padding bytes. -/// \param value Value to be set. -/// \param x The setted memory size in linear dimension. -/// \param y The setted memory size in second dimension. -/// \param q The queue in which the operation is done. -/// \returns no return value. +/// \brief Sets 2 bytes data \p value to the first \p size elements starting +/// from \p dev_ptr in \p q asynchronously. +/// \param [in] dev_ptr Pointer to the virtual device memory address. +/// \param [in] value The value to be set. +/// \param [in] size Number of elements to be set to the value. +/// \param [in] q The queue in which the operation is done. +/// \returns An event representing the memset operation. +static inline sycl::event +memset_d16_async(void *dev_ptr, unsigned short value, size_t size, + sycl::queue q = get_default_queue()) { + return detail::fill(q, dev_ptr, value, size); +} + +/// \brief Sets 4 bytes data \p value to the first \p size elements starting +/// from \p dev_ptr in \p q asynchronously. +/// \param [in] dev_ptr Pointer to the virtual device memory address. +/// \param [in] value The value to be set. +/// \param [in] size Number of elements to be set to the value. +/// \param [in] q The queue in which the operation is done. +/// \returns An event representing the memset operation. +static inline sycl::event +memset_d32_async(void *dev_ptr, unsigned int value, size_t size, + sycl::queue q = get_default_queue()) { + return detail::fill(q, dev_ptr, value, size); +} + +/// \brief Sets 1 byte data \p val to the pitched 2D memory region pointed by \p +/// ptr in \p q synchronously. +/// \param [in] ptr Pointer to the virtual device memory. +/// \param [in] pitch The pitch size by number of elements, including padding. +/// \param [in] val The value to be set. +/// \param [in] x The width of memory region by number of elements. +/// \param [in] y The height of memory region by number of elements. +/// \param [in] q The queue in which the operation is done. static inline void memset(void *ptr, size_t pitch, int val, size_t x, size_t y, sycl::queue q = get_default_queue()) { + sycl::event::wait(detail::memset(q, ptr, pitch, val, x, y)); +} + +/// \brief Sets 2 bytes data \p val to the pitched 2D memory region pointed by +/// ptr in \p q synchronously. +/// \param [in] ptr Pointer to the virtual device memory. +/// \param [in] pitch The pitch size by number of elements, including padding. +/// \param [in] val The value to be set. +/// \param [in] x The width of memory region by number of elements. +/// \param [in] y The height of memory region by number of elements. +/// \param [in] q The queue in which the operation is done. +static inline void memset_d16(void *ptr, size_t pitch, unsigned short val, + size_t x, size_t y, + sycl::queue q = get_default_queue()) { sycl::event::wait(detail::memset(q, ptr, pitch, val, x, y)); } -/// Sets \p value to the 2D memory region pointed by \p ptr in \p q. \p x and -/// \p y specify the setted 2D memory size. \p pitch is the bytes in linear -/// dimension, including padding bytes. The return of the function does NOT -/// guarantee the memset operation is completed. -/// -/// \param ptr Pointer to the device memory region. -/// \param pitch Bytes in linear dimension, including padding bytes. -/// \param value Value to be set. -/// \param x The setted memory size in linear dimension. -/// \param y The setted memory size in second dimension. -/// \param q The queue in which the operation is done. -/// \returns no return value. +/// \brief Sets 4 bytes data \p val to the pitched 2D memory region pointed by +/// ptr in \p q synchronously. +/// \param [in] ptr Pointer to the virtual device memory. +/// \param [in] pitch The pitch size by number of elements, including padding. +/// \param [in] val The value to be set. +/// \param [in] x The width of memory region by number of elements. +/// \param [in] y The height of memory region by number of elements. +/// \param [in] q The queue in which the operation is done. +static inline void memset_d32(void *ptr, size_t pitch, unsigned int val, + size_t x, size_t y, + sycl::queue q = get_default_queue()) { + sycl::event::wait(detail::memset(q, ptr, pitch, val, x, y)); +} + +/// \brief Sets 1 byte data \p val to the pitched 2D memory region pointed by \p +/// ptr in \p q asynchronously. +/// \param [in] ptr Pointer to the virtual device memory. +/// \param [in] pitch The pitch size by number of elements, including padding. +/// \param [in] val The value to be set. +/// \param [in] x The width of memory region by number of elements. +/// \param [in] y The height of memory region by number of elements. +/// \param [in] q The queue in which the operation is done. +/// \returns An event representing the memset operation. static inline sycl::event memset_async(void *ptr, size_t pitch, int val, size_t x, size_t y, sycl::queue q = get_default_queue()) { + + auto events = detail::memset(q, ptr, pitch, val, x, y); + return detail::combine_events(events, q); +} + +/// \brief Sets 2 bytes data \p val to the pitched 2D memory region pointed by +/// \p ptr in \p q asynchronously. +/// \param [in] ptr Pointer to the virtual device memory. +/// \param [in] pitch The pitch size by number of elements, including padding. +/// \param [in] val The value to be set. +/// \param [in] x The width of memory region by number of elements. +/// \param [in] y The height of memory region by number of elements. +/// \param [in] q The queue in which the operation is done. +/// \returns An event representing the memset operation. +static inline sycl::event +memset_d16_async(void *ptr, size_t pitch, unsigned short val, size_t x, + size_t y, sycl::queue q = get_default_queue()) { + auto events = detail::memset(q, ptr, pitch, val, x, y); + return detail::combine_events(events, q); +} + +/// \brief Sets 4 bytes data \p val to the pitched 2D memory region pointed by +/// \p ptr in \p q asynchronously. +/// \param [in] ptr Pointer to the virtual device memory. +/// \param [in] pitch The pitch size by number of elements, including padding. +/// \param [in] val The value to be set. +/// \param [in] x The width of memory region by number of elements. +/// \param [in] y The height of memory region by number of elements. +/// \param [in] q The queue in which the operation is done. +/// \returns An event representing the memset operation. +static inline sycl::event +memset_d32_async(void *ptr, size_t pitch, unsigned int val, size_t x, size_t y, + sycl::queue q = get_default_queue()) { auto events = detail::memset(q, ptr, pitch, val, x, y); return detail::combine_events(events, q); } @@ -812,7 +914,7 @@ static inline sycl::event memset_async(void *ptr, size_t pitch, int val, /// \returns no return value. static inline void memset(pitched_data pitch, int val, sycl::range<3> size, sycl::queue q = get_default_queue()) { - sycl::event::wait(detail::memset(q, pitch, val, size)); + sycl::event::wait(detail::memset(q, pitch, val, size)); } /// Sets \p value to the 3D memory region specified by \p pitch in \p q. \p size @@ -823,11 +925,11 @@ static inline void memset(pitched_data pitch, int val, sycl::range<3> size, /// \param value Value to be set. /// \param size The setted 3D memory size. /// \param q The queue in which the operation is done. -/// \returns no return value. +/// \returns An event representing the memset operation. static inline sycl::event memset_async(pitched_data pitch, int val, sycl::range<3> size, sycl::queue q = get_default_queue()) { - auto events = detail::memset(q, pitch, val, size); + auto events = detail::memset(q, pitch, val, size); return detail::combine_events(events, q); } diff --git a/sycl/test-e2e/syclcompat/memory/memory_management_test1.cpp b/sycl/test-e2e/syclcompat/memory/memory_management_test1.cpp index 1f0662992038b..d13ab2357e632 100644 --- a/sycl/test-e2e/syclcompat/memory/memory_management_test1.cpp +++ b/sycl/test-e2e/syclcompat/memory/memory_management_test1.cpp @@ -128,27 +128,36 @@ void test_memcpy_q() { free(h_C); } -void test_memset() { +template void test_memset_impl() { std::cout << __PRETTY_FUNCTION__ << std::endl; + // ValueT -> int for memset and memset_d32, short for memset_d16. + using ValueT = std::conditional_t< + memset_size_bits == 8 || memset_size_bits == 32, int, + std::conditional_t>; + static_assert(!std::is_void_v, + "memset tests only work for 8, 16 and 32 bits"); constexpr int Num = 10; - int *h_A = (int *)malloc(Num * sizeof(int)); + ValueT *h_A = (ValueT *)malloc(Num * sizeof(ValueT)); for (int i = 0; i < Num; i++) { h_A[i] = 4; } - int *d_A = nullptr; - - d_A = (int *)syclcompat::malloc(Num * sizeof(int)); + ValueT *d_A = (ValueT *)syclcompat::malloc(Num * sizeof(ValueT)); // hostA -> deviceA - syclcompat::memcpy((void *)d_A, (void *)h_A, Num * sizeof(int)); + syclcompat::memcpy((void *)d_A, (void *)h_A, Num * sizeof(ValueT)); // set d_A[0,..., 6] = 0 - syclcompat::memset((void *)d_A, 0, (Num - 3) * sizeof(int)); + if constexpr (memset_size_bits == 8) + syclcompat::memset((void *)d_A, 0, (Num - 3) * sizeof(ValueT)); + else if constexpr (memset_size_bits == 16) + syclcompat::memset_d16((void *)d_A, 0, (Num - 3)); + else if constexpr (memset_size_bits == 32) + syclcompat::memset_d32((void *)d_A, 0, (Num - 3)); // deviceA -> hostA - syclcompat::memcpy((void *)h_A, (void *)d_A, Num * sizeof(int)); + syclcompat::memcpy((void *)h_A, (void *)d_A, Num * sizeof(ValueT)); syclcompat::free((void *)d_A); @@ -165,28 +174,37 @@ void test_memset() { free(h_A); } -void test_memset_q() { +template void test_memset_q_impl() { std::cout << __PRETTY_FUNCTION__ << std::endl; + // ValueT -> int for memset and memset_d32, short for memset_d16. + using ValueT = std::conditional_t< + memset_size_bits == 8 || memset_size_bits == 32, int, + std::conditional_t>; + static_assert(!std::is_void_v, + "memset tests only work for 8, 16 and 32 bits"); sycl::queue q{{sycl::property::queue::in_order()}}; constexpr int Num = 10; - int *h_A = (int *)malloc(Num * sizeof(int)); + ValueT *h_A = (ValueT *)malloc(Num * sizeof(ValueT)); for (int i = 0; i < Num; i++) { h_A[i] = 4; } - int *d_A = nullptr; - - d_A = (int *)syclcompat::malloc(Num * sizeof(int), q); + ValueT *d_A = (ValueT *)syclcompat::malloc(Num * sizeof(ValueT), q); // hostA -> deviceA - syclcompat::memcpy((void *)d_A, (void *)h_A, Num * sizeof(int), q); + syclcompat::memcpy((void *)d_A, (void *)h_A, Num * sizeof(ValueT), q); // set d_A[0,..., 6] = 0 - syclcompat::memset((void *)d_A, 0, (Num - 3) * sizeof(int), q); + if constexpr (memset_size_bits == 8) + syclcompat::memset((void *)d_A, 0, (Num - 3) * sizeof(ValueT), q); + else if constexpr (memset_size_bits == 16) + syclcompat::memset_d16((void *)d_A, 0, (Num - 3), q); + else if constexpr (memset_size_bits == 32) + syclcompat::memset_d32((void *)d_A, 0, (Num - 3), q); // deviceA -> hostA - syclcompat::memcpy((void *)h_A, (void *)d_A, Num * sizeof(int), q); + syclcompat::memcpy((void *)h_A, (void *)d_A, Num * sizeof(ValueT), q); syclcompat::free((void *)d_A, q); @@ -203,6 +221,42 @@ void test_memset_q() { free(h_A); } +void test_memset() { + std::cout << __PRETTY_FUNCTION__ << std::endl; + constexpr size_t memset_size_in_bits = 8; + test_memset_impl(); +} + +void test_memset_d16() { + std::cout << __PRETTY_FUNCTION__ << std::endl; + constexpr size_t memset_size_in_bits = 16; + test_memset_impl(); +} + +void test_memset_d32() { + std::cout << __PRETTY_FUNCTION__ << std::endl; + constexpr size_t memset_size_in_bits = 32; + test_memset_impl(); +} + +void test_memset_q() { + std::cout << __PRETTY_FUNCTION__ << std::endl; + constexpr size_t memset_size_in_bits = 8; + test_memset_q_impl(); +} + +void test_memset_d16_q() { + std::cout << __PRETTY_FUNCTION__ << std::endl; + constexpr size_t memset_size_in_bits = 16; + test_memset_q_impl(); +} + +void test_memset_d32_q() { + std::cout << __PRETTY_FUNCTION__ << std::endl; + constexpr size_t memset_size_in_bits = 32; + test_memset_q_impl(); +} + template void test_memcpy_t() { std::cout << __PRETTY_FUNCTION__ << std::endl; @@ -480,6 +534,10 @@ int main() { test_memcpy_q(); test_memset(); test_memset_q(); + test_memset_d16(); + test_memset_d16_q(); + test_memset_d32(); + test_memset_d32_q(); test_constant_memcpy(); test_constant_memcpy_q(); diff --git a/sycl/test-e2e/syclcompat/memory/memory_management_test3.cpp b/sycl/test-e2e/syclcompat/memory/memory_management_test3.cpp index 0d35a0721bbc7..ee0c5fc146d59 100644 --- a/sycl/test-e2e/syclcompat/memory/memory_management_test3.cpp +++ b/sycl/test-e2e/syclcompat/memory/memory_management_test3.cpp @@ -244,25 +244,36 @@ void test_memcpy_async_pitched_q() { syclcompat::free((void *)d_data, q); } -void test_memset_async() { +template void test_memset_async_impl() { std::cout << __PRETTY_FUNCTION__ << std::endl; + // ValueT -> int for memset and memset_d32, short for memset_d16. + using ValueT = std::conditional_t< + memset_size_bits == 8 || memset_size_bits == 32, int, + std::conditional_t>; + static_assert(!std::is_void_v, + "memset tests only work for 8, 16 and 32 bits"); int Num = 10; - int *h_A = (int *)malloc(Num * sizeof(int)); + ValueT *h_A = (ValueT *)malloc(Num * sizeof(ValueT)); for (int i = 0; i < Num; i++) { h_A[i] = 4; } - int *d_A = (int *)syclcompat::malloc(Num * sizeof(int)); + ValueT *d_A = (ValueT *)syclcompat::malloc(Num * sizeof(ValueT)); // hostA -> deviceA - syclcompat::memcpy_async((void *)d_A, (void *)h_A, Num * sizeof(int)); + syclcompat::memcpy_async((void *)d_A, (void *)h_A, Num * sizeof(ValueT)); // set d_A[0,..., 6] = 0 - syclcompat::memset_async((void *)d_A, 0, (Num - 3) * sizeof(int)); + if constexpr (memset_size_bits == 8) + syclcompat::memset_async((void *)d_A, 0, (Num - 3) * sizeof(ValueT)); + else if constexpr (memset_size_bits == 16) + syclcompat::memset_d16_async((void *)d_A, 0, (Num - 3)); + else if constexpr (memset_size_bits == 32) + syclcompat::memset_d32_async((void *)d_A, 0, (Num - 3)); // deviceA -> hostA - syclcompat::memcpy_async((void *)h_A, (void *)d_A, Num * sizeof(int)); + syclcompat::memcpy_async((void *)h_A, (void *)d_A, Num * sizeof(ValueT)); syclcompat::get_default_queue().wait_and_throw(); @@ -281,26 +292,37 @@ void test_memset_async() { free(h_A); } -void test_memset_async_q() { +template void test_memset_async_q_impl() { std::cout << __PRETTY_FUNCTION__ << std::endl; + // int for memset and memset_d32, short for memset_d16. + using ValueT = + std::conditional_t>; + static_assert(!std::is_void_v, + "memset tests only work for 8, 16 and 32 bits"); sycl::queue q{{sycl::property::queue::in_order()}}; int Num = 10; - int *h_A = (int *)malloc(Num * sizeof(int)); + ValueT *h_A = (ValueT *)malloc(Num * sizeof(ValueT)); for (int i = 0; i < Num; i++) { h_A[i] = 4; } - int *d_A = (int *)syclcompat::malloc(Num * sizeof(int), q); + ValueT *d_A = (ValueT *)syclcompat::malloc(Num * sizeof(ValueT), q); // hostA -> deviceA - syclcompat::memcpy_async((void *)d_A, (void *)h_A, Num * sizeof(int), q); + syclcompat::memcpy_async((void *)d_A, (void *)h_A, Num * sizeof(ValueT), q); // set d_A[0,..., 6] = 0 - syclcompat::memset_async((void *)d_A, 0, (Num - 3) * sizeof(int), q); + if constexpr (bits == 8) + syclcompat::memset_async((void *)d_A, 0, (Num - 3) * sizeof(ValueT), q); + else if constexpr (bits == 16) + syclcompat::memset_d16_async((void *)d_A, 0, (Num - 3), q); + else if constexpr (bits == 32) + syclcompat::memset_d32_async((void *)d_A, 0, (Num - 3), q); // deviceA -> hostA - syclcompat::memcpy_async((void *)h_A, (void *)d_A, Num * sizeof(int), q); + syclcompat::memcpy_async((void *)h_A, (void *)d_A, Num * sizeof(ValueT), q); q.wait_and_throw(); syclcompat::free((void *)d_A, q); @@ -317,6 +339,42 @@ void test_memset_async_q() { free(h_A); } +void test_memset_async() { + std::cout << __PRETTY_FUNCTION__ << std::endl; + constexpr size_t memset_size_in_bits = 8; + test_memset_async_impl(); +} + +void test_memset_d16_async() { + std::cout << __PRETTY_FUNCTION__ << std::endl; + constexpr size_t memset_size_in_bits = 16; + test_memset_async_impl(); +} + +void test_memset_d32_async() { + std::cout << __PRETTY_FUNCTION__ << std::endl; + constexpr size_t memset_size_in_bits = 32; + test_memset_async_impl(); +} + +void test_memset_async_q() { + std::cout << __PRETTY_FUNCTION__ << std::endl; + constexpr size_t memset_size_in_bits = 8; + test_memset_async_q_impl(); +} + +void test_memset_d16_async_q() { + std::cout << __PRETTY_FUNCTION__ << std::endl; + constexpr size_t memset_size_in_bits = 16; + test_memset_async_q_impl(); +} + +void test_memset_d32_async_q() { + std::cout << __PRETTY_FUNCTION__ << std::endl; + constexpr size_t memset_size_in_bits = 32; + test_memset_async_q_impl(); +} + template void test_memcpy_async_t_q() { std::cout << __PRETTY_FUNCTION__ << std::endl; @@ -610,6 +668,10 @@ int main() { test_memcpy_async_pitched_q(); test_memset_async(); test_memset_async_q(); + test_memset_d16_async(); + test_memset_d16_async_q(); + test_memset_d32_async(); + test_memset_d32_async_q(); test_constant_memcpy_async(); test_constant_memcpy_async_q();