From 0f6e5f29f6976e569f1ecd7d4e4db647339ff681 Mon Sep 17 00:00:00 2001 From: Vladimir Lazarev Date: Sun, 25 Apr 2021 15:54:54 +0300 Subject: [PATCH 1/5] [SYCL] Support 3- and 16-elememnt vectors for sub-group load/store --- sycl/include/CL/sycl/ONEAPI/sub_group.hpp | 101 +++++++++++++++++++++- 1 file changed, 99 insertions(+), 2 deletions(-) diff --git a/sycl/include/CL/sycl/ONEAPI/sub_group.hpp b/sycl/include/CL/sycl/ONEAPI/sub_group.hpp index 11d09c114bf81..d488e6cbd71a3 100644 --- a/sycl/include/CL/sycl/ONEAPI/sub_group.hpp +++ b/sycl/include/CL/sycl/ONEAPI/sub_group.hpp @@ -299,7 +299,7 @@ struct sub_group { template sycl::detail::enable_if_t< sycl::detail::sub_group::AcceptableForGlobalLoadStore::value && - N != 1, + N != 1 && N != 16 && N != 3, vec> load(const multi_ptr src) const { #ifdef __SYCL_DEVICE_ONLY__ @@ -319,6 +319,59 @@ struct sub_group { #endif } + template + sycl::detail::enable_if_t< + sycl::detail::sub_group::AcceptableForGlobalLoadStore::value && + N == 16, + vec> + load(const multi_ptr src) const { +#ifdef __SYCL_DEVICE_ONLY__ +#ifdef __NVPTX__ + vec res; + for (int i = 0; i < N; ++i) { + res[i] = *(src.get() + i * get_max_local_range()[0] + get_local_id()[0]); + } + return res; +#else + return {sycl::detail::sub_group::load<8, T>(src), + sycl::detail::sub_group::load<8, T>(src + + 8 * get_max_local_range()[0])}; +#endif // __NVPTX__ +#else + (void)src; + throw runtime_error("Sub-groups are not supported on host device.", + PI_INVALID_DEVICE); +#endif + } + + template + sycl::detail::enable_if_t< + sycl::detail::sub_group::AcceptableForGlobalLoadStore::value && + N == 3, + vec> + load(const multi_ptr src) const { +#ifdef __SYCL_DEVICE_ONLY__ +#ifdef __NVPTX__ + vec res; + for (int i = 0; i < N; ++i) { + res[i] = *(src.get() + i * get_max_local_range()[0] + get_local_id()[0]); + } + return res; +#else + auto res = sycl::detail::sub_group::load<4, T>(src); + return {res.s2(), res.s1(), res.s0()}; + /* return { + sycl::detail::sub_group::load<1, T>(src), + sycl::detail::sub_group::load<2, T>(src + + get_max_local_range()[0])};*/ +#endif // __NVPTX__ +#else + (void)src; + throw runtime_error("Sub-groups are not supported on host device.", + PI_INVALID_DEVICE); +#endif + } + template sycl::detail::enable_if_t< sycl::detail::sub_group::AcceptableForLocalLoadStore::value, @@ -459,7 +512,7 @@ struct sub_group { template sycl::detail::enable_if_t< sycl::detail::sub_group::AcceptableForGlobalLoadStore::value && - N != 1> + N != 1 && N != 16 && N != 3> store(multi_ptr dst, const vec &x) const { #ifdef __SYCL_DEVICE_ONLY__ #ifdef __NVPTX__ @@ -477,6 +530,50 @@ struct sub_group { #endif } + template + sycl::detail::enable_if_t< + sycl::detail::sub_group::AcceptableForGlobalLoadStore::value && + N == 16> + store(multi_ptr dst, const vec &x) const { +#ifdef __SYCL_DEVICE_ONLY__ +#ifdef __NVPTX__ + for (int i = 0; i < N; ++i) { + *(dst.get() + i * get_max_local_range()[0] + get_local_id()[0]) = x[i]; + } +#else + store<8, T, Space>(dst, x.lo()); + store<8, T, Space>(dst + 8 * get_max_local_range()[0], x.hi()); +#endif // __NVPTX__ +#else + (void)dst; + (void)x; + throw runtime_error("Sub-groups are not supported on host device.", + PI_INVALID_DEVICE); +#endif + } + + template + sycl::detail::enable_if_t< + sycl::detail::sub_group::AcceptableForGlobalLoadStore::value && + N == 3> + store(multi_ptr dst, const vec &x) const { +#ifdef __SYCL_DEVICE_ONLY__ +#ifdef __NVPTX__ + for (int i = 0; i < N; ++i) { + *(dst.get() + i * get_max_local_range()[0] + get_local_id()[0]) = x[i]; + } +#else + store<1, T, Space>(dst, x.s0()); + store<2, T, Space>(dst + get_max_local_range()[0], {x.s1(), x.s2()}); +#endif // __NVPTX__ +#else + (void)dst; + (void)x; + throw runtime_error("Sub-groups are not supported on host device.", + PI_INVALID_DEVICE); +#endif + } + template sycl::detail::enable_if_t< sycl::detail::sub_group::AcceptableForLocalLoadStore::value> From 7be54390d119449c7c56015a42250277cdde8405 Mon Sep 17 00:00:00 2001 From: Vladimir Lazarev Date: Sun, 25 Apr 2021 16:32:42 +0300 Subject: [PATCH 2/5] Decrease code ducplication for load --- sycl/include/CL/sycl/ONEAPI/sub_group.hpp | 95 ++++++++--------------- 1 file changed, 32 insertions(+), 63 deletions(-) diff --git a/sycl/include/CL/sycl/ONEAPI/sub_group.hpp b/sycl/include/CL/sycl/ONEAPI/sub_group.hpp index d488e6cbd71a3..4fdf789e57cda 100644 --- a/sycl/include/CL/sycl/ONEAPI/sub_group.hpp +++ b/sycl/include/CL/sycl/ONEAPI/sub_group.hpp @@ -295,82 +295,70 @@ struct sub_group { PI_INVALID_DEVICE); #endif } - +#ifdef __SYCL_DEVICE_ONLY__ +#ifdef __NVPTX__ template sycl::detail::enable_if_t< - sycl::detail::sub_group::AcceptableForGlobalLoadStore::value && - N != 1 && N != 16 && N != 3, + sycl::detail::sub_group::AcceptableForGlobalLoadStore::value, vec> load(const multi_ptr src) const { -#ifdef __SYCL_DEVICE_ONLY__ -#ifdef __NVPTX__ vec res; for (int i = 0; i < N; ++i) { res[i] = *(src.get() + i * get_max_local_range()[0] + get_local_id()[0]); } return res; -#else + } +#else // __NVPTX__ + template + sycl::detail::enable_if_t< + sycl::detail::sub_group::AcceptableForGlobalLoadStore::value && + N != 1 && N != 3 && N != 16, + vec> + load(const multi_ptr src) const { return sycl::detail::sub_group::load(src); -#endif // __NVPTX__ -#else - (void)src; - throw runtime_error("Sub-groups are not supported on host device.", - PI_INVALID_DEVICE); -#endif } template sycl::detail::enable_if_t< sycl::detail::sub_group::AcceptableForGlobalLoadStore::value && N == 16, - vec> + vec> load(const multi_ptr src) const { -#ifdef __SYCL_DEVICE_ONLY__ -#ifdef __NVPTX__ - vec res; - for (int i = 0; i < N; ++i) { - res[i] = *(src.get() + i * get_max_local_range()[0] + get_local_id()[0]); - } - return res; -#else return {sycl::detail::sub_group::load<8, T>(src), sycl::detail::sub_group::load<8, T>(src + 8 * get_max_local_range()[0])}; -#endif // __NVPTX__ -#else - (void)src; - throw runtime_error("Sub-groups are not supported on host device.", - PI_INVALID_DEVICE); -#endif } template sycl::detail::enable_if_t< sycl::detail::sub_group::AcceptableForGlobalLoadStore::value && N == 3, - vec> + vec> load(const multi_ptr src) const { -#ifdef __SYCL_DEVICE_ONLY__ -#ifdef __NVPTX__ - vec res; - for (int i = 0; i < N; ++i) { - res[i] = *(src.get() + i * get_max_local_range()[0] + get_local_id()[0]); - } - return res; -#else auto res = sycl::detail::sub_group::load<4, T>(src); return {res.s2(), res.s1(), res.s0()}; - /* return { - sycl::detail::sub_group::load<1, T>(src), - sycl::detail::sub_group::load<2, T>(src + - get_max_local_range()[0])};*/ -#endif // __NVPTX__ -#else - (void)src; + } + + template + sycl::detail::enable_if_t< + sycl::detail::sub_group::AcceptableForGlobalLoadStore::value && + N == 1, + vec> + load(const multi_ptr src) const { + return sycl::detail::sub_group::load(src); + } +#endif // ___NVPTX___ +#else // __SYCL_DEVICE_ONLY__ + template + sycl::detail::enable_if_t< + sycl::detail::sub_group::AcceptableForGlobalLoadStore::value, + vec> + load(const multi_ptr src) const { + (void)src; throw runtime_error("Sub-groups are not supported on host device.", PI_INVALID_DEVICE); -#endif } +#endif // __SYCL_DEVICE_ONLY__ template sycl::detail::enable_if_t< @@ -390,25 +378,6 @@ struct sub_group { #endif } - template - sycl::detail::enable_if_t< - sycl::detail::sub_group::AcceptableForGlobalLoadStore::value && - N == 1, - vec> - load(const multi_ptr src) const { -#ifdef __SYCL_DEVICE_ONLY__ -#ifdef __NVPTX__ - return src.get()[get_local_id()[0]]; -#else - return sycl::detail::sub_group::load(src); -#endif // __NVPTX__ -#else - (void)src; - throw runtime_error("Sub-groups are not supported on host device.", - PI_INVALID_DEVICE); -#endif - } - #ifdef __SYCL_DEVICE_ONLY__ // Method for decorated pointer template From db5415cb2b783f198e830e063b0a49874d59c733 Mon Sep 17 00:00:00 2001 From: Vladimir Lazarev Date: Mon, 26 Apr 2021 08:32:21 +0300 Subject: [PATCH 3/5] Reduce code duplication for store function --- sycl/include/CL/sycl/ONEAPI/sub_group.hpp | 90 ++++++++--------------- 1 file changed, 32 insertions(+), 58 deletions(-) diff --git a/sycl/include/CL/sycl/ONEAPI/sub_group.hpp b/sycl/include/CL/sycl/ONEAPI/sub_group.hpp index 4fdf789e57cda..57dff28c668fa 100644 --- a/sycl/include/CL/sycl/ONEAPI/sub_group.hpp +++ b/sycl/include/CL/sycl/ONEAPI/sub_group.hpp @@ -308,7 +308,7 @@ struct sub_group { } return res; } -#else // __NVPTX__ +#else // __NVPTX__ template sycl::detail::enable_if_t< sycl::detail::sub_group::AcceptableForGlobalLoadStore::value && @@ -348,13 +348,13 @@ struct sub_group { return sycl::detail::sub_group::load(src); } #endif // ___NVPTX___ -#else // __SYCL_DEVICE_ONLY__ +#else // __SYCL_DEVICE_ONLY__ template sycl::detail::enable_if_t< sycl::detail::sub_group::AcceptableForGlobalLoadStore::value, vec> load(const multi_ptr src) const { - (void)src; + (void)src; throw runtime_error("Sub-groups are not supported on host device.", PI_INVALID_DEVICE); } @@ -459,89 +459,63 @@ struct sub_group { #endif } - template - sycl::detail::enable_if_t< - sycl::detail::sub_group::AcceptableForGlobalLoadStore::value && - N == 1> - store(multi_ptr dst, const vec &x) const { #ifdef __SYCL_DEVICE_ONLY__ #ifdef __NVPTX__ - dst.get()[get_local_id()[0]] = x[0]; -#else - store(dst, x); -#endif // __NVPTX__ -#else - (void)dst; - (void)x; - throw runtime_error("Sub-groups are not supported on host device.", - PI_INVALID_DEVICE); -#endif - } - template sycl::detail::enable_if_t< - sycl::detail::sub_group::AcceptableForGlobalLoadStore::value && - N != 1 && N != 16 && N != 3> + sycl::detail::sub_group::AcceptableForGlobalLoadStore::value> store(multi_ptr dst, const vec &x) const { -#ifdef __SYCL_DEVICE_ONLY__ -#ifdef __NVPTX__ for (int i = 0; i < N; ++i) { *(dst.get() + i * get_max_local_range()[0] + get_local_id()[0]) = x[i]; } -#else + } +#else // __NVPTX__ + template + sycl::detail::enable_if_t< + sycl::detail::sub_group::AcceptableForGlobalLoadStore::value && + N != 1 && N != 3 && N != 16> + store(multi_ptr dst, const vec &x) const { sycl::detail::sub_group::store(dst, x); -#endif // __NVPTX__ -#else - (void)dst; - (void)x; - throw runtime_error("Sub-groups are not supported on host device.", - PI_INVALID_DEVICE); -#endif } template sycl::detail::enable_if_t< sycl::detail::sub_group::AcceptableForGlobalLoadStore::value && - N == 16> - store(multi_ptr dst, const vec &x) const { -#ifdef __SYCL_DEVICE_ONLY__ -#ifdef __NVPTX__ - for (int i = 0; i < N; ++i) { - *(dst.get() + i * get_max_local_range()[0] + get_local_id()[0]) = x[i]; - } -#else - store<8, T, Space>(dst, x.lo()); - store<8, T, Space>(dst + 8 * get_max_local_range()[0], x.hi()); -#endif // __NVPTX__ -#else - (void)dst; - (void)x; - throw runtime_error("Sub-groups are not supported on host device.", - PI_INVALID_DEVICE); -#endif + N == 1> + store(multi_ptr dst, const vec &x) const { + sycl::detail::sub_group::store(dst, x); } template sycl::detail::enable_if_t< sycl::detail::sub_group::AcceptableForGlobalLoadStore::value && N == 3> - store(multi_ptr dst, const vec &x) const { -#ifdef __SYCL_DEVICE_ONLY__ -#ifdef __NVPTX__ - for (int i = 0; i < N; ++i) { - *(dst.get() + i * get_max_local_range()[0] + get_local_id()[0]) = x[i]; - } -#else + store(multi_ptr dst, const vec &x) const { store<1, T, Space>(dst, x.s0()); store<2, T, Space>(dst + get_max_local_range()[0], {x.s1(), x.s2()}); + } + + template + sycl::detail::enable_if_t< + sycl::detail::sub_group::AcceptableForGlobalLoadStore::value && + N == 16> + store(multi_ptr dst, const vec &x) const { + store<8, T, Space>(dst, x.lo()); + store<8, T, Space>(dst + 8 * get_max_local_range()[0], x.hi()); + } + #endif // __NVPTX__ -#else +#else // __SYCL_DEVICE_ONLY__ + template + sycl::detail::enable_if_t< + sycl::detail::sub_group::AcceptableForGlobalLoadStore::value> + store(multi_ptr dst, const vec &x) const { (void)dst; (void)x; throw runtime_error("Sub-groups are not supported on host device.", PI_INVALID_DEVICE); -#endif } +#endif // __SYCL_DEVICE_ONLY__ template sycl::detail::enable_if_t< From 89aeb4914517109b65ef7d39e8f8692f6b91c6f3 Mon Sep 17 00:00:00 2001 From: Vladimir Lazarev Date: Mon, 26 Apr 2021 14:24:58 +0300 Subject: [PATCH 4/5] Apply review comments --- sycl/include/CL/sycl/ONEAPI/sub_group.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/include/CL/sycl/ONEAPI/sub_group.hpp b/sycl/include/CL/sycl/ONEAPI/sub_group.hpp index 57dff28c668fa..984540f8685c9 100644 --- a/sycl/include/CL/sycl/ONEAPI/sub_group.hpp +++ b/sycl/include/CL/sycl/ONEAPI/sub_group.hpp @@ -336,7 +336,7 @@ struct sub_group { vec> load(const multi_ptr src) const { auto res = sycl::detail::sub_group::load<4, T>(src); - return {res.s2(), res.s1(), res.s0()}; + return {res.s0(), res.s1(), res.s2()}; } template From 27456b5afd125d3f1163918d8a0e3e63d533e600 Mon Sep 17 00:00:00 2001 From: Vladimir Lazarev Date: Mon, 26 Apr 2021 19:11:57 +0300 Subject: [PATCH 5/5] Apply review comments --- sycl/include/CL/sycl/ONEAPI/sub_group.hpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/sycl/include/CL/sycl/ONEAPI/sub_group.hpp b/sycl/include/CL/sycl/ONEAPI/sub_group.hpp index 984540f8685c9..8989c4cf027a2 100644 --- a/sycl/include/CL/sycl/ONEAPI/sub_group.hpp +++ b/sycl/include/CL/sycl/ONEAPI/sub_group.hpp @@ -335,8 +335,9 @@ struct sub_group { N == 3, vec> load(const multi_ptr src) const { - auto res = sycl::detail::sub_group::load<4, T>(src); - return {res.s0(), res.s1(), res.s2()}; + return { + sycl::detail::sub_group::load<1, T>(src), + sycl::detail::sub_group::load<2, T>(src + get_max_local_range()[0])}; } template