From 7fae328f52d3a08234aa18c3f37123735aca0fea Mon Sep 17 00:00:00 2001 From: Erich Keane Date: Fri, 4 Jun 2021 09:24:00 -0700 Subject: [PATCH 1/6] [SYCL] Force instantiation of parallel_for before use of KernelInfo With the new __builtin_sycl_unique_stable_name implementation, we need to make sure we instantiate the kernel call BEFORE we use the builtin via KernelInfo. We can do this by naming the function (but only if it is a static/global, so this also changes those functions to static). Additionally, we need to make sure we do this in the same way that parallel_for_wrapper does, otherwise we'll end up with 2 kernels instantiated with the same name. --- sycl/include/CL/sycl/handler.hpp | 36 +++++++++++++++++++++++++++++++- 1 file changed, 35 insertions(+), 1 deletion(-) diff --git a/sycl/include/CL/sycl/handler.hpp b/sycl/include/CL/sycl/handler.hpp index 0642f36588b6c..2777643b0d90e 100644 --- a/sycl/include/CL/sycl/handler.hpp +++ b/sycl/include/CL/sycl/handler.hpp @@ -796,6 +796,41 @@ class __SYCL_EXPORT handler { // Get the kernal name to check condition 3. std::string KName = typeid(NameT *).name(); + + // Force instantiation of the kernel before we use the kernel info to make + // sure __builtin_sycl_unique_stable_name doesn't cause problems. + // The instantiations of kernel_parallel_for must match the logic caused by + // kernel_parallel_for_wrapper. + using WrapperTy = + decltype(getRangeRoundedKernelLambda( + KernelFunc, NumWorkItems)); + using NameWT = typename detail::get_kernel_wrapper_name_t::name; +#ifdef __SYCL_NONCONST_FUNCTOR__ + using WrapperKernelParamTy = WrapperTy; + using KernelParamTy = KernelFunc; +#else + using WrapperKernelParamTy = const WrapperTy &; + using KernelParamTy = const KernelFunc &; +#endif + + if constexpr (detail::isKernelLambdaCallableWithKernelHandler< + WrapperTy, ElementType>()) { + (void)static_cast( + kernel_parallel_for); + } else { + (void)static_cast( + kernel_parallel_for); + } + + if constexpr (detail::isKernelLambdaCallableWithKernelHandler< + KernelType, ElementType>()) { + (void)static_cast( + kernel_parallel_for); + } else { + (void)static_cast( + kernel_parallel_for); + } + using KI = detail::KernelInfo; bool DisableRounding = (getenv("SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING") != nullptr) || @@ -815,7 +850,6 @@ class __SYCL_EXPORT handler { // will yield a rounded-up value for the total range. size_t NewValX = ((NumWorkItems[0] + GoodFactorX - 1) / GoodFactorX) * GoodFactorX; - using NameWT = typename detail::get_kernel_wrapper_name_t::name; if (getenv("SYCL_PARALLEL_FOR_RANGE_ROUNDING_TRACE") != nullptr) std::cout << "parallel_for range adjusted from " << NumWorkItems[0] << " to " << NewValX << std::endl; From 6c946114c39eb2e21958e9510cf92efb4fb0de9d Mon Sep 17 00:00:00 2001 From: Erich Keane Date: Fri, 4 Jun 2021 09:32:28 -0700 Subject: [PATCH 2/6] Forgot to mark parallel_for static --- sycl/include/CL/sycl/handler.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sycl/include/CL/sycl/handler.hpp b/sycl/include/CL/sycl/handler.hpp index 2777643b0d90e..a3deac2922ede 100644 --- a/sycl/include/CL/sycl/handler.hpp +++ b/sycl/include/CL/sycl/handler.hpp @@ -944,7 +944,7 @@ class __SYCL_EXPORT handler { // NOTE: the name of these functions - "kernel_parallel_for" - are used by the // Front End to determine kernel invocation kind. template - __SYCL_KERNEL_ATTR__ void + __SYCL_KERNEL_ATTR__ static void #ifdef __SYCL_NONCONST_FUNCTOR__ kernel_parallel_for(KernelType KernelFunc) { #else @@ -960,7 +960,7 @@ class __SYCL_EXPORT handler { // NOTE: the name of these functions - "kernel_parallel_for" - are used by the // Front End to determine kernel invocation kind. template - __SYCL_KERNEL_ATTR__ void + __SYCL_KERNEL_ATTR__ static void #ifdef __SYCL_NONCONST_FUNCTOR__ kernel_parallel_for(KernelType KernelFunc, kernel_handler KH) { #else From a4010907713c07a89122e3c41140ddaf6cdad59f Mon Sep 17 00:00:00 2001 From: Erich Keane Date: Fri, 4 Jun 2021 13:04:49 -0700 Subject: [PATCH 3/6] Extract the instantiator out into its own type to simplify its use --- sycl/include/CL/sycl/handler.hpp | 56 ++++++++++++++++++-------------- 1 file changed, 32 insertions(+), 24 deletions(-) diff --git a/sycl/include/CL/sycl/handler.hpp b/sycl/include/CL/sycl/handler.hpp index a3deac2922ede..b763a96f85980 100644 --- a/sycl/include/CL/sycl/handler.hpp +++ b/sycl/include/CL/sycl/handler.hpp @@ -805,31 +805,11 @@ class __SYCL_EXPORT handler { decltype(getRangeRoundedKernelLambda( KernelFunc, NumWorkItems)); using NameWT = typename detail::get_kernel_wrapper_name_t::name; -#ifdef __SYCL_NONCONST_FUNCTOR__ - using WrapperKernelParamTy = WrapperTy; - using KernelParamTy = KernelFunc; -#else - using WrapperKernelParamTy = const WrapperTy &; - using KernelParamTy = const KernelFunc &; -#endif - - if constexpr (detail::isKernelLambdaCallableWithKernelHandler< - WrapperTy, ElementType>()) { - (void)static_cast( - kernel_parallel_for); - } else { - (void)static_cast( - kernel_parallel_for); - } - if constexpr (detail::isKernelLambdaCallableWithKernelHandler< - KernelType, ElementType>()) { - (void)static_cast( - kernel_parallel_for); - } else { - (void)static_cast( - kernel_parallel_for); - } + (void)kernel_parallel_for_wrapper_instantiator::value; + (void)kernel_parallel_for_wrapper_instantiator::value; using KI = detail::KernelInfo; bool DisableRounding = @@ -1050,6 +1030,34 @@ class __SYCL_EXPORT handler { } } + // Helper instantiator type for kernel_parallel_for_wrapper that + // instantiates but not calls the appropriate kernel. Needed to support use of + // KernelInfo in parallel_for_lambda_impl when supporting + // SCYL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING. + template + class kernel_parallel_for_wrapper_instantiator { + static constexpr auto func_loader() { +#ifdef __SYCL_NONCONST_FUNCTOR__ + using ParamTy = KernelType; +#else + using ParamTy = const KernelType &; +#endif + if constexpr (detail::isKernelLambdaCallableWithKernelHandler< + KernelType, ElementType>()) { + using FuncTy = void (*)(ParamTy, kernel_handler); + return static_cast( + kernel_parallel_for); + } else { + using FuncTy = void (*)(ParamTy); + return static_cast( + kernel_parallel_for); + } + } + + public: + static constexpr auto value = func_loader(); + }; + // Wrappers for kernel_parallel_for_work_group(...) template From 9bb9bcda63bb885d0e1f15dba188a4234b895436 Mon Sep 17 00:00:00 2001 From: Erich Keane Date: Fri, 4 Jun 2021 13:09:04 -0700 Subject: [PATCH 4/6] Clang-format fix --- sycl/include/CL/sycl/handler.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sycl/include/CL/sycl/handler.hpp b/sycl/include/CL/sycl/handler.hpp index b763a96f85980..4d6d58199960e 100644 --- a/sycl/include/CL/sycl/handler.hpp +++ b/sycl/include/CL/sycl/handler.hpp @@ -807,9 +807,9 @@ class __SYCL_EXPORT handler { using NameWT = typename detail::get_kernel_wrapper_name_t::name; (void)kernel_parallel_for_wrapper_instantiator::value; + WrapperTy>::value; (void)kernel_parallel_for_wrapper_instantiator::value; + KernelType>::value; using KI = detail::KernelInfo; bool DisableRounding = From f60aaa7a3b2dc6c8d9ef6503c907a61c589fa995 Mon Sep 17 00:00:00 2001 From: Erich Keane Date: Fri, 4 Jun 2021 16:27:20 -0700 Subject: [PATCH 5/6] Revert most of the change, just remove the DISABLE_PARALLEL_FOR_RANGE_ROUNDING feature. --- sycl/include/CL/sycl/handler.hpp | 58 +++----------------------------- 1 file changed, 4 insertions(+), 54 deletions(-) diff --git a/sycl/include/CL/sycl/handler.hpp b/sycl/include/CL/sycl/handler.hpp index 4d6d58199960e..ce8c25268c717 100644 --- a/sycl/include/CL/sycl/handler.hpp +++ b/sycl/include/CL/sycl/handler.hpp @@ -797,39 +797,17 @@ class __SYCL_EXPORT handler { // Get the kernal name to check condition 3. std::string KName = typeid(NameT *).name(); - // Force instantiation of the kernel before we use the kernel info to make - // sure __builtin_sycl_unique_stable_name doesn't cause problems. - // The instantiations of kernel_parallel_for must match the logic caused by - // kernel_parallel_for_wrapper. - using WrapperTy = - decltype(getRangeRoundedKernelLambda( - KernelFunc, NumWorkItems)); - using NameWT = typename detail::get_kernel_wrapper_name_t::name; - - (void)kernel_parallel_for_wrapper_instantiator::value; - (void)kernel_parallel_for_wrapper_instantiator::value; - - using KI = detail::KernelInfo; - bool DisableRounding = - (getenv("SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING") != nullptr) || - (KName.find("SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING") != - std::string::npos) || - (KI::getName() == nullptr || KI::getName()[0] == '\0') || - (KI::callsThisItem()); - // Perform range rounding if rounding-up is enabled // and there are sufficient work-items to need rounding // and the user-specified range is not a multiple of a "good" value. - if (!DisableRounding && (NumWorkItems[0] >= MinRangeX) && - (NumWorkItems[0] % MinFactorX != 0)) { + if ((NumWorkItems[0] >= MinRangeX) && (NumWorkItems[0] % MinFactorX != 0)) { // It is sufficient to round up just the first dimension. // Multiplying the rounded-up value of the first dimension // by the values of the remaining dimensions (if any) // will yield a rounded-up value for the total range. size_t NewValX = ((NumWorkItems[0] + GoodFactorX - 1) / GoodFactorX) * GoodFactorX; + using NameWT = typename detail::get_kernel_wrapper_name_t::name; if (getenv("SYCL_PARALLEL_FOR_RANGE_ROUNDING_TRACE") != nullptr) std::cout << "parallel_for range adjusted from " << NumWorkItems[0] << " to " << NewValX << std::endl; @@ -924,7 +902,7 @@ class __SYCL_EXPORT handler { // NOTE: the name of these functions - "kernel_parallel_for" - are used by the // Front End to determine kernel invocation kind. template - __SYCL_KERNEL_ATTR__ static void + __SYCL_KERNEL_ATTR__ void #ifdef __SYCL_NONCONST_FUNCTOR__ kernel_parallel_for(KernelType KernelFunc) { #else @@ -940,7 +918,7 @@ class __SYCL_EXPORT handler { // NOTE: the name of these functions - "kernel_parallel_for" - are used by the // Front End to determine kernel invocation kind. template - __SYCL_KERNEL_ATTR__ static void + __SYCL_KERNEL_ATTR__ void #ifdef __SYCL_NONCONST_FUNCTOR__ kernel_parallel_for(KernelType KernelFunc, kernel_handler KH) { #else @@ -1030,34 +1008,6 @@ class __SYCL_EXPORT handler { } } - // Helper instantiator type for kernel_parallel_for_wrapper that - // instantiates but not calls the appropriate kernel. Needed to support use of - // KernelInfo in parallel_for_lambda_impl when supporting - // SCYL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING. - template - class kernel_parallel_for_wrapper_instantiator { - static constexpr auto func_loader() { -#ifdef __SYCL_NONCONST_FUNCTOR__ - using ParamTy = KernelType; -#else - using ParamTy = const KernelType &; -#endif - if constexpr (detail::isKernelLambdaCallableWithKernelHandler< - KernelType, ElementType>()) { - using FuncTy = void (*)(ParamTy, kernel_handler); - return static_cast( - kernel_parallel_for); - } else { - using FuncTy = void (*)(ParamTy); - return static_cast( - kernel_parallel_for); - } - } - - public: - static constexpr auto value = func_loader(); - }; - // Wrappers for kernel_parallel_for_work_group(...) template From 70d05d180d448c0dd03acf6aa3dbf9736a87bd46 Mon Sep 17 00:00:00 2001 From: Alexey Sachkov Date: Mon, 7 Jun 2021 14:38:06 +0300 Subject: [PATCH 6/6] Add a few regression tests --- sycl/test/on-device/range-rounding.cpp | 187 ++++++++++++++++++++++++ sycl/test/regression/unnamed-lambda.cpp | 10 ++ 2 files changed, 197 insertions(+) create mode 100644 sycl/test/on-device/range-rounding.cpp create mode 100644 sycl/test/regression/unnamed-lambda.cpp diff --git a/sycl/test/on-device/range-rounding.cpp b/sycl/test/on-device/range-rounding.cpp new file mode 100644 index 0000000000000..fe8194ce5b738 --- /dev/null +++ b/sycl/test/on-device/range-rounding.cpp @@ -0,0 +1,187 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// REQUIRES: cpu +// RUN: env SYCL_PARALLEL_FOR_RANGE_ROUNDING_TRACE=1 %CPU_RUN_PLACEHOLDER %t.out %CPU_CHECK_PLACEHOLDER + +#include + +using namespace sycl; + +range<1> Range1 = {0}; +range<2> Range2 = {0, 0}; +range<3> Range3 = {0, 0, 0}; + +void check(const char *msg, size_t v, size_t ref) { + std::cout << msg << v << std::endl; + assert(v == ref); +} + +int try_item1(size_t size) { + range<1> Size{size}; + int Counter = 0; + { + buffer, 1> BufRange(&Range1, 1); + buffer BufCounter(&Counter, 1); + queue myQueue; + + myQueue.submit([&](handler &cgh) { + auto AccRange = BufRange.get_access(cgh); + auto AccCounter = BufCounter.get_access(cgh); + cgh.parallel_for(Size, [=](item<1> ITEM) { + AccCounter[0].fetch_add(1); + AccRange[0] = this_item<1>().get_range(0); + }); + }); + myQueue.wait(); + } + check("Size seen by user = ", Range1.get(0), size); + check("Counter = ", Counter, size); + return 0; +} + +void try_item2(size_t size) { + range<2> Size{size, size}; + int Counter = 0; + { + buffer, 1> BufRange(&Range2, 1); + buffer BufCounter(&Counter, 1); + queue myQueue; + + myQueue.submit([&](handler &cgh) { + auto AccRange = BufRange.get_access(cgh); + auto AccCounter = BufCounter.get_access(cgh); + cgh.parallel_for(Size, [=](item<2> ITEM) { + AccCounter[0].fetch_add(1); + AccRange[0][0] = this_item<2>().get_range(0); + }); + }); + myQueue.wait(); + } + check("Size seen by user = ", Range2.get(0), size); + check("Counter = ", Counter, size * size); +} + +void try_item3(size_t size) { + range<3> Size{size, size, size}; + int Counter = 0; + { + buffer, 1> BufRange(&Range3, 1); + buffer BufCounter(&Counter, 1); + queue myQueue; + + myQueue.submit([&](handler &cgh) { + auto AccRange = BufRange.get_access(cgh); + auto AccCounter = BufCounter.get_access(cgh); + cgh.parallel_for(Size, [=](item<3> ITEM) { + AccCounter[0].fetch_add(1); + AccRange[0][0] = this_item<3>().get_range(0); + }); + }); + myQueue.wait(); + } + check("Size seen by user = ", Range3.get(0), size); + check("Counter = ", Counter, size * size * size); +} + +void try_id1(size_t size) { + range<1> Size{size}; + int Counter = 0; + { + buffer, 1> BufRange(&Range1, 1); + buffer BufCounter(&Counter, 1); + queue myQueue; + + myQueue.submit([&](handler &cgh) { + auto AccRange = BufRange.get_access(cgh); + auto AccCounter = BufCounter.get_access(cgh); + cgh.parallel_for(Size, [=](id<1> ID) { + AccCounter[0].fetch_add(1); + AccRange[0] = this_id<1>()[0]; + }); + }); + myQueue.wait(); + } + check("Counter = ", Counter, size); +} + +void try_id2(size_t size) { + range<2> Size{size, size}; + int Counter = 0; + { + buffer, 1> BufRange(&Range2, 1); + buffer BufCounter(&Counter, 1); + queue myQueue; + + myQueue.submit([&](handler &cgh) { + auto AccRange = BufRange.get_access(cgh); + auto AccCounter = BufCounter.get_access(cgh); + cgh.parallel_for(Size, [=](id<2> ID) { + AccCounter[0].fetch_add(1); + AccRange[0][0] = this_id<2>()[0]; + }); + }); + myQueue.wait(); + } + check("Counter = ", Counter, size * size); +} + +void try_id3(size_t size) { + range<3> Size{size, size, size}; + int Counter = 0; + { + buffer, 1> BufRange(&Range3, 1); + buffer BufCounter(&Counter, 1); + queue myQueue; + + myQueue.submit([&](handler &cgh) { + auto AccRange = BufRange.get_access(cgh); + auto AccCounter = BufCounter.get_access(cgh); + cgh.parallel_for(Size, [=](id<3> ID) { + AccCounter[0].fetch_add(1); + AccRange[0][0] = this_id<3>()[0]; + }); + }); + myQueue.wait(); + } + check("Counter = ", Counter, size * size * size); +} + +int main() { + int x; + + x = 10; + try_item1(x); + try_item2(x); + try_item3(x); + try_id1(x); + try_id2(x); + try_id3(x); + + x = 1025; + try_item1(x); + try_item2(x); + try_item3(x); + try_id1(x); + try_id2(x); + try_id3(x); + + return 0; +} + +// CHECK: Size seen by user = 10 +// CHECK-NEXT: Counter = 10 +// CHECK-NEXT: Size seen by user = 10 +// CHECK-NEXT: Counter = 100 +// CHECK-NEXT: Size seen by user = 10 +// CHECK-NEXT: Counter = 1000 +// CHECK-NEXT: Counter = 10 +// CHECK-NEXT: Counter = 100 +// CHECK-NEXT: Counter = 1000 +// CHECK-NEXT: Size seen by user = 1025 +// CHECK-NEXT: Counter = 1025 +// CHECK-NEXT: Size seen by user = 1025 +// CHECK-NEXT: Counter = 1050625 +// CHECK-NEXT: Size seen by user = 1025 +// CHECK-NEXT: Counter = 1076890625 +// CHECK-NEXT: Counter = 1025 +// CHECK-NEXT: Counter = 1050625 +// CHECK-NEXT: Counter = 1076890625 diff --git a/sycl/test/regression/unnamed-lambda.cpp b/sycl/test/regression/unnamed-lambda.cpp new file mode 100644 index 0000000000000..162bf86189dbb --- /dev/null +++ b/sycl/test/regression/unnamed-lambda.cpp @@ -0,0 +1,10 @@ +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-device-only -c %s -o %t.temp + +#include "CL/sycl.hpp" + +void foo(cl::sycl::queue queue) { + cl::sycl::event queue_event2 = queue.submit([&](cl::sycl::handler &cgh) { + cgh.parallel_for(cl::sycl::range<1>{1}, + [=](cl::sycl::item<1> id) {}); + }); +}