diff --git a/sycl/include/sycl/reduction.hpp b/sycl/include/sycl/reduction.hpp index bc84422033fbe..bf8abd1332ce2 100644 --- a/sycl/include/sycl/reduction.hpp +++ b/sycl/include/sycl/reduction.hpp @@ -915,7 +915,7 @@ struct NDRangeReduction< auto &PartialSumsBuf = Redu.getTempBuffer(NWorkGroups * NElements, CGH); accessor PartialSums(PartialSumsBuf, CGH, sycl::read_write, sycl::no_init); - bool IsUpdateOfUserVar = !Reduction::is_usm && !Redu.initializeToIdentity(); + bool IsUpdateOfUserVar = !Redu.initializeToIdentity(); auto Rest = [&](auto NWorkGroupsFinished) { local_accessor DoReducePartialSumsInLastWG{1, CGH}; @@ -1480,8 +1480,6 @@ template <> struct NDRangeReduction { // group size may be not power of two. Those two cases considered // inefficient as they require additional code and checks in the kernel. bool HasUniformWG = NWorkGroups * WGSize == NWorkItems; - if (!Reduction::has_fast_reduce) - HasUniformWG = HasUniformWG && (WGSize & (WGSize - 1)) == 0; // Get read accessor to the buffer that was used as output // in the previous kernel. @@ -1493,7 +1491,7 @@ template <> struct NDRangeReduction { !Redu.initializeToIdentity() && NWorkGroups == 1; - bool UniformPow2WG = HasUniformWG; + bool UniformPow2WG = HasUniformWG && (WGSize & (WGSize - 1)) == 0; // Use local memory to reduce elements in work-groups into 0-th element. // If WGSize is not power of two, then WGSize+1 elements are allocated. // The additional last element is used to catch elements that could