Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SYCL] Force instantiation of parallel_for before use of KernelInfo #3886

Closed
wants to merge 6 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 1 addition & 9 deletions sycl/include/CL/sycl/handler.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -796,19 +796,11 @@ class __SYCL_EXPORT handler {

// Get the kernal name to check condition 3.
std::string KName = typeid(NameT *).name();
using KI = detail::KernelInfo<KernelName>;
bool DisableRounding =
(getenv("SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING") != nullptr) ||
(KName.find("SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING") !=
std::string::npos) ||
(KI::getName() == nullptr || KI::getName()[0] == '\0') ||
(KI::callsThisItem());
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm concerned about this particular change. Not calling this_item free functions is a requirement for range rounding feature. Having all tests passed means that we don't have coverage for this.

@romanovvlad, @rdeodhar, what do you think?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The test SYCL/Basic/free_function_queries/free_function_queries.cpp tested this. However, for performance reasons, the heuristics for range rounding were later adjusted so that rounding is only done when the range >= 1024. The test uses a range of 10 so the kernel that uses this_item is not subject to range rounding and the test passes.
The test will need to be modified to increase the range (I will do it separately).
So removing that check from handler.hpp is not correct.


// Perform range rounding if rounding-up is enabled
// and there are sufficient work-items to need rounding
// and the user-specified range is not a multiple of a "good" value.
if (!DisableRounding && (NumWorkItems[0] >= MinRangeX) &&
(NumWorkItems[0] % MinFactorX != 0)) {
if ((NumWorkItems[0] >= MinRangeX) && (NumWorkItems[0] % MinFactorX != 0)) {
// It is sufficient to round up just the first dimension.
// Multiplying the rounded-up value of the first dimension
// by the values of the remaining dimensions (if any)
Expand Down
187 changes: 187 additions & 0 deletions sycl/test/on-device/range-rounding.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,187 @@
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
// REQUIRES: cpu
// RUN: env SYCL_PARALLEL_FOR_RANGE_ROUNDING_TRACE=1 %CPU_RUN_PLACEHOLDER %t.out %CPU_CHECK_PLACEHOLDER

#include <CL/sycl.hpp>

using namespace sycl;

range<1> Range1 = {0};
range<2> Range2 = {0, 0};
range<3> Range3 = {0, 0, 0};

void check(const char *msg, size_t v, size_t ref) {
std::cout << msg << v << std::endl;
assert(v == ref);
}

int try_item1(size_t size) {
range<1> Size{size};
int Counter = 0;
{
buffer<range<1>, 1> BufRange(&Range1, 1);
buffer<int, 1> BufCounter(&Counter, 1);
queue myQueue;

myQueue.submit([&](handler &cgh) {
auto AccRange = BufRange.get_access<access::mode::read_write>(cgh);
auto AccCounter = BufCounter.get_access<access::mode::atomic>(cgh);
cgh.parallel_for<class PF_init_item1>(Size, [=](item<1> ITEM) {
AccCounter[0].fetch_add(1);
AccRange[0] = this_item<1>().get_range(0);
});
});
myQueue.wait();
}
check("Size seen by user = ", Range1.get(0), size);
check("Counter = ", Counter, size);
return 0;
}

void try_item2(size_t size) {
range<2> Size{size, size};
int Counter = 0;
{
buffer<range<2>, 1> BufRange(&Range2, 1);
buffer<int, 1> BufCounter(&Counter, 1);
queue myQueue;

myQueue.submit([&](handler &cgh) {
auto AccRange = BufRange.get_access<access::mode::read_write>(cgh);
auto AccCounter = BufCounter.get_access<access::mode::atomic>(cgh);
cgh.parallel_for<class PF_init_item2>(Size, [=](item<2> ITEM) {
AccCounter[0].fetch_add(1);
AccRange[0][0] = this_item<2>().get_range(0);
});
});
myQueue.wait();
}
check("Size seen by user = ", Range2.get(0), size);
check("Counter = ", Counter, size * size);
}

void try_item3(size_t size) {
range<3> Size{size, size, size};
int Counter = 0;
{
buffer<range<3>, 1> BufRange(&Range3, 1);
buffer<int, 1> BufCounter(&Counter, 1);
queue myQueue;

myQueue.submit([&](handler &cgh) {
auto AccRange = BufRange.get_access<access::mode::read_write>(cgh);
auto AccCounter = BufCounter.get_access<access::mode::atomic>(cgh);
cgh.parallel_for<class PF_init_item3>(Size, [=](item<3> ITEM) {
AccCounter[0].fetch_add(1);
AccRange[0][0] = this_item<3>().get_range(0);
});
});
myQueue.wait();
}
check("Size seen by user = ", Range3.get(0), size);
check("Counter = ", Counter, size * size * size);
}

void try_id1(size_t size) {
range<1> Size{size};
int Counter = 0;
{
buffer<range<1>, 1> BufRange(&Range1, 1);
buffer<int, 1> BufCounter(&Counter, 1);
queue myQueue;

myQueue.submit([&](handler &cgh) {
auto AccRange = BufRange.get_access<access::mode::read_write>(cgh);
auto AccCounter = BufCounter.get_access<access::mode::atomic>(cgh);
cgh.parallel_for<class PF_init_id1>(Size, [=](id<1> ID) {
AccCounter[0].fetch_add(1);
AccRange[0] = this_id<1>()[0];
});
});
myQueue.wait();
}
check("Counter = ", Counter, size);
}

void try_id2(size_t size) {
range<2> Size{size, size};
int Counter = 0;
{
buffer<range<2>, 1> BufRange(&Range2, 1);
buffer<int, 1> BufCounter(&Counter, 1);
queue myQueue;

myQueue.submit([&](handler &cgh) {
auto AccRange = BufRange.get_access<access::mode::read_write>(cgh);
auto AccCounter = BufCounter.get_access<access::mode::atomic>(cgh);
cgh.parallel_for<class PF_init_id2>(Size, [=](id<2> ID) {
AccCounter[0].fetch_add(1);
AccRange[0][0] = this_id<2>()[0];
});
});
myQueue.wait();
}
check("Counter = ", Counter, size * size);
}

void try_id3(size_t size) {
range<3> Size{size, size, size};
int Counter = 0;
{
buffer<range<3>, 1> BufRange(&Range3, 1);
buffer<int, 1> BufCounter(&Counter, 1);
queue myQueue;

myQueue.submit([&](handler &cgh) {
auto AccRange = BufRange.get_access<access::mode::read_write>(cgh);
auto AccCounter = BufCounter.get_access<access::mode::atomic>(cgh);
cgh.parallel_for<class PF_init_id3>(Size, [=](id<3> ID) {
AccCounter[0].fetch_add(1);
AccRange[0][0] = this_id<3>()[0];
});
});
myQueue.wait();
}
check("Counter = ", Counter, size * size * size);
}

int main() {
int x;

x = 10;
try_item1(x);
try_item2(x);
try_item3(x);
try_id1(x);
try_id2(x);
try_id3(x);

x = 1025;
try_item1(x);
try_item2(x);
try_item3(x);
try_id1(x);
try_id2(x);
try_id3(x);

return 0;
}

// CHECK: Size seen by user = 10
// CHECK-NEXT: Counter = 10
// CHECK-NEXT: Size seen by user = 10
// CHECK-NEXT: Counter = 100
// CHECK-NEXT: Size seen by user = 10
// CHECK-NEXT: Counter = 1000
// CHECK-NEXT: Counter = 10
// CHECK-NEXT: Counter = 100
// CHECK-NEXT: Counter = 1000
// CHECK-NEXT: Size seen by user = 1025
// CHECK-NEXT: Counter = 1025
// CHECK-NEXT: Size seen by user = 1025
// CHECK-NEXT: Counter = 1050625
// CHECK-NEXT: Size seen by user = 1025
// CHECK-NEXT: Counter = 1076890625
// CHECK-NEXT: Counter = 1025
// CHECK-NEXT: Counter = 1050625
// CHECK-NEXT: Counter = 1076890625
10 changes: 10 additions & 0 deletions sycl/test/regression/unnamed-lambda.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-device-only -c %s -o %t.temp
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@erichkeane, this test should probably be copied into #3894 to ensure that we have fixed the original problem

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done, should be in the latest patch.


#include "CL/sycl.hpp"

void foo(cl::sycl::queue queue) {
cl::sycl::event queue_event2 = queue.submit([&](cl::sycl::handler &cgh) {
cgh.parallel_for<class K1>(cl::sycl::range<1>{1},
[=](cl::sycl::item<1> id) {});
});
}