From 1a1413f98d0c523bc58de5f566d32ff9e1d3fbce Mon Sep 17 00:00:00 2001 From: Ewan Crawford Date: Fri, 18 Nov 2022 16:27:54 +0000 Subject: [PATCH] [SYCL] Repro for reduction fail * Add RUN lines to tests so that tests are run by LIT * clang-format existing tests, and other minor cleanups * Add `graph-explicit-reduction.cpp` which shows fail from https://github.com/reble/llvm/issues/24 by using the `sycl::ext::oneapi::property::queue::lazy_execution` property on a queue which uses a reduction outwith the graph building API --- sycl/test/graph/graph-explicit-dotp.cpp | 149 ++++++++++--------- sycl/test/graph/graph-explicit-reduction.cpp | 37 +++++ sycl/test/graph/graph-explicit-simple.cpp | 73 +++++---- 3 files changed, 149 insertions(+), 110 deletions(-) create mode 100644 sycl/test/graph/graph-explicit-reduction.cpp diff --git a/sycl/test/graph/graph-explicit-dotp.cpp b/sycl/test/graph/graph-explicit-dotp.cpp index 8525b04904b23..c32a839c919e9 100644 --- a/sycl/test/graph/graph-explicit-dotp.cpp +++ b/sycl/test/graph/graph-explicit-dotp.cpp @@ -1,94 +1,97 @@ -#include +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out #include +#include #include const size_t n = 10; float host_gold_result() { - float alpha = 1.0f; - float beta = 2.0f; - float gamma = 3.0f; - - float sum = 0.0f; - - for(size_t i = 0; i < n; ++i) { - sum += (alpha * 1.0f + beta * 2.0f) - * (gamma * 3.0f + beta * 2.0f); - } - - return sum; + float alpha = 1.0f; + float beta = 2.0f; + float gamma = 3.0f; + + float sum = 0.0f; + + for (size_t i = 0; i < n; ++i) { + sum += (alpha * 1.0f + beta * 2.0f) * (gamma * 3.0f + beta * 2.0f); + } + + return sum; } int main() { - float alpha = 1.0f; - float beta = 2.0f; - float gamma = 3.0f; - - float *x , *y, *z; - - sycl::property_list properties{ - sycl::property::queue::in_order(), - sycl::ext::oneapi::property::queue::lazy_execution{} - }; - - sycl::queue q{sycl::gpu_selector_v, properties}; - - sycl::ext::oneapi::experimental::command_graph g; - - float *dotp = sycl::malloc_shared(1, q); - - x = sycl::malloc_shared(n, q); - y = sycl::malloc_shared(n, q); - z = sycl::malloc_shared(n, q); - - /* init data on the device */ - auto n_i = g.add([&](sycl::handler &h) { - h.parallel_for(n, [=](sycl::id<1> it){ - const size_t i = it[0]; - x[i] = 1.0f; - y[i] = 2.0f; - z[i] = 3.0f; - }); + float alpha = 1.0f; + float beta = 2.0f; + float gamma = 3.0f; + + sycl::property_list properties{ + sycl::property::queue::in_order{}, + sycl::ext::oneapi::property::queue::lazy_execution{}}; + + sycl::queue q{sycl::gpu_selector_v, properties}; + + sycl::ext::oneapi::experimental::command_graph g; + + float *dotp = sycl::malloc_shared(1, q); + + float *x = sycl::malloc_shared(n, q); + float *y = sycl::malloc_shared(n, q); + float *z = sycl::malloc_shared(n, q); + + /* init data on the device */ + auto n_i = g.add([&](sycl::handler &h) { + h.parallel_for(n, [=](sycl::id<1> it) { + const size_t i = it[0]; + x[i] = 1.0f; + y[i] = 2.0f; + z[i] = 3.0f; }); + }); - auto node_a = g.add([&](sycl::handler &h) { + auto node_a = g.add( + [&](sycl::handler &h) { h.parallel_for(sycl::range<1>{n}, [=](sycl::id<1> it) { - const size_t i = it[0]; - x[i] = alpha * x[i] + beta * y[i]; + const size_t i = it[0]; + x[i] = alpha * x[i] + beta * y[i]; }); - }, {n_i}); - - auto node_b = g.add([&](sycl::handler &h) { + }, + {n_i}); + + auto node_b = g.add( + [&](sycl::handler &h) { h.parallel_for(sycl::range<1>{n}, [=](sycl::id<1> it) { - const size_t i = it[0]; - z[i] = gamma * z[i] + beta * y[i]; + const size_t i = it[0]; + z[i] = gamma * z[i] + beta * y[i]; }); - }, {n_i}); + }, + {n_i}); - auto node_c = g.add([&](sycl::handler &h) { + auto node_c = g.add( + [&](sycl::handler &h) { h.parallel_for(sycl::range<1>{n}, sycl::reduction(dotp, 0.0f, std::plus()), [=](sycl::id<1> it, auto &sum) { - const size_t i = it[0]; - sum += x[i] * z[i]; + const size_t i = it[0]; + sum += x[i] * z[i]; }); - }, {node_a, node_b}); - - auto exec_graph = g.finalize(q.get_context()); - - exec_graph.exec_and_wait(q); - - if (*dotp != host_gold_result()) { - std::cout << "Error unexpected result!\n"; - } - - sycl::free(dotp, q); - sycl::free(x, q); - sycl::free(y, q); - sycl::free(z, q); - - std::cout << "done.\n"; - - return 0; -} \ No newline at end of file + }, + {node_a, node_b}); + + auto exec_graph = g.finalize(q.get_context()); + + exec_graph.exec_and_wait(q); + + if (*dotp != host_gold_result()) { + std::cout << "Error unexpected result!\n"; + } + + sycl::free(dotp, q); + sycl::free(x, q); + sycl::free(y, q); + sycl::free(z, q); + + std::cout << "done.\n"; + + return 0; +} diff --git a/sycl/test/graph/graph-explicit-reduction.cpp b/sycl/test/graph/graph-explicit-reduction.cpp new file mode 100644 index 0000000000000..9a2788079570c --- /dev/null +++ b/sycl/test/graph/graph-explicit-reduction.cpp @@ -0,0 +1,37 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +#include +#include + +#include + +int main() { + sycl::property_list properties{ + sycl::property::queue::in_order{}, + sycl::ext::oneapi::property::queue::lazy_execution{}}; + + sycl::queue q{sycl::gpu_selector_v, properties}; + + sycl::ext::oneapi::experimental::command_graph g; + + const size_t n = 10; + float *input = sycl::malloc_shared(n, q); + float *output = sycl::malloc_shared(1, q); + for (size_t i = 0; i < n; i++) { + input[i] = i; + } + + auto e = q.submit([&](sycl::handler &h) { + h.parallel_for(sycl::range<1>{n}, + sycl::reduction(output, 0.0f, std::plus()), + [=](sycl::id<1> idx, auto &sum) { sum += input[idx]; }); + }); + + e.wait(); + + sycl::free(input, q); + sycl::free(output, q); + + std::cout << "done\n"; + + return 0; +} diff --git a/sycl/test/graph/graph-explicit-simple.cpp b/sycl/test/graph/graph-explicit-simple.cpp index 339d5b6e76150..d2f0098322172 100644 --- a/sycl/test/graph/graph-explicit-simple.cpp +++ b/sycl/test/graph/graph-explicit-simple.cpp @@ -1,43 +1,42 @@ -#include +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out #include +#include #include -const size_t n = 10; - int main() { - - sycl::property_list properties{ - sycl::property::queue::in_order(), - sycl::ext::oneapi::property::queue::lazy_execution{} - }; - - //sycl::gpu_selector device_selector; - - sycl::queue q{sycl::gpu_selector_v, properties}; - - //sycl::queue copy_q{}; - - sycl::ext::oneapi::experimental::command_graph g; - - float *arr = sycl::malloc_shared(n, q); - - g.add( - [&](sycl::handler& h){ - h.parallel_for(sycl::range<1>{n}, [=](sycl::id<1> idx){size_t i = idx; arr[i]=1; }); + + sycl::property_list properties{ + sycl::property::queue::in_order{}, + sycl::ext::oneapi::property::queue::lazy_execution{}}; + + sycl::queue q{sycl::gpu_selector_v, properties}; + + sycl::ext::oneapi::experimental::command_graph g; + + const size_t n = 10; + float *arr = sycl::malloc_shared(n, q); + + g.add([&](sycl::handler &h) { + h.parallel_for(sycl::range<1>{n}, [=](sycl::id<1> idx) { + size_t i = idx; + arr[i] = 1; }); - - auto result_before_exec1 = arr[0]; - - auto exec_graph = g.finalize(q.get_context()); - - auto result_before_exec2 = arr[0]; - - exec_graph.exec_and_wait(q); - - auto result = arr[0]; - - std::cout << "done.\n"; - - return 0; -} \ No newline at end of file + }); + + auto result_before_exec1 = arr[0]; + + auto exec_graph = g.finalize(q.get_context()); + + auto result_before_exec2 = arr[0]; + + exec_graph.exec_and_wait(q); + + auto result = arr[0]; + + sycl::free(arr, q); + + std::cout << "done.\n"; + + return 0; +}