-
Notifications
You must be signed in to change notification settings - Fork 803
Closed
Labels
Description
Describe the bug
sycl::vec "as" operation usage leads to significant performance drop. Is this expected?
Have to use reinterpret_cast workaround to get expected performance back.
To Reproduce
Test program:
# cat as.cpp
#include <sycl.hpp>
#include <chrono>
const unsigned N = 1024 * 1024;
sycl::float4 Mem[N] = { sycl::float4(0) };
// Kernel as1 execution time: 17.643s
inline sycl::float4 as1(sycl::float4 a, const uint32_t b) {
return (a.as<sycl::uint4>() & b).as<sycl::float4>();
}
// Kernel as2 execution time: 0.052s
inline sycl::float4 as2(const sycl::float4 a, const uint32_t b) {
const sycl::uint4 i = reinterpret_cast<const sycl::uint4&>(a) & b;
return reinterpret_cast<const sycl::float4&>(i);
}
typedef sycl::float4 (*FOO)(sycl::float4, uint32_t);
template<FOO foo> void do_sycl(const char* const name) {
try {
auto exception_handler = [] (sycl::exception_list exceptions) {
for (std::exception_ptr const& e : exceptions) {
try {
std::rethrow_exception(e);
} catch(sycl::exception const& e) {
std::cerr << "Caught asynchronous SYCL exception:\n" << e.what() << std::endl;
}
}
};
auto q = sycl::queue{sycl::gpu_selector_v, exception_handler};
auto bMem = sycl::buffer(Mem, sycl::range(N));
// compile kernels
auto kb_begin = std::chrono::high_resolution_clock::now();
auto kb = sycl::get_kernel_bundle<sycl::bundle_state::executable>(q.get_context());
{ auto kb_end = std::chrono::high_resolution_clock::now();
auto kb_elapsed = std::chrono::duration_cast<std::chrono::nanoseconds>(kb_end - kb_begin);
printf("Kernel %s compile time: %.3fs\n", name, kb_elapsed.count() * 1e-9);
}
auto begin = std::chrono::high_resolution_clock::now();
Mem[0] = 1;
q.submit([&](sycl::handler& h) {
auto mem = bMem.get_access<sycl::access::mode::read_write>(h);
h.use_kernel_bundle(kb);
h.parallel_for(sycl::range(N), [=](sycl::id<1> n) {
for (int i = 0; i != 1000*1000; ++ i)
mem[n] = foo(mem[n], 0x3F400000);
});
});
q.wait_and_throw();
{ auto end = std::chrono::high_resolution_clock::now();
auto elapsed = std::chrono::duration_cast<std::chrono::nanoseconds>(end - begin);
printf("Kernel %s execution time: %.3fs\n", name, elapsed.count() * 1e-9);
}
} catch (sycl::exception const& e) {
std::cerr << "Caught synchronous SYCL exception:\n" << e.what() << std::endl;
}
}
int main() {
do_sycl<as1>("as1");
if (Mem[0][0] != 0.5f) {
printf("FAILED: %f (%X)\n", Mem[0][0], *(unsigned*)Mem);
return 1;
}
do_sycl<as2>("as2");
if (Mem[0][0] != 0.5f) {
printf("FAILED: %f (%X)\n", Mem[0][0], *(unsigned*)Mem);
return 1;
}
puts("PASSED");
return 0;
}
Test program output:
# clang++ -fsycl as.cpp && time ./a.out
Kernel as1 compile time: 0.115s
Kernel as1 execution time: 17.648s
Kernel as2 compile time: 0.000s
Kernel as2 execution time: 0.052s
PASSED
real 0m17.882s
user 0m7.578s
sys 0m10.291s
Environment (please complete the following information):
- OS: Ubuntu 22.04
- Target device and vendor: Intel Arc A380 GPU
- DPC++ version: clang version 16.0.0 (https://github.com/intel/llvm 1299e21
- Dependencies version: ghcr.io/intel/llvm/sycl_ubuntu2004_nightly:latest docker container
zjin-lcf