From a3ed51a71b0db96c6474d1397b1f542d4a8477d0 Mon Sep 17 00:00:00 2001 From: Jeff Hammond Date: Mon, 26 Oct 2020 10:59:21 -0700 Subject: [PATCH] add LOCAL_SIZE nd_range to SYCL to optimize for V100 (no observed negative impact on x86) Hades Canyon NUC verification with Intel DPC++ from oneAPI beta09. $ dpcpp -DSYCL -O3 main.cpp SYCLStream.cpp $ ./a.out --device 0 BabelStream Version: 3.4 Implementation: SYCL Running kernels 100 times Precision: double Array size: 268.4 MB (=0.3 GB) Total size: 805.3 MB (=0.8 GB) Using SYCL device Intel(R) Core(TM) i7-8809G CPU @ 3.10GHz Driver: 2020.11.8.0.27 Reduction kernel config: 8 groups of size 8 Function MBytes/sec Min (sec) Max Average Copy 20332.291 0.02640 0.02767 0.02659 Mul 20267.131 0.02649 0.02714 0.02658 Add 22927.175 0.03512 0.03744 0.03546 Triad 22780.968 0.03535 0.03585 0.03545 Dot 31974.622 0.01679 0.04985 0.02091 $ ./a.out --device 1 BabelStream Version: 3.4 Implementation: SYCL Running kernels 100 times Precision: double Array size: 268.4 MB (=0.3 GB) Total size: 805.3 MB (=0.8 GB) Using SYCL device Intel(R) Gen9 HD Graphics NEO Driver: 20.18.16699 Reduction kernel config: 96 groups of size 256 Function MBytes/sec Min (sec) Max Average Copy 32263.941 0.01664 0.02050 0.01698 Mul 32463.600 0.01654 0.01950 0.01690 Add 31267.789 0.02576 0.02953 0.02617 Triad 31988.618 0.02517 0.02876 0.02563 Dot 28149.343 0.01907 0.03302 0.02016 Signed-off-by: Jeff Hammond --- SYCLStream.cpp | 16 ++++++++++------ SYCLStream.h | 2 ++ 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/SYCLStream.cpp b/SYCLStream.cpp index 89605305..0d4b2293 100644 --- a/SYCLStream.cpp +++ b/SYCLStream.cpp @@ -91,8 +91,9 @@ void SYCLStream::copy() { auto ka = d_a->template get_access(cgh); auto kc = d_c->template get_access(cgh); - cgh.parallel_for(range<1>{array_size}, [=](id<1> idx) + cgh.parallel_for(nd_range<1>{array_size,LOCAL_SIZE}, [=](nd_item<1> it) { + const auto idx = it.get_global_id(0); kc[idx] = ka[idx]; }); }); @@ -107,8 +108,9 @@ void SYCLStream::mul() { auto kb = d_b->template get_access(cgh); auto kc = d_c->template get_access(cgh); - cgh.parallel_for(range<1>{array_size}, [=](id<1> idx) + cgh.parallel_for(nd_range<1>{array_size,LOCAL_SIZE}, [=](nd_item<1> it) { + const auto idx = it.get_global_id(0); kb[idx] = scalar * kc[idx]; }); }); @@ -123,8 +125,9 @@ void SYCLStream::add() auto ka = d_a->template get_access(cgh); auto kb = d_b->template get_access(cgh); auto kc = d_c->template get_access(cgh); - cgh.parallel_for(range<1>{array_size}, [=](id<1> idx) + cgh.parallel_for(nd_range<1>{array_size,LOCAL_SIZE}, [=](nd_item<1> it) { + const auto idx = it.get_global_id(0); kc[idx] = ka[idx] + kb[idx]; }); }); @@ -140,8 +143,9 @@ void SYCLStream::triad() auto ka = d_a->template get_access(cgh); auto kb = d_b->template get_access(cgh); auto kc = d_c->template get_access(cgh); - cgh.parallel_for(range<1>{array_size}, [=](id<1> idx) + cgh.parallel_for(nd_range<1>{array_size,LOCAL_SIZE}, [=](nd_item<1> it) { + const auto idx = it.get_global_id(0); ka[idx] = kb[idx] + scalar * kc[idx]; }); }); @@ -201,9 +205,9 @@ void SYCLStream::init_arrays(T initA, T initB, T initC) auto ka = d_a->template get_access(cgh); auto kb = d_b->template get_access(cgh); auto kc = d_c->template get_access(cgh); - cgh.parallel_for(range<1>{array_size}, [=](item<1> item) + cgh.parallel_for(nd_range<1>{array_size,LOCAL_SIZE}, [=](nd_item<1> item) { - auto id = item.get_id(0); + const auto id = item.get_global_id(0); ka[id] = initA; kb[id] = initB; kc[id] = initC; diff --git a/SYCLStream.h b/SYCLStream.h index ab62ecde..fbfe4764 100644 --- a/SYCLStream.h +++ b/SYCLStream.h @@ -15,6 +15,8 @@ #define IMPLEMENTATION_STRING "SYCL" +#define LOCAL_SIZE 256 + namespace sycl_kernels { template class init;