diff --git a/SYCL/Basic/aspects.cpp b/SYCL/Basic/aspects.cpp new file mode 100644 index 0000000000..bc5b04f4ff --- /dev/null +++ b/SYCL/Basic/aspects.cpp @@ -0,0 +1,106 @@ +// RUN: %clangxx -fsycl %s -o %t.out +// RUN: env SYCL_DEVICE_FILTER=%sycl_be %t.out +// +// Hip is missing some of the parameters tested here so it fails with ROCm for +// NVIDIA +// XFAIL: rocm_nvidia + +//==--------------- aspects.cpp - SYCL device test ------------------------==// +// +// Returns the various aspects of a device and platform. +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include +#include + +using namespace cl::sycl; + +// platform::has() calls device::has() for each device on the platform. + +int main() { + bool failed = false; + int pltIdx = 0; + for (const auto &plt : platform::get_platforms()) { + pltIdx++; + if (plt.has(aspect::host)) { + std::cout << "Platform #" << pltIdx + << " type: Host supports:" << std::endl; + } else if (plt.has(aspect::cpu)) { + std::cout << "Platform #" << pltIdx + << " type: CPU supports:" << std::endl; + } else if (plt.has(aspect::gpu)) { + std::cout << "Platform #" << pltIdx + << " type: GPU supports:" << std::endl; + } else if (plt.has(aspect::accelerator)) { + std::cout << "Platform #" << pltIdx + << " type: Accelerator supports:" << std::endl; + } else if (plt.has(aspect::custom)) { + std::cout << "Platform #" << pltIdx + << " type: Custom supports:" << std::endl; + } else { + failed = true; + std::cout << "Failed: platform #" << pltIdx << " type: unknown" + << std::endl; + return 1; + } + + if (plt.has(aspect::fp16)) { + std::cout << " fp16" << std::endl; + } + if (plt.has(aspect::fp64)) { + std::cout << " fp64" << std::endl; + } + if (plt.has(aspect::int64_base_atomics)) { + std::cout << " base atomic operations" << std::endl; + } + if (plt.has(aspect::int64_extended_atomics)) { + std::cout << " extended atomic operations" << std::endl; + } + if (plt.has(aspect::atomic64)) { + std::cout << " atomic64" << std::endl; + } + if (plt.has(aspect::image)) { + std::cout << " images" << std::endl; + } + if (plt.has(aspect::online_compiler)) { + std::cout << " online compiler" << std::endl; + } + if (plt.has(aspect::online_linker)) { + std::cout << " online linker" << std::endl; + } + if (plt.has(aspect::queue_profiling)) { + std::cout << " queue profiling" << std::endl; + } + if (plt.has(aspect::usm_device_allocations)) { + std::cout << " USM allocations" << std::endl; + } + if (plt.has(aspect::usm_host_allocations)) { + std::cout << " USM host allocations" << std::endl; + } + if (plt.has(aspect::usm_atomic_host_allocations)) { + std::cout << " USM atomic host allocations" << std::endl; + } + if (plt.has(aspect::usm_shared_allocations)) { + std::cout << " USM shared allocations" << std::endl; + } + if (plt.has(aspect::usm_atomic_shared_allocations)) { + std::cout << " USM atomic shared allocations" << std::endl; + } + if (plt.has(aspect::usm_restricted_shared_allocations)) { + std::cout << " USM restricted shared allocations" << std::endl; + } + if (plt.has(aspect::usm_system_allocator)) { + std::cout << " USM system allocator" << std::endl; + } + if (plt.has(aspect::usm_system_allocations)) { + std::cout << " USM system allocations" << std::endl; + } + } + std::cout << "Passed." << std::endl; + return 0; +} diff --git a/SYCL/Basic/diagnostics/device-check.cpp b/SYCL/Basic/diagnostics/device-check.cpp new file mode 100644 index 0000000000..931881cc28 --- /dev/null +++ b/SYCL/Basic/diagnostics/device-check.cpp @@ -0,0 +1,51 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: env SYCL_DEVICE_FILTER=%sycl_be SYCL_DEVICE_TYPE=cpu %t.out +// RUN: env SYCL_DEVICE_FILTER=%sycl_be SYCL_DEVICE_TYPE=gpu %t.out +// RUN: env SYCL_DEVICE_FILTER=%sycl_be SYCL_DEVICE_TYPE=acc %t.out +// RUN: env SYCL_DEVICE_FILTER=%sycl_be SYCL_DEVICE_TYPE=host %t.out +// RUN: env SYCL_DEVICE_FILTER=%sycl_be SYCL_DEVICE_TYPE=CPU %t.out +// RUN: env SYCL_DEVICE_FILTER=%sycl_be SYCL_DEVICE_TYPE=GPU %t.out +// RUN: env SYCL_DEVICE_FILTER=%sycl_be SYCL_DEVICE_TYPE=ACC %t.out +// RUN: env SYCL_DEVICE_FILTER=%sycl_be SYCL_DEVICE_TYPE=HOST %t.out +// RUN: env SYCL_DEVICE_FILTER=%sycl_be SYCL_DEVICE_TYPE=Cpu %t.out +// RUN: env SYCL_DEVICE_FILTER=%sycl_be SYCL_DEVICE_TYPE=Gpu %t.out +// RUN: env SYCL_DEVICE_FILTER=%sycl_be SYCL_DEVICE_TYPE=Acc %t.out +// RUN: env SYCL_DEVICE_FILTER=%sycl_be SYCL_DEVICE_TYPE=Host %t.out +// RUN: env SYCL_DEVICE_FILTER=%sycl_be SYCL_DEVICE_TYPE=XPU %t.out + +//==------------------- device-check.cpp --------------------------==// +// This is a diagnostic test which ensures that +// device types are case-insensitive. +// It also checks for SYCL_DEVICE being set incorrectly. +//==---------------------------------------------------------------==// + +#include +#include + +using namespace cl::sycl; + +int main() { + try { + queue q = queue(); + auto device = q.get_device(); + auto deviceName = device.get_info(); + std::cout << " Device Name: " << deviceName << std::endl; + } + + catch (runtime_error &E) { + if (std::string(E.what()).find("SYCL_DEVICE_TYPE is not recognized. Must " + "be GPU, CPU, ACC or HOST.") == + std::string::npos && + std::string(E.what()).find("No device of requested type available.") == + std::string::npos) { + std::cout << "Test failed: received error is incorrect." << std::endl; + return 1; + } else { + std::cout << "Test passed: caught the expected error." << std::endl; + return 0; + } + } + + std::cout << "Test passed: results are correct." << std::endl; + return 0; +} diff --git a/SYCL/Basic/image/srgba-read.cpp b/SYCL/Basic/image/srgba-read.cpp new file mode 100644 index 0000000000..dd377af36d --- /dev/null +++ b/SYCL/Basic/image/srgba-read.cpp @@ -0,0 +1,136 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out %CPU_CHECK_PLACEHOLDER +// RUN: %GPU_RUN_PLACEHOLDER %t.out %GPU_CHECK_PLACEHOLDER + +// XFAIL: level_zero +// UNSUPPORTED: cuda +// UNSUPPORTED: rocm_nvidia +// UNSUPPORTED: rocm_amd + +#include + +using namespace cl::sycl; + +using accessorPixelT = sycl::float4; +using dataPixelT = uint32_t; + +// will output a pixel as {r,g,b,a}. provide override if a different pixelT is +// defined. +void outputPixel(sycl::float4 somePixel) { + std::cout << "{" << somePixel[0] << "," << somePixel[1] << "," << somePixel[2] + << "," << somePixel[3] << "} "; +} + +constexpr long width = 4; +constexpr long height = 3; + +void test_rd(image_channel_order ChanOrder, image_channel_type ChanType) { + + int numTests = 4; // drives the size of the testResults buffer, and the number + // of report iterations. Kludge. + + // this should yield a read of approximate 0.5 for each channel + // when read directly with a normal non-linearized image (e.g. + // image_channel_order::rgba). For sRGB + // (image_channel_order::ext_oneapi_srgba), this is the value with maximum + // conversion. So we should read values of approximately 0.2 + dataPixelT basicPixel{127 << 24 | 127 << 16 | 127 << 8 | 127}; + + queue Q; + const sycl::range<2> ImgRange_2D(width, height); + + // IMPORTANT: const data is *required* for sRGBA images. + // OpenCL support is limited for 2D/3D images that are read only. + const std::vector ImgData(ImgRange_2D.size(), basicPixel); + try { // closure + + image<2> image_2D(ImgData.data(), ChanOrder, ChanType, ImgRange_2D); + // use a buffer to report back test results. + buffer testResults((range<1>(numTests))); + + Q.submit([&](handler &cgh) { + auto image_acc = + image_2D.get_access(cgh); + auto test_acc = testResults.get_access(cgh); + + cgh.single_task([=]() { + int i = 0; // the index for writing into the testResult buffer. + + // verify our four pixels were set up correctly. + // 0-3 read four pixels. no sampler + test_acc[i++] = image_acc.read(sycl::int2{0, 0}); + test_acc[i++] = image_acc.read(sycl::int2{1, 0}); + test_acc[i++] = image_acc.read(sycl::int2{0, 1}); + test_acc[i++] = image_acc.read(sycl::int2{2, 2}); + }); + }); + Q.wait_and_throw(); + + // REPORT RESULTS + auto test_acc = testResults.get_access(); + for (int i = 0, idx = 0; i < numTests; i++, idx++) { + if (i == 0) { + idx = 0; + std::cout << "read four pixels, no sampler" << std::endl; + } + + accessorPixelT testPixel = test_acc[i]; + std::cout << i << /* " -- " << idx << */ ": "; + outputPixel(testPixel); + std::cout << std::endl; + } + } catch (sycl::exception e) { + std::cout << "exception caught: " << e.what() << std::endl; + } // ~image / ~buffer +} + +int main() { + +#ifdef SYCL_EXT_ONEAPI_SRGB + std::cout << "SYCL_EXT_ONEAPI_SRGB defined" << std::endl; +#endif + + queue Q; + device D = Q.get_device(); + + // test aspect + if (D.has(aspect::ext_oneapi_srgb)) + std::cout << "aspect::ext_oneapi_srgb detected" << std::endl; + + if (D.has(aspect::image)) { + // RGBA -- (normal, non-linearized) + std::cout << "rgba -------" << std::endl; + test_rd(image_channel_order::rgba, image_channel_type::unorm_int8); + + // sRGBA -- (linearized reads) + std::cout << "srgba -------" << std::endl; + test_rd(image_channel_order::ext_oneapi_srgba, + image_channel_type::unorm_int8); + } else { + std::cout << "device does not support image operations" << std::endl; + } + + return 0; +} + +// clang-format off +// CHECK: SYCL_EXT_ONEAPI_SRGB defined +// CHECK: aspect::ext_oneapi_srgb detected + +// CHECK: rgba ------- +// CHECK-NEXT: read four pixels, no sampler +// these next four reads should all be close to 0.5 +// CHECK-NEXT: 0: {0.498039,0.498039,0.498039,0.498039} +// CHECK-NEXT: 1: {0.498039,0.498039,0.498039,0.498039} +// CHECK-NEXT: 2: {0.498039,0.498039,0.498039,0.498039} +// CHECK-NEXT: 3: {0.498039,0.498039,0.498039,0.498039} +// CHECK: srgba ------- +// CHECK-NEXT: read four pixels, no sampler +// these next four reads should have R, G, B values close to 0.2 +// presently the values differ slightly between OpenCL GPU and CPU +// (e.g. GPU: 0.21231, CPU: 0.211795 ) +// CHECK-NEXT: 0: {0.21 +// CHECK-NEXT: 1: {0.21 +// CHECK-NEXT: 2: {0.21 +// CHECK-NEXT: 3: {0.21 +// clang-format on diff --git a/SYCL/Basic/intel-ext-device.cpp b/SYCL/Basic/intel-ext-device.cpp new file mode 100644 index 0000000000..e64cb21eee --- /dev/null +++ b/SYCL/Basic/intel-ext-device.cpp @@ -0,0 +1,114 @@ +// RUN: %clangxx -fsycl %s -o %t.out +// RUN: env SYCL_DEVICE_FILTER=level_zero:gpu %t.out +// RUN: env SYCL_DEVICE_FILTER=opencl:gpu %t.out +// +// REQUIRES: gpu +// UNSUPPORTED: cuda +// UNSUPPORTED: rocm_nvidia +// UNSUPPORTED: rocm_amd + +//==--------- intel-ext-device.cpp - SYCL device test ------------==// +// +// Returns the low-level device details. These are Intel-specific extensions +// that are only supported on Level Zero. +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include + +#include + +using namespace cl::sycl; + +#ifdef _WIN32 +#define setenv(name, value, overwrite) _putenv_s(name, value) +#endif + +int main(int argc, char **argv) { + // Must be enabled at the beginning of the application + // to obtain the PCI address + setenv("SYCL_ENABLE_PCI", "1", 0); + + int pltCount = 1; + for (const auto &plt : platform::get_platforms()) { + if (!plt.has(aspect::host)) { + int devCount = 1; + int totalEUs = 0; + int numSlices = 0; + int numSubslices = 0; + int numEUsPerSubslice = 0; + for (const auto &dev : plt.get_devices()) { + std::cout << "Platform #" << pltCount++ << ":" << std::endl; + if (dev.has(aspect::gpu)) { + auto name = dev.get_info(); + std::cout << "Device #" << devCount++ << ": " + << dev.get_info() << ":" << std::endl; + + std::cout << "Backend: "; + if (plt.get_backend() == backend::level_zero) { + std::cout << "Level Zero" << std::endl; + } else if (plt.get_backend() == backend::opencl) { + std::cout << "OpenCL" << std::endl; + } else if (plt.get_backend() == backend::cuda) { + std::cout << "CUDA" << std::endl; + } else { + std::cout << "Unknown" << std::endl; + } + + // Use Feature Test macro to see if extensions are supported. + if (SYCL_EXT_INTEL_DEVICE_INFO >= 1) { + + if (dev.has(aspect::ext_intel_pci_address)) { + std::cout << "PCI address = " + << dev.get_info() + << std::endl; + } + if (dev.has(aspect::ext_intel_gpu_eu_count)) { + totalEUs = dev.get_info(); + std::cout << "Number of EUs = " << totalEUs << std::endl; + } + if (dev.has(aspect::ext_intel_gpu_eu_simd_width)) { + int w = dev.get_info(); + std::cout << "EU SIMD width = " << w << std::endl; + } + if (dev.has(aspect::ext_intel_gpu_slices)) { + numSlices = dev.get_info(); + std::cout << "Number of slices = " << numSlices << std::endl; + } + if (dev.has(aspect::ext_intel_gpu_subslices_per_slice)) { + numSubslices = dev.get_info< + info::device::ext_intel_gpu_subslices_per_slice>(); + std::cout << "Number of subslices per slice = " << numSubslices + << std::endl; + } + if (dev.has(aspect::ext_intel_gpu_eu_count_per_subslice)) { + numEUsPerSubslice = dev.get_info< + info::device::ext_intel_gpu_eu_count_per_subslice>(); + std::cout << "Number of EUs per subslice = " << numEUsPerSubslice + << std::endl; + } + if (dev.has(aspect::ext_intel_max_mem_bandwidth)) { + // not supported yet + long m = + dev.get_info(); + std::cout << "Maximum memory bandwidth = " << m << std::endl; + } + // This is the only data we can verify. + if (totalEUs != numSlices * numSubslices * numEUsPerSubslice) { + std::cout << "Error: EU Count is incorrect!" << std::endl; + std::cout << "Failed!" << std::endl; + return 1; + } + } // SYCL_EXT_INTEL_DEVICE_INFO + } + std::cout << std::endl; + } + } + } + std::cout << "Passed!" << std::endl; + return 0; +} diff --git a/SYCL/Basic/interop/construction_ocl.cpp b/SYCL/Basic/interop/construction_ocl.cpp new file mode 100644 index 0000000000..b1f1cb65ea --- /dev/null +++ b/SYCL/Basic/interop/construction_ocl.cpp @@ -0,0 +1,160 @@ +// REQUIRES: opencl, opencl_icd +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %opencl_lib %s -o %t.ocl.out +// RUN: env SYCL_DEVICE_FILTER="opencl" %t.ocl.out + +#include +#include + +#include + +constexpr auto BE = sycl::backend::opencl; + +int main() { + sycl::device Dev{sycl::default_selector{}}; + auto NativeDev = Dev.get_native(); + + sycl::device NewDev = sycl::make_device(NativeDev); + assert(NewDev.get_info() == + Dev.get_info()); + + sycl::platform Plt = Dev.get_platform(); + auto NativePlt = Plt.get_native(); + + sycl::platform NewPlt = sycl::make_platform(NativePlt); + assert(NewPlt == Plt); + + sycl::context Ctx{Dev}; + auto NativeCtx = Ctx.get_native(); + + sycl::context NewCtx = sycl::make_context(NativeCtx); + assert(NewCtx.get_native() == NativeCtx); + + sycl::queue Q{Ctx, Dev}; + auto NativeQ = Q.get_native(); + + sycl::queue NewQ = sycl::make_queue(NativeQ, Ctx); + assert(NativeQ == NewQ.get_native()); + + sycl::event Evt = Q.single_task([] {}); + auto NativeEvt = Evt.get_native(); + + sycl::event NewEvt = sycl::make_event(NativeEvt, Ctx); + assert(NativeEvt == NewEvt.get_native()); + + cl_mem NativeBuf = + clCreateBuffer(NativeCtx, CL_MEM_READ_WRITE, 128, nullptr, nullptr); + auto NewBuf = sycl::make_buffer(NativeBuf, Ctx); + assert(NewBuf.get_range()[0] == 128); + + const char *ProgSrc = "kernel void _() {}"; + cl_int Err; + + // Program in state NONE + { + cl_program OclProg = + clCreateProgramWithSource(NativeCtx, 1, &ProgSrc, nullptr, &Err); + assert(Err == CL_SUCCESS && "Program creation failed"); + + auto KB = + sycl::make_kernel_bundle(OclProg, Ctx); + auto KernelIDs = KB.get_kernel_ids(); + assert(KernelIDs.empty()); + + cl_program OclProg2 = + clCreateProgramWithSource(NativeCtx, 1, &ProgSrc, nullptr, &Err); + assert(Err == CL_SUCCESS && "Program creation failed"); + + auto KB2 = + sycl::make_kernel_bundle(OclProg2, Ctx); + auto KernelIDs2 = KB2.get_kernel_ids(); + assert(KernelIDs2.empty()); + + cl_program OclProg3 = + clCreateProgramWithSource(NativeCtx, 1, &ProgSrc, nullptr, &Err); + assert(Err == CL_SUCCESS && "Program creation failed"); + + auto KB3 = sycl::make_kernel_bundle( + OclProg3, Ctx); + auto KernelIDs3 = KB3.get_kernel_ids(); + assert(KernelIDs3.empty()); + } + + // Compiled program + { + cl_program OclProg = + clCreateProgramWithSource(NativeCtx, 1, &ProgSrc, nullptr, &Err); + assert(Err == CL_SUCCESS && "Program creation failed"); + + Err = clCompileProgram(OclProg, 1, &NativeDev, "", 0, nullptr, nullptr, + nullptr, nullptr); + assert(Err == CL_SUCCESS && "Program compile failed"); + + auto KB = + sycl::make_kernel_bundle(OclProg, Ctx); + auto KernelIDs = KB.get_kernel_ids(); + assert(KernelIDs.empty()); + + bool StateMismatch = false; + try { + auto KB2 = + sycl::make_kernel_bundle(OclProg, Ctx); + } catch (sycl::runtime_error Ex) { + StateMismatch = true; + } + assert(StateMismatch); + + cl_program OclProg3 = + clCreateProgramWithSource(NativeCtx, 1, &ProgSrc, nullptr, &Err); + assert(Err == CL_SUCCESS && "Program creation failed"); + + Err = clCompileProgram(OclProg3, 1, &NativeDev, "", 0, nullptr, nullptr, + nullptr, nullptr); + assert(Err == CL_SUCCESS && "Program compile failed"); + + auto KB3 = sycl::make_kernel_bundle( + OclProg3, Ctx); + auto KernelIDs3 = KB3.get_kernel_ids(); + assert(KernelIDs3.empty()); + } + + // Linked program + { + cl_program OclProg = + clCreateProgramWithSource(NativeCtx, 1, &ProgSrc, nullptr, &Err); + assert(Err == CL_SUCCESS && "Program creation failed"); + + Err = clBuildProgram(OclProg, 1, &NativeDev, "", nullptr, nullptr); + assert(Err == CL_SUCCESS && "Program build failed"); + + auto KB = sycl::make_kernel_bundle( + OclProg, Ctx); + auto KernelIDs = KB.get_kernel_ids(); + assert(KernelIDs.empty()); + + cl_kernel NativeKer = clCreateKernel(OclProg, "_", &Err); + assert(Err == CL_SUCCESS && "Kernel creation failed"); + + auto Kernel = sycl::make_kernel(NativeKer, Ctx); + assert(Kernel.get_info() == 0); + + bool StateMismatch = false; + try { + auto KB2 = + sycl::make_kernel_bundle(OclProg, Ctx); + } catch (sycl::runtime_error Ex) { + StateMismatch = true; + } + assert(StateMismatch); + + StateMismatch = false; + try { + auto KB3 = sycl::make_kernel_bundle( + OclProg, Ctx); + } catch (sycl::runtime_error Ex) { + StateMismatch = true; + } + assert(StateMismatch); + } + + return 0; +} diff --git a/SYCL/Basic/interop/construction_ze.cpp b/SYCL/Basic/interop/construction_ze.cpp new file mode 100644 index 0000000000..55e2933586 --- /dev/null +++ b/SYCL/Basic/interop/construction_ze.cpp @@ -0,0 +1,28 @@ +// REQUIRES: level_zero, level_zero_dev_kit +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %level_zero_options %s -o %t.ze.out +// RUN: env SYCL_DEVICE_FILTER="level_zero" %t.ze.out + +#include + +#include +#include + +constexpr auto BE = sycl::backend::level_zero; + +int main() { + sycl::device Dev{sycl::default_selector{}}; + + sycl::queue Q{Dev}; + + if (0) { + Q.submit([](sycl::handler &CGH) { CGH.single_task([] {}); }); + } + + sycl::platform Plt = Dev.get_platform(); + auto NativePlt = Plt.get_native(); + + sycl::platform NewPlt = sycl::make_platform(NativePlt); + assert(NewPlt == Plt); + + return 0; +} diff --git a/SYCL/Basic/interop/get_native_ocl.cpp b/SYCL/Basic/interop/get_native_ocl.cpp new file mode 100644 index 0000000000..da6c85dc46 --- /dev/null +++ b/SYCL/Basic/interop/get_native_ocl.cpp @@ -0,0 +1,36 @@ +// REQUIRES: opencl, opencl_dev_kit +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %opencl_options %s -o %t.ocl.out +// RUN: %t.ocl.out + +#include + +#include +#include + +constexpr auto BE = sycl::backend::opencl; + +class TestKernel; + +int main() { + sycl::queue Q; + + if (0) { + Q.submit([](sycl::handler &CGH) { CGH.single_task([] {}); }); + } + + sycl::kernel_id KernelID = sycl::get_kernel_id(); + + sycl::kernel_bundle KernelBundle = + sycl::get_kernel_bundle(Q.get_context()); + + sycl::kernel Kernel = KernelBundle.get_kernel(KernelID); + + cl_kernel Handle = Kernel.get_native(); + + size_t Size = 0; + cl_int Err = + clGetKernelInfo(Handle, CL_KERNEL_FUNCTION_NAME, 0, nullptr, &Size); + assert(Err == CL_SUCCESS); + + return 0; +} diff --git a/SYCL/Basic/interop/get_native_ze.cpp b/SYCL/Basic/interop/get_native_ze.cpp new file mode 100644 index 0000000000..a8911ddd9c --- /dev/null +++ b/SYCL/Basic/interop/get_native_ze.cpp @@ -0,0 +1,35 @@ +// REQUIRES: level_zero, level_zero_dev_kit +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %level_zero_options %s -o %t.ze.out +// RUN: %t.ze.out + +#include + +#include +#include + +constexpr auto BE = sycl::backend::level_zero; + +class TestKernel; + +int main() { + sycl::queue Q; + + if (0) { + Q.submit([](sycl::handler &CGH) { CGH.single_task([] {}); }); + } + + sycl::kernel_id KernelID = sycl::get_kernel_id(); + + sycl::kernel_bundle KernelBundle = + sycl::get_kernel_bundle(Q.get_context()); + + sycl::kernel Kernel = KernelBundle.get_kernel(KernelID); + + ze_kernel_handle_t Handle = Kernel.get_native(); + + ze_kernel_properties_t KernelProperties; + ze_result_t Err = zeKernelGetProperties(Handle, &KernelProperties); + assert(Err == ZE_RESULT_SUCCESS); + + return 0; +} diff --git a/SYCL/Basic/interop/traits.cpp b/SYCL/Basic/interop/traits.cpp new file mode 100644 index 0000000000..92c824c696 --- /dev/null +++ b/SYCL/Basic/interop/traits.cpp @@ -0,0 +1,69 @@ +// RUN: %clangxx -fsycl -DUSE_OPENCL %s +// RUN: %clangxx -fsycl -DUSE_L0 %s +// RUN: %clangxx -fsycl -DUSE_CUDA %s + +#ifdef USE_OPENCL +#include + +#include + +constexpr auto Backend = sycl::backend::opencl; +#endif + +#ifdef USE_L0 +#include + +#include + +constexpr auto Backend = sycl::backend::level_zero; +#endif + +#ifdef USE_CUDA +#include + +constexpr auto Backend = sycl::backend::cuda; +#endif + +#include + +int main() { +#ifdef USE_OPENCL + static_assert( + std::is_same_v::input_type, + sycl::interop::type>); + static_assert( + std::is_same_v::input_type, + sycl::interop::type>); + static_assert( + std::is_same_v::input_type, + sycl::interop::type>); + static_assert(std::is_same_v< + sycl::backend_traits::input_type>, + sycl::interop>::type>); + static_assert( + std::is_same_v::input_type, + sycl::interop::type>); +#endif + +// CUDA does not have a native type for platforms +#ifndef USE_CUDA + static_assert( + std::is_same_v::input_type, + sycl::interop::type>); + static_assert( + std::is_same_v::return_type, + sycl::interop::type>); +#endif + + static_assert( + std::is_same_v::return_type, + sycl::interop::type>); + static_assert( + std::is_same_v::return_type, + sycl::interop::type>); + static_assert( + std::is_same_v::return_type, + sycl::interop::type>); + + return 0; +} diff --git a/SYCL/Basic/platform.cpp b/SYCL/Basic/platform.cpp new file mode 100644 index 0000000000..f6ee302c19 --- /dev/null +++ b/SYCL/Basic/platform.cpp @@ -0,0 +1,80 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: env SYCL_DEVICE_FILTER=host,%sycl_be %t.out +//==--------------- platform.cpp - SYCL platform test ----------------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#include +#include +#include +#include + +using namespace cl::sycl; + +int main() { + int i = 1; + vector_class openclPlatforms; + for (const auto &plt : platform::get_platforms()) { + std::cout << "Platform " << i++ + << " is available: " << ((plt.is_host()) ? "host: " : "OpenCL: ") + << std::hex + << ((plt.is_host() || + plt.get_backend() != cl::sycl::backend::opencl) + ? nullptr + : plt.get()) + << std::endl; + } + + auto platforms = platform::get_platforms(); + platform &platformA = platforms[0]; + platform &platformB = (platforms.size() > 1 ? platforms[1] : platforms[0]); + { + std::cout << "move constructor" << std::endl; + platform Platform(platformA); + size_t hash = hash_class()(Platform); + platform MovedPlatform(std::move(Platform)); + assert(hash == hash_class()(MovedPlatform)); + assert(platformA.is_host() == MovedPlatform.is_host()); + if (!platformA.is_host() && + platformA.get_backend() == cl::sycl::backend::opencl) { + assert(MovedPlatform.get() != nullptr); + } + } + { + std::cout << "move assignment operator" << std::endl; + platform Platform(platformA); + size_t hash = hash_class()(Platform); + platform WillMovedPlatform(platformB); + WillMovedPlatform = std::move(Platform); + assert(hash == hash_class()(WillMovedPlatform)); + assert(platformA.is_host() == WillMovedPlatform.is_host()); + if (!platformA.is_host() && + platformA.get_backend() == cl::sycl::backend::opencl) { + assert(WillMovedPlatform.get() != nullptr); + } + } + { + std::cout << "copy constructor" << std::endl; + platform Platform(platformA); + size_t hash = hash_class()(Platform); + platform PlatformCopy(Platform); + assert(hash == hash_class()(Platform)); + assert(hash == hash_class()(PlatformCopy)); + assert(Platform == PlatformCopy); + assert(Platform.is_host() == PlatformCopy.is_host()); + } + { + std::cout << "copy assignment operator" << std::endl; + platform Platform(platformA); + size_t hash = hash_class()(Platform); + platform WillPlatformCopy(platformB); + WillPlatformCopy = Platform; + assert(hash == hash_class()(Platform)); + assert(hash == hash_class()(WillPlatformCopy)); + assert(Platform == WillPlatformCopy); + assert(Platform.is_host() == WillPlatformCopy.is_host()); + } +} diff --git a/SYCL/Basic/span.cpp b/SYCL/Basic/span.cpp new file mode 100644 index 0000000000..f7c4091244 --- /dev/null +++ b/SYCL/Basic/span.cpp @@ -0,0 +1,112 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out +// +// Fails to release USM pointer on ROCm for NVIDIA +// XFAIL: rocm_nvidia + +#include +#include + +using namespace sycl; + +void testSpanCapture() { + // This test creates spans that are backed by USM. + // ensures they can be captured by device lambda + // and that read and write operations function correctly + // across capture. + queue Q; + + constexpr long numReadTests = 2; + const range<1> NumberOfReadTestsRange(numReadTests); + buffer SpanRead(NumberOfReadTestsRange); + + // span from a vector + // We will create a vector, backed by a USM allocator. And a span from that. + using vec_alloc = usm_allocator; + // Create allocator for device associated with q + vec_alloc myAlloc(Q); + // Create std vector with the allocator + std::vector vecUSM(4, myAlloc); + std::iota(vecUSM.begin(), vecUSM.end(), 1); + sycl::span vecUSM_span{vecUSM}; + vecUSM_span[0] += 100; // 101 modify first value using span affordance. + + // span from USM memory + auto *usm_data = malloc_shared(4, Q); + sycl::span usm_span(usm_data, 4); + std::iota(usm_span.begin(), usm_span.end(), 1); + usm_span[0] += 100; // 101 modify first value using span affordance. + + event E = Q.submit([&](handler &cgh) { + auto can_read_from_span_acc = SpanRead.get_access(cgh); + cgh.single_task([=] { + // read from the spans. + can_read_from_span_acc[0] = vecUSM_span[0]; + can_read_from_span_acc[1] = usm_span[0]; + + // write to the spans + vecUSM_span[1] += 1000; + usm_span[1] += 1000; + }); + }); + E.wait(); + + // check out the read operations, should have gotten 101 from each + auto can_read_from_span_acc = SpanRead.get_access(); + for (int i = 0; i < numReadTests; i++) { + assert(can_read_from_span_acc[i] == 101 && + "read check should have gotten 100"); + } + + // were the spans successfully modified via write? + assert(vecUSM_span[1] == 1002 && + "vecUSM_span write check should have gotten 1001"); + assert(usm_span[1] == 1002 && "usm_span write check should have gotten 1001"); + + free(usm_data, Q); +} + +void set_all_span_values(sycl::span container, int v) { + for (auto &e : container) + e = v; +} + +void testSpanOnDevice() { + // this test creates a simple span on device, + // passes it to a function that operates on it + // and ensures it worked correctly + queue Q; + constexpr long numReadTests = 4; + const range<1> NumberOfReadTestsRange(numReadTests); + buffer SpanRead(NumberOfReadTestsRange); + + event E = Q.submit([&](handler &cgh) { + auto can_read_from_span_acc = SpanRead.get_access(cgh); + cgh.single_task([=] { + // create a span on device, pass it to function that modifies it + // read values back out. + int a[]{1, 2, 3, 4}; + sycl::span a_span{a}; + set_all_span_values(a_span, 10); + for (int i = 0; i < numReadTests; i++) + can_read_from_span_acc[i] = a_span[i]; + }); + }); + E.wait(); + + // check out the read operations, should have gotten 10 from each + auto can_read_from_span_acc = SpanRead.get_access(); + for (int i = 0; i < numReadTests; i++) { + assert(can_read_from_span_acc[i] == 10 && + "read check should have gotten 10"); + } +} + +int main() { + testSpanCapture(); + testSpanOnDevice(); + + return 0; +} diff --git a/SYCL/GroupAlgorithm/SYCL2020/all_of.cpp b/SYCL/GroupAlgorithm/SYCL2020/all_of.cpp new file mode 100644 index 0000000000..80723b5f3f --- /dev/null +++ b/SYCL/GroupAlgorithm/SYCL2020/all_of.cpp @@ -0,0 +1,66 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -I . -o %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out +// +// Missing __spirv_GroupAll on AMD: +// XFAIL: rocm_amd + +#include "support.h" +#include +#include +#include +#include +using namespace sycl; + +template class all_of_kernel; + +struct IsEven { + bool operator()(int i) const { return (i % 2) == 0; } +}; + +template +void test(queue q, InputContainer input, OutputContainer output, + Predicate pred) { + typedef class all_of_kernel kernel_name; + size_t N = input.size(); + size_t G = 64; + { + buffer in_buf(input.data(), input.size()); + buffer out_buf(output.data(), output.size()); + + q.submit([&](handler &cgh) { + accessor in{in_buf, cgh, sycl::read_only}; + accessor out{out_buf, cgh, sycl::write_only, sycl::no_init}; + cgh.parallel_for(nd_range<1>(G, G), [=](nd_item<1> it) { + group<1> g = it.get_group(); + int lid = it.get_local_id(0); + out[0] = all_of_group(g, pred(in[lid])); + out[1] = all_of_group(g, in[lid], pred); + out[2] = joint_all_of(g, in.get_pointer(), in.get_pointer() + N, pred); + }); + }); + } + bool expected = std::all_of(input.begin(), input.end(), pred); + assert(output[0] == expected); + assert(output[1] == expected); + assert(output[2] == expected); +} + +int main() { + queue q; + if (!isSupportedDevice(q.get_device())) { + std::cout << "Skipping test\n"; + return 0; + } + + constexpr int N = 128; + std::array input; + std::array output; + std::iota(input.begin(), input.end(), 0); + std::fill(output.begin(), output.end(), false); + + test(q, input, output, IsEven()); + + std::cout << "Test passed." << std::endl; +} diff --git a/SYCL/GroupAlgorithm/SYCL2020/any_of.cpp b/SYCL/GroupAlgorithm/SYCL2020/any_of.cpp new file mode 100644 index 0000000000..9f681ec20c --- /dev/null +++ b/SYCL/GroupAlgorithm/SYCL2020/any_of.cpp @@ -0,0 +1,76 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -I . -o %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out +// +// Missing __spirv_GroupAny on AMD: +// XFAIL: rocm_amd + +#include "support.h" +#include +#include +#include +#include +using namespace sycl; + +template class any_of_kernel; + +struct GeZero { + bool operator()(int i) const { return i >= 0; } +}; +struct IsEven { + bool operator()(int i) const { return (i % 2) == 0; } +}; +struct LtZero { + bool operator()(int i) const { return i < 0; } +}; + +template +void test(queue q, InputContainer input, OutputContainer output, + Predicate pred) { + typedef typename InputContainer::value_type InputT; + typedef typename OutputContainer::value_type OutputT; + typedef class any_of_kernel kernel_name; + size_t N = input.size(); + size_t G = 64; + { + buffer in_buf(input.data(), input.size()); + buffer out_buf(output.data(), output.size()); + + q.submit([&](handler &cgh) { + accessor in{in_buf, cgh, sycl::read_only}; + accessor out{out_buf, cgh, sycl::write_only, sycl::no_init}; + cgh.parallel_for(nd_range<1>(G, G), [=](nd_item<1> it) { + group<1> g = it.get_group(); + int lid = it.get_local_id(0); + out[0] = any_of_group(g, pred(in[lid])); + out[1] = any_of_group(g, in[lid], pred); + out[2] = joint_any_of(g, in.get_pointer(), in.get_pointer() + N, pred); + }); + }); + } + bool expected = std::any_of(input.begin(), input.end(), pred); + assert(output[0] == expected); + assert(output[1] == expected); + assert(output[2] == expected); +} + +int main() { + queue q; + if (!isSupportedDevice(q.get_device())) { + std::cout << "Skipping test\n"; + return 0; + } + + constexpr int N = 128; + std::array input; + std::array output; + std::iota(input.begin(), input.end(), 0); + std::fill(output.begin(), output.end(), false); + + test(q, input, output, GeZero()); + test(q, input, output, IsEven()); + test(q, input, output, LtZero()); + + std::cout << "Test passed." << std::endl; +} diff --git a/SYCL/GroupAlgorithm/SYCL2020/exclusive_scan.cpp b/SYCL/GroupAlgorithm/SYCL2020/exclusive_scan.cpp new file mode 100644 index 0000000000..d827d71ce6 --- /dev/null +++ b/SYCL/GroupAlgorithm/SYCL2020/exclusive_scan.cpp @@ -0,0 +1,168 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -I . -o %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out +// +// Missing __spirv_GroupIAdd, __spirv_GroupBroadcast, __spirv_GroupSMin and +// __spirv_GroupSMax on AMD: +// XFAIL: rocm_amd + +// TODO: enable compile+runtime checks for operations defined in SPIR-V 1.3. +// That requires either adding a switch to clang (-spirv-max-version=1.3) or +// raising the spirv version from 1.1. to 1.3 for spirv translator +// unconditionally. Using operators specific for spirv 1.3 and higher with +// -spirv-max-version=1.1 being set by default causes assert/check fails +// in spirv translator. +// RUNx: %clangxx -fsycl -fsycl-targets=%sycl_triple -DSPIRV_1_3 %s -I . -o \ + %t13.out + +#include "support.h" +#include +#include +#include +#include +#include +#include +using namespace sycl; + +template +class exclusive_scan_kernel; + +// std::exclusive_scan isn't implemented yet, so use serial implementation +// instead +namespace emu { +template +OutputIterator exclusive_scan(InputIterator first, InputIterator last, + OutputIterator result, T init, + BinaryOperation binary_op) { + T partial = init; + for (InputIterator it = first; it != last; ++it) { + *(result++) = partial; + partial = binary_op(partial, *it); + } + return result; +} +} // namespace emu + +template +void test(queue q, InputContainer input, OutputContainer output, + BinaryOperation binary_op, + typename OutputContainer::value_type identity) { + typedef typename InputContainer::value_type InputT; + typedef typename OutputContainer::value_type OutputT; + typedef class exclusive_scan_kernel kernel_name0; + typedef class exclusive_scan_kernel kernel_name1; + typedef class exclusive_scan_kernel kernel_name2; + typedef class exclusive_scan_kernel kernel_name3; + OutputT init = 42; + size_t N = input.size(); + size_t G = 64; + std::vector expected(N); + { + buffer in_buf(input.data(), input.size()); + buffer out_buf(output.data(), output.size()); + q.submit([&](handler &cgh) { + accessor in{in_buf, cgh, sycl::read_only}; + accessor out{out_buf, cgh, sycl::write_only, sycl::no_init}; + cgh.parallel_for(nd_range<1>(G, G), [=](nd_item<1> it) { + group<1> g = it.get_group(); + int lid = it.get_local_id(0); + out[lid] = exclusive_scan_over_group(g, in[lid], binary_op); + }); + }); + } + emu::exclusive_scan(input.begin(), input.begin() + G, expected.begin(), + identity, binary_op); + assert(std::equal(output.begin(), output.begin() + G, expected.begin())); + + { + buffer in_buf(input.data(), input.size()); + buffer out_buf(output.data(), output.size()); + q.submit([&](handler &cgh) { + accessor in{in_buf, cgh, sycl::read_only}; + accessor out{out_buf, cgh, sycl::write_only, sycl::no_init}; + cgh.parallel_for(nd_range<1>(G, G), [=](nd_item<1> it) { + group<1> g = it.get_group(); + int lid = it.get_local_id(0); + out[lid] = exclusive_scan_over_group(g, in[lid], init, binary_op); + }); + }); + } + emu::exclusive_scan(input.begin(), input.begin() + G, expected.begin(), init, + binary_op); + assert(std::equal(output.begin(), output.begin() + G, expected.begin())); + + { + buffer in_buf(input.data(), input.size()); + buffer out_buf(output.data(), output.size()); + q.submit([&](handler &cgh) { + accessor in{in_buf, cgh, sycl::read_only}; + accessor out{out_buf, cgh, sycl::write_only, sycl::no_init}; + cgh.parallel_for(nd_range<1>(G, G), [=](nd_item<1> it) { + group<1> g = it.get_group(); + joint_exclusive_scan(g, in.get_pointer(), in.get_pointer() + N, + out.get_pointer(), binary_op); + }); + }); + } + emu::exclusive_scan(input.begin(), input.begin() + N, expected.begin(), + identity, binary_op); + assert(std::equal(output.begin(), output.begin() + N, expected.begin())); + + { + buffer in_buf(input.data(), input.size()); + buffer out_buf(output.data(), output.size()); + q.submit([&](handler &cgh) { + accessor in{in_buf, cgh, sycl::read_only}; + accessor out{out_buf, cgh, sycl::write_only, sycl::no_init}; + cgh.parallel_for(nd_range<1>(G, G), [=](nd_item<1> it) { + group<1> g = it.get_group(); + joint_exclusive_scan(g, in.get_pointer(), in.get_pointer() + N, + out.get_pointer(), init, binary_op); + }); + }); + } + emu::exclusive_scan(input.begin(), input.begin() + N, expected.begin(), init, + binary_op); + assert(std::equal(output.begin(), output.begin() + N, expected.begin())); +} + +int main() { + queue q; + if (!isSupportedDevice(q.get_device())) { + std::cout << "Skipping test\n"; + return 0; + } + + constexpr int N = 128; + std::array input; + std::array output; + std::iota(input.begin(), input.end(), 0); + std::fill(output.begin(), output.end(), 0); + + test(q, input, output, sycl::plus<>(), 0); + test(q, input, output, sycl::minimum<>(), + std::numeric_limits::max()); + test(q, input, output, sycl::maximum<>(), + std::numeric_limits::lowest()); + + test(q, input, output, sycl::plus(), 0); + test(q, input, output, sycl::minimum(), + std::numeric_limits::max()); + test(q, input, output, sycl::maximum(), + std::numeric_limits::lowest()); + +#ifdef SPIRV_1_3 + test(q, input, output, + sycl::multiplies(), 1); + test(q, input, output, sycl::bit_or(), 0); + test(q, input, output, sycl::bit_xor(), + 0); + test(q, input, output, sycl::bit_and(), + ~0); +#endif // SPIRV_1_3 + + std::cout << "Test passed." << std::endl; +} diff --git a/SYCL/GroupAlgorithm/SYCL2020/group_broadcast.cpp b/SYCL/GroupAlgorithm/SYCL2020/group_broadcast.cpp new file mode 100644 index 0000000000..dafec4e8eb --- /dev/null +++ b/SYCL/GroupAlgorithm/SYCL2020/group_broadcast.cpp @@ -0,0 +1,101 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out +// +// Missing __spirv_GroupBroadcast on AMD: +// XFAIL: rocm_amd + +#include "support.h" +#include +#include +#include +#include +#include +using namespace sycl; + +template +void test(queue q, InputContainer input, OutputContainer output) { + typedef typename InputContainer::value_type InputT; + typedef typename OutputContainer::value_type OutputT; + size_t N = input.size(); + size_t G = 4; + range<2> R(G, G); + { + buffer in_buf(input.data(), input.size()); + buffer out_buf(output.data(), output.size()); + + q.submit([&](handler &cgh) { + accessor in{in_buf, cgh, sycl::read_only}; + accessor out{out_buf, cgh, sycl::write_only, sycl::no_init}; + cgh.parallel_for(nd_range<2>(R, R), [=](nd_item<2> it) { + group<2> g = it.get_group(); + int lid = it.get_local_linear_id(); + out[0] = group_broadcast(g, in[lid]); + out[1] = group_broadcast(g, in[lid], group<2>::id_type(1, 2)); + out[2] = + group_broadcast(g, in[lid], group<2>::linear_id_type(2 * G + 1)); + }); + }); + } + assert(output[0] == input[0]); + assert(output[1] == input[1 * G + 2]); + assert(output[2] == input[2 * G + 1]); +} + +int main() { + queue q; + if (!isSupportedDevice(q.get_device())) { + std::cout << "Skipping test\n"; + return 0; + } + + constexpr int N = 16; + + // Test built-in scalar type + { + std::array input; + std::array output; + std::iota(input.begin(), input.end(), 1); + std::fill(output.begin(), output.end(), false); + test(q, input, output); + } + + // Test pointer type + // { + // std::array input; + // std::array output; + // for (int i = 0; i < N; ++i) { + // input[i] = static_cast(0x0) + i; + // } + // std::fill(output.begin(), output.end(), static_cast(0x0)); + // test(q, input, output); + // } + + // Test user-defined type + // - Use complex as a proxy for this + // - Test float and double to test 64-bit and 128-bit types + // { + // std::array, N> input; + // std::array, 3> output; + // for (int i = 0; i < N; ++i) { + // input[i] = + // std::complex(0, 1) + (float)i * std::complex(2, 2); + // } + // std::fill(output.begin(), output.end(), std::complex(0, 0)); + // test(q, input, output); + // } + // { + // std::array, N> input; + // std::array, 3> output; + // for (int i = 0; i < N; ++i) { + // input[i] = + // std::complex(0, 1) + (double)i * std::complex(2, + // 2); + // } + // std::fill(output.begin(), output.end(), std::complex(0, 0)); + // test(q, input, output); + // } + std::cout << "Test passed." << std::endl; +} diff --git a/SYCL/GroupAlgorithm/SYCL2020/helpers.hpp b/SYCL/GroupAlgorithm/SYCL2020/helpers.hpp new file mode 100644 index 0000000000..e4b74966a6 --- /dev/null +++ b/SYCL/GroupAlgorithm/SYCL2020/helpers.hpp @@ -0,0 +1,168 @@ +//==---------- helpers.hpp -*- C++ -*--------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#include +#include +#include +#include +#include + +using namespace cl::sycl; + +// ---- utils +template struct utils { + static T1 add_vec(const vec &v); + static bool cmp_vec(const vec &v, const vec &r); + static std::string stringify_vec(const vec &v); +}; +template struct utils { + static T2 add_vec(const vec &v) { return v.s0(); } + static bool cmp_vec(const vec &v, const vec &r) { + return v.s0() == r.s0(); + } + static std::string stringify_vec(const vec &v) { + return std::to_string((T2)v.s0()); + } +}; +template struct utils { + static T2 add_vec(const vec &v) { return v.s0() + v.s1(); } + static bool cmp_vec(const vec &v, const vec &r) { + return v.s0() == r.s0() && v.s1() == r.s1(); + } + static std::string stringify_vec(const vec &v) { + return std::string("(") + std::to_string((T2)v.s0()) + ", " + + std::to_string((T2)v.s1()) + " )"; + } +}; +template struct utils { + static T2 add_vec(const vec &v) { return v.s0() + v.s1() + v.s2(); } + static bool cmp_vec(const vec &v, const vec &r) { + return v.s0() == r.s0() && v.s1() == r.s1() && v.s2() == r.s2(); + } + static std::string stringify_vec(const vec &v) { + return std::string("(") + std::to_string((T2)v.s0()) + ", " + + std::to_string((T2)v.s1()) + ", " + std::to_string((T2)v.s3()) + + " )"; + } +}; +template struct utils { + static T2 add_vec(const vec &v) { + return v.s0() + v.s1() + v.s2() + v.s3(); + } + static bool cmp_vec(const vec &v, const vec &r) { + return v.s0() == r.s0() && v.s1() == r.s1() && v.s2() == r.s2() && + v.s3() == r.s3(); + } + static std::string stringify_vec(const vec &v) { + return std::string("(") + std::to_string((T2)v.s0()) + ", " + + std::to_string((T2)v.s1()) + std::to_string((T2)v.s2()) + ", " + + std::to_string((T2)v.s3()) + " )"; + } +}; +template struct utils { + static T2 add_vec(const vec &v) { + return v.s0() + v.s1() + v.s2() + v.s3() + v.s4() + v.s5() + v.s6() + + v.s7(); + } + static bool cmp_vec(const vec &v, const vec &r) { + return v.s0() == r.s0() && v.s1() == r.s1() && v.s2() == r.s2() && + v.s3() == r.s3() && v.s4() == r.s4() && v.s5() == r.s5() && + v.s6() == r.s6() && v.s7() == r.s7(); + } + static std::string stringify_vec(const vec &v) { + return std::string("(") + std::to_string((T2)v.s0()) + ", " + + std::to_string((T2)v.s1()) + std::to_string((T2)v.s2()) + ", " + + std::to_string((T2)v.s3()) + std::to_string((T2)v.s4()) + ", " + + std::to_string((T2)v.s5()) + std::to_string((T2)v.s6()) + ", " + + std::to_string((T2)v.s7()) + " )"; + } +}; + +template struct utils { + static T2 add_vec(const vec &v) { + return v.s0() + v.s1() + v.s2() + v.s3() + v.s4() + v.s5() + v.s6() + + v.s7() + v.s8() + v.s9() + v.sA() + v.sB() + v.sC() + v.sD() + + v.sE() + v.sF(); + } + static bool cmp_vec(const vec &v, const vec &r) { + return v.s0() == r.s0() && v.s1() == r.s1() && v.s2() == r.s2() && + v.s3() == r.s3() && v.s4() == r.s4() && v.s5() == r.s5() && + v.s6() == r.s6() && v.s7() == r.s7() && v.s8() == r.s8() && + v.s9() == r.s9() && v.sA() == r.sA() && v.sB() == r.sB() && + v.sC() == r.sC() && v.sD() == r.sD() && v.sE() == r.sE() && + v.sF() == r.sF(); + } + static std::string stringify_vec(const vec &v) { + return std::string("(") + std::to_string((T2)v.s0()) + ", " + + std::to_string((T2)v.s1()) + std::to_string((T2)v.s2()) + ", " + + std::to_string((T2)v.s3()) + std::to_string((T2)v.s4()) + ", " + + std::to_string((T2)v.s5()) + std::to_string((T2)v.s6()) + ", " + + std::to_string((T2)v.s7()) + std::to_string((T2)v.s8()) + ", " + + std::to_string((T2)v.s9()) + std::to_string((T2)v.sA()) + ", " + + std::to_string((T2)v.sB()) + std::to_string((T2)v.sC()) + ", " + + std::to_string((T2)v.sE()) + std::to_string((T2)v.sD()) + ", " + + std::to_string((T2)v.sF()) + " )"; + } +}; + +// ---- exit_if_not_equal +template void exit_if_not_equal(T val, T ref, const char *name) { + if (std::is_floating_point::value) { + auto cmp_val = std::bitset(val); + auto cmp_ref = std::bitset(ref); + if (cmp_val != cmp_ref) { + std::cout << "Unexpected result for " << name << ": " << val << "(" + << cmp_val << ") expected value: " << ref << "(" << cmp_ref + << ")" << std::endl; + exit(1); + } + } else { + if ((val - ref) != 0) { + std::cout << "Unexpected result for " << name << ": " << (long)val + << " expected value: " << (long)ref << std::endl; + exit(1); + } + } +} + +// template +// void exit_if_not_equal(std::complex val, std::complex ref, +// const char *name) { +// std::string Name{name}; +// exit_if_not_equal(val.real(), ref.real(), (Name + ".real()").c_str()); +// exit_if_not_equal(val.imag(), ref.imag(), (Name + ".imag()").c_str()); +// } + +template void exit_if_not_equal(T *val, T *ref, const char *name) { + if ((val - ref) != 0) { + std::cout << "Unexpected result for " << name << ": " << val + << " expected value: " << ref << std::endl; + exit(1); + } +} + +template <> void exit_if_not_equal(half val, half ref, const char *name) { + int16_t cmp_val = reinterpret_cast(val); + int16_t cmp_ref = reinterpret_cast(ref); + if (std::abs(cmp_val - cmp_ref) > 1) { + std::cout << "Unexpected result for " << name << ": " << (float)val + << " expected value: " << (float)ref << std::endl; + exit(1); + } +} + +template +void exit_if_not_equal_vec(vec val, vec ref, const char *name) { + if (!utils::cmp_vec(ref, val)) { + std::cout << "Unexpected result for " << name << ": " + << utils::stringify_vec(val) + << " expected value: " << utils::stringify_vec(ref) + << std::endl; + + exit(1); + } +} \ No newline at end of file diff --git a/SYCL/GroupAlgorithm/SYCL2020/inclusive_scan.cpp b/SYCL/GroupAlgorithm/SYCL2020/inclusive_scan.cpp new file mode 100644 index 0000000000..f2b9f7ce53 --- /dev/null +++ b/SYCL/GroupAlgorithm/SYCL2020/inclusive_scan.cpp @@ -0,0 +1,169 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -I . -o %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out +// +// Missing __spirv_GroupIAdd, __spirv_GroupBroadcast, __spirv_GroupSMin and +// __spirv_GroupSMax on AMD: +// XFAIL: rocm_amd + +// TODO: enable compile+runtime checks for operations defined in SPIR-V 1.3. +// That requires either adding a switch to clang (-spirv-max-version=1.3) or +// raising the spirv version from 1.1. to 1.3 for spirv translator +// unconditionally. Using operators specific for spirv 1.3 and higher with +// -spirv-max-version=1.1 being set by default causes assert/check fails +// in spirv translator. +// RUNx: %clangxx -fsycl -fsycl-targets=%sycl_triple -DSPIRV_1_3 %s -I . -o \ + %t13.out + +#include "support.h" +#include +#include +#include +#include +#include +#include +using namespace sycl; + +template +class inclusive_scan_kernel; + +// std::inclusive_scan isn't implemented yet, so use serial implementation +// instead +namespace emu { +template +OutputIterator inclusive_scan(InputIterator first, InputIterator last, + OutputIterator result, BinaryOperation binary_op, + T init) { + T partial = init; + for (InputIterator it = first; it != last; ++it) { + partial = binary_op(partial, *it); + *(result++) = partial; + } + return result; +} +} // namespace emu + +template +void test(queue q, InputContainer input, OutputContainer output, + BinaryOperation binary_op, + typename OutputContainer::value_type identity) { + typedef typename InputContainer::value_type InputT; + typedef typename OutputContainer::value_type OutputT; + typedef class inclusive_scan_kernel kernel_name0; + typedef class inclusive_scan_kernel kernel_name1; + typedef class inclusive_scan_kernel kernel_name2; + typedef class inclusive_scan_kernel kernel_name3; + OutputT init = 42; + size_t N = input.size(); + size_t G = 64; + std::vector expected(N); + { + buffer in_buf(input.data(), input.size()); + buffer out_buf(output.data(), output.size()); + q.submit([&](handler &cgh) { + accessor in{in_buf, cgh, sycl::read_only}; + accessor out{out_buf, cgh, sycl::write_only, sycl::no_init}; + cgh.parallel_for(nd_range<1>(G, G), [=](nd_item<1> it) { + group<1> g = it.get_group(); + int lid = it.get_local_id(0); + out[lid] = inclusive_scan_over_group(g, in[lid], binary_op); + }); + }); + } + emu::inclusive_scan(input.begin(), input.begin() + G, expected.begin(), + binary_op, identity); + assert(std::equal(output.begin(), output.begin() + G, expected.begin())); + + { + buffer in_buf(input.data(), input.size()); + buffer out_buf(output.data(), output.size()); + q.submit([&](handler &cgh) { + accessor in{in_buf, cgh, sycl::read_only}; + accessor out{out_buf, cgh, sycl::write_only, sycl::no_init}; + cgh.parallel_for(nd_range<1>(G, G), [=](nd_item<1> it) { + group<1> g = it.get_group(); + int lid = it.get_local_id(0); + out[lid] = inclusive_scan_over_group(g, in[lid], binary_op, init); + }); + }); + } + emu::inclusive_scan(input.begin(), input.begin() + G, expected.begin(), + binary_op, init); + assert(std::equal(output.begin(), output.begin() + G, expected.begin())); + + { + buffer in_buf(input.data(), input.size()); + buffer out_buf(output.data(), output.size()); + q.submit([&](handler &cgh) { + accessor in{in_buf, cgh, sycl::read_only}; + accessor out{out_buf, cgh, sycl::write_only, sycl::no_init}; + cgh.parallel_for(nd_range<1>(G, G), [=](nd_item<1> it) { + group<1> g = it.get_group(); + joint_inclusive_scan(g, in.get_pointer(), in.get_pointer() + N, + out.get_pointer(), binary_op); + }); + }); + } + emu::inclusive_scan(input.begin(), input.begin() + N, expected.begin(), + binary_op, identity); + assert(std::equal(output.begin(), output.begin() + N, expected.begin())); + + { + buffer in_buf(input.data(), input.size()); + buffer out_buf(output.data(), output.size()); + q.submit([&](handler &cgh) { + accessor in{in_buf, cgh, sycl::read_only}; + accessor out{out_buf, cgh, sycl::write_only, sycl::no_init}; + cgh.parallel_for(nd_range<1>(G, G), [=](nd_item<1> it) { + group<1> g = it.get_group(); + joint_inclusive_scan(g, in.get_pointer(), in.get_pointer() + N, + out.get_pointer(), binary_op, init); + }); + }); + } + emu::inclusive_scan(input.begin(), input.begin() + N, expected.begin(), + binary_op, init); + assert(std::equal(output.begin(), output.begin() + N, expected.begin())); +} + +int main() { + queue q; + if (!isSupportedDevice(q.get_device())) { + std::cout << "Skipping test\n"; + return 0; + } + + constexpr int N = 128; + std::array input; + std::array output; + std::iota(input.begin(), input.end(), 0); + std::fill(output.begin(), output.end(), 0); + + test(q, input, output, sycl::plus<>(), 0); + test(q, input, output, sycl::minimum<>(), + std::numeric_limits::max()); + test(q, input, output, sycl::maximum<>(), + std::numeric_limits::lowest()); + + test(q, input, output, sycl::plus(), 0); + test(q, input, output, sycl::minimum(), + std::numeric_limits::max()); + test(q, input, output, sycl::maximum(), + std::numeric_limits::lowest()); + +#ifdef SPIRV_1_3 + test(q, input, output, + sycl::multiplies(), 1); + test(q, input, output, + sycl::bit_or(), 0); + test(q, input, output, + sycl::bit_xor(), 0); + test(q, input, output, sycl::bit_and(), + ~0); +#endif // SPIRV_1_3 + + std::cout << "Test passed." << std::endl; +} diff --git a/SYCL/GroupAlgorithm/SYCL2020/none_of.cpp b/SYCL/GroupAlgorithm/SYCL2020/none_of.cpp new file mode 100644 index 0000000000..b2b0eb0d7d --- /dev/null +++ b/SYCL/GroupAlgorithm/SYCL2020/none_of.cpp @@ -0,0 +1,74 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -I . -o %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out +// +// Missing __spirv_GroupAll and __spirv_GroupAny on AMD: +// XFAIL: rocm_amd + +#include "support.h" +#include +#include +#include +#include +using namespace sycl; + +template class none_of_kernel; + +struct GeZero { + bool operator()(int i) const { return i >= 0; } +}; +struct IsEven { + bool operator()(int i) const { return (i % 2) == 0; } +}; +struct LtZero { + bool operator()(int i) const { return i < 0; } +}; + +template +void test(queue q, InputContainer input, OutputContainer output, + Predicate pred) { + typedef class none_of_kernel kernel_name; + size_t N = input.size(); + size_t G = 64; + { + buffer in_buf(input.data(), input.size()); + buffer out_buf(output.data(), output.size()); + + q.submit([&](handler &cgh) { + accessor in{in_buf, cgh, sycl::read_only}; + accessor out{out_buf, cgh, sycl::write_only, sycl::no_init}; + cgh.parallel_for(nd_range<1>(G, G), [=](nd_item<1> it) { + group<1> g = it.get_group(); + int lid = it.get_local_id(0); + out[0] = none_of_group(g, pred(in[lid])); + out[1] = none_of_group(g, in[lid], pred); + out[2] = joint_none_of(g, in.get_pointer(), in.get_pointer() + N, pred); + }); + }); + } + bool expected = std::none_of(input.begin(), input.end(), pred); + assert(output[0] == expected); + assert(output[1] == expected); + assert(output[2] == expected); +} + +int main() { + queue q; + if (!isSupportedDevice(q.get_device())) { + std::cout << "Skipping test\n"; + return 0; + } + + constexpr int N = 128; + std::array input; + std::array output; + std::iota(input.begin(), input.end(), 0); + std::fill(output.begin(), output.end(), false); + + test(q, input, output, GeZero()); + test(q, input, output, IsEven()); + test(q, input, output, LtZero()); + + std::cout << "Test passed." << std::endl; +} diff --git a/SYCL/GroupAlgorithm/SYCL2020/permute_select.cpp b/SYCL/GroupAlgorithm/SYCL2020/permute_select.cpp new file mode 100644 index 0000000000..a863942294 --- /dev/null +++ b/SYCL/GroupAlgorithm/SYCL2020/permute_select.cpp @@ -0,0 +1,43 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out +// +// Missing __spirv_SubgroupId, __spirv_SubgroupMaxSize, __spirv_SubgroupShuffle* +// on AMD: +// XFAIL: rocm_amd +// +//==------------ permute_select.cpp -*- C++ -*-----------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "permute_select.hpp" + +int main() { + queue Queue; + if (Queue.get_device().is_host()) { + std::cout << "Skipping test\n"; + return 0; + } + check(Queue); + check(Queue); + check(Queue); + check(Queue); + check(Queue); + check(Queue); + check(Queue); + check(Queue); + check(Queue); + check(Queue); + check(Queue); + check(Queue); + check(Queue); + check(Queue); + check(Queue); + std::cout << "Test passed." << std::endl; + return 0; +} diff --git a/SYCL/GroupAlgorithm/SYCL2020/permute_select.hpp b/SYCL/GroupAlgorithm/SYCL2020/permute_select.hpp new file mode 100644 index 0000000000..cfabdd9e22 --- /dev/null +++ b/SYCL/GroupAlgorithm/SYCL2020/permute_select.hpp @@ -0,0 +1,140 @@ +//==----- permute_select.hpp -*- C++ -*------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "helpers.hpp" +#include +template class sycl_subgr; + +using namespace cl::sycl; + +// TODO remove this workaround when clang will support correct generation of +// half typename in integration header +struct wa_half; + +template +void check(queue &Queue, size_t G = 256, size_t L = 64) { + try { + nd_range<1> NdRange(G, L); + buffer> buf_select(G); + buffer> buf_xor(G); + buffer sgsizebuf(1); + Queue.submit([&](handler &cgh) { + accessor acc_select{buf_select, cgh, sycl::read_write}; + accessor acc_xor{buf_xor, cgh, sycl::read_write}; + accessor sgsizeacc{sgsizebuf, cgh, sycl::read_write}; + cgh.parallel_for>(NdRange, [=](nd_item<1> NdItem) { + ext::oneapi::sub_group SG = NdItem.get_sub_group(); + uint32_t wggid = NdItem.get_global_id(0); + uint32_t sgid = SG.get_group_id().get(0); + vec vwggid(wggid), vsgid(sgid); + if (wggid == 0) + sgsizeacc[0] = SG.get_max_local_range()[0]; + + /*GID of middle element in every subgroup*/ + acc_select[NdItem.get_global_id()] = + select_from_group(SG, vwggid, SG.get_max_local_range()[0] / 2); + /* Save GID with SGLID = ( SGLID XOR SGID ) % SGMaxSize */ + acc_xor[NdItem.get_global_id()] = permute_group_by_xor( + SG, vwggid, sgid % SG.get_max_local_range()[0]); + }); + }); + host_accessor acc_select{buf_select, sycl::read_write}; + host_accessor acc_xor{buf_xor, sycl::read_write}; + host_accessor sgsizeacc{sgsizebuf, sycl::read_write}; + + size_t sg_size = sgsizeacc[0]; + int SGid = 0; + int SGLid = 0; + int SGBeginGid = 0; + for (int j = 0; j < G; j++) { + if (j % L % sg_size == 0) { + SGid++; + SGLid = 0; + SGBeginGid = j; + } + if (j % L == 0) { + SGid = 0; + SGLid = 0; + SGBeginGid = j; + } + /*GID of middle element in every subgroup*/ + exit_if_not_equal_vec( + acc_select[j], vec(j / L * L + SGid * sg_size + sg_size / 2), + "select_from_group"); + /* Value GID with SGLID = ( SGLID XOR SGID ) % SGMaxSize */ + exit_if_not_equal_vec(acc_xor[j], + vec(SGBeginGid + (SGLid ^ (SGid % sg_size))), + "permute_group_by_xor"); + SGLid++; + } + } catch (exception e) { + std::cout << "SYCL exception caught: " << e.what(); + exit(1); + } +} + +template void check(queue &Queue, size_t G = 256, size_t L = 64) { + try { + nd_range<1> NdRange(G, L); + buffer buf_select(G); + buffer buf_xor(G); + buffer sgsizebuf(1); + Queue.submit([&](handler &cgh) { + accessor acc_select{buf_select, cgh, sycl::read_write}; + accessor acc_xor{buf_xor, cgh, sycl::read_write}; + accessor sgsizeacc{sgsizebuf, cgh, sycl::read_write}; + cgh.parallel_for>(NdRange, [=](nd_item<1> NdItem) { + ext::oneapi::sub_group SG = NdItem.get_sub_group(); + uint32_t wggid = NdItem.get_global_id(0); + uint32_t sgid = SG.get_group_id().get(0); + if (wggid == 0) + sgsizeacc[0] = SG.get_max_local_range()[0]; + + /*GID of middle element in every subgroup*/ + acc_select[NdItem.get_global_id()] = + select_from_group(SG, wggid, SG.get_max_local_range()[0] / 2); + /* Save GID with SGLID = ( SGLID XOR SGID ) % SGMaxSize */ + acc_xor[NdItem.get_global_id()] = + permute_group_by_xor(SG, wggid, sgid % SG.get_max_local_range()[0]); + }); + }); + host_accessor acc_select{buf_select, sycl::read_write}; + host_accessor acc_xor{buf_xor, sycl::read_write}; + host_accessor sgsizeacc{sgsizebuf, sycl::read_write}; + + size_t sg_size = sgsizeacc[0]; + int SGid = 0; + int SGLid = 0; + int SGBeginGid = 0; + for (int j = 0; j < G; j++) { + if (j % L % sg_size == 0) { + SGid++; + SGLid = 0; + SGBeginGid = j; + } + if (j % L == 0) { + SGid = 0; + SGLid = 0; + SGBeginGid = j; + } + + /*GID of middle element in every subgroup*/ + exit_if_not_equal(acc_select[j], + j / L * L + SGid * sg_size + sg_size / 2, + "select_from_group"); + + /* Value GID with SGLID = ( SGLID XOR SGID ) % SGMaxSize */ + exit_if_not_equal(acc_xor[j], SGBeginGid + (SGLid ^ (SGid % sg_size)), + "permute_group_by_xor"); + SGLid++; + } + } catch (exception e) { + std::cout << "SYCL exception caught: " << e.what(); + exit(1); + } +} diff --git a/SYCL/GroupAlgorithm/SYCL2020/reduce.cpp b/SYCL/GroupAlgorithm/SYCL2020/reduce.cpp new file mode 100644 index 0000000000..2c968ce04f --- /dev/null +++ b/SYCL/GroupAlgorithm/SYCL2020/reduce.cpp @@ -0,0 +1,103 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -I . -o %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out +// +// Missinsg __spirv_GroupIAdd, __spirv_GroupSMin and __spirv_GroupSMax on AMD: +// XFAIL: rocm_amd + +// TODO: enable compile+runtime checks for operations defined in SPIR-V 1.3. +// That requires either adding a switch to clang (-spirv-max-version=1.3) or +// raising the spirv version from 1.1. to 1.3 for spirv translator +// unconditionally. Using operators specific for spirv 1.3 and higher with +// -spirv-max-version=1.1 being set by default causes assert/check fails +// in spirv translator. +// RUNx: %clangxx -fsycl -fsycl-targets=%sycl_triple -DSPIRV_1_3 %s -I . -o \ + %t13.out + +#include "support.h" +#include +#include +#include +#include +#include +using namespace sycl; + +template +void test(queue q, InputContainer input, OutputContainer output, + BinaryOperation binary_op, + typename OutputContainer::value_type identity) { + typedef typename InputContainer::value_type InputT; + typedef typename OutputContainer::value_type OutputT; + OutputT init = 42; + size_t N = input.size(); + size_t G = 64; + { + buffer in_buf(input.data(), input.size()); + buffer out_buf(output.data(), output.size()); + + q.submit([&](handler &cgh) { + accessor in{in_buf, cgh, sycl::read_only}; + accessor out{out_buf, cgh, sycl::write_only, sycl::no_init}; + cgh.parallel_for( + nd_range<1>(G, G), [=](nd_item<1> it) { + group<1> g = it.get_group(); + int lid = it.get_local_id(0); + out[0] = reduce_over_group(g, in[lid], binary_op); + out[1] = reduce_over_group(g, in[lid], init, binary_op); + out[2] = joint_reduce(g, in.get_pointer(), in.get_pointer() + N, + binary_op); + out[3] = joint_reduce(g, in.get_pointer(), in.get_pointer() + N, + init, binary_op); + }); + }); + } + // std::reduce is not implemented yet, so use std::accumulate instead + assert(output[0] == std::accumulate(input.begin(), input.begin() + G, + identity, binary_op)); + assert(output[1] == + std::accumulate(input.begin(), input.begin() + G, init, binary_op)); + assert(output[2] == + std::accumulate(input.begin(), input.end(), identity, binary_op)); + assert(output[3] == + std::accumulate(input.begin(), input.end(), init, binary_op)); +} + +int main() { + queue q; + if (!isSupportedDevice(q.get_device())) { + std::cout << "Skipping test\n"; + return 0; + } + + constexpr int N = 128; + std::array input; + std::array output; + std::iota(input.begin(), input.end(), 0); + std::fill(output.begin(), output.end(), 0); + + test(q, input, output, sycl::plus<>(), 0); + test(q, input, output, sycl::minimum<>(), + std::numeric_limits::max()); + test(q, input, output, sycl::maximum<>(), + std::numeric_limits::lowest()); + + test(q, input, output, sycl::plus(), 0); + test(q, input, output, sycl::minimum(), + std::numeric_limits::max()); + test(q, input, output, sycl::maximum(), + std::numeric_limits::lowest()); + +#ifdef SPIRV_1_3 + test(q, input, output, + sycl::multiplies(), 1); + test(q, input, output, sycl::bit_or(), + 0); + test(q, input, output, sycl::bit_xor(), 0); + test(q, input, output, + sycl::bit_and(), ~0); +#endif // SPIRV_1_3 + + std::cout << "Test passed." << std::endl; +} diff --git a/SYCL/GroupAlgorithm/SYCL2020/shift_left_right.cpp b/SYCL/GroupAlgorithm/SYCL2020/shift_left_right.cpp new file mode 100644 index 0000000000..790eb5030d --- /dev/null +++ b/SYCL/GroupAlgorithm/SYCL2020/shift_left_right.cpp @@ -0,0 +1,43 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out +// +// Missing __spirv_SubgroupId, __spirv_SubgroupMaxSize, __spirv_SubgroupShuffle* +// on AMD: +// XFAIL: rocm_amd +// +//==------------ shift_left_right.cpp -*- C++ -*----------------------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "shift_left_right.hpp" + +int main() { + queue Queue; + if (Queue.get_device().is_host()) { + std::cout << "Skipping test\n"; + return 0; + } + check(Queue); + check(Queue); + check(Queue); + check(Queue); + check(Queue); + check(Queue); + check(Queue); + check(Queue); + check(Queue); + check(Queue); + check(Queue); + check(Queue); + check(Queue); + check(Queue); + check(Queue); + std::cout << "Test passed." << std::endl; + return 0; +} diff --git a/SYCL/GroupAlgorithm/SYCL2020/shift_left_right.hpp b/SYCL/GroupAlgorithm/SYCL2020/shift_left_right.hpp new file mode 100644 index 0000000000..bad1136fc8 --- /dev/null +++ b/SYCL/GroupAlgorithm/SYCL2020/shift_left_right.hpp @@ -0,0 +1,143 @@ +//==------- shift_left_right.hpp -*- C++ -*---------------------------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "helpers.hpp" +#include +template class sycl_subgr; + +using namespace cl::sycl; + +// TODO remove this workaround when clang will support correct generation of +// half typename in integration header +struct wa_half; + +// ---- check +template +void check(queue &Queue, size_t G = 256, size_t L = 64) { + try { + nd_range<1> NdRange(G, L); + buffer> buf_right(G); + buffer> buf_left(G); + buffer sgsizebuf(1); + Queue.submit([&](handler &cgh) { + accessor acc_right{buf_right, cgh, sycl::read_write}; + accessor acc_left{buf_left, cgh, sycl::read_write}; + accessor sgsizeacc{sgsizebuf, cgh, sycl::read_write}; + + cgh.parallel_for>(NdRange, [=](nd_item<1> NdItem) { + ext::oneapi::sub_group SG = NdItem.get_sub_group(); + uint32_t wggid = NdItem.get_global_id(0); + uint32_t sgid = SG.get_group_id().get(0); + vec vwggid(wggid), vsgid(sgid); + if (wggid == 0) + sgsizeacc[0] = SG.get_max_local_range()[0]; + + /* Save GID-SGID */ + acc_right[NdItem.get_global_id()] = shift_group_right(SG, vwggid, sgid); + /* Save GID+SGID */ + acc_left[NdItem.get_global_id()] = shift_group_left(SG, vwggid, sgid); + }); + }); + host_accessor acc_right{buf_right, sycl::read_write}; + host_accessor acc_left{buf_left, sycl::read_write}; + host_accessor sgsizeacc{sgsizebuf, sycl::read_write}; + + size_t sg_size = sgsizeacc[0]; + int SGid = 0; + int SGLid = 0; + int SGBeginGid = 0; + for (int j = 0; j < G; j++) { + if (j % L % sg_size == 0) { + SGid++; + SGLid = 0; + SGBeginGid = j; + } + if (j % L == 0) { + SGid = 0; + SGLid = 0; + SGBeginGid = j; + } + + /* Value GID+SGID for all element except last SGID in SG*/ + if (j % L % sg_size + SGid < sg_size && j % L + SGid < L) { + exit_if_not_equal_vec(acc_left[j], vec(j + SGid % sg_size), + "shift_group_left"); + } + /* Value GID-SGID for all element except first SGID in SG*/ + if (j % L % sg_size >= SGid) { + exit_if_not_equal_vec(acc_right[j], vec(j - SGid % sg_size), + "shift_group_right"); + } + + SGLid++; + } + } catch (exception e) { + std::cout << "SYCL exception caught: " << e.what(); + exit(1); + } +} + +template void check(queue &Queue, size_t G = 256, size_t L = 64) { + try { + nd_range<1> NdRange(G, L); + buffer buf_right(G); + buffer buf_left(G); + buffer sgsizebuf(1); + Queue.submit([&](handler &cgh) { + accessor acc_right{buf_right, cgh, sycl::read_write}; + accessor acc_left{buf_left, cgh, sycl::read_write}; + accessor sgsizeacc{sgsizebuf, cgh, sycl::read_write}; + cgh.parallel_for>(NdRange, [=](nd_item<1> NdItem) { + ext::oneapi::sub_group SG = NdItem.get_sub_group(); + uint32_t wggid = NdItem.get_global_id(0); + uint32_t sgid = SG.get_group_id().get(0); + if (wggid == 0) + sgsizeacc[0] = SG.get_max_local_range()[0]; + + /* Save GID-SGID */ + acc_right[NdItem.get_global_id()] = shift_group_right(SG, wggid, sgid); + /* Save GID+SGID */ + acc_left[NdItem.get_global_id()] = shift_group_left(SG, wggid, sgid); + }); + }); + host_accessor acc_right{buf_right, sycl::read_write}; + host_accessor acc_left{buf_left, sycl::read_write}; + host_accessor sgsizeacc{sgsizebuf, sycl::read_write}; + + size_t sg_size = sgsizeacc[0]; + int SGid = 0; + int SGLid = 0; + int SGBeginGid = 0; + for (int j = 0; j < G; j++) { + if (j % L % sg_size == 0) { + SGid++; + SGLid = 0; + SGBeginGid = j; + } + if (j % L == 0) { + SGid = 0; + SGLid = 0; + SGBeginGid = j; + } + + /* Value GID+SGID for all element except last SGID in SG*/ + if (j % L % sg_size + SGid < sg_size && j % L + SGid < L) { + exit_if_not_equal(acc_left[j], j + SGid, "shift_group_left"); + } + /* Value GID-SGID for all element except first SGID in SG*/ + if (j % L % sg_size >= SGid) { + exit_if_not_equal(acc_right[j], j - SGid, "shift_group_right"); + } + + SGLid++; + } + } catch (exception e) { + std::cout << "SYCL exception caught: " << e.what(); + exit(1); + } +} diff --git a/SYCL/GroupAlgorithm/SYCL2020/support.h b/SYCL/GroupAlgorithm/SYCL2020/support.h new file mode 100644 index 0000000000..fa5118ea27 --- /dev/null +++ b/SYCL/GroupAlgorithm/SYCL2020/support.h @@ -0,0 +1,23 @@ +#include +using namespace sycl; + +bool isSupportedDevice(device D) { + std::string PlatformName = D.get_platform().get_info(); + if (PlatformName.find("CUDA") != std::string::npos) + return true; + + if (PlatformName.find("Level-Zero") != std::string::npos) + return true; + + if (PlatformName.find("OpenCL") != std::string::npos) { + std::string Version = D.get_info(); + size_t Offset = Version.find("OpenCL"); + if (Offset == std::string::npos) + return false; + Version = Version.substr(Offset + 7, 3); + if (Version >= std::string("2.0")) + return true; + } + + return false; +} diff --git a/SYCL/GroupAlgorithm/back_to_back_collectives.cpp b/SYCL/GroupAlgorithm/back_to_back_collectives.cpp new file mode 100644 index 0000000000..5c5d14fd33 --- /dev/null +++ b/SYCL/GroupAlgorithm/back_to_back_collectives.cpp @@ -0,0 +1,73 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %HOST_RUN_PLACEHOLDER %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out +// +// Missing __spirv_GroupIAdd on AMD: +// XFAIL: rocm_amd + +#include +#include +#include +using namespace cl::sycl; +using namespace cl::sycl::ext::oneapi; + +class back_to_back; + +int main() { + queue q; + if (q.get_device().is_host()) { + std::cout << "Skipping test\n"; + return 0; + } + + // Use max work-group size to maximize chance of race + program prog(q.get_context()); + prog.build_with_kernel_type(); + kernel k = prog.get_kernel(); + device d = q.get_device(); + int N = k.get_info(d); + + std::vector Input(N), Sum(N), EScan(N), IScan(N); + std::iota(Input.begin(), Input.end(), 0); + std::fill(Sum.begin(), Sum.end(), 0); + std::fill(EScan.begin(), EScan.end(), 0); + std::fill(IScan.begin(), IScan.end(), 0); + + { + buffer InputBuf(Input.data(), N); + buffer SumBuf(Sum.data(), N); + buffer EScanBuf(EScan.data(), N); + buffer IScanBuf(IScan.data(), N); + q.submit([&](handler &h) { + auto Input = InputBuf.get_access(h); + auto Sum = SumBuf.get_access(h); + auto EScan = EScanBuf.get_access(h); + auto IScan = IScanBuf.get_access(h); + h.parallel_for(nd_range<1>(N, N), [=](nd_item<1> it) { + size_t i = it.get_global_id(0); + auto g = it.get_group(); + // Loop to increase number of back-to-back calls + for (int r = 0; r < 10; ++r) { + Sum[i] = reduce(g, Input[i], sycl::plus<>()); + EScan[i] = exclusive_scan(g, Input[i], sycl::plus<>()); + IScan[i] = inclusive_scan(g, Input[i], sycl::plus<>()); + } + }); + }); + } + + int sum = 0; + bool passed = true; + for (int i = 0; i < N; ++i) { + passed &= (sum == EScan[i]); + sum += i; + passed &= (sum == IScan[i]); + } + for (int i = 0; i < N; ++i) { + passed &= (sum == Sum[i]); + } + std::cout << "Test passed." << std::endl; + return 0; +} diff --git a/SYCL/Plugin/level_zero_batch_event_status.cpp b/SYCL/Plugin/level_zero_batch_event_status.cpp new file mode 100644 index 0000000000..d98140ac4b --- /dev/null +++ b/SYCL/Plugin/level_zero_batch_event_status.cpp @@ -0,0 +1,108 @@ +// REQUIRES: gpu, level_zero + +// RUN: %clangxx -fsycl -fsycl-unnamed-lambda -fsycl-targets=%sycl_triple %s -o %t.out + +// Set batching to 4 explicitly +// RUN: env SYCL_PI_LEVEL_ZERO_BATCH_SIZE=4 SYCL_PI_TRACE=2 ZE_DEBUG=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1 | FileCheck %s + +// level_zero_batch_test.cpp +// +// This tests the level zero plugin's kernel batching code. It specifically +// tests that the current batch is submitted when an Event execution status +// request is made. This test uses explicit SYCL_PI_LEVEL_ZERO_BATCH_SIZE=4 +// to make sure that the batching is submitted when the piEventGetInfo is +// done, rather than some other dynamic batching criteria. +// +// CHECK: ---> piEnqueueKernelLaunch +// CHECK: ZE ---> zeCommandListAppendLaunchKernel +// Shouldn't have closed until we see a piEventGetInfo +// CHECK-NOT: ZE ---> zeCommandListClose +// CHECK-NOT: ZE ---> zeCommandQueueExecuteCommandLists +// CHECK: ---> piEventGetInfo +// Shouldn't see another piGetEventInfo until after closing command list +// CHECK-NOT: ---> piEventGetInfo +// Look for close and Execute after piEventGetInfo +// CHECK: ZE ---> zeCommandListClose +// CHECK: ZE ---> zeCommandQueueExecuteCommandLists +// CHECK: ---> piEventGetInfo +// CHECK-NOT: piEventsWait +// CHECK: ---> piEnqueueKernelLaunch +// CHECK: ZE ---> zeCommandListAppendLaunchKernel +// CHECK: ---> piEventsWait +// Look for close and Execute after piEventsWait +// CHECK: ZE ---> zeCommandListClose +// CHECK: ZE ---> zeCommandQueueExecuteCommandLists +// CHECK: ---> piEventGetInfo +// No close and execute here, should already have happened. +// CHECK-NOT: ZE ---> zeCommandListClose +// CHECK-NOT: ZE ---> zeCommandQueueExecuteCommandLists +// CHECK-NOT: Test Fail +// CHECK: Test Pass + +#include +#include +#include +#include +#include + +int main(void) { + sycl::default_selector ds{}; + sycl::queue q{ds}; + sycl::vector_class events(10); + + sycl::event ev1 = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(events); + cgh.single_task([=] {}); + }); + + bool ev1_completed = false; + int try_count = 0; + while (true) { + auto ev1_status = + ev1.get_info(); + if (ev1_status == sycl::info::event_command_status::complete) { + std::cout << "Ev1 has completed" << std::endl; + ev1_completed = true; + break; + } + + std::cout << "Ev1 has not yet completed: "; + switch (ev1_status) { + case sycl::info::event_command_status::submitted: + std::cout << "submitted"; + break; + case sycl::info::event_command_status::running: + std::cout << "running"; + break; + default: + std::cout << "unrecognized"; + break; + } + std::cout << std::endl; + + std::chrono::milliseconds timespan(300); + std::this_thread::sleep_for(timespan); + + try_count += 1; + if (try_count > 10) { + ev1.wait(); + } + } + assert(ev1_completed); + + sycl::event ev2 = q.submit([&](sycl::handler &cgh) { + cgh.depends_on(events); + cgh.single_task([=] {}); + }); + q.wait(); + + auto ev2_status = ev2.get_info(); + if (ev2_status != sycl::info::event_command_status::complete) { + std::cout << "Test Fail" << std::endl; + exit(1); + } + + std::cout << "Ev2 has completed" << std::endl; + std::cout << "Test Pass" << std::endl; + return 0; +} diff --git a/SYCL/Plugin/level_zero_batch_test.cpp b/SYCL/Plugin/level_zero_batch_test.cpp new file mode 100644 index 0000000000..f834e2bfab --- /dev/null +++ b/SYCL/Plugin/level_zero_batch_test.cpp @@ -0,0 +1,362 @@ +// REQUIRES: gpu, level_zero + +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out + +// Set batching to 4 explicitly +// RUN: env SYCL_PI_LEVEL_ZERO_BATCH_SIZE=4 SYCL_PI_TRACE=2 ZE_DEBUG=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1 | FileCheck --check-prefixes=CKALL,CKB4 %s + +// Set batching to 1 explicitly +// RUN: env SYCL_PI_LEVEL_ZERO_BATCH_SIZE=1 SYCL_PI_TRACE=2 ZE_DEBUG=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1 | FileCheck --check-prefixes=CKALL,CKB1 %s + +// Set batching to 3 explicitly +// RUN: env SYCL_PI_LEVEL_ZERO_BATCH_SIZE=3 SYCL_PI_TRACE=2 ZE_DEBUG=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1 | FileCheck --check-prefixes=CKALL,CKB3 %s + +// Set batching to 5 explicitly +// RUN: env SYCL_PI_LEVEL_ZERO_BATCH_SIZE=5 SYCL_PI_TRACE=2 ZE_DEBUG=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1 | FileCheck --check-prefixes=CKALL,CKB5 %s + +// Set batching to 7 explicitly +// RUN: env SYCL_PI_LEVEL_ZERO_BATCH_SIZE=7 SYCL_PI_TRACE=2 ZE_DEBUG=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1 | FileCheck --check-prefixes=CKALL,CKB7 %s + +// Set batching to 8 explicitly +// RUN: env SYCL_PI_LEVEL_ZERO_BATCH_SIZE=8 SYCL_PI_TRACE=2 ZE_DEBUG=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1 | FileCheck --check-prefixes=CKALL,CKB8 %s + +// Set batching to 9 explicitly +// RUN: env SYCL_PI_LEVEL_ZERO_BATCH_SIZE=9 SYCL_PI_TRACE=2 ZE_DEBUG=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1 | FileCheck --check-prefixes=CKALL,CKB9 %s + +// level_zero_batch_test.cpp +// +// This tests the level zero plugin's kernel batching code. The default +// batching is 4, and exact batch size can be controlled with environment +// variable SYCL_PI_LEVEL_ZEOR+BATCH_SIZE=N. +// This test enqueues 8 kernels and then does a wait. And it does this 3 times. +// Expected output is that for batching =1 you will see zeCommandListClose, +// and zeCommandQueueExecuteCommandLists after every piEnqueueKernelLaunch. +// For batching=3 you will see that after 3rd and 6th enqueues, and then after +// piEventsWait. For 5, after 5th piEnqueue, and then after piEventsWait. For +// 4 you will see these after 4th and 8th Enqueue, and for 8, only after the +// 8th enqueue. And lastly for 9, you will see the Close and Execute calls +// only after the piEventsWait. +// Since the test does this 3 times, this pattern will repeat 2 more times, +// and then the test will print Test Passed 8 times, once for each kernel +// validation check. +// Pattern starts first set of kernel executions. +// CKALL: ---> piEnqueueKernelLaunch( +// CKALL: ZE ---> zeCommandListAppendLaunchKernel( +// CKB1: ZE ---> zeCommandListClose( +// CKB1: ZE ---> zeCommandQueueExecuteCommandLists( +// CKALL: ---> piEnqueueKernelLaunch( +// CKALL: ZE ---> zeCommandListAppendLaunchKernel( +// CKB1: ZE ---> zeCommandListClose( +// CKB1: ZE ---> zeCommandQueueExecuteCommandLists( +// CKALL: ---> piEnqueueKernelLaunch( +// CKALL: ZE ---> zeCommandListAppendLaunchKernel( +// CKB1: ZE ---> zeCommandListClose( +// CKB1: ZE ---> zeCommandQueueExecuteCommandLists( +// CKB3: ZE ---> zeCommandListClose( +// CKB3: ZE ---> zeCommandQueueExecuteCommandLists( +// CKALL: ---> piEnqueueKernelLaunch( +// CKALL: ZE ---> zeCommandListAppendLaunchKernel( +// CKB1: ZE ---> zeCommandListClose( +// CKB1: ZE ---> zeCommandQueueExecuteCommandLists( +// CKB4: ZE ---> zeCommandListClose( +// CKB4: ZE ---> zeCommandQueueExecuteCommandLists( +// CKALL: ---> piEnqueueKernelLaunch( +// CKALL: ZE ---> zeCommandListAppendLaunchKernel( +// CKB1: ZE ---> zeCommandListClose( +// CKB1: ZE ---> zeCommandQueueExecuteCommandLists( +// CKB5: ZE ---> zeCommandListClose( +// CKB5: ZE ---> zeCommandQueueExecuteCommandLists( +// CKALL: ---> piEnqueueKernelLaunch( +// CKALL: ZE ---> zeCommandListAppendLaunchKernel( +// CKB1: ZE ---> zeCommandListClose( +// CKB1: ZE ---> zeCommandQueueExecuteCommandLists( +// CKB3: ZE ---> zeCommandListClose( +// CKB3: ZE ---> zeCommandQueueExecuteCommandLists( +// CKALL: ---> piEnqueueKernelLaunch( +// CKALL: ZE ---> zeCommandListAppendLaunchKernel( +// CKB1: ZE ---> zeCommandListClose( +// CKB1: ZE ---> zeCommandQueueExecuteCommandLists( +// CKB7: ZE ---> zeCommandListClose( +// CKB7: ZE ---> zeCommandQueueExecuteCommandLists( +// CKALL: ---> piEnqueueKernelLaunch( +// CKALL: ZE ---> zeCommandListAppendLaunchKernel( +// CKB1: ZE ---> zeCommandListClose( +// CKB1: ZE ---> zeCommandQueueExecuteCommandLists( +// CKB4: ZE ---> zeCommandListClose( +// CKB4: ZE ---> zeCommandQueueExecuteCommandLists( +// CKB8: ZE ---> zeCommandListClose( +// CKB8: ZE ---> zeCommandQueueExecuteCommandLists( +// CKALL: ---> piEventsWait( +// CKB3: ZE ---> zeCommandListClose( +// CKB3: ZE ---> zeCommandQueueExecuteCommandLists( +// CKB5: ZE ---> zeCommandListClose( +// CKB5: ZE ---> zeCommandQueueExecuteCommandLists( +// CKB7: ZE ---> zeCommandListClose( +// CKB7: ZE ---> zeCommandQueueExecuteCommandLists( +// CKB9: ZE ---> zeCommandListClose( +// CKB9: ZE ---> zeCommandQueueExecuteCommandLists( +// Pattern starts 2nd set of kernel executions +// CKALL: ---> piEnqueueKernelLaunch( +// CKALL: ZE ---> zeCommandListAppendLaunchKernel( +// CKB1: ZE ---> zeCommandListClose( +// CKB1: ZE ---> zeCommandQueueExecuteCommandLists( +// CKALL: ---> piEnqueueKernelLaunch( +// CKALL: ZE ---> zeCommandListAppendLaunchKernel( +// CKB1: ZE ---> zeCommandListClose( +// CKB1: ZE ---> zeCommandQueueExecuteCommandLists( +// CKALL: ---> piEnqueueKernelLaunch( +// CKALL: ZE ---> zeCommandListAppendLaunchKernel( +// CKB1: ZE ---> zeCommandListClose( +// CKB1: ZE ---> zeCommandQueueExecuteCommandLists( +// CKB3: ZE ---> zeCommandListClose( +// CKB3: ZE ---> zeCommandQueueExecuteCommandLists( +// CKALL: ---> piEnqueueKernelLaunch( +// CKALL: ZE ---> zeCommandListAppendLaunchKernel( +// CKB1: ZE ---> zeCommandListClose( +// CKB1: ZE ---> zeCommandQueueExecuteCommandLists( +// CKB4: ZE ---> zeCommandListClose( +// CKB4: ZE ---> zeCommandQueueExecuteCommandLists( +// CKALL: ---> piEnqueueKernelLaunch( +// CKALL: ZE ---> zeCommandListAppendLaunchKernel( +// CKB1: ZE ---> zeCommandListClose( +// CKB1: ZE ---> zeCommandQueueExecuteCommandLists( +// CKB5: ZE ---> zeCommandListClose( +// CKB5: ZE ---> zeCommandQueueExecuteCommandLists( +// CKALL: ---> piEnqueueKernelLaunch( +// CKALL: ZE ---> zeCommandListAppendLaunchKernel( +// CKB1: ZE ---> zeCommandListClose( +// CKB1: ZE ---> zeCommandQueueExecuteCommandLists( +// CKB3: ZE ---> zeCommandListClose( +// CKB3: ZE ---> zeCommandQueueExecuteCommandLists( +// CKALL: ---> piEnqueueKernelLaunch( +// CKALL: ZE ---> zeCommandListAppendLaunchKernel( +// CKB1: ZE ---> zeCommandListClose( +// CKB1: ZE ---> zeCommandQueueExecuteCommandLists( +// CKB7: ZE ---> zeCommandListClose( +// CKB7: ZE ---> zeCommandQueueExecuteCommandLists( +// CKALL: ---> piEnqueueKernelLaunch( +// CKALL: ZE ---> zeCommandListAppendLaunchKernel( +// CKB1: ZE ---> zeCommandListClose( +// CKB1: ZE ---> zeCommandQueueExecuteCommandLists( +// CKB4: ZE ---> zeCommandListClose( +// CKB4: ZE ---> zeCommandQueueExecuteCommandLists( +// CKB8: ZE ---> zeCommandListClose( +// CKB8: ZE ---> zeCommandQueueExecuteCommandLists( +// CKALL: ---> piEventsWait( +// CKB3: ZE ---> zeCommandListClose( +// CKB3: ZE ---> zeCommandQueueExecuteCommandLists( +// CKB5: ZE ---> zeCommandListClose( +// CKB5: ZE ---> zeCommandQueueExecuteCommandLists( +// CKB7: ZE ---> zeCommandListClose( +// CKB7: ZE ---> zeCommandQueueExecuteCommandLists( +// CKB9: ZE ---> zeCommandListClose( +// CKB9: ZE ---> zeCommandQueueExecuteCommandLists( +// Pattern starts 3rd set of kernel executions +// CKALL: ---> piEnqueueKernelLaunch( +// CKALL: ZE ---> zeCommandListAppendLaunchKernel( +// CKB1: ZE ---> zeCommandListClose( +// CKB1: ZE ---> zeCommandQueueExecuteCommandLists( +// CKALL: ---> piEnqueueKernelLaunch( +// CKALL: ZE ---> zeCommandListAppendLaunchKernel( +// CKB1: ZE ---> zeCommandListClose( +// CKB1: ZE ---> zeCommandQueueExecuteCommandLists( +// CKALL: ---> piEnqueueKernelLaunch( +// CKALL: ZE ---> zeCommandListAppendLaunchKernel( +// CKB1: ZE ---> zeCommandListClose( +// CKB1: ZE ---> zeCommandQueueExecuteCommandLists( +// CKB3: ZE ---> zeCommandListClose( +// CKB3: ZE ---> zeCommandQueueExecuteCommandLists( +// CKALL: ---> piEnqueueKernelLaunch( +// CKALL: ZE ---> zeCommandListAppendLaunchKernel( +// CKB1: ZE ---> zeCommandListClose( +// CKB1: ZE ---> zeCommandQueueExecuteCommandLists( +// CKB4: ZE ---> zeCommandListClose( +// CKB4: ZE ---> zeCommandQueueExecuteCommandLists( +// CKALL: ---> piEnqueueKernelLaunch( +// CKALL: ZE ---> zeCommandListAppendLaunchKernel( +// CKB1: ZE ---> zeCommandListClose( +// CKB1: ZE ---> zeCommandQueueExecuteCommandLists( +// CKB5: ZE ---> zeCommandListClose( +// CKB5: ZE ---> zeCommandQueueExecuteCommandLists( +// CKALL: ---> piEnqueueKernelLaunch( +// CKALL: ZE ---> zeCommandListAppendLaunchKernel( +// CKB1: ZE ---> zeCommandListClose( +// CKB1: ZE ---> zeCommandQueueExecuteCommandLists( +// CKB3: ZE ---> zeCommandListClose( +// CKB3: ZE ---> zeCommandQueueExecuteCommandLists( +// CKALL: ---> piEnqueueKernelLaunch( +// CKALL: ZE ---> zeCommandListAppendLaunchKernel( +// CKB1: ZE ---> zeCommandListClose( +// CKB1: ZE ---> zeCommandQueueExecuteCommandLists( +// CKB7: ZE ---> zeCommandListClose( +// CKB7: ZE ---> zeCommandQueueExecuteCommandLists( +// CKALL: ---> piEnqueueKernelLaunch( +// CKALL: ZE ---> zeCommandListAppendLaunchKernel( +// CKB1: ZE ---> zeCommandListClose( +// CKB1: ZE ---> zeCommandQueueExecuteCommandLists( +// CKB4: ZE ---> zeCommandListClose( +// CKB4: ZE ---> zeCommandQueueExecuteCommandLists( +// CKB8: ZE ---> zeCommandListClose( +// CKB8: ZE ---> zeCommandQueueExecuteCommandLists( +// CKALL: ---> piEventsWait( +// CKB3: ZE ---> zeCommandListClose( +// CKB3: ZE ---> zeCommandQueueExecuteCommandLists( +// CKB5: ZE ---> zeCommandListClose( +// CKB5: ZE ---> zeCommandQueueExecuteCommandLists( +// CKB7: ZE ---> zeCommandListClose( +// CKB7: ZE ---> zeCommandQueueExecuteCommandLists( +// CKB9: ZE ---> zeCommandListClose( +// CKB9: ZE ---> zeCommandQueueExecuteCommandLists( +// Now just check for 8 Test Pass kernel validations. +// CKALL: Test Pass +// CKALL: Test Pass +// CKALL: Test Pass +// CKALL: Test Pass +// CKALL: Test Pass +// CKALL: Test Pass +// CKALL: Test Pass +// CKALL: Test Pass + +#include "CL/sycl.hpp" +#include +#include +#include + +namespace sycl = cl::sycl; + +void validate(uint32_t *result, uint32_t *expect, size_t n) { + int error = 0; + for (int i = 0; i < n; i++) { + if (result[i] != expect[i]) { + error++; + if (error < 10) { + printf("Error: %d, expect: %d\n", result[i], expect[i]); + } + } + } + error > 0 ? printf("Error: %d\n", error) : printf("Test Pass\n"); +} + +int main(int argc, char *argv[]) { + size_t M = 65536; + size_t N = 512 / 4; + size_t AL = M * N * sizeof(uint32_t); + + sycl::queue q(sycl::default_selector{}); + auto ctx = q.get_context(); + auto dev = q.get_device(); + + uint32_t *Y1 = static_cast(sycl::malloc_shared(AL, dev, ctx)); + uint32_t *Z1 = static_cast(sycl::malloc_shared(AL, dev, ctx)); + uint32_t *Z2 = static_cast(sycl::malloc_shared(AL, dev, ctx)); + uint32_t *Z3 = static_cast(sycl::malloc_shared(AL, dev, ctx)); + uint32_t *Z4 = static_cast(sycl::malloc_shared(AL, dev, ctx)); + uint32_t *Z5 = static_cast(sycl::malloc_shared(AL, dev, ctx)); + uint32_t *Z6 = static_cast(sycl::malloc_shared(AL, dev, ctx)); + uint32_t *Z7 = static_cast(sycl::malloc_shared(AL, dev, ctx)); + uint32_t *Z8 = static_cast(sycl::malloc_shared(AL, dev, ctx)); + + for (size_t i = 0; i < M * N; i++) { + Y1[i] = i % 255; + } + + memset(Z1, '\0', AL); + memset(Z2, '\0', AL); + memset(Z3, '\0', AL); + memset(Z4, '\0', AL); + memset(Z5, '\0', AL); + memset(Z6, '\0', AL); + memset(Z7, '\0', AL); + memset(Z8, '\0', AL); + + { + for (size_t j = 0; j < 3; j++) { + q.submit([&](sycl::handler &h) { + h.parallel_for(sycl::range<2>{M, N}, + [=](sycl::id<2> it) { + const int m = it[0]; + const int n = it[1]; + Z1[m * N + n] = Y1[m * N + n]; + }); + }); + q.submit([&](sycl::handler &h) { + h.parallel_for(sycl::range<2>{M, N}, + [=](sycl::id<2> it) { + const int m = it[0]; + const int n = it[1]; + Z2[m * N + n] = Y1[m * N + n]; + }); + }); + q.submit([&](sycl::handler &h) { + h.parallel_for(sycl::range<2>{M, N}, + [=](sycl::id<2> it) { + const int m = it[0]; + const int n = it[1]; + Z3[m * N + n] = Y1[m * N + n]; + }); + }); + q.submit([&](sycl::handler &h) { + h.parallel_for(sycl::range<2>{M, N}, + [=](sycl::id<2> it) { + const int m = it[0]; + const int n = it[1]; + Z4[m * N + n] = Y1[m * N + n]; + }); + }); + q.submit([&](sycl::handler &h) { + h.parallel_for(sycl::range<2>{M, N}, + [=](sycl::id<2> it) { + const int m = it[0]; + const int n = it[1]; + Z5[m * N + n] = Y1[m * N + n]; + }); + }); + q.submit([&](sycl::handler &h) { + h.parallel_for(sycl::range<2>{M, N}, + [=](sycl::id<2> it) { + const int m = it[0]; + const int n = it[1]; + Z6[m * N + n] = Y1[m * N + n]; + }); + }); + q.submit([&](sycl::handler &h) { + h.parallel_for(sycl::range<2>{M, N}, + [=](sycl::id<2> it) { + const int m = it[0]; + const int n = it[1]; + Z7[m * N + n] = Y1[m * N + n]; + }); + }); + q.submit([&](sycl::handler &h) { + h.parallel_for(sycl::range<2>{M, N}, + [=](sycl::id<2> it) { + const int m = it[0]; + const int n = it[1]; + Z8[m * N + n] = Y1[m * N + n]; + }); + }); + + q.wait(); + } + } + validate(Y1, Z1, M * N); + validate(Y1, Z2, M * N); + validate(Y1, Z3, M * N); + validate(Y1, Z4, M * N); + validate(Y1, Z5, M * N); + validate(Y1, Z6, M * N); + validate(Y1, Z7, M * N); + validate(Y1, Z8, M * N); + + sycl::free(Y1, ctx); + sycl::free(Z1, ctx); + sycl::free(Z2, ctx); + sycl::free(Z3, ctx); + sycl::free(Z4, ctx); + sycl::free(Z5, ctx); + sycl::free(Z6, ctx); + sycl::free(Z7, ctx); + sycl::free(Z8, ctx); + + return 0; +} diff --git a/SYCL/Plugin/level_zero_dynamic_batch_test.cpp b/SYCL/Plugin/level_zero_dynamic_batch_test.cpp new file mode 100644 index 0000000000..bc07034e05 --- /dev/null +++ b/SYCL/Plugin/level_zero_dynamic_batch_test.cpp @@ -0,0 +1,220 @@ +// REQUIRES: gpu, level_zero + +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out + +// Check that dynamic batching raises/lowers batch size +// RUN: env SYCL_PI_TRACE=2 ZE_DEBUG=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1 | FileCheck --check-prefixes=CKALL,CKDYN %s + +// level_zero_dynamic_batch_test.cpp +// +// This tests the level zero plugin's kernel dynamic batch size adjustment +// code. +// It starts out by enqueing 40 kernels before it does a wait, and it does +// this 5 times. That should cause the dynamic batch size adjustment to +// raise the batch size up 3 times. +// +// Then the test starts enqueueing only 4 kernels before doing a wait, and +// it does that 20 times. That should cause the batch size to +// be lowered to be less than 4. +// +// CKDYN: Raising QueueBatchSize to 5 +// CKDYN: Raising QueueBatchSize to 6 +// CKDYN: Raising QueueBatchSize to 7 +// CKDYN-NOT: Raising QueueBatchSize +// CKALL: Test Pass +// CKALL: Test Pass +// CKALL: Test Pass +// CKALL: Test Pass +// CKALL: Test Pass +// CKALL: Test Pass +// CKALL: Test Pass +// CKDYN: Lowering QueueBatchSize to 2 +// CKDYN-NOT: Lowering QueueBatchSize +// CKALL: Test Pass +// CKALL: Test Pass +// CKALL: Test Pass +// CKALL: Test Pass + +#include "CL/sycl.hpp" +#include +#include +#include + +namespace sycl = cl::sycl; + +void validate(uint32_t *result, uint32_t *expect, size_t n) { + int error = 0; + for (int i = 0; i < n; i++) { + if (result[i] != expect[i]) { + error++; + if (error < 10) { + printf("Error: %d, expect: %d\n", result[i], expect[i]); + } + } + } + error > 0 ? printf("Error: %d\n", error) : printf("Test Pass\n"); +} + +int main(int argc, char *argv[]) { + size_t M = 65536; + size_t N = 512 / 4; + size_t AL = M * N * sizeof(uint32_t); + + sycl::queue q(sycl::default_selector{}); + auto ctx = q.get_context(); + auto dev = q.get_device(); + + uint32_t *Y1 = static_cast(sycl::malloc_shared(AL, dev, ctx)); + uint32_t *Z1 = static_cast(sycl::malloc_shared(AL, dev, ctx)); + uint32_t *Z2 = static_cast(sycl::malloc_shared(AL, dev, ctx)); + uint32_t *Z3 = static_cast(sycl::malloc_shared(AL, dev, ctx)); + uint32_t *Z4 = static_cast(sycl::malloc_shared(AL, dev, ctx)); + uint32_t *Z5 = static_cast(sycl::malloc_shared(AL, dev, ctx)); + uint32_t *Z6 = static_cast(sycl::malloc_shared(AL, dev, ctx)); + uint32_t *Z7 = static_cast(sycl::malloc_shared(AL, dev, ctx)); + uint32_t *Z8 = static_cast(sycl::malloc_shared(AL, dev, ctx)); + + for (size_t i = 0; i < M * N; i++) { + Y1[i] = i % 255; + } + + memset(Z1, '\0', AL); + memset(Z2, '\0', AL); + memset(Z3, '\0', AL); + memset(Z4, '\0', AL); + memset(Z5, '\0', AL); + memset(Z6, '\0', AL); + memset(Z7, '\0', AL); + memset(Z8, '\0', AL); + + for (size_t i = 0; i < 5; i++) { + for (size_t j = 0; j < 5; j++) { + q.submit([&](sycl::handler &h) { + h.parallel_for(sycl::range<2>{M, N}, + [=](sycl::id<2> it) { + const int m = it[0]; + const int n = it[1]; + Z1[m * N + n] = Y1[m * N + n]; + }); + }); + q.submit([&](sycl::handler &h) { + h.parallel_for(sycl::range<2>{M, N}, + [=](sycl::id<2> it) { + const int m = it[0]; + const int n = it[1]; + Z2[m * N + n] = Y1[m * N + n]; + }); + }); + q.submit([&](sycl::handler &h) { + h.parallel_for(sycl::range<2>{M, N}, + [=](sycl::id<2> it) { + const int m = it[0]; + const int n = it[1]; + Z3[m * N + n] = Y1[m * N + n]; + }); + }); + q.submit([&](sycl::handler &h) { + h.parallel_for(sycl::range<2>{M, N}, + [=](sycl::id<2> it) { + const int m = it[0]; + const int n = it[1]; + Z4[m * N + n] = Y1[m * N + n]; + }); + }); + q.submit([&](sycl::handler &h) { + h.parallel_for(sycl::range<2>{M, N}, + [=](sycl::id<2> it) { + const int m = it[0]; + const int n = it[1]; + Z5[m * N + n] = Y1[m * N + n]; + }); + }); + q.submit([&](sycl::handler &h) { + h.parallel_for(sycl::range<2>{M, N}, + [=](sycl::id<2> it) { + const int m = it[0]; + const int n = it[1]; + Z6[m * N + n] = Y1[m * N + n]; + }); + }); + q.submit([&](sycl::handler &h) { + h.parallel_for(sycl::range<2>{M, N}, + [=](sycl::id<2> it) { + const int m = it[0]; + const int n = it[1]; + Z7[m * N + n] = Y1[m * N + n]; + }); + }); + q.submit([&](sycl::handler &h) { + h.parallel_for(sycl::range<2>{M, N}, + [=](sycl::id<2> it) { + const int m = it[0]; + const int n = it[1]; + Z8[m * N + n] = Y1[m * N + n]; + }); + }); + } + q.wait(); + } + + validate(Y1, Z1, M * N); + validate(Y1, Z2, M * N); + validate(Y1, Z3, M * N); + validate(Y1, Z4, M * N); + validate(Y1, Z5, M * N); + validate(Y1, Z6, M * N); + validate(Y1, Z7, M * N); + validate(Y1, Z8, M * N); + + for (size_t i = 0; i < 20; i++) { + q.submit([&](sycl::handler &h) { + h.parallel_for(sycl::range<2>{M, N}, + [=](sycl::id<2> it) { + const int m = it[0]; + const int n = it[1]; + Z1[m * N + n] = Y1[m * N + n]; + }); + }); + q.submit([&](sycl::handler &h) { + h.parallel_for(sycl::range<2>{M, N}, + [=](sycl::id<2> it) { + const int m = it[0]; + const int n = it[1]; + Z2[m * N + n] = Y1[m * N + n]; + }); + }); + q.submit([&](sycl::handler &h) { + h.parallel_for(sycl::range<2>{M, N}, + [=](sycl::id<2> it) { + const int m = it[0]; + const int n = it[1]; + Z3[m * N + n] = Y1[m * N + n]; + }); + }); + q.submit([&](sycl::handler &h) { + h.parallel_for(sycl::range<2>{M, N}, + [=](sycl::id<2> it) { + const int m = it[0]; + const int n = it[1]; + Z4[m * N + n] = Y1[m * N + n]; + }); + }); + q.wait(); + } + validate(Y1, Z1, M * N); + validate(Y1, Z2, M * N); + validate(Y1, Z3, M * N); + validate(Y1, Z4, M * N); + + sycl::free(Y1, ctx); + sycl::free(Z1, ctx); + sycl::free(Z2, ctx); + sycl::free(Z3, ctx); + sycl::free(Z4, ctx); + sycl::free(Z5, ctx); + sycl::free(Z6, ctx); + sycl::free(Z7, ctx); + sycl::free(Z8, ctx); + + return 0; +} diff --git a/SYCL/Regression/device_num.cpp b/SYCL/Regression/device_num.cpp new file mode 100644 index 0000000000..6fbfc093b8 --- /dev/null +++ b/SYCL/Regression/device_num.cpp @@ -0,0 +1,96 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: env SYCL_DEVICE_FILTER=0 %t.out +// RUN: env SYCL_DEVICE_FILTER=1 %t.out +// RUN: env SYCL_DEVICE_FILTER=2 %t.out +// RUN: env SYCL_DEVICE_FILTER=3 %t.out +// RUN: env SYCL_DEVICE_FILTER=4 %t.out + +// The test is using all available BEs but CUDA machine in CI does not have +// functional OpenCL RT +// UNSUPPORTED: cuda + +#include +#include + +using namespace cl::sycl; +using namespace std; + +void printDeviceType(const device &d) { + string name = d.get_platform().get_info(); + auto DeviceType = d.get_info(); + std::string DeviceTypeName; + + switch (DeviceType) { + case info::device_type::cpu: + DeviceTypeName = "CPU "; + break; + case info::device_type::gpu: + DeviceTypeName = "GPU "; + break; + case info::device_type::host: + DeviceTypeName = "HOST "; + break; + case info::device_type::accelerator: + DeviceTypeName = "ACCELERATOR "; + break; + default: + DeviceTypeName = "UNKNOWN "; + break; + } + std::cout << DeviceTypeName << name << std::endl; +} + +int main() { + const char *envVal = std::getenv("SYCL_DEVICE_FILTER"); + int deviceNum; + std::cout << "SYCL_DEVICE_FILTER=" << envVal << std::endl; + deviceNum = std::atoi(envVal); + + auto devices = device::get_devices(); + if (devices.size() > deviceNum) { + device targetDevice = devices[deviceNum]; + std::cout << "Target Device: "; + printDeviceType(targetDevice); + + { + default_selector ds; + device d = ds.select_device(); + std::cout << "default_selector selected "; + printDeviceType(d); + assert(targetDevice == d && + "The selected device is not the target device specified."); + } + + if (targetDevice.is_gpu()) { + gpu_selector gs; + device d = gs.select_device(); + std::cout << "gpu_selector selected "; + printDeviceType(d); + assert(targetDevice == d && + "The selected device is not the target device specified."); + } else if (targetDevice.is_cpu()) { + cpu_selector cs; + device d = cs.select_device(); + std::cout << "cpu_selector selected "; + printDeviceType(d); + assert(targetDevice == d && + "The selected device is not the target device specified."); + } else if (targetDevice.is_accelerator()) { + accelerator_selector as; + device d = as.select_device(); + std::cout << "accelerator_selector selected "; + printDeviceType(d); + assert(targetDevice == d && + "The selected device is not the target device specified."); + } + // HOST device is always available regardless of SYCL_DEVICE_FILTER + { + host_selector hs; + device d = hs.select_device(); + std::cout << "host_selector selected "; + printDeviceType(d); + assert(d.is_host() && "The selected device is not a host device."); + } + } + return 0; +} diff --git a/SYCL/SpecConstants/2020/host_apis.cpp b/SYCL/SpecConstants/2020/host_apis.cpp new file mode 100644 index 0000000000..c373291938 --- /dev/null +++ b/SYCL/SpecConstants/2020/host_apis.cpp @@ -0,0 +1,105 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %t.out + +// UNSUPPORTED: cuda +// UNSUPPORTED: rocm_nvidia +// UNSUPPORTED: rocm_amd + +#include + +#include + +class Kernel1Name; +class Kernel2Name; + +struct TestStruct { + int a; + int b; +}; + +struct TestStruct2 { + bool a; + long b; +}; + +const static sycl::specialization_id SpecConst1{42}; +const static sycl::specialization_id SpecConst2{42}; +const static sycl::specialization_id SpecConst3{TestStruct{42, 42}}; +const static sycl::specialization_id SpecConst4{42}; +const static sycl::specialization_id SpecConst5{ + TestStruct2{true, 0}}; + +int main() { + sycl::queue Q; + + // No support for host device so far + if (Q.is_host()) + return 0; + + // The code is needed to just have device images in the executable + if (0) { + Q.submit([](sycl::handler &CGH) { CGH.single_task([] {}); }); + Q.submit([](sycl::handler &CGH) { CGH.single_task([] {}); }); + } + + const sycl::context Ctx = Q.get_context(); + const sycl::device Dev = Q.get_device(); + + sycl::kernel_bundle KernelBundle = + sycl::get_kernel_bundle(Ctx, {Dev}); + + assert(KernelBundle.contains_specialization_constants() == true); + assert(KernelBundle.has_specialization_constant() == false); + assert(KernelBundle.has_specialization_constant() == true); + + // Test that unused spec constants are saved. + assert(KernelBundle.get_specialization_constant() == 42); + KernelBundle.set_specialization_constant(1); + assert(KernelBundle.get_specialization_constant() == 1); + + KernelBundle.set_specialization_constant(1); + { + auto ExecBundle = sycl::build(KernelBundle); + assert(ExecBundle.get_specialization_constant() == 1); + assert(ExecBundle.get_specialization_constant() == 1); + sycl::buffer Buf{sycl::range{1}}; + Q.submit([&](sycl::handler &CGH) { + CGH.use_kernel_bundle(ExecBundle); + auto Acc = Buf.get_access(CGH); + CGH.single_task([=](sycl::kernel_handler KH) { + Acc[0] = KH.get_specialization_constant(); + }); + }); + auto Acc = Buf.get_access(); + assert(Acc[0] == 1); + } + + { + sycl::buffer Buf{sycl::range{1}}; + Q.submit([&](sycl::handler &CGH) { + auto Acc = Buf.get_access(CGH); + CGH.set_specialization_constant(TestStruct{1, 2}); + const auto SC = CGH.get_specialization_constant(); + assert(SC == 42); + CGH.single_task([=](sycl::kernel_handler KH) { + Acc[0] = KH.get_specialization_constant(); + }); + }); + auto Acc = Buf.get_access(); + assert(Acc[0].a == 1 && Acc[0].b == 2); + } + + sycl::kernel_bundle KernelBundle2 = + sycl::get_kernel_bundle(Ctx, {Dev}); + KernelBundle2.set_specialization_constant(TestStruct2{false, 1}); + + sycl::kernel_bundle KernelBundle3 = + sycl::join({KernelBundle2, KernelBundle}); + + assert(KernelBundle3.get_specialization_constant().a == false); + assert(KernelBundle3.get_specialization_constant().b == 1); + assert(KernelBundle3.get_specialization_constant() == 1); + assert(KernelBundle3.get_specialization_constant() == 1); + + return 0; +} diff --git a/SYCL/SpecConstants/2020/kernel_lambda_with_kernel_handler_arg.cpp b/SYCL/SpecConstants/2020/kernel_lambda_with_kernel_handler_arg.cpp new file mode 100644 index 0000000000..dd7a27b8db --- /dev/null +++ b/SYCL/SpecConstants/2020/kernel_lambda_with_kernel_handler_arg.cpp @@ -0,0 +1,111 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// +// Hits an assert in the Lower Work Group Scope Code pass on AMD: +// XFAIL: rocm_amd + +// This test checks all possible scenarios of running single_task, parallel_for +// and parallel_for_work_group to verify that this code compiles and runs +// correctly with user's lambda with and without sycl::kernel_handler argument + +#include + +int main() { + sycl::queue q; + + // single_task w/o kernel_handler arg + q.submit([&](sycl::handler &cgh) { + cgh.single_task([=]() {}); + }); + + // single_task with kernel_handler arg + q.submit([&](sycl::handler &cgh) { + cgh.single_task( + [=](sycl::kernel_handler kh) {}); + }); + + // parallel_for with id and w/o kernel_handler arg + q.submit([&](sycl::handler &cgh) { + cgh.parallel_for( + sycl::range<1>(1), [](sycl::id<1> i) {}); + }); + + // parallel_for with id and kernel_handler args + q.submit([&](sycl::handler &cgh) { + cgh.parallel_for( + sycl::range<1>(1), [](sycl::id<1> i, sycl::kernel_handler kh) {}); + }); + + // parallel_for with item and w/o kernel_handler arg + q.submit([&](sycl::handler &cgh) { + cgh.parallel_for( + sycl::range<3>(3, 3, 3), [](sycl::item<3> it) {}); + }); + + // parallel_for with item and kernel_handler args + q.submit([&](sycl::handler &cgh) { + cgh.parallel_for( + sycl::range<3>(3, 3, 3), + [](sycl::item<3> it, sycl::kernel_handler kh) {}); + }); + + // parallel_for with nd_item and w/o kernel_handler arg + q.submit([&](sycl::handler &cgh) { + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(4, 4, 4), sycl::range<3>(2, 2, 2)), + [=](sycl::nd_item<3> item) {}); + }); + + // parallel_for with nd_item and kernel_handler args + q.submit([&](sycl::handler &cgh) { + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(4, 4, 4), sycl::range<3>(2, 2, 2)), + [=](sycl::nd_item<3> item, sycl::kernel_handler kh) {}); + }); + + // parallel_for with generic lambda w/o kernel_handler arg + q.submit([&](sycl::handler &cgh) { + cgh.parallel_for( + sycl::range<3>(3, 3, 3), [](auto it) {}); + }); + + // parallel_for with generic lambda with kernel_handler arg + q.submit([&](sycl::handler &cgh) { + cgh.parallel_for( + sycl::range<3>(3, 3, 3), [](auto it, sycl::kernel_handler kh) {}); + }); + + // parallel_for with integral type arg and w/o kernel_handler arg + q.submit([&](sycl::handler &cgh) { + cgh.parallel_for( + sycl::range<1>(1), [](int index) {}); + }); + + // parallel_for with integral type and kernel_handler args + q.submit([&](sycl::handler &cgh) { + cgh.parallel_for( + sycl::range<1>(1), [](int index, sycl::kernel_handler kh) {}); + }); + + // parallel_for_work_group without kernel_handler arg + q.submit([&](sycl::handler &cgh) { + cgh.parallel_for_work_group< + class KernelParallelForWorkGroupWithKernelHandler>( + sycl::range<3>(2, 2, 2), sycl::range<3>(2, 2, 2), + [=](sycl::group<3> myGroup) { + myGroup.parallel_for_work_item([&](sycl::h_item<3> myItem) {}); + myGroup.parallel_for_work_item([&](sycl::h_item<3> myItem) {}); + }); + }); + + // parallel_for_work_group with kernel_handler arg + q.submit([&](sycl::handler &cgh) { + cgh.parallel_for_work_group< + class KernelParallelForWorkGroupWithoutKernelHandler>( + sycl::range<3>(2, 2, 2), sycl::range<3>(2, 2, 2), + [=](sycl::group<3> myGroup, sycl::kernel_handler kh) { + myGroup.parallel_for_work_item([&](sycl::h_item<3> myItem) {}); + myGroup.parallel_for_work_item([&](sycl::h_item<3> myItem) {}); + }); + }); +} diff --git a/SYCL/SpecConstants/2020/non_native/Inputs/common.cpp b/SYCL/SpecConstants/2020/non_native/Inputs/common.cpp new file mode 100644 index 0000000000..94bf445f11 --- /dev/null +++ b/SYCL/SpecConstants/2020/non_native/Inputs/common.cpp @@ -0,0 +1,54 @@ +#include + +#include + +class Kernel1Name; +class Kernel2Name; + +struct TestStruct { + int a; + int b; +}; + +const static sycl::specialization_id SpecConst1{42}; +const static sycl::specialization_id SpecConst2{42}; +const static sycl::specialization_id SpecConst3{TestStruct{42, 42}}; +const static sycl::specialization_id SpecConst4{42}; + +int main() { + sycl::queue Q; + + // No support for host device so far + if (Q.is_host()) + return 0; + + { + sycl::buffer Buf{sycl::range{1}}; + Q.submit([&](sycl::handler &CGH) { + CGH.set_specialization_constant(1); + auto Acc = Buf.get_access(CGH); + CGH.single_task([=](sycl::kernel_handler KH) { + Acc[0] = KH.get_specialization_constant(); + }); + }); + auto Acc = Buf.get_access(); + assert(Acc[0] == 1); + } + + { + sycl::buffer Buf{sycl::range{1}}; + Q.submit([&](sycl::handler &CGH) { + auto Acc = Buf.get_access(CGH); + CGH.set_specialization_constant(TestStruct{1, 2}); + const auto SC = CGH.get_specialization_constant(); + assert(SC == 42); + CGH.single_task([=](sycl::kernel_handler KH) { + Acc[0] = KH.get_specialization_constant(); + }); + }); + auto Acc = Buf.get_access(); + assert(Acc[0].a == 1 && Acc[0].b == 2); + } + + return 0; +} diff --git a/SYCL/SpecConstants/2020/non_native/accelerator.cpp b/SYCL/SpecConstants/2020/non_native/accelerator.cpp new file mode 100644 index 0000000000..c11958d10d --- /dev/null +++ b/SYCL/SpecConstants/2020/non_native/accelerator.cpp @@ -0,0 +1,7 @@ +// REQUIRES: aoc, accelerator + +// RUN: %clangxx -fsycl -fsycl-targets=spir64_fpga-unknown-unknown-sycldevice %S/Inputs/common.cpp -o %t.out +// RUN: %ACC_RUN_PLACEHOLDER %t.out + +// This test checks correctness of SYCL2020 non-native specialization constants +// on accelerator device diff --git a/SYCL/SpecConstants/2020/non_native/aot_w_kernel_handler_wo_spec_consts.cpp b/SYCL/SpecConstants/2020/non_native/aot_w_kernel_handler_wo_spec_consts.cpp new file mode 100644 index 0000000000..f0a71e5110 --- /dev/null +++ b/SYCL/SpecConstants/2020/non_native/aot_w_kernel_handler_wo_spec_consts.cpp @@ -0,0 +1,37 @@ +// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out +// +// Hits an assert in the Lower Work Group Scope Code pass on AMD: +// XFAIL: rocm_amd + +// This test checks correctness of compiling and running of application with +// kernel lambdas containing kernel_handler arguments and w/o usage of +// specialization constants in AOT mode + +#include + +int main() { + sycl::queue q; + + q.submit([&](sycl::handler &cgh) { + cgh.single_task( + [=](sycl::kernel_handler kh) {}); + }); + + q.submit([&](sycl::handler &cgh) { + cgh.parallel_for( + sycl::nd_range<3>(sycl::range<3>(4, 4, 4), sycl::range<3>(2, 2, 2)), + [=](sycl::nd_item<3> item, sycl::kernel_handler kh) {}); + }); + + // parallel_for_work_group with kernel_handler arg + q.submit([&](sycl::handler &cgh) { + cgh.parallel_for_work_group< + class KernelParallelForWorkGroupWithoutKernelHandler>( + sycl::range<3>(2, 2, 2), sycl::range<3>(2, 2, 2), + [=](sycl::group<3> myGroup, sycl::kernel_handler kh) { + myGroup.parallel_for_work_item([&](sycl::h_item<3> myItem) {}); + myGroup.parallel_for_work_item([&](sycl::h_item<3> myItem) {}); + }); + }); +} diff --git a/SYCL/SpecConstants/2020/non_native/cpu.cpp b/SYCL/SpecConstants/2020/non_native/cpu.cpp new file mode 100644 index 0000000000..a2c91f1b58 --- /dev/null +++ b/SYCL/SpecConstants/2020/non_native/cpu.cpp @@ -0,0 +1,7 @@ +// REQUIRES: opencl-aot, cpu + +// RUN: %clangxx -fsycl -fsycl-targets=spir64_x86_64-unknown-unknown-sycldevice %S/Inputs/common.cpp -o %t.out +// RUN: %CPU_RUN_PLACEHOLDER %t.out + +// This test checks correctness of SYCL2020 non-native specialization constants +// on CPU device diff --git a/SYCL/SpecConstants/2020/non_native/cuda.cpp b/SYCL/SpecConstants/2020/non_native/cuda.cpp new file mode 100644 index 0000000000..01d0d37e04 --- /dev/null +++ b/SYCL/SpecConstants/2020/non_native/cuda.cpp @@ -0,0 +1,10 @@ +// REQUIRES: cuda + +// RUN: %clangxx -fsycl -fsycl-targets=nvptx64-unknown-unknown-sycldevice %S/Inputs/common.cpp -o %t.out +// RUN: env SYCL_DEVICE_FILTER=cuda %t.out + +// TODO: enable this test then compile-time error in sycl-post-link is fixed +// UNSUPPORTED: cuda + +// This test checks correctness of SYCL2020 non-native specialization constants +// on CUDA device diff --git a/SYCL/SpecConstants/2020/non_native/gpu.cpp b/SYCL/SpecConstants/2020/non_native/gpu.cpp new file mode 100644 index 0000000000..879fc95e94 --- /dev/null +++ b/SYCL/SpecConstants/2020/non_native/gpu.cpp @@ -0,0 +1,13 @@ +// REQUIRES: ocloc, gpu, TEMPORARY_DISABLED +// UNSUPPORTED: cuda +// CUDA is not compatible with SPIR. +// +// UNSUPPORTED: rocm_nvidia +// UNSUPPORTED: rocm_amd +// ROCm is not compatible with SPIR. + +// RUN: %clangxx -fsycl -fsycl-targets=spir64_gen-unknown-unknown-sycldevice -Xsycl-target-backend=spir64_gen-unknown-unknown-sycldevice "-device *" %S/Inputs/common.cpp -o %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out + +// This test checks correctness of SYCL2020 non-native specialization constants +// on GPU device