Skip to content
This repository has been archived by the owner on Mar 28, 2023. It is now read-only.

[SYCL][Fusion] Kernel Fusion support for CUDA backend #1683

Open
wants to merge 5 commits into
base: intel
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions SYCL/KernelFusion/abort_fusion.cpp
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir %s -o %t.out
// RUN: env SYCL_RT_WARNING_LEVEL=1 %CPU_RUN_PLACEHOLDER %t.out 2>&1\
// RUN: %CPU_CHECK_PLACEHOLDER
// RUN: env SYCL_RT_WARNING_LEVEL=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1\
// RUN: %GPU_CHECK_PLACEHOLDER
// UNSUPPORTED: cuda || hip
// UNSUPPORTED: hip
// REQUIRES: fusion

// Test fusion being aborted: Different scenarios causing the JIT compiler
Expand Down
10 changes: 6 additions & 4 deletions SYCL/KernelFusion/abort_internalization.cpp
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
// RUN: env SYCL_ENABLE_FUSION_CACHING=0 SYCL_RT_WARNING_LEVEL=1 %CPU_RUN_PLACEHOLDER %t.out 2>&1\
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir %s -o %t.out
// RUN: env SYCL_RT_WARNING_LEVEL=1 SYCL_ENABLE_FUSION_CACHING=0\
// RUN: %CPU_RUN_PLACEHOLDER %t.out 2>&1\
// RUN: %CPU_CHECK_PLACEHOLDER
// RUN: env SYCL_ENABLE_FUSION_CACHING=0 SYCL_RT_WARNING_LEVEL=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1\
// RUN: env SYCL_RT_WARNING_LEVEL=1 SYCL_ENABLE_FUSION_CACHING=0\
// RUN: %GPU_RUN_PLACEHOLDER %t.out 2>&1\
// RUN: %GPU_CHECK_PLACEHOLDER
// UNSUPPORTED: cuda || hip
// UNSUPPORTED: hip
// REQUIRES: fusion

// Test incomplete internalization: Different scenarios causing the JIT compiler
Expand Down
4 changes: 2 additions & 2 deletions SYCL/KernelFusion/abort_internalization_stored_ptr.cpp
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir %s -o %t.out
// RUN: env SYCL_RT_WARNING_LEVEL=1 %CPU_RUN_PLACEHOLDER %t.out 2>&1\
// RUN: %CPU_CHECK_PLACEHOLDER --implicit-check-not "Computation error" --implicit-check-not "Internalized"
// RUN: env SYCL_RT_WARNING_LEVEL=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1\
// RUN: %GPU_CHECK_PLACEHOLDER --implicit-check-not "Computation error" --implicit-check-not "Internalized"
// UNSUPPORTED: cuda || hip
// UNSUPPORTED: hip
// REQUIRES: fusion

// Test pointers being stored are not internalized.
Expand Down
4 changes: 2 additions & 2 deletions SYCL/KernelFusion/barrier_local_internalization.cpp
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir %s -o %t.out
// RUN: %CPU_RUN_PLACEHOLDER %t.out
// RUN: %GPU_RUN_PLACEHOLDER %t.out
// UNSUPPORTED: cuda || hip
// UNSUPPORTED: hip
// REQUIRES: fusion

// Test complete fusion with local internalization and a combination of kernels
Expand Down
4 changes: 2 additions & 2 deletions SYCL/KernelFusion/buffer_internalization.cpp
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir %s -o %t.out
// RUN: %CPU_RUN_PLACEHOLDER %t.out
// RUN: %GPU_RUN_PLACEHOLDER %t.out
// UNSUPPORTED: cuda || hip
// UNSUPPORTED: hip
// REQUIRES: fusion

// Test complete fusion with private internalization specified on the
Expand Down
4 changes: 2 additions & 2 deletions SYCL/KernelFusion/cancel_fusion.cpp
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir %s -o %t.out
// RUN: %CPU_RUN_PLACEHOLDER %t.out
// RUN: %GPU_RUN_PLACEHOLDER %t.out
// UNSUPPORTED: cuda || hip
// UNSUPPORTED: hip
// REQUIRES: fusion

// Test cancel fusion
Expand Down
4 changes: 2 additions & 2 deletions SYCL/KernelFusion/complete_fusion.cpp
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir %s -o %t.out
// RUN: %CPU_RUN_PLACEHOLDER %t.out
// RUN: %GPU_RUN_PLACEHOLDER %t.out
// UNSUPPORTED: cuda || hip
// UNSUPPORTED: hip
// REQUIRES: fusion

// Test complete fusion without any internalization
Expand Down
2 changes: 1 addition & 1 deletion SYCL/KernelFusion/device_info_descriptor.cpp
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
// RUN: %CPU_RUN_PLACEHOLDER %t.out
// RUN: %GPU_RUN_PLACEHOLDER %t.out
// XFAIL: cuda || hip
// XFAIL: hip
// REQUIRES: fusion

// Test correct return from device information descriptor.
Expand Down
4 changes: 2 additions & 2 deletions SYCL/KernelFusion/diamond_shape.cpp
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir %s -o %t.out
// RUN: %CPU_RUN_PLACEHOLDER %t.out
// RUN: %GPU_RUN_PLACEHOLDER %t.out
// UNSUPPORTED: cuda || hip
// UNSUPPORTED: hip
// REQUIRES: fusion

// Test complete fusion with private internalization specified on the
Expand Down
111 changes: 111 additions & 0 deletions SYCL/KernelFusion/diamond_shape_local.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir %s -o %t.out
// RUN: %CPU_RUN_PLACEHOLDER %t.out
// RUN: %GPU_RUN_PLACEHOLDER %t.out
// UNSUPPORTED: hip
// REQUIRES: fusion

// Test complete fusion with local internalization specified on the
// accessors for a combination of four kernels, forming a diamond-like shape and
// repeating one of the kernels.

#include <sycl/sycl.hpp>

using namespace sycl;

struct AddKernel {
accessor<int, 1> accIn1;
accessor<int, 1> accIn2;
accessor<int, 1> accOut;

void operator()(id<1> i) const { accOut[i] = accIn1[i] + accIn2[i]; }
};

int main() {
constexpr size_t dataSize = 512;
int in1[dataSize], in2[dataSize], in3[dataSize], tmp1[dataSize],
tmp2[dataSize], tmp3[dataSize], out[dataSize];

for (size_t i = 0; i < dataSize; ++i) {
in1[i] = i * 2;
in2[i] = i * 3;
in3[i] = i * 4;
tmp1[i] = -1;
tmp2[i] = -1;
tmp3[i] = -1;
out[i] = -1;
}

queue q{ext::codeplay::experimental::property::queue::enable_fusion{}};

{
buffer<int> bIn1{in1, range{dataSize}};
buffer<int> bIn2{in2, range{dataSize}};
buffer<int> bIn3{in3, range{dataSize}};
buffer<int> bTmp1{
tmp1,
range{dataSize},
{sycl::ext::codeplay::experimental::property::promote_local{}}};
buffer<int> bTmp2{
tmp2,
range{dataSize},
{sycl::ext::codeplay::experimental::property::promote_local{}}};
buffer<int> bTmp3{
tmp3,
range{dataSize},
{sycl::ext::codeplay::experimental::property::promote_local{}}};
buffer<int> bOut{out, range{dataSize}};

ext::codeplay::experimental::fusion_wrapper fw{q};
fw.start_fusion();

assert(fw.is_in_fusion_mode() && "Queue should be in fusion mode");

q.submit([&](handler &cgh) {
auto accIn1 = bIn1.get_access(cgh);
auto accIn2 = bIn2.get_access(cgh);
auto accTmp1 = bTmp1.get_access(cgh);
cgh.parallel_for<AddKernel>(nd_range<1>{{dataSize}, {16}},
AddKernel{accIn1, accIn2, accTmp1});
});

q.submit([&](handler &cgh) {
auto accTmp1 = bTmp1.get_access(cgh);
auto accIn3 = bIn3.get_access(cgh);
auto accTmp2 = bTmp2.get_access(cgh);
cgh.parallel_for<class KernelOne>(
nd_range<1>{{dataSize}, {16}},
[=](id<1> i) { accTmp2[i] = accTmp1[i] * accIn3[i]; });
});

q.submit([&](handler &cgh) {
auto accTmp1 = bTmp1.get_access(cgh);
auto accTmp3 = bTmp3.get_access(cgh);
cgh.parallel_for<class KernelTwo>(
nd_range<1>{{dataSize}, {16}},
[=](id<1> i) { accTmp3[i] = accTmp1[i] * 5; });
});

q.submit([&](handler &cgh) {
auto accTmp2 = bTmp2.get_access(cgh);
auto accTmp3 = bTmp3.get_access(cgh);
auto accOut = bOut.get_access(cgh);
cgh.parallel_for<AddKernel>(nd_range<1>{{dataSize}, {16}},
AddKernel{accTmp2, accTmp3, accOut});
});

fw.complete_fusion({ext::codeplay::experimental::property::no_barriers{}});

assert(!fw.is_in_fusion_mode() &&
"Queue should not be in fusion mode anymore");
}

// Check the results
for (size_t i = 0; i < dataSize; ++i) {
assert(out[i] == (20 * i * i + i * 25) && "Computation error");
assert(tmp1[i] == -1 && "tmp1 not internalized");
assert(tmp2[i] == -1 && "tmp2 not internalized");
assert(tmp3[i] == -1 && "tmp3 not internalized");
}

return 0;
}
4 changes: 2 additions & 2 deletions SYCL/KernelFusion/event_wait_cancel.cpp
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir %s -o %t.out
// RUN: %CPU_RUN_PLACEHOLDER %t.out
// RUN: %GPU_RUN_PLACEHOLDER %t.out
// UNSUPPORTED: cuda || hip
// UNSUPPORTED: hip
// REQUIRES: fusion

// Test validity of events after cancel_fusion.
Expand Down
4 changes: 2 additions & 2 deletions SYCL/KernelFusion/event_wait_complete.cpp
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir %s -o %t.out
// RUN: %CPU_RUN_PLACEHOLDER %t.out
// RUN: %GPU_RUN_PLACEHOLDER %t.out
// UNSUPPORTED: cuda || hip
// UNSUPPORTED: hip
// REQUIRES: fusion

// Test validity of events after complete_fusion.
Expand Down
78 changes: 78 additions & 0 deletions SYCL/KernelFusion/existing_local_accessor.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir %s -o %t.out
// RUN: %CPU_RUN_PLACEHOLDER %t.out
// RUN: %GPU_RUN_PLACEHOLDER %t.out
// UNSUPPORTED: hip
// REQUIRES: fusion

// Test complete fusion with local internalization and an local accessor that
// already exists in one of the input kernels.

#include <sycl/sycl.hpp>

using namespace sycl;

int main() {
constexpr size_t dataSize = 512;
int in1[dataSize], in2[dataSize], in3[dataSize], tmp[dataSize], out[dataSize];

for (size_t i = 0; i < dataSize; ++i) {
in1[i] = i * 2;
in2[i] = i * 3;
in3[i] = i * 4;
tmp[i] = -1;
out[i] = -1;
}

queue q{ext::codeplay::experimental::property::queue::enable_fusion{}};

{
buffer<int> bIn1{in1, range{dataSize}};
buffer<int> bIn2{in2, range{dataSize}};
buffer<int> bIn3{in3, range{dataSize}};
buffer<int> bTmp{tmp, range{dataSize}};
buffer<int> bOut{out, range{dataSize}};

ext::codeplay::experimental::fusion_wrapper fw{q};
fw.start_fusion();

assert(fw.is_in_fusion_mode() && "Queue should be in fusion mode");

q.submit([&](handler &cgh) {
auto accIn1 = bIn1.get_access(cgh);
auto accIn2 = bIn2.get_access(cgh);
auto accTmp = bTmp.get_access(
cgh, sycl::ext::codeplay::experimental::property::promote_local{});
local_accessor<int> accLocal{16, cgh};
cgh.parallel_for<class KernelOne>(
nd_range<1>{{dataSize}, {16}}, [=](nd_item<1> i) {
size_t globalIdx = i.get_global_linear_id();
size_t localIdx = i.get_local_linear_id();
accLocal[localIdx] = accIn2[globalIdx];
accTmp[globalIdx] = accIn1[globalIdx] + accLocal[localIdx];
});
});

q.submit([&](handler &cgh) {
auto accTmp = bTmp.get_access(
cgh, sycl::ext::codeplay::experimental::property::promote_local{});
auto accIn3 = bIn3.get_access(cgh);
auto accOut = bOut.get_access(cgh);
cgh.parallel_for<class KernelTwo>(
nd_range<1>{{dataSize}, {16}},
[=](id<1> i) { accOut[i] = accTmp[i] * accIn3[i]; });
});

fw.complete_fusion({ext::codeplay::experimental::property::no_barriers{}});

assert(!fw.is_in_fusion_mode() &&
"Queue should not be in fusion mode anymore");
}

// Check the results
for (size_t i = 0; i < dataSize; ++i) {
assert(out[i] == (20 * i * i) && "Computation error");
assert(tmp[i] == -1 && "Not internalized");
}

return 0;
}
4 changes: 2 additions & 2 deletions SYCL/KernelFusion/internal_explicit_dependency.cpp
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir %s -o %t.out
// RUN: %CPU_RUN_PLACEHOLDER %t.out
// RUN: %GPU_RUN_PLACEHOLDER %t.out
// UNSUPPORTED: cuda || hip
// UNSUPPORTED: hip
// REQUIRES: fusion

// Test complete fusion where one kernel in the fusion list specifies an
Expand Down
4 changes: 2 additions & 2 deletions SYCL/KernelFusion/internalize_array_wrapper.cpp
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir %s -o %t.out
// RUN: %CPU_RUN_PLACEHOLDER %t.out
// RUN: %GPU_RUN_PLACEHOLDER %t.out
// UNSUPPORTED: cuda || hip
// UNSUPPORTED: hip
// REQUIRES: fusion

// Test internalization of a nested array type.
Expand Down
4 changes: 2 additions & 2 deletions SYCL/KernelFusion/internalize_deep.cpp
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir %s -o %t.out
// RUN: %CPU_RUN_PLACEHOLDER %t.out
// RUN: %GPU_RUN_PLACEHOLDER %t.out
// UNSUPPORTED: cuda || hip
// UNSUPPORTED: hip
// REQUIRES: fusion

// Test complete fusion with internalization of a deep struct type.
Expand Down
4 changes: 2 additions & 2 deletions SYCL/KernelFusion/internalize_multi_ptr.cpp
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir %s -o %t.out
// RUN: %CPU_RUN_PLACEHOLDER %t.out
// RUN: %GPU_RUN_PLACEHOLDER %t.out
// UNSUPPORTED: cuda || hip
// UNSUPPORTED: hip
// REQUIRES: fusion

// Test complete fusion with private internalization specified on the
Expand Down
4 changes: 2 additions & 2 deletions SYCL/KernelFusion/internalize_vec.cpp
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir %s -o %t.out
// RUN: %CPU_RUN_PLACEHOLDER %t.out
// RUN: %GPU_RUN_PLACEHOLDER %t.out
// UNSUPPORTED: cuda || hip
// UNSUPPORTED: hip
// REQUIRES: fusion

// Test complete fusion with internalization of a struct type.
Expand Down
4 changes: 2 additions & 2 deletions SYCL/KernelFusion/internalize_vfunc.cpp
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir %s -o %t.out
// RUN: %CPU_RUN_PLACEHOLDER %t.out
// RUN: %GPU_RUN_PLACEHOLDER %t.out
// UNSUPPORTED: cuda || hip
// UNSUPPORTED: hip
// REQUIRES: fusion

// Test complete fusion with private internalization specified on the
Expand Down
4 changes: 2 additions & 2 deletions SYCL/KernelFusion/jit_caching.cpp
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir %s -o %t.out
// RUN: env SYCL_RT_WARNING_LEVEL=1 %CPU_RUN_PLACEHOLDER %t.out 2>&1\
// RUN: %CPU_CHECK_PLACEHOLDER --implicit-check-not "COMPUTATION ERROR" --implicit-check-not "WRONG INTERNALIZATION"
// RUN: env SYCL_RT_WARNING_LEVEL=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1\
// RUN: %GPU_CHECK_PLACEHOLDER --implicit-check-not "COMPUTATION ERROR" --implicit-check-not "WRONG INTERNALIZATION"
// UNSUPPORTED: cuda || hip
// UNSUPPORTED: hip
// REQUIRES: fusion

// Test caching for JIT fused kernels. Also test for debug messages being
Expand Down
4 changes: 2 additions & 2 deletions SYCL/KernelFusion/local_internalization.cpp
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir %s -o %t.out
// RUN: %CPU_RUN_PLACEHOLDER %t.out
// RUN: %GPU_RUN_PLACEHOLDER %t.out
// UNSUPPORTED: cuda || hip
// UNSUPPORTED: hip
// REQUIRES: fusion

// Test complete fusion with local internalization specified on the
Expand Down
Loading