intel · sommerlukas · Jan 26, 2023 · Feb 14, 2023 · Feb 15, 2023 · Feb 16, 2023
@@ -1,9 +1,9 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir %s -o %t.out
 // RUN: env SYCL_RT_WARNING_LEVEL=1 %CPU_RUN_PLACEHOLDER %t.out 2>&1\
 // RUN: %CPU_CHECK_PLACEHOLDER
 // RUN: env SYCL_RT_WARNING_LEVEL=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1\
 // RUN: %GPU_CHECK_PLACEHOLDER
-// UNSUPPORTED: cuda || hip
+// UNSUPPORTED: hip
 // REQUIRES: fusion
 
 // Test fusion being aborted: Different scenarios causing the JIT compiler

@@ -1,9 +1,11 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
-// RUN: env SYCL_ENABLE_FUSION_CACHING=0 SYCL_RT_WARNING_LEVEL=1 %CPU_RUN_PLACEHOLDER %t.out 2>&1\
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir %s -o %t.out
+// RUN: env SYCL_RT_WARNING_LEVEL=1 SYCL_ENABLE_FUSION_CACHING=0\
+// RUN: %CPU_RUN_PLACEHOLDER %t.out 2>&1\
 // RUN: %CPU_CHECK_PLACEHOLDER
-// RUN: env SYCL_ENABLE_FUSION_CACHING=0 SYCL_RT_WARNING_LEVEL=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1\
+// RUN: env SYCL_RT_WARNING_LEVEL=1 SYCL_ENABLE_FUSION_CACHING=0\
+// RUN: %GPU_RUN_PLACEHOLDER %t.out 2>&1\
 // RUN: %GPU_CHECK_PLACEHOLDER
-// UNSUPPORTED: cuda || hip
+// UNSUPPORTED: hip
 // REQUIRES: fusion
 
 // Test incomplete internalization: Different scenarios causing the JIT compiler

@@ -1,9 +1,9 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir %s -o %t.out
 // RUN: env SYCL_RT_WARNING_LEVEL=1 %CPU_RUN_PLACEHOLDER %t.out 2>&1\
 // RUN: %CPU_CHECK_PLACEHOLDER --implicit-check-not "Computation error" --implicit-check-not "Internalized"
 // RUN: env SYCL_RT_WARNING_LEVEL=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1\
 // RUN: %GPU_CHECK_PLACEHOLDER --implicit-check-not "Computation error" --implicit-check-not "Internalized"
-// UNSUPPORTED: cuda || hip
+// UNSUPPORTED: hip
 // REQUIRES: fusion
 
 // Test pointers being stored are not internalized.

@@ -1,7 +1,7 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir %s -o %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
-// UNSUPPORTED: cuda || hip
+// UNSUPPORTED: hip
 // REQUIRES: fusion
 
 // Test complete fusion with local internalization and a combination of kernels

@@ -1,7 +1,7 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir %s -o %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
-// UNSUPPORTED: cuda || hip
+// UNSUPPORTED: hip
 // REQUIRES: fusion
 
 // Test complete fusion with private internalization specified on the

@@ -1,7 +1,7 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir %s -o %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
-// UNSUPPORTED: cuda || hip
+// UNSUPPORTED: hip
 // REQUIRES: fusion
 
 // Test cancel fusion

@@ -1,7 +1,7 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir %s -o %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
-// UNSUPPORTED: cuda || hip
+// UNSUPPORTED: hip
 // REQUIRES: fusion
 
 // Test complete fusion without any internalization

@@ -1,7 +1,7 @@
 // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
-// XFAIL: cuda || hip
+// XFAIL: hip
 // REQUIRES: fusion
 
 // Test correct return from device information descriptor.

@@ -1,7 +1,7 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir %s -o %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
-// UNSUPPORTED: cuda || hip
+// UNSUPPORTED: hip
 // REQUIRES: fusion
 
 // Test complete fusion with private internalization specified on the

@@ -0,0 +1,111 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir %s -o %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// UNSUPPORTED: hip
+// REQUIRES: fusion
+
+// Test complete fusion with local internalization specified on the
+// accessors for a combination of four kernels, forming a diamond-like shape and
+// repeating one of the kernels.
+
+#include <sycl/sycl.hpp>
+
+using namespace sycl;
+
+struct AddKernel {
+  accessor<int, 1> accIn1;
+  accessor<int, 1> accIn2;
+  accessor<int, 1> accOut;
+
+  void operator()(id<1> i) const { accOut[i] = accIn1[i] + accIn2[i]; }
+};
+
+int main() {
+  constexpr size_t dataSize = 512;
+  int in1[dataSize], in2[dataSize], in3[dataSize], tmp1[dataSize],
+      tmp2[dataSize], tmp3[dataSize], out[dataSize];
+
+  for (size_t i = 0; i < dataSize; ++i) {
+    in1[i] = i * 2;
+    in2[i] = i * 3;
+    in3[i] = i * 4;
+    tmp1[i] = -1;
+    tmp2[i] = -1;
+    tmp3[i] = -1;
+    out[i] = -1;
+  }
+
+  queue q{ext::codeplay::experimental::property::queue::enable_fusion{}};
+
+  {
+    buffer<int> bIn1{in1, range{dataSize}};
+    buffer<int> bIn2{in2, range{dataSize}};
+    buffer<int> bIn3{in3, range{dataSize}};
+    buffer<int> bTmp1{
+        tmp1,
+        range{dataSize},
+        {sycl::ext::codeplay::experimental::property::promote_local{}}};
+    buffer<int> bTmp2{
+        tmp2,
+        range{dataSize},
+        {sycl::ext::codeplay::experimental::property::promote_local{}}};
+    buffer<int> bTmp3{
+        tmp3,
+        range{dataSize},
+        {sycl::ext::codeplay::experimental::property::promote_local{}}};
+    buffer<int> bOut{out, range{dataSize}};
+
+    ext::codeplay::experimental::fusion_wrapper fw{q};
+    fw.start_fusion();
+
+    assert(fw.is_in_fusion_mode() && "Queue should be in fusion mode");
+
+    q.submit([&](handler &cgh) {
+      auto accIn1 = bIn1.get_access(cgh);
+      auto accIn2 = bIn2.get_access(cgh);
+      auto accTmp1 = bTmp1.get_access(cgh);
+      cgh.parallel_for<AddKernel>(nd_range<1>{{dataSize}, {16}},
+                                  AddKernel{accIn1, accIn2, accTmp1});
+    });
+
+    q.submit([&](handler &cgh) {
+      auto accTmp1 = bTmp1.get_access(cgh);
+      auto accIn3 = bIn3.get_access(cgh);
+      auto accTmp2 = bTmp2.get_access(cgh);
+      cgh.parallel_for<class KernelOne>(
+          nd_range<1>{{dataSize}, {16}},
+          [=](id<1> i) { accTmp2[i] = accTmp1[i] * accIn3[i]; });
+    });
+
+    q.submit([&](handler &cgh) {
+      auto accTmp1 = bTmp1.get_access(cgh);
+      auto accTmp3 = bTmp3.get_access(cgh);
+      cgh.parallel_for<class KernelTwo>(
+          nd_range<1>{{dataSize}, {16}},
+          [=](id<1> i) { accTmp3[i] = accTmp1[i] * 5; });
+    });
+
+    q.submit([&](handler &cgh) {
+      auto accTmp2 = bTmp2.get_access(cgh);
+      auto accTmp3 = bTmp3.get_access(cgh);
+      auto accOut = bOut.get_access(cgh);
+      cgh.parallel_for<AddKernel>(nd_range<1>{{dataSize}, {16}},
+                                  AddKernel{accTmp2, accTmp3, accOut});
+    });
+
+    fw.complete_fusion({ext::codeplay::experimental::property::no_barriers{}});
+
+    assert(!fw.is_in_fusion_mode() &&
+           "Queue should not be in fusion mode anymore");
+  }
+
+  // Check the results
+  for (size_t i = 0; i < dataSize; ++i) {
+    assert(out[i] == (20 * i * i + i * 25) && "Computation error");
+    assert(tmp1[i] == -1 && "tmp1 not internalized");
+    assert(tmp2[i] == -1 && "tmp2 not internalized");
+    assert(tmp3[i] == -1 && "tmp3 not internalized");
+  }
+
+  return 0;
+}
@@ -1,7 +1,7 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir %s -o %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
-// UNSUPPORTED: cuda || hip
+// UNSUPPORTED: hip
 // REQUIRES: fusion
 
 // Test validity of events after cancel_fusion.

@@ -1,7 +1,7 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir %s -o %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
-// UNSUPPORTED: cuda || hip
+// UNSUPPORTED: hip
 // REQUIRES: fusion
 
 // Test validity of events after complete_fusion.

@@ -0,0 +1,78 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir %s -o %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// UNSUPPORTED: hip
+// REQUIRES: fusion
+
+// Test complete fusion with local internalization and an local accessor that
+// already exists in one of the input kernels.
+
+#include <sycl/sycl.hpp>
+
+using namespace sycl;
+
+int main() {
+  constexpr size_t dataSize = 512;
+  int in1[dataSize], in2[dataSize], in3[dataSize], tmp[dataSize], out[dataSize];
+
+  for (size_t i = 0; i < dataSize; ++i) {
+    in1[i] = i * 2;
+    in2[i] = i * 3;
+    in3[i] = i * 4;
+    tmp[i] = -1;
+    out[i] = -1;
+  }
+
+  queue q{ext::codeplay::experimental::property::queue::enable_fusion{}};
+
+  {
+    buffer<int> bIn1{in1, range{dataSize}};
+    buffer<int> bIn2{in2, range{dataSize}};
+    buffer<int> bIn3{in3, range{dataSize}};
+    buffer<int> bTmp{tmp, range{dataSize}};
+    buffer<int> bOut{out, range{dataSize}};
+
+    ext::codeplay::experimental::fusion_wrapper fw{q};
+    fw.start_fusion();
+
+    assert(fw.is_in_fusion_mode() && "Queue should be in fusion mode");
+
+    q.submit([&](handler &cgh) {
+      auto accIn1 = bIn1.get_access(cgh);
+      auto accIn2 = bIn2.get_access(cgh);
+      auto accTmp = bTmp.get_access(
+          cgh, sycl::ext::codeplay::experimental::property::promote_local{});
+      local_accessor<int> accLocal{16, cgh};
+      cgh.parallel_for<class KernelOne>(
+          nd_range<1>{{dataSize}, {16}}, [=](nd_item<1> i) {
+            size_t globalIdx = i.get_global_linear_id();
+            size_t localIdx = i.get_local_linear_id();
+            accLocal[localIdx] = accIn2[globalIdx];
+            accTmp[globalIdx] = accIn1[globalIdx] + accLocal[localIdx];
+          });
+    });
+
+    q.submit([&](handler &cgh) {
+      auto accTmp = bTmp.get_access(
+          cgh, sycl::ext::codeplay::experimental::property::promote_local{});
+      auto accIn3 = bIn3.get_access(cgh);
+      auto accOut = bOut.get_access(cgh);
+      cgh.parallel_for<class KernelTwo>(
+          nd_range<1>{{dataSize}, {16}},
+          [=](id<1> i) { accOut[i] = accTmp[i] * accIn3[i]; });
+    });
+
+    fw.complete_fusion({ext::codeplay::experimental::property::no_barriers{}});
+
+    assert(!fw.is_in_fusion_mode() &&
+           "Queue should not be in fusion mode anymore");
+  }
+
+  // Check the results
+  for (size_t i = 0; i < dataSize; ++i) {
+    assert(out[i] == (20 * i * i) && "Computation error");
+    assert(tmp[i] == -1 && "Not internalized");
+  }
+
+  return 0;
+}
@@ -1,7 +1,7 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir %s -o %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
-// UNSUPPORTED: cuda || hip
+// UNSUPPORTED: hip
 // REQUIRES: fusion
 
 // Test complete fusion where one kernel in the fusion list specifies an

@@ -1,7 +1,7 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir %s -o %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
-// UNSUPPORTED: cuda || hip
+// UNSUPPORTED: hip
 // REQUIRES: fusion
 
 // Test internalization of a nested array type.

@@ -1,7 +1,7 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir %s -o %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
-// UNSUPPORTED: cuda || hip
+// UNSUPPORTED: hip
 // REQUIRES: fusion
 
 // Test complete fusion with internalization of a deep struct type.

@@ -1,7 +1,7 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir %s -o %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
-// UNSUPPORTED: cuda || hip
+// UNSUPPORTED: hip
 // REQUIRES: fusion
 
 // Test complete fusion with private internalization specified on the

@@ -1,7 +1,7 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir %s -o %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
-// UNSUPPORTED: cuda || hip
+// UNSUPPORTED: hip
 // REQUIRES: fusion
 
 // Test complete fusion with internalization of a struct type.

@@ -1,7 +1,7 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir %s -o %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
-// UNSUPPORTED: cuda || hip
+// UNSUPPORTED: hip
 // REQUIRES: fusion
 
 // Test complete fusion with private internalization specified on the

@@ -1,9 +1,9 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir %s -o %t.out
 // RUN: env SYCL_RT_WARNING_LEVEL=1 %CPU_RUN_PLACEHOLDER %t.out 2>&1\
 // RUN: %CPU_CHECK_PLACEHOLDER --implicit-check-not "COMPUTATION ERROR" --implicit-check-not "WRONG INTERNALIZATION"
 // RUN: env SYCL_RT_WARNING_LEVEL=1 %GPU_RUN_PLACEHOLDER %t.out 2>&1\
 // RUN: %GPU_CHECK_PLACEHOLDER --implicit-check-not "COMPUTATION ERROR" --implicit-check-not "WRONG INTERNALIZATION"
-// UNSUPPORTED: cuda || hip
+// UNSUPPORTED: hip
 // REQUIRES: fusion
 
 // Test caching for JIT fused kernels. Also test for debug messages being

@@ -1,7 +1,7 @@
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-embed-ir %s -o %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
-// UNSUPPORTED: cuda || hip
+// UNSUPPORTED: hip
 // REQUIRES: fusion
 
 // Test complete fusion with local internalization specified on the