From 1a1413f98d0c523bc58de5f566d32ff9e1d3fbce Mon Sep 17 00:00:00 2001
From: Ewan Crawford <ewan@codeplay.com>
Date: Fri, 18 Nov 2022 16:27:54 +0000
Subject: [PATCH] [SYCL] Repro for reduction fail

* Add RUN lines to tests so that tests are run by LIT
* clang-format existing tests, and other minor cleanups
* Add `graph-explicit-reduction.cpp` which shows fail from https://github.com/reble/llvm/issues/24 by using the `sycl::ext::oneapi::property::queue::lazy_execution` property on a queue which uses a reduction outwith  the graph building API
---
 sycl/test/graph/graph-explicit-dotp.cpp      | 149 ++++++++++---------
 sycl/test/graph/graph-explicit-reduction.cpp |  37 +++++
 sycl/test/graph/graph-explicit-simple.cpp    |  73 +++++----
 3 files changed, 149 insertions(+), 110 deletions(-)
 create mode 100644 sycl/test/graph/graph-explicit-reduction.cpp
diff --git a/sycl/test/graph/graph-explicit-dotp.cpp b/sycl/test/graph/graph-explicit-dotp.cpp
index 8525b04904b23..c32a839c919e9 100644
--- a/sycl/test/graph/graph-explicit-dotp.cpp
+++ b/sycl/test/graph/graph-explicit-dotp.cpp
@@ -1,94 +1,97 @@
-#include <iostream>
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
 #include <CL/sycl.hpp>
+#include <iostream>
 
 #include <sycl/ext/oneapi/experimental/graph.hpp>
 
 const size_t n = 10;
 
 float host_gold_result() {
-    float alpha = 1.0f;
-    float beta = 2.0f;
-    float gamma = 3.0f;
-    
-    float sum = 0.0f;
-    
-    for(size_t i = 0; i < n; ++i) {
-        sum += (alpha * 1.0f + beta * 2.0f)
-             * (gamma * 3.0f + beta * 2.0f);
-    }
-    
-    return sum;
+  float alpha = 1.0f;
+  float beta = 2.0f;
+  float gamma = 3.0f;
+
+  float sum = 0.0f;
+
+  for (size_t i = 0; i < n; ++i) {
+    sum += (alpha * 1.0f + beta * 2.0f) * (gamma * 3.0f + beta * 2.0f);
+  }
+
+  return sum;
 }
 
 int main() {
-    float alpha = 1.0f;
-    float beta = 2.0f;
-    float gamma = 3.0f;
-    
-    float *x , *y, *z;
-
-    sycl::property_list properties{
-      sycl::property::queue::in_order(),
-      sycl::ext::oneapi::property::queue::lazy_execution{}
-    };
-
-    sycl::queue q{sycl::gpu_selector_v, properties};
-
-    sycl::ext::oneapi::experimental::command_graph g;
-    
-    float *dotp = sycl::malloc_shared<float>(1, q);
-    
-    x = sycl::malloc_shared<float>(n, q);
-    y = sycl::malloc_shared<float>(n, q);
-    z = sycl::malloc_shared<float>(n, q);
-    
-    /* init data on the device */
-    auto n_i = g.add([&](sycl::handler &h) {
-        h.parallel_for(n, [=](sycl::id<1> it){
-            const size_t i = it[0];
-            x[i] = 1.0f;
-            y[i] = 2.0f;
-            z[i] = 3.0f;
-        });
+  float alpha = 1.0f;
+  float beta = 2.0f;
+  float gamma = 3.0f;
+
+  sycl::property_list properties{
+      sycl::property::queue::in_order{},
+      sycl::ext::oneapi::property::queue::lazy_execution{}};
+
+  sycl::queue q{sycl::gpu_selector_v, properties};
+
+  sycl::ext::oneapi::experimental::command_graph g;
+
+  float *dotp = sycl::malloc_shared<float>(1, q);
+
+  float *x = sycl::malloc_shared<float>(n, q);
+  float *y = sycl::malloc_shared<float>(n, q);
+  float *z = sycl::malloc_shared<float>(n, q);
+
+  /* init data on the device */
+  auto n_i = g.add([&](sycl::handler &h) {
+    h.parallel_for(n, [=](sycl::id<1> it) {
+      const size_t i = it[0];
+      x[i] = 1.0f;
+      y[i] = 2.0f;
+      z[i] = 3.0f;
     });
+  });
 
-    auto node_a = g.add([&](sycl::handler &h) {
+  auto node_a = g.add(
+      [&](sycl::handler &h) {
         h.parallel_for(sycl::range<1>{n}, [=](sycl::id<1> it) {
-            const size_t i = it[0];
-            x[i] = alpha * x[i] + beta * y[i];
+          const size_t i = it[0];
+          x[i] = alpha * x[i] + beta * y[i];
         });
-    }, {n_i});
-    
-    auto node_b = g.add([&](sycl::handler &h) {
+      },
+      {n_i});
+
+  auto node_b = g.add(
+      [&](sycl::handler &h) {
         h.parallel_for(sycl::range<1>{n}, [=](sycl::id<1> it) {
-            const size_t i = it[0];
-            z[i] = gamma * z[i] + beta * y[i];
+          const size_t i = it[0];
+          z[i] = gamma * z[i] + beta * y[i];
         });
-    }, {n_i});
+      },
+      {n_i});
 
-    auto node_c = g.add([&](sycl::handler &h) {
+  auto node_c = g.add(
+      [&](sycl::handler &h) {
         h.parallel_for(sycl::range<1>{n},
                        sycl::reduction(dotp, 0.0f, std::plus()),
                        [=](sycl::id<1> it, auto &sum) {
-                           const size_t i = it[0];
-                           sum += x[i] * z[i];
+                         const size_t i = it[0];
+                         sum += x[i] * z[i];
                        });
-    }, {node_a, node_b});
-
-    auto exec_graph = g.finalize(q.get_context());
-    
-    exec_graph.exec_and_wait(q);
-
-    if (*dotp != host_gold_result()) {
-      std::cout << "Error unexpected result!\n";
-    }
-
-    sycl::free(dotp, q);
-    sycl::free(x, q);
-    sycl::free(y, q);
-    sycl::free(z, q);
-    
-    std::cout << "done.\n";
-    
-    return 0;
-}
\ No newline at end of file
+      },
+      {node_a, node_b});
+
+  auto exec_graph = g.finalize(q.get_context());
+
+  exec_graph.exec_and_wait(q);
+
+  if (*dotp != host_gold_result()) {
+    std::cout << "Error unexpected result!\n";
+  }
+
+  sycl::free(dotp, q);
+  sycl::free(x, q);
+  sycl::free(y, q);
+  sycl::free(z, q);
+
+  std::cout << "done.\n";
+
+  return 0;
+}
diff --git a/sycl/test/graph/graph-explicit-reduction.cpp b/sycl/test/graph/graph-explicit-reduction.cpp
new file mode 100644
index 0000000000000..9a2788079570c
--- /dev/null
+++ b/sycl/test/graph/graph-explicit-reduction.cpp
@@ -0,0 +1,37 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+#include <CL/sycl.hpp>
+#include <iostream>
+
+#include <sycl/ext/oneapi/experimental/graph.hpp>
+
+int main() {
+  sycl::property_list properties{
+      sycl::property::queue::in_order{},
+      sycl::ext::oneapi::property::queue::lazy_execution{}};
+
+  sycl::queue q{sycl::gpu_selector_v, properties};
+
+  sycl::ext::oneapi::experimental::command_graph g;
+
+  const size_t n = 10;
+  float *input = sycl::malloc_shared<float>(n, q);
+  float *output = sycl::malloc_shared<float>(1, q);
+  for (size_t i = 0; i < n; i++) {
+    input[i] = i;
+  }
+
+  auto e = q.submit([&](sycl::handler &h) {
+    h.parallel_for(sycl::range<1>{n},
+                   sycl::reduction(output, 0.0f, std::plus()),
+                   [=](sycl::id<1> idx, auto &sum) { sum += input[idx]; });
+  });
+
+  e.wait();
+
+  sycl::free(input, q);
+  sycl::free(output, q);
+
+  std::cout << "done\n";
+
+  return 0;
+}
diff --git a/sycl/test/graph/graph-explicit-simple.cpp b/sycl/test/graph/graph-explicit-simple.cpp
index 339d5b6e76150..d2f0098322172 100644
--- a/sycl/test/graph/graph-explicit-simple.cpp
+++ b/sycl/test/graph/graph-explicit-simple.cpp
@@ -1,43 +1,42 @@
-#include <iostream>
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
 #include <CL/sycl.hpp>
+#include <iostream>
 
 #include <sycl/ext/oneapi/experimental/graph.hpp>
 
-const size_t n = 10;
-
 int main() {
-    
-    sycl::property_list properties{
-      sycl::property::queue::in_order(),
-      sycl::ext::oneapi::property::queue::lazy_execution{}
-    };
-
-    //sycl::gpu_selector device_selector;
-    
-    sycl::queue q{sycl::gpu_selector_v, properties};
-    
-    //sycl::queue copy_q{};
-    
-    sycl::ext::oneapi::experimental::command_graph g;
-    
-    float *arr = sycl::malloc_shared<float>(n, q);
-    
-    g.add(
-        [&](sycl::handler& h){
-        h.parallel_for(sycl::range<1>{n}, [=](sycl::id<1> idx){size_t i = idx; arr[i]=1; });
+
+  sycl::property_list properties{
+      sycl::property::queue::in_order{},
+      sycl::ext::oneapi::property::queue::lazy_execution{}};
+
+  sycl::queue q{sycl::gpu_selector_v, properties};
+
+  sycl::ext::oneapi::experimental::command_graph g;
+
+  const size_t n = 10;
+  float *arr = sycl::malloc_shared<float>(n, q);
+
+  g.add([&](sycl::handler &h) {
+    h.parallel_for(sycl::range<1>{n}, [=](sycl::id<1> idx) {
+      size_t i = idx;
+      arr[i] = 1;
     });
-    
-    auto result_before_exec1 = arr[0];
-    
-    auto exec_graph = g.finalize(q.get_context());
-    
-    auto result_before_exec2 = arr[0];
-    
-    exec_graph.exec_and_wait(q);
-    
-    auto result = arr[0];
-    
-    std::cout << "done.\n";
-
-    return 0;  
-}
\ No newline at end of file
+  });
+
+  auto result_before_exec1 = arr[0];
+
+  auto exec_graph = g.finalize(q.get_context());
+
+  auto result_before_exec2 = arr[0];
+
+  exec_graph.exec_and_wait(q);
+
+  auto result = arr[0];
+
+  sycl::free(arr, q);
+
+  std::cout << "done.\n";
+
+  return 0;
+}