From d59bfddca8c9863daa29f8259b79e1e9ce4f2f69 Mon Sep 17 00:00:00 2001
From: Ruslan Arutyunyan <ruslan.arutyunyan@intel.com>
Date: Fri, 19 Jun 2020 15:51:27 +0300
Subject: [PATCH 1/2] [SYCL] Implement braced-init-list or a number as range
 for queue::parallel_for

Modification:
    Make three different overloads for queue::parallel for to support
range implicit conversion from number or braced-init-list
    Add tests for queue::parallel_for calls with generic lambda

Signed-off-by: Ruslan Arutyunyan <ruslan.arutyunyan@intel.com>
---
 sycl/include/CL/sycl/queue.hpp                | 71 +++++++++++++---
 sycl/test/basic_tests/{ => queue}/queue.cpp   |  0
 .../queue/queue_parallel_for_generic.cpp      | 72 +++++++++++++++++
 .../queue/queue_parallel_for_interface.cpp    | 81 +++++++++++++++++++
 4 files changed, 215 insertions(+), 9 deletions(-)
 rename sycl/test/basic_tests/{ => queue}/queue.cpp (100%)
 create mode 100644 sycl/test/basic_tests/queue/queue_parallel_for_generic.cpp
 create mode 100644 sycl/test/basic_tests/queue/queue_parallel_for_interface.cpp
diff --git a/sycl/include/CL/sycl/queue.hpp b/sycl/include/CL/sycl/queue.hpp
index 77ce1fab86474..2e65114c87b34 100644
--- a/sycl/include/CL/sycl/queue.hpp
+++ b/sycl/include/CL/sycl/queue.hpp
@@ -432,10 +432,9 @@ class __SYCL_EXPORT queue {
   /// \param NumWorkItems is a range that specifies the work space of the kernel
   /// \param KernelFunc is the Kernel functor or lambda
   /// \param CodeLoc contains the code location of user code
-  template <typename KernelName = detail::auto_name, typename KernelType,
-            int Dims>
+  template <typename KernelName = detail::auto_name, typename KernelType>
   event parallel_for(
-      range<Dims> NumWorkItems, KernelType KernelFunc
+      range<1> NumWorkItems, KernelType KernelFunc
 #ifndef DISABLE_SYCL_INSTRUMENTATION_METADATA
       ,
       const detail::code_location &CodeLoc = detail::code_location::current()
@@ -444,12 +443,47 @@ class __SYCL_EXPORT queue {
 #ifdef DISABLE_SYCL_INSTRUMENTATION_METADATA
     const detail::code_location &CodeLoc = {};
 #endif
-    return submit(
-        [&](handler &CGH) {
-          CGH.template parallel_for<KernelName, KernelType>(NumWorkItems,
-                                                            KernelFunc);
-        },
-        CodeLoc);
+    return parallel_for_impl(NumWorkItems, KernelFunc, CodeLoc);
+  }
+
+  /// parallel_for version with a kernel represented as a lambda + range that
+  /// specifies global size only.
+  ///
+  /// \param NumWorkItems is a range that specifies the work space of the kernel
+  /// \param KernelFunc is the Kernel functor or lambda
+  /// \param CodeLoc contains the code location of user code
+  template <typename KernelName = detail::auto_name, typename KernelType>
+  event parallel_for(
+      range<2> NumWorkItems, KernelType KernelFunc
+#ifndef DISABLE_SYCL_INSTRUMENTATION_METADATA
+      ,
+      const detail::code_location &CodeLoc = detail::code_location::current()
+#endif
+  ) {
+#ifdef DISABLE_SYCL_INSTRUMENTATION_METADATA
+    const detail::code_location &CodeLoc = {};
+#endif
+    return parallel_for_impl(NumWorkItems, KernelFunc, CodeLoc);
+  }
+
+  /// parallel_for version with a kernel represented as a lambda + range that
+  /// specifies global size only.
+  ///
+  /// \param NumWorkItems is a range that specifies the work space of the kernel
+  /// \param KernelFunc is the Kernel functor or lambda
+  /// \param CodeLoc contains the code location of user code
+  template <typename KernelName = detail::auto_name, typename KernelType>
+  event parallel_for(
+      range<3> NumWorkItems, KernelType KernelFunc
+#ifndef DISABLE_SYCL_INSTRUMENTATION_METADATA
+      ,
+      const detail::code_location &CodeLoc = detail::code_location::current()
+#endif
+  ) {
+#ifdef DISABLE_SYCL_INSTRUMENTATION_METADATA
+    const detail::code_location &CodeLoc = {};
+#endif
+    return parallel_for_impl(NumWorkItems, KernelFunc, CodeLoc);
   }
 
   /// parallel_for version with a kernel represented as a lambda + range that
@@ -716,6 +750,25 @@ class __SYCL_EXPORT queue {
   /// A template-free version of submit.
   event submit_impl(function_class<void(handler &)> CGH, queue secondQueue,
                     const detail::code_location &CodeLoc);
+
+  /// parallel_for_impl with a kernel represented as a lambda + range that
+  /// specifies global size only.
+  ///
+  /// \param NumWorkItems is a range that specifies the work space of the kernel
+  /// \param KernelFunc is the Kernel functor or lambda
+  /// \param CodeLoc contains the code location of user code
+  template <typename KernelName = detail::auto_name, typename KernelType,
+            int Dims>
+  event parallel_for_impl(
+      range<Dims> NumWorkItems, KernelType KernelFunc,
+      const detail::code_location &CodeLoc = detail::code_location::current()) {
+    return submit(
+        [&](handler &CGH) {
+          CGH.template parallel_for<KernelName, KernelType>(NumWorkItems,
+                                                            KernelFunc);
+        },
+        CodeLoc);
+  }
 };
 
 } // namespace sycl
diff --git a/sycl/test/basic_tests/queue.cpp b/sycl/test/basic_tests/queue/queue.cpp
similarity index 100%
rename from sycl/test/basic_tests/queue.cpp
rename to sycl/test/basic_tests/queue/queue.cpp
diff --git a/sycl/test/basic_tests/queue/queue_parallel_for_generic.cpp b/sycl/test/basic_tests/queue/queue_parallel_for_generic.cpp
new file mode 100644
index 0000000000000..5ce67bd01ac4a
--- /dev/null
+++ b/sycl/test/basic_tests/queue/queue_parallel_for_generic.cpp
@@ -0,0 +1,72 @@
+// UNSUPPORTED: cuda
+// CUDA does not support unnamed lambdas.
+//
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-unnamed-lambda %s -o %t.out
+// RUN: env SYCL_DEVICE_TYPE=HOST %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+
+//==- queue_parallel_for_generic.cpp - SYCL queue parallel_for generic lambda -=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===------------------------------------------------------------------------===//
+
+#include <CL/sycl.hpp>
+#include <iostream>
+#include <type_traits>
+
+int main() {
+  sycl::queue q{};
+  auto dev = q.get_device();
+  auto ctx = q.get_context();
+  constexpr int N = 8;
+
+  if (dev.get_info<sycl::info::device::usm_shared_allocations>()) {
+    auto A = static_cast<int *>(sycl::malloc_shared(N * sizeof(int), dev, ctx));
+
+    for (int i = 0; i < N; i++) {
+      A[i] = 1;
+    }
+
+    q.parallel_for(N, [=](auto i) {
+      static_assert(std::is_same<decltype(i), sycl::item<1>>::value,
+                    "lambda arg type is unexpected");
+      A[i]++;
+    });
+
+    q.parallel_for<class Foo>({N}, [=](auto i) {
+      static_assert(std::is_same<decltype(i), sycl::item<1>>::value,
+                    "lambda arg type is unexpected");
+      A[i]++;
+    });
+
+    sycl::id<1> offset(0);
+    q.parallel_for<class Baz>(sycl::range<1>{N}, offset, [=](auto i) {
+      static_assert(std::is_same<decltype(i), sycl::item<1>>::value,
+                    "lambda arg type is unexpected");
+      A[i]++;
+    });
+
+    sycl::nd_range<1> NDR(sycl::range<1>{N}, sycl::range<1>{2});
+    q.parallel_for<class NDFoo>(NDR, [=](auto nd_i) {
+      static_assert(std::is_same<decltype(nd_i), sycl::nd_item<1>>::value,
+                    "lambda arg type is unexpected");
+      auto i = nd_i.get_global_id(0);
+      A[i]++;
+    });
+
+    q.wait();
+
+    for (int i = 0; i < N; i++) {
+      if (A[i] != 5)
+        return 1;
+    }
+    sycl::free(A, ctx);
+  }
+
+  return 0;
+}
diff --git a/sycl/test/basic_tests/queue/queue_parallel_for_interface.cpp b/sycl/test/basic_tests/queue/queue_parallel_for_interface.cpp
new file mode 100644
index 0000000000000..3bc2b64306534
--- /dev/null
+++ b/sycl/test/basic_tests/queue/queue_parallel_for_interface.cpp
@@ -0,0 +1,81 @@
+// UNSUPPORTED: cuda
+// CUDA does not support unnamed lambdas.
+//
+// RUN: %clangxx -fsycl -fsyntax-only -fsycl-unnamed-lambda %s -o %t.out
+
+//==- queue_parallel_for_generic.cpp - SYCL queue parallel_for interface test -=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===------------------------------------------------------------------------===//
+
+#include <CL/sycl.hpp>
+#include <iostream>
+#include <type_traits>
+
+template <std::size_t... Is>
+void test_range_impl(sycl::queue q, std::index_sequence<Is...>,
+                     sycl::range<sizeof...(Is)> *) {
+  constexpr auto dims = sizeof...(Is);
+
+  q.parallel_for(sycl::range<dims>{Is...}, [=](auto i) {
+    static_assert(std::is_same<decltype(i), sycl::item<dims>>::value,
+                  "lambda arg type is unexpected");
+  });
+}
+
+template <std::size_t... Is>
+void test_range_impl(sycl::queue q, std::index_sequence<Is...>,
+                     sycl::nd_range<sizeof...(Is)> *) {
+  constexpr auto dims = sizeof...(Is);
+
+  sycl::nd_range<dims> ndr{sycl::range<dims>{Is...}, sycl::range<dims>{Is...}};
+  q.parallel_for(ndr, [=](auto i) {
+    static_assert(std::is_same<decltype(i), sycl::nd_item<dims>>::value,
+                  "lambda arg type is unexpected");
+  });
+}
+
+template <template <int> class Range, std::size_t Dims>
+void test_range(sycl::queue q) {
+  test_range_impl(q, std::make_index_sequence<Dims>{},
+                  static_cast<Range<Dims> *>(nullptr));
+}
+
+void test_number_braced_init_list(sycl::queue q) {
+  constexpr auto n = 1;
+  q.parallel_for(n, [=](auto i) {
+    static_assert(std::is_same<decltype(i), sycl::item<1>>::value,
+                  "lambda arg type is unexpected");
+  });
+
+  q.parallel_for({n}, [=](auto i) {
+    static_assert(std::is_same<decltype(i), sycl::item<1>>::value,
+                  "lambda arg type is unexpected");
+  });
+
+  q.parallel_for({n, n}, [=](auto i) {
+    static_assert(std::is_same<decltype(i), sycl::item<2>>::value,
+                  "lambda arg type is unexpected");
+  });
+
+  q.parallel_for({n, n, n}, [=](auto i) {
+    static_assert(std::is_same<decltype(i), sycl::item<3>>::value,
+                  "lambda arg type is unexpected");
+  });
+}
+
+int main() {
+  sycl::queue q{};
+
+  test_number_braced_init_list(q);
+
+  test_range<sycl::range, 1>(q);
+  test_range<sycl::range, 2>(q);
+  test_range<sycl::range, 3>(q);
+  test_range<sycl::nd_range, 1>(q);
+  test_range<sycl::nd_range, 2>(q);
+  test_range<sycl::nd_range, 3>(q);
+}

From 70e0b2bd194049ce678b0e9616cb209c18331f2a Mon Sep 17 00:00:00 2001
From: Ruslan Arutyunyan <ruslan.arutyunyan@intel.com>
Date: Mon, 22 Jun 2020 15:47:20 +0300
Subject: [PATCH 2/2] Fix review comments

Signed-off-by: Ruslan Arutyunyan <ruslan.arutyunyan@intel.com>
---
 sycl/include/CL/sycl/queue.hpp                |  6 +-
 .../queue/queue_parallel_for_generic.cpp      | 80 ++++++++++---------
 .../queue/queue_parallel_for_interface.cpp    | 41 +++++-----
 3 files changed, 63 insertions(+), 64 deletions(-)

diff --git a/sycl/include/CL/sycl/queue.hpp b/sycl/include/CL/sycl/queue.hpp
index 2e65114c87b34..20ff09eee99c7 100644
--- a/sycl/include/CL/sycl/queue.hpp
+++ b/sycl/include/CL/sycl/queue.hpp
@@ -443,7 +443,7 @@ class __SYCL_EXPORT queue {
 #ifdef DISABLE_SYCL_INSTRUMENTATION_METADATA
     const detail::code_location &CodeLoc = {};
 #endif
-    return parallel_for_impl(NumWorkItems, KernelFunc, CodeLoc);
+    return parallel_for_impl<KernelName>(NumWorkItems, KernelFunc, CodeLoc);
   }
 
   /// parallel_for version with a kernel represented as a lambda + range that
@@ -463,7 +463,7 @@ class __SYCL_EXPORT queue {
 #ifdef DISABLE_SYCL_INSTRUMENTATION_METADATA
     const detail::code_location &CodeLoc = {};
 #endif
-    return parallel_for_impl(NumWorkItems, KernelFunc, CodeLoc);
+    return parallel_for_impl<KernelName>(NumWorkItems, KernelFunc, CodeLoc);
   }
 
   /// parallel_for version with a kernel represented as a lambda + range that
@@ -483,7 +483,7 @@ class __SYCL_EXPORT queue {
 #ifdef DISABLE_SYCL_INSTRUMENTATION_METADATA
     const detail::code_location &CodeLoc = {};
 #endif
-    return parallel_for_impl(NumWorkItems, KernelFunc, CodeLoc);
+    return parallel_for_impl<KernelName>(NumWorkItems, KernelFunc, CodeLoc);
   }
 
   /// parallel_for version with a kernel represented as a lambda + range that
diff --git a/sycl/test/basic_tests/queue/queue_parallel_for_generic.cpp b/sycl/test/basic_tests/queue/queue_parallel_for_generic.cpp
index 5ce67bd01ac4a..7af89c5357c3a 100644
--- a/sycl/test/basic_tests/queue/queue_parallel_for_generic.cpp
+++ b/sycl/test/basic_tests/queue/queue_parallel_for_generic.cpp
@@ -1,7 +1,9 @@
-// UNSUPPORTED: cuda
-// CUDA does not support unnamed lambdas.
+// XFAIL: cuda
+// piextUSM*Alloc functions for CUDA are not behaving as described in
+// https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/USM/USM.adoc
+// https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/USM/cl_intel_unified_shared_memory.asciidoc
 //
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -fsycl-unnamed-lambda %s -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
 // RUN: env SYCL_DEVICE_TYPE=HOST %t.out
 // RUN: %ACC_RUN_PLACEHOLDER %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
@@ -25,48 +27,48 @@ int main() {
   auto ctx = q.get_context();
   constexpr int N = 8;
 
-  if (dev.get_info<sycl::info::device::usm_shared_allocations>()) {
-    auto A = static_cast<int *>(sycl::malloc_shared(N * sizeof(int), dev, ctx));
+  if (!dev.get_info<sycl::info::device::usm_shared_allocations>()) {
+    return 0;
+  }
+
+  auto A = static_cast<int *>(sycl::malloc_shared(N * sizeof(int), dev, ctx));
 
-    for (int i = 0; i < N; i++) {
-      A[i] = 1;
-    }
+  for (int i = 0; i < N; i++) {
+    A[i] = 1;
+  }
 
-    q.parallel_for(N, [=](auto i) {
-      static_assert(std::is_same<decltype(i), sycl::item<1>>::value,
-                    "lambda arg type is unexpected");
-      A[i]++;
-    });
+  q.parallel_for<class Bar>(N, [=](auto i) {
+    static_assert(std::is_same<decltype(i), sycl::item<1>>::value,
+                  "lambda arg type is unexpected");
+    A[i]++;
+  });
 
-    q.parallel_for<class Foo>({N}, [=](auto i) {
-      static_assert(std::is_same<decltype(i), sycl::item<1>>::value,
-                    "lambda arg type is unexpected");
-      A[i]++;
-    });
+  q.parallel_for<class Foo>({N}, [=](auto i) {
+    static_assert(std::is_same<decltype(i), sycl::item<1>>::value,
+                  "lambda arg type is unexpected");
+    A[i]++;
+  });
 
-    sycl::id<1> offset(0);
-    q.parallel_for<class Baz>(sycl::range<1>{N}, offset, [=](auto i) {
-      static_assert(std::is_same<decltype(i), sycl::item<1>>::value,
-                    "lambda arg type is unexpected");
-      A[i]++;
-    });
+  sycl::id<1> offset(0);
+  q.parallel_for<class Baz>(sycl::range<1>{N}, offset, [=](auto i) {
+    static_assert(std::is_same<decltype(i), sycl::item<1>>::value,
+                  "lambda arg type is unexpected");
+    A[i]++;
+  });
 
-    sycl::nd_range<1> NDR(sycl::range<1>{N}, sycl::range<1>{2});
-    q.parallel_for<class NDFoo>(NDR, [=](auto nd_i) {
-      static_assert(std::is_same<decltype(nd_i), sycl::nd_item<1>>::value,
-                    "lambda arg type is unexpected");
-      auto i = nd_i.get_global_id(0);
-      A[i]++;
-    });
+  sycl::nd_range<1> NDR(sycl::range<1>{N}, sycl::range<1>{2});
+  q.parallel_for<class NDFoo>(NDR, [=](auto nd_i) {
+    static_assert(std::is_same<decltype(nd_i), sycl::nd_item<1>>::value,
+                  "lambda arg type is unexpected");
+    auto i = nd_i.get_global_id(0);
+    A[i]++;
+  });
 
-    q.wait();
+  q.wait();
 
-    for (int i = 0; i < N; i++) {
-      if (A[i] != 5)
-        return 1;
-    }
-    sycl::free(A, ctx);
+  for (int i = 0; i < N; i++) {
+    if (A[i] != 5)
+      return 1;
   }
-
-  return 0;
+  sycl::free(A, ctx);
 }
diff --git a/sycl/test/basic_tests/queue/queue_parallel_for_interface.cpp b/sycl/test/basic_tests/queue/queue_parallel_for_interface.cpp
index 3bc2b64306534..38554d8d5f5f1 100644
--- a/sycl/test/basic_tests/queue/queue_parallel_for_interface.cpp
+++ b/sycl/test/basic_tests/queue/queue_parallel_for_interface.cpp
@@ -1,7 +1,4 @@
-// UNSUPPORTED: cuda
-// CUDA does not support unnamed lambdas.
-//
-// RUN: %clangxx -fsycl -fsyntax-only -fsycl-unnamed-lambda %s -o %t.out
+// RUN: %clangxx -fsycl -fsyntax-only %s -o %t.out
 
 //==- queue_parallel_for_generic.cpp - SYCL queue parallel_for interface test -=//
 //
@@ -15,53 +12,53 @@
 #include <iostream>
 #include <type_traits>
 
-template <std::size_t... Is>
+template <typename KernelName, std::size_t... Is>
 void test_range_impl(sycl::queue q, std::index_sequence<Is...>,
                      sycl::range<sizeof...(Is)> *) {
   constexpr auto dims = sizeof...(Is);
 
-  q.parallel_for(sycl::range<dims>{Is...}, [=](auto i) {
+  q.parallel_for<KernelName>(sycl::range<dims>{Is...}, [=](auto i) {
     static_assert(std::is_same<decltype(i), sycl::item<dims>>::value,
                   "lambda arg type is unexpected");
   });
 }
 
-template <std::size_t... Is>
+template <typename KernelName, std::size_t... Is>
 void test_range_impl(sycl::queue q, std::index_sequence<Is...>,
                      sycl::nd_range<sizeof...(Is)> *) {
   constexpr auto dims = sizeof...(Is);
 
   sycl::nd_range<dims> ndr{sycl::range<dims>{Is...}, sycl::range<dims>{Is...}};
-  q.parallel_for(ndr, [=](auto i) {
+  q.parallel_for<KernelName>(ndr, [=](auto i) {
     static_assert(std::is_same<decltype(i), sycl::nd_item<dims>>::value,
                   "lambda arg type is unexpected");
   });
 }
 
-template <template <int> class Range, std::size_t Dims>
+template <typename KernelName, template <int> class Range, std::size_t Dims>
 void test_range(sycl::queue q) {
-  test_range_impl(q, std::make_index_sequence<Dims>{},
-                  static_cast<Range<Dims> *>(nullptr));
+  test_range_impl<KernelName>(q, std::make_index_sequence<Dims>{},
+                              static_cast<Range<Dims> *>(nullptr));
 }
 
 void test_number_braced_init_list(sycl::queue q) {
   constexpr auto n = 1;
-  q.parallel_for(n, [=](auto i) {
+  q.parallel_for<class Number>(n, [=](auto i) {
     static_assert(std::is_same<decltype(i), sycl::item<1>>::value,
                   "lambda arg type is unexpected");
   });
 
-  q.parallel_for({n}, [=](auto i) {
+  q.parallel_for<class BracedInitList1>({n}, [=](auto i) {
     static_assert(std::is_same<decltype(i), sycl::item<1>>::value,
                   "lambda arg type is unexpected");
   });
 
-  q.parallel_for({n, n}, [=](auto i) {
+  q.parallel_for<class BracedInitList2>({n, n}, [=](auto i) {
     static_assert(std::is_same<decltype(i), sycl::item<2>>::value,
                   "lambda arg type is unexpected");
   });
 
-  q.parallel_for({n, n, n}, [=](auto i) {
+  q.parallel_for<class BracedInitList3>({n, n, n}, [=](auto i) {
     static_assert(std::is_same<decltype(i), sycl::item<3>>::value,
                   "lambda arg type is unexpected");
   });
@@ -70,12 +67,12 @@ void test_number_braced_init_list(sycl::queue q) {
 int main() {
   sycl::queue q{};
 
-  test_number_braced_init_list(q);
+  test_range<class test_range1, sycl::range, 1>(q);
+  test_range<class test_range2, sycl::range, 2>(q);
+  test_range<class test_range3, sycl::range, 3>(q);
+  test_range<class test_nd_range1, sycl::nd_range, 1>(q);
+  test_range<class test_nd_range2, sycl::nd_range, 2>(q);
+  test_range<class test_nd_range3, sycl::nd_range, 3>(q);
 
-  test_range<sycl::range, 1>(q);
-  test_range<sycl::range, 2>(q);
-  test_range<sycl::range, 3>(q);
-  test_range<sycl::nd_range, 1>(q);
-  test_range<sycl::nd_range, 2>(q);
-  test_range<sycl::nd_range, 3>(q);
+  test_number_braced_init_list(q);
 }