Adding Separate OpenMP Offloading Backend to `libcxx/include/__algorithm/pstl_backends` #66968

AntonRydahl · 2023-09-21T00:55:08Z

To address some of the good feedback from the recent request for comments, I have added a separate OpenMP offloading backend in libcxx/include/__algorithm/pstl_backends.

To begin with, I have added std::for_each, std::transform, std::transform_reduce, and std::fill.

I know there are still a lot of things missing, such as LIT tests, but I think we should try to agree on the project structure first.

llvmbot · 2023-09-21T00:56:08Z

@llvm/pr-subscribers-github-workflow
@llvm/pr-subscribers-libcxxabi

@llvm/pr-subscribers-libcxx

Changes

To address some of the good feedback from the recent request for comments, I have added a separate OpenMP offloading backend in libcxx/include/__algorithm/pstl_backends.

To begin with, I have only added std::for_each and std::fill.

I know there are still a lot of things missing, such as LIT tests, but I think we should try to agree on the project structure first.

Full diff: https://github.com/llvm/llvm-project/pull/66968.diff

9 Files Affected:

(modified) libcxx/CMakeLists.txt (+14)
(modified) libcxx/include/CMakeLists.txt (+5)
(modified) libcxx/include/__algorithm/pstl_backend.h (+8)
(added) libcxx/include/__algorithm/pstl_backends/gpu_backend.h (+21)
(added) libcxx/include/__algorithm/pstl_backends/gpu_backends/backend.h (+33)
(added) libcxx/include/__algorithm/pstl_backends/gpu_backends/fill.h (+59)
(added) libcxx/include/__algorithm/pstl_backends/gpu_backends/for_each.h (+59)
(added) libcxx/include/__algorithm/pstl_backends/gpu_backends/omp_offload.h (+94)
(modified) libcxx/include/__config_site.in (+1)

diff --git a/libcxx/CMakeLists.txt b/libcxx/CMakeLists.txt
index bb2898b799bcef9..43d2a448de79584 100644
--- a/libcxx/CMakeLists.txt
+++ b/libcxx/CMakeLists.txt
@@ -290,6 +290,8 @@ option(LIBCXX_HAS_WIN32_THREAD_API "Ignore auto-detection and force use of win32
 option(LIBCXX_HAS_EXTERNAL_THREAD_API
   "Build libc++ with an externalized threading API.
    This option may only be set to ON when LIBCXX_ENABLE_THREADS=ON." OFF)
+option(LIBCXX_ENABLE_GPU_OFFLOAD 
+  "Build libc++ with support for GPU offload" OFF)
 
 if (LIBCXX_ENABLE_THREADS)
   set(LIBCXX_PSTL_CPU_BACKEND "std_thread" CACHE STRING "Which PSTL CPU backend to use")
@@ -297,6 +299,14 @@ else()
   set(LIBCXX_PSTL_CPU_BACKEND "serial" CACHE STRING "Which PSTL CPU backend to use")
 endif()
 
+if (NOT DEFINED LIBCXX_PSTL_GPU_BACKEND)
+  if (${LIBCXX_ENABLE_GPU_OFFLOAD})
+    set(LIBCXX_PSTL_GPU_BACKEND "omp_offload" CACHE STRING "Which PSTL GPU backend to use")
+  else()
+    set(LIBCXX_PSTL_GPU_BACKEND "none" CACHE STRING "Which PSTL GPU backend to use")
+  endif()
+endif()
+
 # Misc options ----------------------------------------------------------------
 # FIXME: Turn -pedantic back ON. It is currently off because it warns
 # about #include_next which is used everywhere.
@@ -809,6 +819,10 @@ else()
                        Valid backends are: serial, std_thread and libdispatch")
 endif()
 
+if (LIBCXX_PSTL_GPU_BACKEND STREQUAL "omp_offload")
+  config_define(1 _LIBCPP_PSTL_GPU_BACKEND_OMP_OFFLOAD)
+endif()
+
 if (LIBCXX_ABI_DEFINES)
   set(abi_defines)
   foreach (abi_define ${LIBCXX_ABI_DEFINES})
diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt
index 2ec755236dbaee2..a3d72df61a86dde 100644
--- a/libcxx/include/CMakeLists.txt
+++ b/libcxx/include/CMakeLists.txt
@@ -85,6 +85,11 @@ set(files
   __algorithm/pstl_backends/cpu_backends/thread.h
   __algorithm/pstl_backends/cpu_backends/transform.h
   __algorithm/pstl_backends/cpu_backends/transform_reduce.h
+  __algorithm/pstl_backends/gpu_backend.h
+  __algorithm/pstl_backends/gpu_backends/backend.h
+  __algorithm/pstl_backends/gpu_backends/fill.h
+  __algorithm/pstl_backends/gpu_backends/for_each.h
+  __algorithm/pstl_backends/gpu_backends/omp_offload.h
   __algorithm/pstl_copy.h
   __algorithm/pstl_count.h
   __algorithm/pstl_fill.h
diff --git a/libcxx/include/__algorithm/pstl_backend.h b/libcxx/include/__algorithm/pstl_backend.h
index 93372f019031b63..47f5191b48517ba 100644
--- a/libcxx/include/__algorithm/pstl_backend.h
+++ b/libcxx/include/__algorithm/pstl_backend.h
@@ -10,6 +10,7 @@
 #define _LIBCPP___ALGORITHM_PSTL_BACKEND_H
 
 #include <__algorithm/pstl_backends/cpu_backend.h>
+#include <__algorithm/pstl_backends/gpu_backend.h>
 #include <__config>
 #include <execution>
 
@@ -179,10 +180,17 @@ struct __select_backend<std::execution::parallel_policy> {
   using type = __cpu_backend_tag;
 };
 
+#    if defined(_LIBCPP_PSTL_GPU_BACKEND_OMP_OFFLOAD)
+template <>
+struct __select_backend<std::execution::parallel_unsequenced_policy> {
+  using type = __gpu_backend_tag;
+};
+#    else
 template <>
 struct __select_backend<std::execution::parallel_unsequenced_policy> {
   using type = __cpu_backend_tag;
 };
+#    endif
 
 #  else
 
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backend.h b/libcxx/include/__algorithm/pstl_backends/gpu_backend.h
new file mode 100644
index 000000000000000..7237036156a1bf3
--- /dev/null
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backend.h
@@ -0,0 +1,21 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKEND_H
+#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKEND_H
+
+#include <__config>
+
+#include <__algorithm/pstl_backends/gpu_backends/backend.h>
+
+#if defined(_LIBCPP_PSTL_GPU_BACKEND_OMP_OFFLOAD)
+#  include <__algorithm/pstl_backends/gpu_backends/fill.h>
+#  include <__algorithm/pstl_backends/gpu_backends/for_each.h>
+#endif
+
+#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKEND_H
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/backend.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/backend.h
new file mode 100644
index 000000000000000..a8b400afbb94d9d
--- /dev/null
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/backend.h
@@ -0,0 +1,33 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKEND_BACKEND_H
+#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKEND_BACKEND_H
+
+#include <__config>
+#include <cstddef>
+
+#if defined(_LIBCPP_PSTL_GPU_BACKEND_OMP_OFFLOAD)
+#  include <__algorithm/pstl_backends/gpu_backends/omp_offload.h>
+#endif
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#  pragma GCC system_header
+#endif
+
+#if _LIBCPP_STD_VER >= 17
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+struct __gpu_backend_tag {};
+
+_LIBCPP_END_NAMESPACE_STD
+
+#endif // _LIBCPP_STD_VER >= 17
+
+#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKEND_BACKEND_H
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/fill.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/fill.h
new file mode 100644
index 000000000000000..32926da87e2a083
--- /dev/null
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/fill.h
@@ -0,0 +1,59 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKNEDS_FILL_H
+#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKNEDS_FILL_H
+
+#include <__algorithm/fill.h>
+#include <__algorithm/pstl_backends/cpu_backends/backend.h>
+#include <__algorithm/pstl_backends/gpu_backends/backend.h>
+#include <__config>
+#include <__iterator/concepts.h>
+#include <__type_traits/is_execution_policy.h>
+#include <__utility/terminate_on_exception.h>
+#include <stdio.h>
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#  pragma GCC system_header
+#endif
+
+#if !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+template <class _ExecutionPolicy, class _ForwardIterator, class _Tp>
+_LIBCPP_HIDE_FROM_ABI void
+__pstl_fill(__gpu_backend_tag, _ForwardIterator __first, _ForwardIterator __last, const _Tp& __value) {
+  // It is only safe to execute for_each on the GPU, it the execution policy is
+  // parallel unsequenced, as it is the only execution policy prohibiting throwing
+  // exceptions and allowing SIMD instructions
+  if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
+                __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
+    std::__par_backend::__parallel_for_simd_val_1(__first, __last - __first, __value);
+  }
+  // Else if the excution policy is parallel, we execute for_each on the CPU instead
+  else if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> &&
+                     __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
+    std::__terminate_on_exception([&] {
+      __par_backend::__parallel_for(
+          __first, __last, [&__value](_ForwardIterator __brick_first, _ForwardIterator __brick_last) {
+            std::__pstl_fill<__remove_parallel_policy_t<_ExecutionPolicy>>(
+                __cpu_backend_tag{}, __brick_first, __brick_last, __value);
+          });
+    });
+    // Else we execute for_each in serial
+  } else {
+    std::fill(__first, __last, __value);
+  }
+}
+
+_LIBCPP_END_NAMESPACE_STD
+
+#endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
+
+#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKNEDS_FILL_H
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/for_each.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/for_each.h
new file mode 100644
index 000000000000000..14de2af8e4a15c6
--- /dev/null
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/for_each.h
@@ -0,0 +1,59 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKNEDS_FOR_EACH_H
+#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKNEDS_FOR_EACH_H
+
+#include <__algorithm/for_each.h>
+#include <__algorithm/pstl_backends/cpu_backends/backend.h>
+#include <__algorithm/pstl_backends/gpu_backends/backend.h>
+#include <__config>
+#include <__iterator/concepts.h>
+#include <__type_traits/is_execution_policy.h>
+#include <__utility/terminate_on_exception.h>
+#include <stdio.h>
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#  pragma GCC system_header
+#endif
+
+#if !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+template <class _ExecutionPolicy, class _ForwardIterator, class _Functor>
+_LIBCPP_HIDE_FROM_ABI void
+__pstl_for_each(__gpu_backend_tag, _ForwardIterator __first, _ForwardIterator __last, _Functor __func) {
+  // It is only safe to execute for_each on the GPU, it the execution policy is
+  // parallel unsequenced, as it is the only execution policy prohibiting throwing
+  // exceptions and allowing SIMD instructions
+  if constexpr (__is_unsequenced_execution_policy_v<_ExecutionPolicy> &&
+                __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
+    std::__par_backend::__parallel_for_simd_1(__first, __last - __first, __func);
+  }
+  // Else if the excution policy is parallel, we execute for_each on the CPU instead
+  else if constexpr (__is_parallel_execution_policy_v<_ExecutionPolicy> &&
+                     __has_random_access_iterator_category_or_concept<_ForwardIterator>::value) {
+    std::__terminate_on_exception([&] {
+      std::__par_backend::__parallel_for(
+          __first, __last, [__func](_ForwardIterator __brick_first, _ForwardIterator __brick_last) {
+            std::__pstl_for_each<__remove_parallel_policy_t<_ExecutionPolicy>>(
+                __cpu_backend_tag{}, __brick_first, __brick_last, __func);
+          });
+    });
+    // Else we execute for_each in serial
+  } else {
+    std::for_each(__first, __last, __func);
+  }
+}
+
+_LIBCPP_END_NAMESPACE_STD
+
+#endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
+
+#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_GPU_BACKNEDS_FOR_EACH_H
diff --git a/libcxx/include/__algorithm/pstl_backends/gpu_backends/omp_offload.h b/libcxx/include/__algorithm/pstl_backends/gpu_backends/omp_offload.h
new file mode 100644
index 000000000000000..4baa4e7f65859d1
--- /dev/null
+++ b/libcxx/include/__algorithm/pstl_backends/gpu_backends/omp_offload.h
@@ -0,0 +1,94 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKENDS_OMP_OFFLOAD_H
+#define _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKENDS_OMP_OFFLOAD_H
+
+#include <__assert>
+#include <__config>
+#include <__utility/move.h>
+#include <cstddef>
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#  pragma GCC system_header
+#endif
+
+_LIBCPP_PUSH_MACROS
+#include <__undef_macros>
+
+#if !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && _LIBCPP_STD_VER >= 17
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+namespace __par_backend {
+inline namespace __omp_gpu_backend {
+
+// In OpenMP, we need to extract the pointer for the underlying data for data
+// structures like std::vector and std::array to be able to map the data to the
+// device.
+
+template <typename T>
+_LIBCPP_HIDE_FROM_ABI inline T __omp_extract_base_ptr(T p) {
+  return p;
+}
+
+template <typename T>
+_LIBCPP_HIDE_FROM_ABI inline T __omp_extract_base_ptr(std::__wrap_iter<T> w) {
+  std::pointer_traits<std::__wrap_iter<T>> PT;
+  return PT.to_address(w);
+}
+
+// Applying function or lambda in a loop
+
+template <class _Iterator, class _DifferenceType, class _Function>
+_LIBCPP_HIDE_FROM_ABI _Iterator
+__omp_parallel_for_simd_1(_Iterator __first, _DifferenceType __n, _Function __f) noexcept {
+#  pragma omp target teams distribute parallel for simd map(tofrom : __first[0 : __n])
+  for (_DifferenceType __i = 0; __i < __n; ++__i)
+    __f(__first[__i]);
+
+  return __first + __n;
+}
+
+// Extracting the underlying pointer
+
+template <class _Iterator, class _DifferenceType, class _Function>
+_LIBCPP_HIDE_FROM_ABI _Iterator __parallel_for_simd_1(_Iterator __first, _DifferenceType __n, _Function __f) noexcept {
+  __omp_parallel_for_simd_1(__omp_gpu_backend::__omp_extract_base_ptr(__first), __n, __f);
+  return __first + __n;
+}
+
+// Assigning a value in a loop
+
+template <class _Index, class _DifferenceType, class _Tp>
+_LIBCPP_HIDE_FROM_ABI _Index
+__omp_parallel_for_simd_val_1(_Index __first, _DifferenceType __n, const _Tp& __value) noexcept {
+#  pragma omp target teams distribute parallel for simd map(tofrom : __first[0 : __n]) map(to : __value)
+  for (_DifferenceType __i = 0; __i < __n; ++__i)
+    __first[__i] = __value;
+
+  return __first + __n;
+}
+
+template <class _Index, class _DifferenceType, class _Tp>
+_LIBCPP_HIDE_FROM_ABI _Index
+__parallel_for_simd_val_1(_Index __first, _DifferenceType __n, const _Tp& __value) noexcept {
+  __omp_parallel_for_simd_val_1(__omp_gpu_backend::__omp_extract_base_ptr(__first), __n, __value);
+  return __first + __n;
+}
+
+} // namespace __omp_gpu_backend
+} // namespace __par_backend
+
+_LIBCPP_END_NAMESPACE_STD
+
+#endif // !defined(_LIBCPP_HAS_NO_INCOMPLETE_PSTL) && && _LIBCPP_STD_VER >= 17
+
+_LIBCPP_POP_MACROS
+
+#endif // _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKENDS_OMP_OFFLOAD_H
diff --git a/libcxx/include/__config_site.in b/libcxx/include/__config_site.in
index c85cbcd02c441b9..e0edddce3afc3ff 100644
--- a/libcxx/include/__config_site.in
+++ b/libcxx/include/__config_site.in
@@ -34,6 +34,7 @@
 #cmakedefine _LIBCPP_PSTL_CPU_BACKEND_SERIAL
 #cmakedefine _LIBCPP_PSTL_CPU_BACKEND_THREAD
 #cmakedefine _LIBCPP_PSTL_CPU_BACKEND_LIBDISPATCH
+#cmakedefine _LIBCPP_PSTL_GPU_BACKEND_OMP_OFFLOAD
 
 // Hardening.
 #cmakedefine01 _LIBCPP_ENABLE_HARDENED_MODE_DEFAULT

AntonRydahl · 2023-09-21T17:32:01Z

Do we want to enable GPU offloading when compiling libcxx, for instance, by using #cmakedefine instead of #define, or should the user be able to switch it on and off when using the library?

AntonRydahl · 2023-09-21T19:51:15Z

I made it depend on CMake options only.

github-actions · 2023-10-02T19:43:27Z

✅ With the latest revision this PR passed the C/C++ code formatter.

ldionne

Thanks a lot for the patch! I think this is really great and I'm glad we're paving the way for new backends being added in the library. Since this is the first such backend, there are some thing to change as we discussed during our live review, but I definitely see a path forward.

Note to self: did not have time to review omp_offload.h yet, will do next time.

libcxx/CMakeLists.txt

libcxx/include/__algorithm/pstl_backend.h

ldionne · 2023-10-04T16:28:09Z

libcxx/include/__algorithm/pstl_backends/gpu_backends/omp_offload.h

@@ -0,0 +1,349 @@
+//===----------------------------------------------------------------------===//


Not attached to this line:

We need to discuss how exceptions are handled when we offload to the GPU. @AntonRydahl mentioned that throw was compiled to __builtin_trap(). This is going to both fail our tests and make us non-conforming. Our preference would be that the OpenMP backend doesn't try to offload if there are exceptions in the code path we're considering offloading.

Needs more discussion.

I have written a section about this in the libc++ documentation. Do you think the description is sufficiently detailed?

libcxx/include/__algorithm/pstl_backends/gpu_backend.h

ldionne · 2023-10-04T16:49:12Z

libcxx/include/__algorithm/pstl_backends/gpu_backends/any_of.h

+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//


You can actually remove this header entirely until you have an implementation for it, since it'll be implemented using std::find_if. Note that if you run into issues while doing that, it's a pre-existing PSTL bug.

I think there is unfortunately a bug in the PSTL logic. I cannot compile libcxx if I remove the overlay header files for any_of, find_if, or stable_sort.

libcxx/include/__algorithm/pstl_backends/gpu_backends/for_each.h

libcxx/include/__algorithm/pstl_backends/gpu_backends/transform.h

libcxx/include/__algorithm/pstl_backends/gpu_backends/transform_reduce.h

ldionne · 2023-10-04T17:23:39Z

libcxx/include/__algorithm/pstl_backends/gpu_backends/transform_reduce.h

+template <class _T1, class _T2, class _T3>
+struct _LIBCPP_HIDE_FROM_ABI __is_supported_reduction : std::false_type {};
+
+#  define __PSTL_IS_SUPPORTED_REDUCTION(funname)                                                                       \


There's something trying to emerge here. We already have libcxx/include/__type_traits/predicate_traits.h and libcxx/include/__type_traits/operation_traits.h. IMO we could unify those into something like:

template <> struct __desugars_to<__equal_to, std::equal_to<>> : true_type {}; template <> struct __desugars_to<ranges::equal_to, std::equal_to<>> : true_type {}; template <> struct __desugars_to<ranges::plus, std::plus<>> : std::true_type {}; // etc...

Then, this becomes simply:

template <class _Func> struct __is_supported_reduction : bool_constant< __desugars_to<_Func, std::minus<>>::value || __desugars_to<_Func, std::multiplies<>>::value || __desugars_to<_Func, std::logical_and<>>::value || __desugars_to<_Func, std::logical_or<>>::value || __desugars_to<_Func, std::bit_and<>>::value || __desugars_to<_Func, std::bit_or<>>::value || __desugars_to<_Func, std::bit_xor<>>::value > {};

This would mandate some prior refactoring, but it would be quite nice.

To not make this PR even bigger, I have added a refactor to use __desugars_to here: #68642

I hope that is what you meant. 😄

…TION_POINT The _LIBCPP_PSTL_CUSTOMIZATION_POINT macro was assuming that the policy was called _RawPolicy and referencing it by name. It happened to always work but this was definitely accidental and an oversight in the original implementation. This patch fixes that by passing the policy to the macro explicitly. Noticed while reviewing llvm#66968.

…TION_POINT (#68238) The _LIBCPP_PSTL_CUSTOMIZATION_POINT macro was assuming that the policy was called _RawPolicy and referencing it by name. It happened to always work but this was definitely accidental and an oversight in the original implementation. This patch fixes that by passing the policy to the macro explicitly. Noticed while reviewing #66968.

EricWF · 2023-10-09T18:50:18Z

@philnik777 Last we spoke, the design work around the customization points was still ongoing. Have we locked it in now?

philnik777 · 2023-10-09T19:41:45Z

@philnik777 Last we spoke, the design work around the customization points was still ongoing. Have we locked it in now?

Kind-of, but not really. We'll probably get more insights while working on this. We're working closely with @AntonRydahl and others to get this backend into libc++.

ldionne

This is starting to look really good! Note to self: still need to dive into the implementation of most algorithms, but we discussed for_each and offloading conditions in some details.

libcxx/CMakeLists.txt

libcxx/include/__algorithm/pstl_backends/openmp/any_of.h

libcxx/include/__algorithm/pstl_backends/openmp/for_each.h

libcxx/include/__algorithm/pstl_backends/openmp/omp_offload.h

This PR addresses a smaller detail discussed in the code review for #66968. Currently, some functions in the `libc++` PSTL CPU backend have been appended with a digit to indicate the number of input iterator arguments. However, there is no need to change the name for each version as overloading can be used instead. This PR will make the naming more consistent in the the CPU and the proposed OpenMP backend.

libcxx/include/__algorithm/pstl_backends/openmp/fill.h

libcxx/include/__algorithm/pstl_backends/openmp/find_if.h

libcxx/include/__algorithm/pstl_backends/openmp/omp_offload.h

libcxx/include/__algorithm/pstl_backends/openmp/for_each.h

ldionne · 2023-10-16T18:24:04Z

libcxx/include/__algorithm/pstl_backends/openmp/for_each.h

+  // parallel unsequenced, as it is the only execution policy prohibiting throwing
+  // exceptions and allowing SIMD instructions


This comment is not technically correct, since exceptions can be thrown from user code even when par_unseq is used.

And unseq also allows for SIMD. So I think we need to explain that we don't have sequencing between the teams (thanks @jdoerfert !).

Also, I think it might be worth promoting this comment to openmp.h since it is not specific for for_each.

libcxx/include/__algorithm/pstl_backends/openmp/transform.h

libcxx/include/__algorithm/pstl_backends/openmp/omp_offload.h

This PR addresses a smaller detail discussed in the code review for llvm/llvm-project#66968. Currently, some functions in the `libc++` PSTL CPU backend have been appended with a digit to indicate the number of input iterator arguments. However, there is no need to change the name for each version as overloading can be used instead. This PR will make the naming more consistent in the the CPU and the proposed OpenMP backend. NOKEYCHECK=True GitOrigin-RevId: f2b79ed9c6c858426b15a0374103ab901b5b2ef3

AntonRydahl · 2023-10-26T06:01:35Z

When adding the logic to error out when OpenMP is not enabled, I had to add a bit of CMake logic to allow installing the library without getting that same error. On the good side, this also made it possible to enable the OpenMP toolchain when running the tests, if the OpenMP backend was selected.

I now found that a few of the tests failed, because I had not thought of handling cases where the base type of the iterator is not trivially copyable. For instance, some of the tests may throw exceptions in the copy constructor. The commit I pushed today makes those tests pass while still allowing my local tests to be executed on the GPU.

What do you think of this change?

AntonRydahl · 2023-10-28T01:02:39Z

I have also added a number of tests that verify that vectors of primitive data types are offloaded to the GPU. They use omp_is_initial_device() to test whether the loop bodies run on the host or the device.

If there are no available devices, the new tests simply don't run:

  if (omp_get_num_devices() < 1)
    return 0;

Also, I have introduced a new feature such that they only run when the OpenMP PSTL backend is enabled:

// REQUIRES: openmp_pstl_backend

jdoerfert · 2024-05-03T19:13:08Z

@AntonRydahl I tried to rebase this but there are conflicts that I cannot easily resolve.
Can you give it a try please?

ldionne · 2024-06-13T19:23:05Z

@jdoerfert Rebase done. There will probably be many small issues because I couldn't build this locally, but I basically resolved the rebasing issues and reformulated the backend on top of the new dispatching mechanism.

AntonRydahl · 2024-06-13T21:18:30Z

Thank you so, so much for spending time fixing this, @ldionne! I will re-run the tests and see if I can replicate the errors from the build bot.

jdoerfert · 2024-06-14T14:40:48Z

Thanks a lot!

Bots seem clean, right?

FWIW, I can (re-)run stuff on linux machines, at least try. We also have people, incl. a GSOC student who will pick this up now.

AntonRydahl · 2024-06-14T14:48:39Z

I think we need to wait and see if stage3 (generic-pstl-openmp, libcxx-runners-8-set) passes.

ldionne · 2024-06-14T14:52:16Z

@AntonRydahl I don't think we ever had a clean run of the OpenMP bot, did we? IMO the testing + CI story for the OpenMP backend is the only thing blocking this PR from being merged, everything else (like improving the backend) can and should be done in separate PRs so we can capture progress here.

AntonRydahl · 2024-06-18T17:17:33Z

The tests should not be able to locate libomptarget.so. Let's see what other failures we get from the build bot.

ldionne

You should rebase on top of main, that will solve the issues you're seeing on the macOS builders.

ldionne · 2024-06-18T18:26:07Z

libcxx/include/__pstl/backends/openmp.h

+                  __libcpp_is_contiguous_iterator<_ForwardOutIterator>::value && is_trivially_copyable_v<_ValueType>) {
+      return std::__rewrap_iter(
+          __outit,
+          __omp_transform(


Suggested change

__omp_transform(

__pstl::__omp_transform(

xevor11 · 2024-08-15T01:25:49Z

Just pushed header files for ranges shift left and shift right for review, I was wondering if the directives and the argument types are correct, a specific format to follow? To test the files in libcxx we would have to run the executable llvm-lit in the openmp test directory?

VedantParanjape · 2024-11-14T16:12:44Z

Thanks for the patch! I wanted to contribute to dev of openMP offload of std algorithms. Is this patch LGTM? and is it safe to base my work on top of this?

jdoerfert · 2024-11-14T17:30:51Z

You should rebase on top of main, that will solve the issues you're seeing on the macOS builders.

@xevor11 can you rebase this on top of main, as @ldionne mentioned?
Also, let's split your additions off into a new PR so we can merge this first.

xevor11 · 2024-11-14T17:48:38Z

Sure will do! Best, Vedant

…

On Thu, Nov 14, 2024, 11:31 AM Johannes Doerfert ***@***.***> wrote: You should rebase on top of main, that will solve the issues you're seeing on the macOS builders. @xevor11 <https://github.com/xevor11> can you rebase this on top of main, as @ldionne <https://github.com/ldionne> mentioned? Also, let's split your additions off into a new PR so we can merge this first. — Reply to this email directly, view it on GitHub <#66968 (comment)>, or unsubscribe <https://github.com/notifications/unsubscribe-auth/ARZYSZ4JDTTLW5ZXQFJR25D2ATM6JAVCNFSM6AAAAAA5AXJ4TCVHI2DSMVQWIX3LMV43OSLTON2WKQ3PNVWWK3TUHMZDINZXGAZDANZQGU> . You are receiving this because you were mentioned.Message ID: ***@***.***>

xevor11 · 2024-11-14T21:45:19Z

For all the conflicting files I just went with the changes made in upstream, since there were no modifications made on my end within these directories or files:

CONFLICT (content): Merge conflict in libcxx/docs/UserDocumentation.rst
CONFLICT (content): Merge conflict in libcxx/docs/VendorDocumentation.rst
CONFLICT (content): Merge conflict in libcxx/include/__pstl/backend.h
CONFLICT (content): Merge conflict in libcxx/include/__pstl/backend_fwd.h
CONFLICT (content): Merge conflict in libcxx/include/module.modulemap
CONFLICT (content): Merge conflict in libcxx/utils/ci/run-buildbot
CONFLICT (content): Merge conflict in libcxx/utils/libcxx/test/features.py

I have rebased on top of main will continue the rest of the work in a different PR as requested

jdoerfert · 2024-11-14T22:18:53Z

For all the conflicting files I just went with the changes made in upstream, since there were no modifications made on my end within these directories or files:

CONFLICT (content): Merge conflict in libcxx/docs/UserDocumentation.rst CONFLICT (content): Merge conflict in libcxx/docs/VendorDocumentation.rst CONFLICT (content): Merge conflict in libcxx/include/__pstl/backend.h CONFLICT (content): Merge conflict in libcxx/include/__pstl/backend_fwd.h CONFLICT (content): Merge conflict in libcxx/include/module.modulemap CONFLICT (content): Merge conflict in libcxx/utils/ci/run-buildbot CONFLICT (content): Merge conflict in libcxx/utils/libcxx/test/features.py

I have rebased on top of main will continue the rest of the work in a different PR as requested

Did you update the PR after the rebase, or created a new one?

xevor11 · 2024-11-14T22:27:08Z

For all the conflicting files I just went with the changes made in upstream, since there were no modifications made on my end within these directories or files:
CONFLICT (content): Merge conflict in libcxx/docs/UserDocumentation.rst CONFLICT (content): Merge conflict in libcxx/docs/VendorDocumentation.rst CONFLICT (content): Merge conflict in libcxx/include/__pstl/backend.h CONFLICT (content): Merge conflict in libcxx/include/__pstl/backend_fwd.h CONFLICT (content): Merge conflict in libcxx/include/module.modulemap CONFLICT (content): Merge conflict in libcxx/utils/ci/run-buildbot CONFLICT (content): Merge conflict in libcxx/utils/libcxx/test/features.py
I have rebased on top of main will continue the rest of the work in a different PR as requested

Did you update the PR after the rebase, or created a new one?

The PR was not updated I'll create a new one

vidsinghal · 2024-12-08T03:29:45Z

Hello @AntonRydahl , I am having a difficult time to compile the tests in the PR.

I build this PR code with the following command

cmake -G Ninja -DCMAKE_BUILD_TYPE=Release \
          -DCMAKE_C_COMPILER=gcc \
          -DCMAKE_CXX_COMPILER=g++ \
          -DCMAKE_INSTALL_PREFIX="$INSTALLDIR" \
          -DLIBCXX_ENABLE_WERROR=YES \
          -DLIBCXXABI_ENABLE_WERROR=YES \
          -DLIBUNWIND_ENABLE_WERROR=YES \
          -DLIBCXX_ENABLE_CLANG_TIDY=ON \
          -DLLVM_LIT_ARGS="-sv --xunit-xml-output test-results.xml --timeout=1500 --time-tests" \
          -DLLVM_ENABLE_PROJECTS="clang;lld;" -DCLANG_DEFAULT_LINKER="lld" -DLLVM_ENABLE_RUNTIMES="libcxx;libcxxabi;libunwind;offload;pstl;openmp" -DLIBCXX_PSTL_BACKEND="openmp" -DLLVM_TARGETS_TO_BUILD="X86;AMDGPU" \
          -DPSTL_PARALLEL_BACKEND="omp" -DLIBCXX_CXX_ABI=libcxxabi -DLIBCXX_ENABLE_THREADS=ON \
                $CLANG_ROOT/llvm-project/llvm

I am trying to compile of the tests with the following command:

clang++  -Wl,-rpath,../clang/build/lib  -I ../clang/build/projects/runtimes/src/  -I ../clang/build/runtimes/runtimes-bins/openmp/runtime/src/  -fopenmp -fexperimental-library -fopenmp-targets=amdgcn-amd-amdhsa test.cpp -o test

however, it throws an error about tbb, which is interesting, since i specifically compiled for the Openmp backend.

ld.lld: error: undefined symbol: tbb::interface7::internal::isolate_within_arena(tbb::interface7::internal::delegate_base&, long)
>>> referenced by test5.cpp
>>>               /var/tmp/singhal2/test5-0eaed1.o:(__pstl::execution::v1::parallel_unsequenced_policy const& tbb::interface7::internal::isolate_impl<void, void __pstl::__tbb_backend::__parallel_for<__pstl::execution::v1::parallel_unsequenced_policy const&, __gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int>>>, void __pstl::__internal::__pattern_walk1<__pstl::execution::v1::parallel_unsequenced_policy const&, __gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int>>>, main::$_0, std::integral_constant<bool, true>>(__pstl::execution::v1::parallel_unsequenced_policy const&, __gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int>>>, __gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int>>>, main::$_0, std::integral_constant<bool, true>, std::integral_constant<bool, true>)::'lambda'()::operator()() const::'lambda'(__gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int>>>, __gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int>>>)>(__pstl::execution::v1::parallel_unsequenced_policy const&, __gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int>>>, __gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int>>>, main::$_0)::'lambda'() const>(__gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int>>>&))

ld.lld: error: undefined symbol: tbb::task_group_context::~task_group_context()
>>> referenced by test5.cpp
>>>               /var/tmp/singhal2/test5-0eaed1.o:(tbb::interface9::internal::start_for<tbb::blocked_range<__gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int>>>>, __pstl::__tbb_backend::__parallel_for_body<__gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int>>>, void __pstl::__internal::__pattern_walk1<__pstl::execution::v1::parallel_unsequenced_policy const&, __gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int>>>, main::$_0, std::integral_constant<bool, true>>(__pstl::execution::v1::parallel_unsequenced_policy const&, __gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int>>>, __gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int>>>, main::$_0, std::integral_constant<bool, true>, std::integral_constant<bool, true>)::'lambda'()::operator()() const::'lambda'(__gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int>>>, __gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int>>>)>, tbb::auto_partitioner const>::run(tbb::blocked_range<__gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int>>>> const&, __pstl::__tbb_backend::__parallel_for_body<__gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int>>>, void __pstl::__internal::__pattern_walk1<__pstl::execution::v1::parallel_unsequenced_policy const&, __gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int>>>, main::$_0, std::integral_constant<bool, true>>(__pstl::execution::v1::parallel_unsequenced_policy const&, __gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int>>>, __gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int>>>, main::$_0, std::integral_constant<bool, true>, std::integral_constant<bool, true>)::'lambda'()::operator()() const::'lambda'(__gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int>>>, __gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int>>>)> const&, tbb::auto_partitioner const&))
>>> referenced by test5.cpp
>>>               /var/tmp/singhal2/test5-0eaed1.o:(tbb::interface9::internal::start_for<tbb::blocked_range<__gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int>>>>, __pstl::__tbb_backend::__parallel_for_body<__gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int>>>, void __pstl::__internal::__pattern_walk1<__pstl::execution::v1::parallel_unsequenced_policy const&, __gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int>>>, main::$_0, std::integral_constant<bool, true>>(__pstl::execution::v1::parallel_unsequenced_policy const&, __gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int>>>, __gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int>>>, main::$_0, std::integral_constant<bool, true>, std::integral_constant<bool, true>)::'lambda'()::operator()() const::'lambda'(__gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int>>>, __gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int>>>)>, tbb::auto_partitioner const>::run(tbb::blocked_range<__gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int>>>> const&, __pstl::__tbb_backend::__parallel_for_body<__gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int>>>, void __pstl::__internal::__pattern_walk1<__pstl::execution::v1::parallel_unsequenced_policy const&, __gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int>>>, main::$_0, std::integral_constant<bool, true>>(__pstl::execution::v1::parallel_unsequenced_policy const&, __gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int>>>, __gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int>>>, main::$_0, std::integral_constant<bool, true>, std::integral_constant<bool, true>)::'lambda'()::operator()() const::'lambda'(__gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int>>>, __gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int>>>)> const&, tbb::auto_partitioner const&))

ld.lld: error: undefined symbol: tbb::task_group_context::init()
>>> referenced by test5.cpp
>>>               /var/tmp/singhal2/test5-0eaed1.o:(tbb::task_group_context::task_group_context(tbb::task_group_context::kind_type, unsigned long))

ld.lld: error: undefined symbol: tbb::internal::allocate_root_with_context_proxy::allocate(unsigned long) const
>>> referenced by test5.cpp
>>>               /var/tmp/singhal2/test5-0eaed1.o:(operator new(unsigned long, tbb::internal::allocate_root_with_context_proxy const&))

ld.lld: error: undefined symbol: tbb::internal::allocate_root_with_context_proxy::free(tbb::task&) const
>>> referenced by test5.cpp
>>>               /var/tmp/singhal2/test5-0eaed1.o:(operator delete(void*, tbb::internal::allocate_root_with_context_proxy const&))

ld.lld: error: undefined symbol: vtable for tbb::task
>>> referenced by test5.cpp
>>>               /var/tmp/singhal2/test5-0eaed1.o:(tbb::task::task())
>>> the vtable symbol may be undefined because the class is missing its key function (see https://lld.llvm.org/missingkeyfunction)

ld.lld: error: undefined symbol: tbb::internal::get_initial_auto_partitioner_divisor()
>>> referenced by test5.cpp
>>>               /var/tmp/singhal2/test5-0eaed1.o:(tbb::interface9::internal::adaptive_mode<tbb::interface9::internal::auto_partition_type>::adaptive_mode())

ld.lld: error: undefined symbol: tbb::internal::allocate_child_proxy::allocate(unsigned long) const
>>> referenced by test5.cpp
>>>               /var/tmp/singhal2/test5-0eaed1.o:(tbb::interface9::internal::allocate_sibling(tbb::task*, unsigned long))

ld.lld: error: undefined symbol: tbb::internal::allocate_continuation_proxy::allocate(unsigned long) const
>>> referenced by test5.cpp
>>>               /var/tmp/singhal2/test5-0eaed1.o:(operator new(unsigned long, tbb::internal::allocate_continuation_proxy const&))

ld.lld: error: undefined symbol: tbb::internal::allocate_continuation_proxy::free(tbb::task&) const
>>> referenced by test5.cpp
>>>               /var/tmp/singhal2/test5-0eaed1.o:(operator delete(void*, tbb::internal::allocate_continuation_proxy const&))

ld.lld: error: undefined symbol: tbb::task_group_context::is_group_execution_cancelled() const
>>> referenced by test5.cpp
>>>               /var/tmp/singhal2/test5-0eaed1.o:(tbb::task::is_cancelled() const)

ld.lld: error: undefined symbol: typeinfo for tbb::task
>>> referenced by test5.cpp
>>>               /var/tmp/singhal2/test5-0eaed1.o:(typeinfo for tbb::interface9::internal::start_for<tbb::blocked_range<__gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int>>>>, __pstl::__tbb_backend::__parallel_for_body<__gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int>>>, void __pstl::__internal::__pattern_walk1<__pstl::execution::v1::parallel_unsequenced_policy const&, __gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int>>>, main::$_0, std::integral_constant<bool, true>>(__pstl::execution::v1::parallel_unsequenced_policy const&, __gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int>>>, __gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int>>>, main::$_0, std::integral_constant<bool, true>, std::integral_constant<bool, true>)::'lambda'()::operator()() const::'lambda'(__gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int>>>, __gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int>>>)>, tbb::auto_partitioner const>)
>>> referenced by test5.cpp
>>>               /var/tmp/singhal2/test5-0eaed1.o:(typeinfo for tbb::interface9::internal::flag_task)

ld.lld: error: undefined symbol: tbb::task::note_affinity(unsigned short)
>>> referenced by test5.cpp
>>>               /var/tmp/singhal2/test5-0eaed1.o:(vtable for tbb::interface9::internal::flag_task)
/usr/bin/clang-linker-wrapper: error: 'ld.lld' failed
clang++: error: linker command failed with exit code 1 (use -v to see invocation)

I can fix this error if i add -ltbb to the end of the compile command.
But then it is never offloaded.
although if i add save temps i do see files compiled for the amd gpu target.

Do you have any clue what is happening here?

VedantParanjape · 2024-12-16T16:26:49Z

Hello @AntonRydahl , I am having a difficult time to compile the tests in the PR.

I build this PR code with the following command

cmake -G Ninja -DCMAKE_BUILD_TYPE=Release \
          -DCMAKE_C_COMPILER=gcc \
          -DCMAKE_CXX_COMPILER=g++ \
          -DCMAKE_INSTALL_PREFIX="$INSTALLDIR" \
          -DLIBCXX_ENABLE_WERROR=YES \
          -DLIBCXXABI_ENABLE_WERROR=YES \
          -DLIBUNWIND_ENABLE_WERROR=YES \
          -DLIBCXX_ENABLE_CLANG_TIDY=ON \
          -DLLVM_LIT_ARGS="-sv --xunit-xml-output test-results.xml --timeout=1500 --time-tests" \
          -DLLVM_ENABLE_PROJECTS="clang;lld;" -DCLANG_DEFAULT_LINKER="lld" -DLLVM_ENABLE_RUNTIMES="libcxx;libcxxabi;libunwind;offload;pstl;openmp" -DLIBCXX_PSTL_BACKEND="openmp" -DLLVM_TARGETS_TO_BUILD="X86;AMDGPU" \
          -DPSTL_PARALLEL_BACKEND="omp" -DLIBCXX_CXX_ABI=libcxxabi -DLIBCXX_ENABLE_THREADS=ON \
                $CLANG_ROOT/llvm-project/llvm

I am trying to compile of the tests with the following command:

clang++  -Wl,-rpath,../clang/build/lib  -I ../clang/build/projects/runtimes/src/  -I ../clang/build/runtimes/runtimes-bins/openmp/runtime/src/  -fopenmp -fexperimental-library -fopenmp-targets=amdgcn-amd-amdhsa test.cpp -o test

however, it throws an error about tbb, which is interesting, since i specifically compiled for the Openmp backend.

ld.lld: error: undefined symbol: tbb::interface7::internal::isolate_within_arena(tbb::interface7::internal::delegate_base&, long)
>>> referenced by test5.cpp
>>>               /var/tmp/singhal2/test5-0eaed1.o:(__pstl::execution::v1::parallel_unsequenced_policy const& tbb::interface7::internal::isolate_impl<void, void __pstl::__tbb_backend::__parallel_for<__pstl::execution::v1::parallel_unsequenced_policy const&, __gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int>>>, void __pstl::__internal::__pattern_walk1<__pstl::execution::v1::parallel_unsequenced_policy const&, __gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int>>>, main::$_0, std::integral_constant<bool, true>>(__pstl::execution::v1::parallel_unsequenced_policy const&, __gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int>>>, __gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int>>>, main::$_0, std::integral_constant<bool, true>, std::integral_constant<bool, true>)::'lambda'()::operator()() const::'lambda'(__gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int>>>, __gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int>>>)>(__pstl::execution::v1::parallel_unsequenced_policy const&, __gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int>>>, __gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int>>>, main::$_0)::'lambda'() const>(__gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int>>>&))

ld.lld: error: undefined symbol: tbb::task_group_context::~task_group_context()
>>> referenced by test5.cpp
>>>               /var/tmp/singhal2/test5-0eaed1.o:(tbb::interface9::internal::start_for<tbb::blocked_range<__gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int>>>>, __pstl::__tbb_backend::__parallel_for_body<__gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int>>>, void __pstl::__internal::__pattern_walk1<__pstl::execution::v1::parallel_unsequenced_policy const&, __gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int>>>, main::$_0, std::integral_constant<bool, true>>(__pstl::execution::v1::parallel_unsequenced_policy const&, __gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int>>>, __gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int>>>, main::$_0, std::integral_constant<bool, true>, std::integral_constant<bool, true>)::'lambda'()::operator()() const::'lambda'(__gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int>>>, __gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int>>>)>, tbb::auto_partitioner const>::run(tbb::blocked_range<__gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int>>>> const&, __pstl::__tbb_backend::__parallel_for_body<__gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int>>>, void __pstl::__internal::__pattern_walk1<__pstl::execution::v1::parallel_unsequenced_policy const&, __gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int>>>, main::$_0, std::integral_constant<bool, true>>(__pstl::execution::v1::parallel_unsequenced_policy const&, __gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int>>>, __gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int>>>, main::$_0, std::integral_constant<bool, true>, std::integral_constant<bool, true>)::'lambda'()::operator()() const::'lambda'(__gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int>>>, __gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int>>>)> const&, tbb::auto_partitioner const&))
>>> referenced by test5.cpp
>>>               /var/tmp/singhal2/test5-0eaed1.o:(tbb::interface9::internal::start_for<tbb::blocked_range<__gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int>>>>, __pstl::__tbb_backend::__parallel_for_body<__gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int>>>, void __pstl::__internal::__pattern_walk1<__pstl::execution::v1::parallel_unsequenced_policy const&, __gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int>>>, main::$_0, std::integral_constant<bool, true>>(__pstl::execution::v1::parallel_unsequenced_policy const&, __gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int>>>, __gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int>>>, main::$_0, std::integral_constant<bool, true>, std::integral_constant<bool, true>)::'lambda'()::operator()() const::'lambda'(__gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int>>>, __gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int>>>)>, tbb::auto_partitioner const>::run(tbb::blocked_range<__gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int>>>> const&, __pstl::__tbb_backend::__parallel_for_body<__gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int>>>, void __pstl::__internal::__pattern_walk1<__pstl::execution::v1::parallel_unsequenced_policy const&, __gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int>>>, main::$_0, std::integral_constant<bool, true>>(__pstl::execution::v1::parallel_unsequenced_policy const&, __gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int>>>, __gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int>>>, main::$_0, std::integral_constant<bool, true>, std::integral_constant<bool, true>)::'lambda'()::operator()() const::'lambda'(__gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int>>>, __gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int>>>)> const&, tbb::auto_partitioner const&))

ld.lld: error: undefined symbol: tbb::task_group_context::init()
>>> referenced by test5.cpp
>>>               /var/tmp/singhal2/test5-0eaed1.o:(tbb::task_group_context::task_group_context(tbb::task_group_context::kind_type, unsigned long))

ld.lld: error: undefined symbol: tbb::internal::allocate_root_with_context_proxy::allocate(unsigned long) const
>>> referenced by test5.cpp
>>>               /var/tmp/singhal2/test5-0eaed1.o:(operator new(unsigned long, tbb::internal::allocate_root_with_context_proxy const&))

ld.lld: error: undefined symbol: tbb::internal::allocate_root_with_context_proxy::free(tbb::task&) const
>>> referenced by test5.cpp
>>>               /var/tmp/singhal2/test5-0eaed1.o:(operator delete(void*, tbb::internal::allocate_root_with_context_proxy const&))

ld.lld: error: undefined symbol: vtable for tbb::task
>>> referenced by test5.cpp
>>>               /var/tmp/singhal2/test5-0eaed1.o:(tbb::task::task())
>>> the vtable symbol may be undefined because the class is missing its key function (see https://lld.llvm.org/missingkeyfunction)

ld.lld: error: undefined symbol: tbb::internal::get_initial_auto_partitioner_divisor()
>>> referenced by test5.cpp
>>>               /var/tmp/singhal2/test5-0eaed1.o:(tbb::interface9::internal::adaptive_mode<tbb::interface9::internal::auto_partition_type>::adaptive_mode())

ld.lld: error: undefined symbol: tbb::internal::allocate_child_proxy::allocate(unsigned long) const
>>> referenced by test5.cpp
>>>               /var/tmp/singhal2/test5-0eaed1.o:(tbb::interface9::internal::allocate_sibling(tbb::task*, unsigned long))

ld.lld: error: undefined symbol: tbb::internal::allocate_continuation_proxy::allocate(unsigned long) const
>>> referenced by test5.cpp
>>>               /var/tmp/singhal2/test5-0eaed1.o:(operator new(unsigned long, tbb::internal::allocate_continuation_proxy const&))

ld.lld: error: undefined symbol: tbb::internal::allocate_continuation_proxy::free(tbb::task&) const
>>> referenced by test5.cpp
>>>               /var/tmp/singhal2/test5-0eaed1.o:(operator delete(void*, tbb::internal::allocate_continuation_proxy const&))

ld.lld: error: undefined symbol: tbb::task_group_context::is_group_execution_cancelled() const
>>> referenced by test5.cpp
>>>               /var/tmp/singhal2/test5-0eaed1.o:(tbb::task::is_cancelled() const)

ld.lld: error: undefined symbol: typeinfo for tbb::task
>>> referenced by test5.cpp
>>>               /var/tmp/singhal2/test5-0eaed1.o:(typeinfo for tbb::interface9::internal::start_for<tbb::blocked_range<__gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int>>>>, __pstl::__tbb_backend::__parallel_for_body<__gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int>>>, void __pstl::__internal::__pattern_walk1<__pstl::execution::v1::parallel_unsequenced_policy const&, __gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int>>>, main::$_0, std::integral_constant<bool, true>>(__pstl::execution::v1::parallel_unsequenced_policy const&, __gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int>>>, __gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int>>>, main::$_0, std::integral_constant<bool, true>, std::integral_constant<bool, true>)::'lambda'()::operator()() const::'lambda'(__gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int>>>, __gnu_cxx::__normal_iterator<int*, std::vector<int, std::allocator<int>>>)>, tbb::auto_partitioner const>)
>>> referenced by test5.cpp
>>>               /var/tmp/singhal2/test5-0eaed1.o:(typeinfo for tbb::interface9::internal::flag_task)

ld.lld: error: undefined symbol: tbb::task::note_affinity(unsigned short)
>>> referenced by test5.cpp
>>>               /var/tmp/singhal2/test5-0eaed1.o:(vtable for tbb::interface9::internal::flag_task)
/usr/bin/clang-linker-wrapper: error: 'ld.lld' failed
clang++: error: linker command failed with exit code 1 (use -v to see invocation)

I can fix this error if i add -ltbb to the end of the compile command. But then it is never offloaded. although if i add save temps i do see files compiled for the amd gpu target.

Do you have any clue what is happening here?

Ping! could someone help with this!

jdoerfert · 2024-12-18T21:55:53Z

I get this error because the host pstl is included.
Following your steps and adding -H to the clang call shows:

  .... /opt/rh/gcc-toolset-12/root/usr/lib/gcc/x86_64-redhat-linux/12/../../../../include/c++/12/pstl/parallel_backend_utils.h
  .... /opt/rh/gcc-toolset-12/root/usr/lib/gcc/x86_64-redhat-linux/12/../../../../include/c++/12/pstl/parallel_backend.h
  ..... /opt/rh/gcc-toolset-12/root/usr/lib/gcc/x86_64-redhat-linux/12/../../../../include/c++/12/pstl/parallel_backend_tbb.h
  ...... /usr/include/tbb/blocked_range.h
  ....... /usr/include/tbb/tbb_stddef.h
  ........ /usr/include/tbb/tbb_config.h

Adding -stdlib=libc++ (and potentially the rpath /lib/x86_64-unknown-linux-gnu/) makes it compile fine. It crashes at runtime due to a missing symbol, but it compiles fine.

jdoerfert · 2024-12-18T23:09:13Z

It crashes at runtime due to a missing symbol, but it compiles fine.

For now, use -fno-exceptions for the compilation, otherwise the host and device symbols are out of whack. That will be resolved in the future but is required now and needs to be documented.

h-vetinari · 2025-01-20T14:26:58Z

This PR was rebased in #122180, for anyone curious.

AntonRydahl requested review from jdoerfert, ldionne and philnik777 September 21, 2023 00:55

llvmbot added the libc++ libc++ C++ Standard Library. Not GNU libstdc++. Not libc++abi. label Sep 21, 2023

AntonRydahl mentioned this pull request Sep 21, 2023

[RFC] Offloading C++ standard parallel algorithms to GPUs using OpenMP #66465

Closed

AntonRydahl marked this pull request as ready for review September 22, 2023 00:09

AntonRydahl requested a review from a team as a code owner September 22, 2023 00:09

AntonRydahl force-pushed the libcxx_pstl_omp_offload_backend branch from a5f7752 to 96adadf Compare October 2, 2023 20:25

ldionne requested changes Oct 4, 2023

View reviewed changes

ldionne mentioned this pull request Oct 4, 2023

[libc++] Explicitly pass execution policies to _LIBCPP_PSTL_CUSTOMIZATION_POINT #68238

Merged

philnik777 added the pstl Issues related to the C++17 Parallel STL label Oct 6, 2023

AntonRydahl mentioned this pull request Oct 9, 2023

[libcxx] Unifying __is_trivial_equality_predicate and __is_trivial_plus_operation into __desugars_to #68642

Merged

ldionne requested changes Oct 13, 2023

View reviewed changes

AntonRydahl mentioned this pull request Oct 13, 2023

[libcxx] Refactoring SIMD function names in PSTL CPU backend #69029

Merged

ldionne requested changes Oct 16, 2023

View reviewed changes

ldionne mentioned this pull request Oct 23, 2023

discrepancy between clang and nvcc regarding std::is_invocable_v #69956

Closed

AntonRydahl requested a review from a team as a code owner October 26, 2023 05:52

ldionne force-pushed the libcxx_pstl_omp_offload_backend branch from 2ecd90a to f87dfc5 Compare June 13, 2024 19:21

llvmbot added libc++abi libc++abi C++ Runtime Library. Not libc++. github:workflow labels Jun 13, 2024

ldionne force-pushed the libcxx_pstl_omp_offload_backend branch from f87dfc5 to 9594623 Compare June 13, 2024 19:35

ldionne force-pushed the libcxx_pstl_omp_offload_backend branch from 9594623 to 736262f Compare June 14, 2024 12:48

Adding OpenMP Offloading Backend for C++ Parallel Algorithms

10d408f

AntonRydahl force-pushed the libcxx_pstl_omp_offload_backend branch from 736262f to 10d408f Compare June 18, 2024 17:15

ldionne reviewed Jun 18, 2024

View reviewed changes

Algorithms for ranges shift left and shift right

d83e0de

Algorithms for ranges_find_last, find_last_if, and find_last_if_not

cfed7b8

xevor11 mentioned this pull request Nov 19, 2024

Adding Support for Offloading C++ standard algorithms #116869

Open

		@@ -0,0 +1,349 @@
		//===----------------------------------------------------------------------===//

		// parallel unsequenced, as it is the only execution policy prohibiting throwing
		// exceptions and allowing SIMD instructions

Adding Separate OpenMP Offloading Backend to libcxx/include/__algorithm/pstl_backends #66968

Are you sure you want to change the base?

Adding Separate OpenMP Offloading Backend to libcxx/include/__algorithm/pstl_backends #66968

Uh oh!

Conversation

AntonRydahl commented Sep 21, 2023 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

llvmbot commented Sep 21, 2023 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

AntonRydahl commented Sep 21, 2023

Uh oh!

AntonRydahl commented Sep 21, 2023

Uh oh!

github-actions bot commented Oct 2, 2023 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

ldionne left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

EricWF commented Oct 9, 2023

Uh oh!

philnik777 commented Oct 9, 2023

Uh oh!

ldionne left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

AntonRydahl commented Oct 26, 2023 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

AntonRydahl commented Oct 28, 2023

Uh oh!

jdoerfert commented May 3, 2024

Uh oh!

ldionne commented Jun 13, 2024

Uh oh!

AntonRydahl commented Jun 13, 2024

Uh oh!

jdoerfert commented Jun 14, 2024

Adding Separate OpenMP Offloading Backend to `libcxx/include/__algorithm/pstl_backends` #66968

Adding Separate OpenMP Offloading Backend to `libcxx/include/__algorithm/pstl_backends` #66968

AntonRydahl commented Sep 21, 2023 •

edited

Loading

llvmbot commented Sep 21, 2023 •

edited

Loading

github-actions bot commented Oct 2, 2023 •

edited

Loading

AntonRydahl commented Oct 26, 2023 •

edited

Loading

xevor11 commented Aug 15, 2024 •

edited

Loading

VedantParanjape commented Nov 14, 2024 •

edited

Loading

xevor11 commented Nov 14, 2024 •

edited

Loading

vidsinghal commented Dec 8, 2024 •

edited

Loading