celeritas-project · sethrj · Jan 19, 2025 · Jan 14, 2025 · Jul 26, 2024 · Jul 26, 2024
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -343,7 +343,14 @@ if(CELERITAS_USE_MPI)
 endif()
 
 if(CELERITAS_USE_OpenMP)
-  find_package(OpenMP REQUIRED)
+  if(NOT OpenMP_FOUND)
+    find_package(OpenMP REQUIRED)
+  endif()
+  if(OpenMP_VERSION AND (OpenMP_VERSION VERSION_LESS "3.0"))
+    message(WARNING
+      "OpenMP version ${OpenMP_VERSION} may fail to compile"
+    )
+  endif()
 endif()
 
 if(CELERITAS_USE_Perfetto)

diff --git a/app/celer-sim/celer-sim.cc b/app/celer-sim/celer-sim.cc
@@ -129,17 +129,19 @@ void run(std::istream* is, std::shared_ptr<OutputRegistry> output)
         CELER_LOG(status) << "Transporting " << run_stream.num_events()
                           << " on " << num_streams << " threads";
         MultiExceptionHandler capture_exception;
+        size_type const num_events = run_stream.num_events();
 #if CELERITAS_OPENMP == CELERITAS_OPENMP_EVENT
 #    pragma omp parallel for
 #endif
-        for (size_type event = 0; event < run_stream.num_events(); ++event)
+        for (size_type event = 0; event < num_events; ++event)
         {
             activate_device_local();
 
             // Run a single event on a single thread
             TransporterResult event_result;
             CELER_TRY_HANDLE(event_result = run_stream(
-                                 StreamId(get_openmp_thread()), EventId(event)),
+                                 id_cast<StreamId>(get_openmp_thread()),
+                                 id_cast<EventId>(event)),
                              capture_exception);
             if (run_input->transporter_result)
             {

diff --git a/cmake/CeleritasOptionUtils.cmake b/cmake/CeleritasOptionUtils.cmake
@@ -281,9 +281,9 @@ endfunction()
 #-----------------------------------------------------------------------------#
 
 function(celeritas_error_incompatible_option  msg var new_value)
-  message(SEND_ERROR "Invalid setting ${var}=${${var}}: ${msg}
-    Possible fix: cmake -D${var}=${new_value} ${CMAKE_BINARY_DIR}"
-  )
+  message(SEND_ERROR "Invalid setting ${var}=${${var}}: ${msg}")
+  message(WARNING "Setting ${var}=${new_value} for next build")
+  set(${var} "${new_value}" CACHE STRING "Set automatically: ${msg}" FORCE)
 endfunction()
 
 #-----------------------------------------------------------------------------#

diff --git a/scripts/cmake-presets/ci-windows-github.json b/scripts/cmake-presets/ci-windows-github.json
@@ -38,8 +38,7 @@
       "inherits": ["base"],
       "displayName": "fast build with testing and JSON",
       "cacheVariables": {
-        "CELERITAS_BUILD_TESTS": {"type": "BOOL", "value": "ON"},
-        "CELERITAS_USE_JSON": {"type": "BOOL", "value": "ON"}
+        "CELERITAS_BUILD_TESTS": {"type": "BOOL", "value": "ON"}
       }
     },
     {

diff --git a/src/corecel/math/Atomics.hh b/src/corecel/math/Atomics.hh
@@ -2,9 +2,16 @@
 // Copyright Celeritas contributors: see top-level COPYRIGHT file for details
 // SPDX-License-Identifier: (Apache-2.0 OR MIT)
 //---------------------------------------------------------------------------//
-//! \file corecel/math/Atomics.hh
-//! \brief Atomics for use in kernel code (CUDA/HIP/OpenMP).
-//---------------------------------------------------------------------------//
+/*!
+ * \file corecel/math/Atomics.hh
+ * \brief Atomics for use in kernel code (CUDA/HIP/OpenMP).
+ *
+ * \note On CPU, these functions assume the atomic add is being done in
+ * with \em track-level parallelism rather than \em event-level because these
+ * utilities are meant for "kernel" code. Multiple independent events
+ * must \em not use these functions to simultaneously modify shared data.
+ *
+ * ---------------------------------------------------------------------------*/
 #pragma once
 
 #include "corecel/Assert.hh"
@@ -13,18 +20,23 @@
 
 #include "Algorithms.hh"
 
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 600)
+#    error "Celeritas requires CUDA arch 6.0 (P100) or greater"
+#endif
+
+#if defined(_OPENMP) && CELERITAS_OPENMP == CELERITAS_OPENMP_TRACK
+//! Capture the subsequent expression as an OpenMP atomic
+#    define CELER_CAPTURE_IF_OPENMP_TRACK _Pragma("omp atomic capture")
+#else
+//! Simply scope the next block
+#    define CELER_CAPTURE_IF_OPENMP_TRACK
+#endif
+
 namespace celeritas
 {
 //---------------------------------------------------------------------------//
 /*!
  * Add to a value, returning the original value.
- *
- * Note that on CPU, this assumes the atomic add is being done in with \em
- * track-level parallelism rather than \em event-level because these utilities
- * are meant for "kernel" code.
- *
- * \warning Multiple events must not use this function to simultaneously modify
- * shared data.
  */
 template<class T>
 CELER_FORCEINLINE_FUNCTION T atomic_add(T* address, T value)
@@ -34,9 +46,7 @@ CELER_FORCEINLINE_FUNCTION T atomic_add(T* address, T value)
 #else
     CELER_EXPECT(address);
     T initial;
-#    if defined(_OPENMP) && CELERITAS_OPENMP == CELERITAS_OPENMP_TRACK
-#        pragma omp atomic capture
-#    endif
+    CELER_CAPTURE_IF_OPENMP_TRACK
     {
         initial = *address;
         *address += value;
@@ -45,33 +55,6 @@ CELER_FORCEINLINE_FUNCTION T atomic_add(T* address, T value)
 #endif
 }
 
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 600)
-//---------------------------------------------------------------------------//
-/*!
- * Atomic addition specialization for double-precision on older platforms.
- *
- * From CUDA C Programming guide v10.1 p127
- */
-inline __device__ double atomic_add(double* address, double val)
-{
-    CELER_EXPECT(address);
-    ull_int* address_as_ull = reinterpret_cast<ull_int*>(address);
-    ull_int old = *address_as_ull;
-    ull_int assumed;
-    do
-    {
-        assumed = old;
-        old = atomicCAS(
-            address_as_ull,
-            assumed,
-            __double_as_longlong(val + __longlong_as_double(assumed)));
-        // Note: uses integer comparison to avoid hang in case of NaN (since
-        // NaN != NaN)
-    } while (assumed != old);
-    return __longlong_as_double(old);
-}
-#endif
-
 //---------------------------------------------------------------------------//
 /*!
  * Set the value to the minimum of the actual and given, returning old.
@@ -84,9 +67,7 @@ CELER_FORCEINLINE_FUNCTION T atomic_min(T* address, T value)
 #else
     CELER_EXPECT(address);
     T initial;
-#    if defined(_OPENMP) && CELERITAS_OPENMP == CELERITAS_OPENMP_TRACK
-#        pragma omp atomic capture
-#    endif
+    CELER_CAPTURE_IF_OPENMP_TRACK
     {
         initial = *address;
         *address = celeritas::min(initial, value);
@@ -107,9 +88,7 @@ CELER_FORCEINLINE_FUNCTION T atomic_max(T* address, T value)
 #else
     CELER_EXPECT(address);
     T initial;
-#    if defined(_OPENMP) && CELERITAS_OPENMP == CELERITAS_OPENMP_TRACK
-#        pragma omp atomic capture
-#    endif
+    CELER_CAPTURE_IF_OPENMP_TRACK
     {
         initial = *address;
         *address = celeritas::max(initial, value);
@@ -118,28 +97,5 @@ CELER_FORCEINLINE_FUNCTION T atomic_max(T* address, T value)
 #endif
 }
 
-#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ <= 300)
-//---------------------------------------------------------------------------//
-/*!
- * Software emulation of atomic max for older systems.
- *
- * This is a modification of the "software double-precision add" algorithm.
- * TODO: combine this algorithm with the atomic_add and genericize on operation
- * if we ever need to implement the atomics for other types.
- */
-inline __device__ ull_int atomic_max(ull_int* address, ull_int val)
-{
-    CELER_EXPECT(address);
-    ull_int old = *address;
-    ull_int assumed;
-    do
-    {
-        assumed = old;
-        old = atomicCAS(address, assumed, celeritas::max(val, assumed));
-    } while (assumed != old);
-    return old;
-}
-#endif
-
 //---------------------------------------------------------------------------//
 }  // namespace celeritas