Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Require less ancient CUDA architecture and OpenMP implementation #1578

Merged
merged 19 commits into from
Jan 19, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -343,7 +343,14 @@ if(CELERITAS_USE_MPI)
endif()

if(CELERITAS_USE_OpenMP)
find_package(OpenMP REQUIRED)
if(NOT OpenMP_FOUND)
find_package(OpenMP REQUIRED)
endif()
if(OpenMP_VERSION AND (OpenMP_VERSION VERSION_LESS "3.0"))
message(WARNING
"OpenMP version ${OpenMP_VERSION} may fail to compile"
)
endif()
endif()

if(CELERITAS_USE_Perfetto)
Expand Down
6 changes: 4 additions & 2 deletions app/celer-sim/celer-sim.cc
Original file line number Diff line number Diff line change
Expand Up @@ -129,17 +129,19 @@ void run(std::istream* is, std::shared_ptr<OutputRegistry> output)
CELER_LOG(status) << "Transporting " << run_stream.num_events()
<< " on " << num_streams << " threads";
MultiExceptionHandler capture_exception;
size_type const num_events = run_stream.num_events();
#if CELERITAS_OPENMP == CELERITAS_OPENMP_EVENT
# pragma omp parallel for
#endif
for (size_type event = 0; event < run_stream.num_events(); ++event)
for (size_type event = 0; event < num_events; ++event)
{
activate_device_local();

// Run a single event on a single thread
TransporterResult event_result;
CELER_TRY_HANDLE(event_result = run_stream(
StreamId(get_openmp_thread()), EventId(event)),
id_cast<StreamId>(get_openmp_thread()),
id_cast<EventId>(event)),
capture_exception);
if (run_input->transporter_result)
{
Expand Down
6 changes: 3 additions & 3 deletions cmake/CeleritasOptionUtils.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -281,9 +281,9 @@ endfunction()
#-----------------------------------------------------------------------------#

function(celeritas_error_incompatible_option msg var new_value)
message(SEND_ERROR "Invalid setting ${var}=${${var}}: ${msg}
Possible fix: cmake -D${var}=${new_value} ${CMAKE_BINARY_DIR}"
)
message(SEND_ERROR "Invalid setting ${var}=${${var}}: ${msg}")
message(WARNING "Setting ${var}=${new_value} for next build")
set(${var} "${new_value}" CACHE STRING "Set automatically: ${msg}" FORCE)
endfunction()

#-----------------------------------------------------------------------------#
Expand Down
3 changes: 1 addition & 2 deletions scripts/cmake-presets/ci-windows-github.json
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,7 @@
"inherits": ["base"],
"displayName": "fast build with testing and JSON",
"cacheVariables": {
"CELERITAS_BUILD_TESTS": {"type": "BOOL", "value": "ON"},
"CELERITAS_USE_JSON": {"type": "BOOL", "value": "ON"}
sethrj marked this conversation as resolved.
Show resolved Hide resolved
"CELERITAS_BUILD_TESTS": {"type": "BOOL", "value": "ON"}
}
},
{
Expand Down
94 changes: 25 additions & 69 deletions src/corecel/math/Atomics.hh
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,16 @@
// Copyright Celeritas contributors: see top-level COPYRIGHT file for details
// SPDX-License-Identifier: (Apache-2.0 OR MIT)
//---------------------------------------------------------------------------//
//! \file corecel/math/Atomics.hh
//! \brief Atomics for use in kernel code (CUDA/HIP/OpenMP).
//---------------------------------------------------------------------------//
/*!
* \file corecel/math/Atomics.hh
* \brief Atomics for use in kernel code (CUDA/HIP/OpenMP).
*
* \note On CPU, these functions assume the atomic add is being done in
* with \em track-level parallelism rather than \em event-level because these
* utilities are meant for "kernel" code. Multiple independent events
* must \em not use these functions to simultaneously modify shared data.
*
* ---------------------------------------------------------------------------*/
#pragma once

#include "corecel/Assert.hh"
Expand All @@ -13,18 +20,23 @@

#include "Algorithms.hh"

#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 600)
# error "Celeritas requires CUDA arch 6.0 (P100) or greater"
#endif

#if defined(_OPENMP) && CELERITAS_OPENMP == CELERITAS_OPENMP_TRACK
//! Capture the subsequent expression as an OpenMP atomic
# define CELER_CAPTURE_IF_OPENMP_TRACK _Pragma("omp atomic capture")
#else
//! Simply scope the next block
# define CELER_CAPTURE_IF_OPENMP_TRACK
#endif

namespace celeritas
{
//---------------------------------------------------------------------------//
/*!
* Add to a value, returning the original value.
*
* Note that on CPU, this assumes the atomic add is being done in with \em
* track-level parallelism rather than \em event-level because these utilities
* are meant for "kernel" code.
*
* \warning Multiple events must not use this function to simultaneously modify
* shared data.
*/
template<class T>
CELER_FORCEINLINE_FUNCTION T atomic_add(T* address, T value)
Expand All @@ -34,9 +46,7 @@ CELER_FORCEINLINE_FUNCTION T atomic_add(T* address, T value)
#else
CELER_EXPECT(address);
T initial;
# if defined(_OPENMP) && CELERITAS_OPENMP == CELERITAS_OPENMP_TRACK
# pragma omp atomic capture
# endif
CELER_CAPTURE_IF_OPENMP_TRACK
{
initial = *address;
*address += value;
Expand All @@ -45,33 +55,6 @@ CELER_FORCEINLINE_FUNCTION T atomic_add(T* address, T value)
#endif
}

#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 600)
//---------------------------------------------------------------------------//
/*!
* Atomic addition specialization for double-precision on older platforms.
*
* From CUDA C Programming guide v10.1 p127
*/
inline __device__ double atomic_add(double* address, double val)
{
CELER_EXPECT(address);
ull_int* address_as_ull = reinterpret_cast<ull_int*>(address);
ull_int old = *address_as_ull;
ull_int assumed;
do
{
assumed = old;
old = atomicCAS(
address_as_ull,
assumed,
__double_as_longlong(val + __longlong_as_double(assumed)));
// Note: uses integer comparison to avoid hang in case of NaN (since
// NaN != NaN)
} while (assumed != old);
return __longlong_as_double(old);
}
#endif

//---------------------------------------------------------------------------//
/*!
* Set the value to the minimum of the actual and given, returning old.
Expand All @@ -84,9 +67,7 @@ CELER_FORCEINLINE_FUNCTION T atomic_min(T* address, T value)
#else
CELER_EXPECT(address);
T initial;
# if defined(_OPENMP) && CELERITAS_OPENMP == CELERITAS_OPENMP_TRACK
# pragma omp atomic capture
# endif
CELER_CAPTURE_IF_OPENMP_TRACK
{
initial = *address;
*address = celeritas::min(initial, value);
Expand All @@ -107,9 +88,7 @@ CELER_FORCEINLINE_FUNCTION T atomic_max(T* address, T value)
#else
CELER_EXPECT(address);
T initial;
# if defined(_OPENMP) && CELERITAS_OPENMP == CELERITAS_OPENMP_TRACK
# pragma omp atomic capture
# endif
CELER_CAPTURE_IF_OPENMP_TRACK
{
initial = *address;
*address = celeritas::max(initial, value);
Expand All @@ -118,28 +97,5 @@ CELER_FORCEINLINE_FUNCTION T atomic_max(T* address, T value)
#endif
}

#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ <= 300)
//---------------------------------------------------------------------------//
/*!
* Software emulation of atomic max for older systems.
*
* This is a modification of the "software double-precision add" algorithm.
* TODO: combine this algorithm with the atomic_add and genericize on operation
* if we ever need to implement the atomics for other types.
*/
inline __device__ ull_int atomic_max(ull_int* address, ull_int val)
{
CELER_EXPECT(address);
ull_int old = *address;
ull_int assumed;
do
{
assumed = old;
old = atomicCAS(address, assumed, celeritas::max(val, assumed));
} while (assumed != old);
return old;
}
#endif

//---------------------------------------------------------------------------//
} // namespace celeritas
Loading