celeritas-project · sethrj · Sep 7, 2023 · Sep 6, 2023 · Sep 6, 2023 · Sep 6, 2023
diff --git a/src/accel/SharedParams.cc b/src/accel/SharedParams.cc
@@ -161,8 +161,23 @@ SharedParams::SharedParams(SetupOptions const& options)
     ScopedMem record_mem("SharedParams.construct");
     ScopedTimeLog scoped_time;
 
-    // Initialize device and other "global" data
-    SharedParams::initialize_device(options);
+    // Initialize CUDA (CUDA environment variables control the preferred
+    // device)
+    celeritas::activate_device();
+
+    if (celeritas::device() && CELERITAS_CORE_GEO == CELERITAS_CORE_GEO_VECGEOM)
+    {
+        // Heap size must be set before creating VecGeom device instance; and
+        // let's just set the stack size as well
+        if (options.cuda_stack_size > 0)
+        {
+            celeritas::set_cuda_stack_size(options.cuda_stack_size);
+        }
+        if (options.cuda_heap_size > 0)
+        {
+            celeritas::set_cuda_heap_size(options.cuda_heap_size);
+        }
+    }
 
     // Construct core data
     this->initialize_core(options);
@@ -200,11 +215,9 @@ SharedParams::SharedParams(SetupOptions const& options)
  * properties) in single-thread mode has "thread" storage in a multithreaded
  * application. It must be initialized on all threads.
  */
-void SharedParams::InitializeWorker(SetupOptions const& options)
+void SharedParams::InitializeWorker(SetupOptions const&)
 {
-    CELER_LOG_LOCAL(status) << "Initializing worker thread";
-    ScopedTimeLog scoped_time;
-    return SharedParams::initialize_device(options);
+    celeritas::activate_device_local();
 }
 
 //---------------------------------------------------------------------------//
@@ -225,37 +238,10 @@ void SharedParams::Finalize()
     CELER_LOG_LOCAL(debug) << "Resetting shared parameters";
     *this = {};
 
-    CELER_ENSURE(!*this);
-}
+    // Reset streams before the static destructor does
+    celeritas::device().create_streams(0);
 
-//---------------------------------------------------------------------------//
-/*!
- * Initialize GPU device on each thread.
- *
- * This is thread safe and must be called from every worker thread.
- */
-void SharedParams::initialize_device(SetupOptions const& options)
-{
-    if (Device::num_devices() == 0)
-    {
-        // No GPU is enabled so no global initialization is needed
-        return;
-    }
-
-    // Initialize CUDA (you'll need to use CUDA environment variables to
-    // control the preferred device)
-    celeritas::activate_device(Device{0});
-
-    // Heap size must be set before creating VecGeom device instance; and
-    // let's just set the stack size as well
-    if (options.cuda_stack_size > 0)
-    {
-        celeritas::set_cuda_stack_size(options.cuda_stack_size);
-    }
-    if (options.cuda_heap_size > 0)
-    {
-        celeritas::set_cuda_heap_size(options.cuda_heap_size);
-    }
+    CELER_ENSURE(!*this);
 }
 
 //---------------------------------------------------------------------------//

diff --git a/src/accel/SharedParams.hh b/src/accel/SharedParams.hh
@@ -108,7 +108,6 @@ class SharedParams
 
     //// HELPER FUNCTIONS ////
 
-    static void initialize_device(SetupOptions const& options);
     void initialize_core(SetupOptions const& options);
     void try_output() const;
 };

diff --git a/src/celeritas/global/detail/ActionSequence.cc b/src/celeritas/global/detail/ActionSequence.cc
@@ -16,11 +16,14 @@
 #include "corecel/Types.hh"
 #include "corecel/cont/EnumArray.hh"
 #include "corecel/cont/Range.hh"
+#include "corecel/sys/Device.hh"
 #include "corecel/sys/ScopedProfiling.hh"
 #include "corecel/sys/Stopwatch.hh"
+#include "corecel/sys/Stream.hh"
 #include "celeritas/global/ActionInterface.hh"
 
 #include "../ActionRegistry.hh"
+#include "../CoreState.hh"
 
 namespace celeritas
 {
@@ -90,6 +93,12 @@ void ActionSequence::begin_run(CoreParams const& params, CoreState<M>& state)
 template<MemSpace M>
 void ActionSequence::execute(CoreParams const& params, CoreState<M>& state)
 {
+    [[maybe_unused]] Stream::StreamT stream = nullptr;
+    if (M == MemSpace::device && options_.sync)
+    {
+        stream = celeritas::device().stream(state.stream_id()).get();
+    }
+
     ScopedProfiling profile_this{"step"};
     if (M == MemSpace::host || options_.sync)
     {
@@ -101,7 +110,7 @@ void ActionSequence::execute(CoreParams const& params, CoreState<M>& state)
             actions_[i]->execute(params, state);
             if (M == MemSpace::device)
             {
-                CELER_DEVICE_CALL_PREFIX(DeviceSynchronize());
+                CELER_DEVICE_CALL_PREFIX(StreamSynchronize(stream));
             }
             accum_time_[i] += get_time();
         }

diff --git a/src/celeritas/track/detail/TrackSortUtils.cu b/src/celeritas/track/detail/TrackSortUtils.cu
@@ -234,17 +234,26 @@ void count_tracks_per_action(
                      start + offsets.size(),
                      ThreadId{});
         CELER_DEVICE_CHECK_ERROR();
+        auto* stream = celeritas::device().stream(states.stream_id).get();
         CELER_LAUNCH_KERNEL(tracks_per_action,
                             celeritas::device().default_block_size(),
                             states.size(),
-                            celeritas::device().stream(states.stream_id).get(),
+                            stream,
                             states,
                             offsets,
                             states.size(),
                             order);
+
         Span<ThreadId> sout = out[AllItems<ThreadId, MemSpace::host>{}];
-        Copier<ThreadId, MemSpace::host> copy_to_host{sout};
-        copy_to_host(MemSpace::device, offsets);
+        CELER_DEVICE_CALL_PREFIX(
+            MemcpyAsync(sout.data(),
+                        offsets.data(),
+                        offsets.size() * sizeof(ThreadId),
+                        CELER_DEVICE_PREFIX(MemcpyDeviceToHost),
+                        stream));
+
+        // Copies must be complete before backfilling
+        CELER_DEVICE_CALL_PREFIX(StreamSynchronize(stream));
         backfill_action_count(sout, states.size());
     }
 }

diff --git a/src/celeritas/user/DetectorSteps.cu b/src/celeritas/user/DetectorSteps.cu
@@ -12,6 +12,8 @@
 #include <thrust/execution_policy.h>
 #include <thrust/iterator/counting_iterator.h>
 
+#include "corecel/Assert.hh"
+#include "corecel/Macros.hh"
 #include "corecel/data/Collection.hh"
 #include "corecel/data/Copier.hh"
 #include "corecel/sys/Device.hh"
@@ -120,7 +122,10 @@ struct HasDetector
 
 //---------------------------------------------------------------------------//
 template<class T>
-void copy_field(std::vector<T>* dst, StateRef<T> const& src, size_type num_valid)
+void copy_field(std::vector<T>* dst,
+                StateRef<T> const& src,
+                size_type num_valid,
+                Stream::StreamT stream)
 {
     if (src.empty() || num_valid == 0)
     {
@@ -131,8 +136,12 @@ void copy_field(std::vector<T>* dst, StateRef<T> const& src, size_type num_valid
     dst->resize(num_valid);
 
     // Copy all items from valid threads
-    Copier<T, MemSpace::host> copy{{dst->data(), num_valid}};
-    copy(MemSpace::device, {src.data().get(), num_valid});
+    CELER_DEVICE_CALL_PREFIX(
+        MemcpyAsync(dst->data(),
+                    src.data().get(),
+                    num_valid * sizeof(T),
+                    CELER_DEVICE_PREFIX(MemcpyDeviceToHost),
+                    stream));
 }
 
 //---------------------------------------------------------------------------//
@@ -166,8 +175,9 @@ void copy_steps<MemSpace::device>(
     gather_step(state, num_valid);
 
     // Resize and copy if the fields are present
+    auto* stream = celeritas::device().stream(state.stream_id).get();
 #define DS_ASSIGN(FIELD) \
-    copy_field(&(output->FIELD), state.scratch.FIELD, num_valid)
+    copy_field(&(output->FIELD), state.scratch.FIELD, num_valid, stream)
 
     DS_ASSIGN(detector);
     DS_ASSIGN(track_id);
@@ -188,6 +198,9 @@ void copy_steps<MemSpace::device>(
     DS_ASSIGN(energy_deposition);
 #undef DS_ASSIGN
 
+    // Copies must be complete before returning
+    CELER_DEVICE_CALL_PREFIX(StreamSynchronize(stream));
+
     CELER_ENSURE(output->detector.size() == num_valid);
     CELER_ENSURE(output->track_id.size() == num_valid);
 }

diff --git a/src/corecel/sys/Device.cc b/src/corecel/sys/Device.cc
@@ -34,13 +34,6 @@ namespace
 {
 //---------------------------------------------------------------------------//
 // HELPER FUNCTIONS
-//---------------------------------------------------------------------------//
-std::mutex& device_setter_mutex()
-{
-    static std::mutex m;
-    return m;
-}
-
 //---------------------------------------------------------------------------//
 /*!
  * Active CUDA device for Celeritas calls on the local process.
@@ -51,9 +44,9 @@ std::mutex& device_setter_mutex()
  * and
  * https://github.com/celeritas-project/celeritas/pull/149#discussion_r578000062
  *
- * We might need to add a "thread_local" annotation corresponding to a
- * multithreaded celeritas option. This class will always be thread safe to
- * read (if the instance isn't being modified by other threads).
+ * The device should be *activated* by the main thread, and \c
+ * activate_device_local should be called on other threads to set up the
+ * local CUDA context.
  */
 Device& global_device()
 {
@@ -148,6 +141,8 @@ Device::Device(int id) : id_{id}, streams_{new detail::StreamStorage{}}
 {
     CELER_EXPECT(id >= 0 && id < Device::num_devices());
 
+    CELER_LOG_LOCAL(debug) << "Constructing device ID " << id;
+
     unsigned int max_threads_per_block = 0;
 #if CELER_USE_DEVICE
 #    if CELERITAS_USE_CUDA
@@ -242,8 +237,8 @@ Device::Device(int id) : id_{id}, streams_{new detail::StreamStorage{}}
  */
 StreamId::size_type Device::num_streams() const
 {
-    CELER_EXPECT(streams_);
-
+    if (!streams_)
+        return 0;
     return streams_->size();
 }
 
@@ -287,19 +282,23 @@ Device const& device()
 
 //---------------------------------------------------------------------------//
 /*!
- * Activate the given device.
+ * Activate the global celeritas device.
  *
  * The given device must be set (true result) unless no device has yet been
  * enabled -- this allows \c make_device to create "null" devices
  * when CUDA is disabled.
  *
- * \note This function is thread safe, and even though the global device is
- * shared across threads, it should be called from each thread to correctly
- * initialize CUDA.
+ * This function may be called once only, because the global device propagates
+ * into local states (e.g. where memory is allocated) all over Celeritas.
  */
 void activate_device(Device&& device)
 {
-    CELER_EXPECT(device || !global_device());
+    static std::mutex m;
+    std::lock_guard<std::mutex> scoped_lock{m};
+    Device& d = global_device();
+    CELER_VALIDATE(!d,
+                   << "celeritas::activate_device may be called only once per "
+                      "application");
 
     if (!device)
         return;
@@ -308,14 +307,8 @@ void activate_device(Device&& device)
                            << device.device_id() << " of "
                            << Device::num_devices();
     ScopedTimeLog scoped_time(&self_logger(), 1.0);
-    Device& d = global_device();
-    {
-        // Lock *after* getting the pointer to the global_device, because
-        // the global_device function (in debug mode) also uses this lock.
-        std::lock_guard<std::mutex> scoped_lock{device_setter_mutex()};
-        CELER_DEVICE_CALL_PREFIX(SetDevice(device.device_id()));
-        d = std::move(device);
-    }
+    CELER_DEVICE_CALL_PREFIX(SetDevice(device.device_id()));
+    d = std::move(device);
 
     // Call cudaFree to wake up the device, making other timers more accurate
     CELER_DEVICE_CALL_PREFIX(Free(nullptr));
@@ -353,13 +346,15 @@ void activate_device(MpiCommunicator const& comm)
  * See
  * https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs
  *
- * \pre activate_device was called to set \c device()
+ * \pre activate_device was called or no device is intended to be used
  */
 void activate_device_local()
 {
-    if (device())
+    Device& d = global_device();
+    if (d)
     {
-        CELER_DEVICE_CALL_PREFIX(SetDevice(device().device_id()));
+        CELER_LOG_LOCAL(debug) << "Activating device " << d.device_id();
+        CELER_DEVICE_CALL_PREFIX(SetDevice(d.device_id()));
     }
 }
 

diff --git a/src/corecel/sys/Stream.cc b/src/corecel/sys/Stream.cc
@@ -67,6 +67,13 @@ void AsyncMemoryResource<Pointer>::do_deallocate([[maybe_unused]] pointer p,
 Stream::Stream() : memory_resource_(stream_)
 {
     CELER_DEVICE_CALL_PREFIX(StreamCreate(&stream_));
+#if CUDART_VERSION >= 12000
+    unsigned long long stream_id = -1;
+    CELER_CUDA_CALL(cudaStreamGetId(stream_, &stream_id));
+    CELER_LOG_LOCAL(debug) << "Created stream ID " << stream_id;
+#else
+    CELER_LOG_LOCAL(debug) << "Created stream  " << static_cast<void*>(stream_);
+#endif
 }
 
 //---------------------------------------------------------------------------//
@@ -80,6 +87,8 @@ Stream::~Stream()
         try
         {
             CELER_DEVICE_CALL_PREFIX(StreamDestroy(stream_));
+            CELER_LOG_LOCAL(debug)
+                << "Destroyed stream " << static_cast<void*>(stream_);
         }
         catch (RuntimeError const& e)
         {