Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix resetting of CUDA streams when running through accel #927

Merged
merged 10 commits into from
Sep 7, 2023
58 changes: 22 additions & 36 deletions src/accel/SharedParams.cc
Original file line number Diff line number Diff line change
Expand Up @@ -161,8 +161,23 @@ SharedParams::SharedParams(SetupOptions const& options)
ScopedMem record_mem("SharedParams.construct");
ScopedTimeLog scoped_time;

// Initialize device and other "global" data
SharedParams::initialize_device(options);
// Initialize CUDA (CUDA environment variables control the preferred
// device)
celeritas::activate_device();

if (celeritas::device() && CELERITAS_CORE_GEO == CELERITAS_CORE_GEO_VECGEOM)
{
// Heap size must be set before creating VecGeom device instance; and
// let's just set the stack size as well
if (options.cuda_stack_size > 0)
{
celeritas::set_cuda_stack_size(options.cuda_stack_size);
}
if (options.cuda_heap_size > 0)
{
celeritas::set_cuda_heap_size(options.cuda_heap_size);
}
}

// Construct core data
this->initialize_core(options);
Expand Down Expand Up @@ -200,11 +215,9 @@ SharedParams::SharedParams(SetupOptions const& options)
* properties) in single-thread mode has "thread" storage in a multithreaded
* application. It must be initialized on all threads.
*/
void SharedParams::InitializeWorker(SetupOptions const& options)
void SharedParams::InitializeWorker(SetupOptions const&)
{
CELER_LOG_LOCAL(status) << "Initializing worker thread";
ScopedTimeLog scoped_time;
return SharedParams::initialize_device(options);
celeritas::activate_device_local();
}

//---------------------------------------------------------------------------//
Expand All @@ -225,37 +238,10 @@ void SharedParams::Finalize()
CELER_LOG_LOCAL(debug) << "Resetting shared parameters";
*this = {};

CELER_ENSURE(!*this);
}
// Reset streams before the static destructor does
celeritas::device().create_streams(0);

//---------------------------------------------------------------------------//
/*!
* Initialize GPU device on each thread.
*
* This is thread safe and must be called from every worker thread.
*/
void SharedParams::initialize_device(SetupOptions const& options)
{
if (Device::num_devices() == 0)
{
// No GPU is enabled so no global initialization is needed
return;
}

// Initialize CUDA (you'll need to use CUDA environment variables to
// control the preferred device)
celeritas::activate_device(Device{0});

// Heap size must be set before creating VecGeom device instance; and
// let's just set the stack size as well
if (options.cuda_stack_size > 0)
{
celeritas::set_cuda_stack_size(options.cuda_stack_size);
}
if (options.cuda_heap_size > 0)
{
celeritas::set_cuda_heap_size(options.cuda_heap_size);
}
CELER_ENSURE(!*this);
}

//---------------------------------------------------------------------------//
Expand Down
1 change: 0 additions & 1 deletion src/accel/SharedParams.hh
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,6 @@ class SharedParams

//// HELPER FUNCTIONS ////

static void initialize_device(SetupOptions const& options);
void initialize_core(SetupOptions const& options);
void try_output() const;
};
Expand Down
11 changes: 10 additions & 1 deletion src/celeritas/global/detail/ActionSequence.cc
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,14 @@
#include "corecel/Types.hh"
#include "corecel/cont/EnumArray.hh"
#include "corecel/cont/Range.hh"
#include "corecel/sys/Device.hh"
#include "corecel/sys/ScopedProfiling.hh"
#include "corecel/sys/Stopwatch.hh"
#include "corecel/sys/Stream.hh"
#include "celeritas/global/ActionInterface.hh"

#include "../ActionRegistry.hh"
#include "../CoreState.hh"

namespace celeritas
{
Expand Down Expand Up @@ -90,6 +93,12 @@ void ActionSequence::begin_run(CoreParams const& params, CoreState<M>& state)
template<MemSpace M>
void ActionSequence::execute(CoreParams const& params, CoreState<M>& state)
{
[[maybe_unused]] Stream::StreamT stream = nullptr;
if (M == MemSpace::device && options_.sync)
{
stream = celeritas::device().stream(state.stream_id()).get();
}

ScopedProfiling profile_this{"step"};
if (M == MemSpace::host || options_.sync)
{
Expand All @@ -101,7 +110,7 @@ void ActionSequence::execute(CoreParams const& params, CoreState<M>& state)
actions_[i]->execute(params, state);
if (M == MemSpace::device)
{
CELER_DEVICE_CALL_PREFIX(DeviceSynchronize());
CELER_DEVICE_CALL_PREFIX(StreamSynchronize(stream));
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Have you looked at how this impacts performance? I recall having actually seen slightly better performance with a device sync than without when running with multiple streams, and doing a quick test just now (celer-sim with cms2018+field+msc, 8 events/threads) I see an even bigger improvement with the stream sync (like 1.2x faster). Curious if you see something similar, because it's not obvious to me why this might be.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Interesting, I don't see a difference (tested with 32 threads and 32/64 events), sync doesn't affect the wall time.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Weird... these are the timing results I'm getting with no sync:

real	3m58.476s
user	7m30.713s
sys	0m2.825s
$ jq '.result["runner"]["time"]' no-sync.out.json 
{
  "actions": {},
  "setup": 31.052430465,
  "steps": [],
  "total": 206.873054614
}

and with sync:

real	3m2.136s
user	14m19.799s
sys	0m3.653s
$ jq '.result["runner"]["time"]' sync.out.json 
{
  "actions": {},
  "setup": 30.891581001,
  "steps": [],
  "total": 150.659916742
}

and the input I'm using, with OMP_NUM_THREADS=8:

JSON input
{
"geometry_filename": "/home/alund/celeritas_project/regression/input/cms2018.gdml",
"physics_filename": "/home/alund/celeritas_project/regression/input/cms2018.gdml",
"primary_gen_options": {
"seed": 0,
"direction": {"distribution": "isotropic", "params": []},
"energy": 10000,
"num_events": 8,
"pdg": 11, 
"position": [
0,
0,
0
],
"primaries_per_event": 1300
},
"geant_options": {
"eloss_fluctuation": false,
"em_bins_per_decade": 56, 
"msc": "urban_extended",
"physics": "em_basic"
},
"mag_field": [
0.0,
0.0,
1.0
],
"initializer_capacity": 8388608,
"max_events": 8,
"num_track_slots": 131072,
"max_steps": 1000000,
"secondary_stack_factor": 3.0,
"seed": 20220904,
"merge_events": false,
"sync": false,
"use_device": true
}

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@amandalund Interesting that the user time doubled even though the real time decreased. Is this on consumer hardware or HPC? It could be that the sync is causing the CPU threads to spinlock rather than context switch while waiting...

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah I thought that was odd... this is on HPC (Intel Xeon Gold 6152 CPU 22c 2.10GHz + NVIDIA Tesla V100 SXM2 w/32GB HBM2).

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It could also be some weird interaction between OpenMP and CUDA? Even though CUDA nominally supports OpenMP, the latter prohibits any kind of thread interaction outside of OpenMP, and CUDA is definitely using pthread under the hood. Maybe that's why @esseivaju didn't see any difference: he's not using OpenMP? Still, I would expect the net effect to be a slowdown rather than speedup.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Interesting, I also see it with celer-g4.
no sync:

real	2m7.980s
user	3m43.651s
sys	0m1.806s

sync:

real	1m46.656s
user	8m40.084s
sys	0m2.068s

with G4FORCENUMBEROFTHREADS=8 and input:

JSON input
{
"geometry_file": "/home/alund/celeritas_project/regression/input/cms2018.gdml",        
"output_file": "cms2018-celer-g4.out.json",                                            
"primary_options": {                                                                   
"seed": 0,                                                                             
"direction": {"distribution": "isotropic", "params": []},                              
"energy": 10000,                                                                       
"num_events": 8,
"pdg": 11,                                                                             
"position": [                                                                          
0,
0,                                                                                     
0                                                                                      
],                                                                                     
"primaries_per_event": 1300                                                            
},                                                                                     
"physics_list": "geant_physics_list",
"physics_options": {
"eloss_fluctuation": false,                                                            
"em_bins_per_decade": 56,
"msc": "urban_extended",
"physics": "em_basic"
},
"field_type": "uniform",
"field": [
0.0,
0.0,
1.0
],
"write_sd_hits": false,
"initializer_capacity": 8388608,                                                       
"max_events": 128,
"num_track_slots": 131072,                                                             
"max_steps": 1000000,                                                                  
"secondary_stack_factor": 3.0,                                                         
"seed": 20220904,                                                                      
"sync": true                                                                           
}

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here’s a relevant question with an answer that I think gives a nice example/explanation as to why we might see better performance with stream synchronization when using async copies with pageable memory: https://forums.developer.nvidia.com/t/performances-of-multi-thread-vs-multi-process-with-mps/64236/2

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@amandalund #910 is still kinda broken (only compiles with CUDA) but I've done some profiling on it and it significantly reduces the number of memcpyasync to pageable memory. Before I had many ~25kb transfers, now the only memcpyasync to pageable memory left comes from thrust (exclusive_scan_counts, remove_if_alive in TrackInitAlgorithms, and copy_if in DetectorSteps) and are 4B (probably the return value).

If you can compile that branch, I'd be curious to know if it helps reduce the timing for the non-sync version.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@esseivaju I tried out the pinned allocator branch, but looks like the time is still about the same as with no stream synchronization.

}
accum_time_[i] += get_time();
}
Expand Down
15 changes: 12 additions & 3 deletions src/celeritas/track/detail/TrackSortUtils.cu
Original file line number Diff line number Diff line change
Expand Up @@ -234,17 +234,26 @@ void count_tracks_per_action(
start + offsets.size(),
ThreadId{});
CELER_DEVICE_CHECK_ERROR();
auto* stream = celeritas::device().stream(states.stream_id).get();
CELER_LAUNCH_KERNEL(tracks_per_action,
celeritas::device().default_block_size(),
states.size(),
celeritas::device().stream(states.stream_id).get(),
stream,
states,
offsets,
states.size(),
order);

Span<ThreadId> sout = out[AllItems<ThreadId, MemSpace::host>{}];
Copier<ThreadId, MemSpace::host> copy_to_host{sout};
copy_to_host(MemSpace::device, offsets);
CELER_DEVICE_CALL_PREFIX(
MemcpyAsync(sout.data(),
offsets.data(),
offsets.size() * sizeof(ThreadId),
CELER_DEVICE_PREFIX(MemcpyDeviceToHost),
stream));

// Copies must be complete before backfilling
CELER_DEVICE_CALL_PREFIX(StreamSynchronize(stream));
backfill_action_count(sout, states.size());
}
}
Expand Down
21 changes: 17 additions & 4 deletions src/celeritas/user/DetectorSteps.cu
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
#include <thrust/execution_policy.h>
#include <thrust/iterator/counting_iterator.h>

#include "corecel/Assert.hh"
#include "corecel/Macros.hh"
#include "corecel/data/Collection.hh"
#include "corecel/data/Copier.hh"
#include "corecel/sys/Device.hh"
Expand Down Expand Up @@ -120,7 +122,10 @@ struct HasDetector

//---------------------------------------------------------------------------//
template<class T>
void copy_field(std::vector<T>* dst, StateRef<T> const& src, size_type num_valid)
void copy_field(std::vector<T>* dst,
StateRef<T> const& src,
size_type num_valid,
Stream::StreamT stream)
{
if (src.empty() || num_valid == 0)
{
Expand All @@ -131,8 +136,12 @@ void copy_field(std::vector<T>* dst, StateRef<T> const& src, size_type num_valid
dst->resize(num_valid);

// Copy all items from valid threads
Copier<T, MemSpace::host> copy{{dst->data(), num_valid}};
copy(MemSpace::device, {src.data().get(), num_valid});
CELER_DEVICE_CALL_PREFIX(
MemcpyAsync(dst->data(),
src.data().get(),
num_valid * sizeof(T),
CELER_DEVICE_PREFIX(MemcpyDeviceToHost),
stream));
}

//---------------------------------------------------------------------------//
Expand Down Expand Up @@ -166,8 +175,9 @@ void copy_steps<MemSpace::device>(
gather_step(state, num_valid);

// Resize and copy if the fields are present
auto* stream = celeritas::device().stream(state.stream_id).get();
#define DS_ASSIGN(FIELD) \
copy_field(&(output->FIELD), state.scratch.FIELD, num_valid)
copy_field(&(output->FIELD), state.scratch.FIELD, num_valid, stream)

DS_ASSIGN(detector);
DS_ASSIGN(track_id);
Expand All @@ -188,6 +198,9 @@ void copy_steps<MemSpace::device>(
DS_ASSIGN(energy_deposition);
#undef DS_ASSIGN

// Copies must be complete before returning
CELER_DEVICE_CALL_PREFIX(StreamSynchronize(stream));

CELER_ENSURE(output->detector.size() == num_valid);
CELER_ENSURE(output->track_id.size() == num_valid);
}
Expand Down
51 changes: 23 additions & 28 deletions src/corecel/sys/Device.cc
Original file line number Diff line number Diff line change
Expand Up @@ -34,13 +34,6 @@ namespace
{
//---------------------------------------------------------------------------//
// HELPER FUNCTIONS
//---------------------------------------------------------------------------//
std::mutex& device_setter_mutex()
{
static std::mutex m;
return m;
}

//---------------------------------------------------------------------------//
/*!
* Active CUDA device for Celeritas calls on the local process.
Expand All @@ -51,9 +44,9 @@ std::mutex& device_setter_mutex()
* and
* https://github.com/celeritas-project/celeritas/pull/149#discussion_r578000062
*
* We might need to add a "thread_local" annotation corresponding to a
* multithreaded celeritas option. This class will always be thread safe to
* read (if the instance isn't being modified by other threads).
* The device should be *activated* by the main thread, and \c
* activate_device_local should be called on other threads to set up the
* local CUDA context.
*/
Device& global_device()
{
Expand Down Expand Up @@ -148,6 +141,8 @@ Device::Device(int id) : id_{id}, streams_{new detail::StreamStorage{}}
{
CELER_EXPECT(id >= 0 && id < Device::num_devices());

CELER_LOG_LOCAL(debug) << "Constructing device ID " << id;

unsigned int max_threads_per_block = 0;
#if CELER_USE_DEVICE
# if CELERITAS_USE_CUDA
Expand Down Expand Up @@ -242,8 +237,8 @@ Device::Device(int id) : id_{id}, streams_{new detail::StreamStorage{}}
*/
StreamId::size_type Device::num_streams() const
{
CELER_EXPECT(streams_);

if (!streams_)
return 0;
return streams_->size();
}

Expand Down Expand Up @@ -287,19 +282,23 @@ Device const& device()

//---------------------------------------------------------------------------//
/*!
* Activate the given device.
* Activate the global celeritas device.
*
* The given device must be set (true result) unless no device has yet been
* enabled -- this allows \c make_device to create "null" devices
* when CUDA is disabled.
*
* \note This function is thread safe, and even though the global device is
* shared across threads, it should be called from each thread to correctly
* initialize CUDA.
* This function may be called once only, because the global device propagates
* into local states (e.g. where memory is allocated) all over Celeritas.
*/
void activate_device(Device&& device)
{
CELER_EXPECT(device || !global_device());
static std::mutex m;
std::lock_guard<std::mutex> scoped_lock{m};
Device& d = global_device();
CELER_VALIDATE(!d,
<< "celeritas::activate_device may be called only once per "
"application");

if (!device)
return;
Expand All @@ -308,14 +307,8 @@ void activate_device(Device&& device)
<< device.device_id() << " of "
<< Device::num_devices();
ScopedTimeLog scoped_time(&self_logger(), 1.0);
Device& d = global_device();
{
// Lock *after* getting the pointer to the global_device, because
// the global_device function (in debug mode) also uses this lock.
std::lock_guard<std::mutex> scoped_lock{device_setter_mutex()};
CELER_DEVICE_CALL_PREFIX(SetDevice(device.device_id()));
d = std::move(device);
}
CELER_DEVICE_CALL_PREFIX(SetDevice(device.device_id()));
d = std::move(device);

// Call cudaFree to wake up the device, making other timers more accurate
CELER_DEVICE_CALL_PREFIX(Free(nullptr));
Expand Down Expand Up @@ -353,13 +346,15 @@ void activate_device(MpiCommunicator const& comm)
* See
* https://developer.nvidia.com/blog/cuda-pro-tip-always-set-current-device-avoid-multithreading-bugs
*
* \pre activate_device was called to set \c device()
* \pre activate_device was called or no device is intended to be used
*/
void activate_device_local()
{
if (device())
Device& d = global_device();
if (d)
{
CELER_DEVICE_CALL_PREFIX(SetDevice(device().device_id()));
CELER_LOG_LOCAL(debug) << "Activating device " << d.device_id();
CELER_DEVICE_CALL_PREFIX(SetDevice(d.device_id()));
}
}

Expand Down
9 changes: 9 additions & 0 deletions src/corecel/sys/Stream.cc
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,13 @@ void AsyncMemoryResource<Pointer>::do_deallocate([[maybe_unused]] pointer p,
Stream::Stream() : memory_resource_(stream_)
{
CELER_DEVICE_CALL_PREFIX(StreamCreate(&stream_));
#if CUDART_VERSION >= 12000
unsigned long long stream_id = -1;
CELER_CUDA_CALL(cudaStreamGetId(stream_, &stream_id));
CELER_LOG_LOCAL(debug) << "Created stream ID " << stream_id;
#else
CELER_LOG_LOCAL(debug) << "Created stream " << static_cast<void*>(stream_);
#endif
}

//---------------------------------------------------------------------------//
Expand All @@ -80,6 +87,8 @@ Stream::~Stream()
try
{
CELER_DEVICE_CALL_PREFIX(StreamDestroy(stream_));
CELER_LOG_LOCAL(debug)
<< "Destroyed stream " << static_cast<void*>(stream_);
}
catch (RuntimeError const& e)
{
Expand Down