From e971e91cf904611f69730c1d4a4e40059f73498f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tadej=20Ciglari=C4=8D?= Date: Thu, 26 May 2022 11:27:58 +0100 Subject: [PATCH 1/5] improve performance of queue.wait() --- sycl/plugins/cuda/pi_cuda.cpp | 2 +- sycl/plugins/cuda/pi_cuda.hpp | 27 ++++++++++++++++++++++++++- 2 files changed, 27 insertions(+), 2 deletions(-) diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp index f9d05af6ac8e2..8db51e3a25d31 100644 --- a/sycl/plugins/cuda/pi_cuda.cpp +++ b/sycl/plugins/cuda/pi_cuda.cpp @@ -2393,7 +2393,7 @@ pi_result cuda_piQueueFinish(pi_queue command_queue) { nullptr); // need PI_ERROR_INVALID_EXTERNAL_HANDLE error code ScopedContext active(command_queue->get_context()); - command_queue->for_each_stream([&result](CUstream s) { + command_queue->sync_streams([&result](CUstream s) { result = PI_CHECK_ERROR(cuStreamSynchronize(s)); }); diff --git a/sycl/plugins/cuda/pi_cuda.hpp b/sycl/plugins/cuda/pi_cuda.hpp index d253794094817..3587ef229917d 100644 --- a/sycl/plugins/cuda/pi_cuda.hpp +++ b/sycl/plugins/cuda/pi_cuda.hpp @@ -391,6 +391,8 @@ struct _pi_queue { std::atomic_uint32_t transfer_stream_idx_; unsigned int num_compute_streams_; unsigned int num_transfer_streams_; + unsigned int last_sync_compute_streams_; + unsigned int last_sync_transfer_streams_; unsigned int flags_; std::mutex compute_stream_mutex_; std::mutex transfer_stream_mutex_; @@ -403,7 +405,7 @@ struct _pi_queue { transfer_streams_{std::move(transfer_streams)}, context_{context}, device_{device}, properties_{properties}, refCount_{1}, eventCount_{0}, compute_stream_idx_{0}, transfer_stream_idx_{0}, - num_compute_streams_{0}, num_transfer_streams_{0}, flags_(flags) { + num_compute_streams_{0}, num_transfer_streams_{0}, last_sync_compute_streams_{0}, last_sync_transfer_streams_{0}, flags_(flags) { cuda_piContextRetain(context_); cuda_piDeviceRetain(device_); } @@ -439,6 +441,29 @@ struct _pi_queue { } } } + + template void sync_streams(T &&f) { + { + std::lock_guard compute_guard(compute_stream_mutex_); + unsigned int start = last_sync_compute_streams_; + unsigned int size = static_cast(compute_streams_.size()); + unsigned int end = num_compute_streams_ < size ? num_compute_streams_ : compute_stream_idx_.load() % size; + for (unsigned int i = start; i != end;(++i < size) ? i : (i=0)) { + f(compute_streams_[i]); + } + last_sync_compute_streams_ = end; + } + { + std::lock_guard transfer_guard(transfer_stream_mutex_); + unsigned int start = last_sync_transfer_streams_; + unsigned int size = static_cast(transfer_streams_.size()); + unsigned int end = num_transfer_streams_ < size ? num_transfer_streams_ : transfer_stream_idx_.load() % size; + for (unsigned int i = start; i != end; (++i < size) ? i : (i=0)) { + f(transfer_streams_[i]); + } + last_sync_transfer_streams_ = end; + } + } _pi_context *get_context() const { return context_; }; From ac209146bf25dfe94560e95b49fc02bd581199a6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tadej=20Ciglari=C4=8D?= Date: Thu, 26 May 2022 13:33:57 +0100 Subject: [PATCH 2/5] bugfix --- sycl/plugins/cuda/pi_cuda.hpp | 30 +++++++++++++++++++++--------- 1 file changed, 21 insertions(+), 9 deletions(-) diff --git a/sycl/plugins/cuda/pi_cuda.hpp b/sycl/plugins/cuda/pi_cuda.hpp index 3587ef229917d..c84dade4b2acc 100644 --- a/sycl/plugins/cuda/pi_cuda.hpp +++ b/sycl/plugins/cuda/pi_cuda.hpp @@ -445,23 +445,35 @@ struct _pi_queue { template void sync_streams(T &&f) { { std::lock_guard compute_guard(compute_stream_mutex_); - unsigned int start = last_sync_compute_streams_; unsigned int size = static_cast(compute_streams_.size()); + unsigned int start = last_sync_compute_streams_; unsigned int end = num_compute_streams_ < size ? num_compute_streams_ : compute_stream_idx_.load() % size; - for (unsigned int i = start; i != end;(++i < size) ? i : (i=0)) { - f(compute_streams_[i]); + if(size==1 && end==0){ + f(compute_streams_[0]); + } + else{ + for (unsigned int i = start; i != end;(++i < size) ? i : (i=0)) { + f(compute_streams_[i]); + } + last_sync_compute_streams_ = end; } - last_sync_compute_streams_ = end; } { std::lock_guard transfer_guard(transfer_stream_mutex_); - unsigned int start = last_sync_transfer_streams_; unsigned int size = static_cast(transfer_streams_.size()); - unsigned int end = num_transfer_streams_ < size ? num_transfer_streams_ : transfer_stream_idx_.load() % size; - for (unsigned int i = start; i != end; (++i < size) ? i : (i=0)) { - f(transfer_streams_[i]); + if(size>0){ + unsigned int start = last_sync_transfer_streams_; + unsigned int end = num_transfer_streams_ < size ? num_transfer_streams_ : transfer_stream_idx_.load() % size; + if(size==1 && end==0){ + f(transfer_streams_[0]); + } + else{ + for (unsigned int i = start; i != end; (++i < size) ? i : (i=0)) { + f(transfer_streams_[i]); + } + last_sync_transfer_streams_ = end; + } } - last_sync_transfer_streams_ = end; } } From 08d2af15b38918133a2cad2b65d7638ac2eb9905 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tadej=20Ciglari=C4=8D?= Date: Thu, 26 May 2022 13:34:16 +0100 Subject: [PATCH 3/5] format --- sycl/plugins/cuda/pi_cuda.hpp | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/sycl/plugins/cuda/pi_cuda.hpp b/sycl/plugins/cuda/pi_cuda.hpp index c84dade4b2acc..3cb1c64b615e4 100644 --- a/sycl/plugins/cuda/pi_cuda.hpp +++ b/sycl/plugins/cuda/pi_cuda.hpp @@ -405,7 +405,9 @@ struct _pi_queue { transfer_streams_{std::move(transfer_streams)}, context_{context}, device_{device}, properties_{properties}, refCount_{1}, eventCount_{0}, compute_stream_idx_{0}, transfer_stream_idx_{0}, - num_compute_streams_{0}, num_transfer_streams_{0}, last_sync_compute_streams_{0}, last_sync_transfer_streams_{0}, flags_(flags) { + num_compute_streams_{0}, num_transfer_streams_{0}, + last_sync_compute_streams_{0}, last_sync_transfer_streams_{0}, + flags_(flags) { cuda_piContextRetain(context_); cuda_piDeviceRetain(device_); } @@ -441,18 +443,19 @@ struct _pi_queue { } } } - + template void sync_streams(T &&f) { { std::lock_guard compute_guard(compute_stream_mutex_); unsigned int size = static_cast(compute_streams_.size()); unsigned int start = last_sync_compute_streams_; - unsigned int end = num_compute_streams_ < size ? num_compute_streams_ : compute_stream_idx_.load() % size; - if(size==1 && end==0){ + unsigned int end = num_compute_streams_ < size + ? num_compute_streams_ + : compute_stream_idx_.load() % size; + if (size == 1 && end == 0) { f(compute_streams_[0]); - } - else{ - for (unsigned int i = start; i != end;(++i < size) ? i : (i=0)) { + } else { + for (unsigned int i = start; i != end; (++i < size) ? i : (i = 0)) { f(compute_streams_[i]); } last_sync_compute_streams_ = end; @@ -461,14 +464,15 @@ struct _pi_queue { { std::lock_guard transfer_guard(transfer_stream_mutex_); unsigned int size = static_cast(transfer_streams_.size()); - if(size>0){ + if (size > 0) { unsigned int start = last_sync_transfer_streams_; - unsigned int end = num_transfer_streams_ < size ? num_transfer_streams_ : transfer_stream_idx_.load() % size; - if(size==1 && end==0){ + unsigned int end = num_transfer_streams_ < size + ? num_transfer_streams_ + : transfer_stream_idx_.load() % size; + if (size == 1 && end == 0) { f(transfer_streams_[0]); - } - else{ - for (unsigned int i = start; i != end; (++i < size) ? i : (i=0)) { + } else { + for (unsigned int i = start; i != end; (++i < size) ? i : (i = 0)) { f(transfer_streams_[i]); } last_sync_transfer_streams_ = end; From 732fedc4dee10fd55a250c2dfb5ef141391645f9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tadej=20Ciglari=C4=8D?= Date: Fri, 27 May 2022 07:31:07 +0100 Subject: [PATCH 4/5] addressed review comment --- sycl/plugins/cuda/pi_cuda.hpp | 47 +++++++++++++++++++++++------------ 1 file changed, 31 insertions(+), 16 deletions(-) diff --git a/sycl/plugins/cuda/pi_cuda.hpp b/sycl/plugins/cuda/pi_cuda.hpp index 3cb1c64b615e4..4d91335f3af54 100644 --- a/sycl/plugins/cuda/pi_cuda.hpp +++ b/sycl/plugins/cuda/pi_cuda.hpp @@ -445,37 +445,52 @@ struct _pi_queue { } template void sync_streams(T &&f) { + auto sync = [&f](const std::vector& streams, unsigned int start, unsigned int stop){ + for (unsigned int i = start; i < stop; i++) { + f(streams[i]); + } + }; { - std::lock_guard compute_guard(compute_stream_mutex_); unsigned int size = static_cast(compute_streams_.size()); + std::lock_guard compute_guard(compute_stream_mutex_); unsigned int start = last_sync_compute_streams_; unsigned int end = num_compute_streams_ < size ? num_compute_streams_ - : compute_stream_idx_.load() % size; - if (size == 1 && end == 0) { - f(compute_streams_[0]); - } else { - for (unsigned int i = start; i != end; (++i < size) ? i : (i = 0)) { - f(compute_streams_[i]); + : compute_stream_idx_.load(); + last_sync_compute_streams_ = end; + if(end - start >= size){ + sync(compute_streams_, 0, size); + } else{ + start %= size; + end %= size; + if(start < end){ + sync(compute_streams_, start, end); + } else{ + sync(compute_streams_, start, size); + sync(compute_streams_, 0, end); } - last_sync_compute_streams_ = end; } } { - std::lock_guard transfer_guard(transfer_stream_mutex_); unsigned int size = static_cast(transfer_streams_.size()); if (size > 0) { + std::lock_guard transfer_guard(transfer_stream_mutex_); unsigned int start = last_sync_transfer_streams_; unsigned int end = num_transfer_streams_ < size ? num_transfer_streams_ - : transfer_stream_idx_.load() % size; - if (size == 1 && end == 0) { - f(transfer_streams_[0]); - } else { - for (unsigned int i = start; i != end; (++i < size) ? i : (i = 0)) { - f(transfer_streams_[i]); + : transfer_stream_idx_.load(); + last_sync_transfer_streams_ = end; + if(end - start >= size){ + sync(transfer_streams_, 0, size); + } else{ + start %= size; + end %= size; + if(start < end){ + sync(transfer_streams_, start, end); + } else{ + sync(transfer_streams_, start, size); + sync(transfer_streams_, 0, end); } - last_sync_transfer_streams_ = end; } } } From 5d4cf4a672bb7ab3a7bc687b2c97bcee03ebe775 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tadej=20Ciglari=C4=8D?= Date: Fri, 27 May 2022 07:44:39 +0100 Subject: [PATCH 5/5] format --- sycl/plugins/cuda/pi_cuda.hpp | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/sycl/plugins/cuda/pi_cuda.hpp b/sycl/plugins/cuda/pi_cuda.hpp index 4d91335f3af54..e9a87a6378c7d 100644 --- a/sycl/plugins/cuda/pi_cuda.hpp +++ b/sycl/plugins/cuda/pi_cuda.hpp @@ -445,7 +445,8 @@ struct _pi_queue { } template void sync_streams(T &&f) { - auto sync = [&f](const std::vector& streams, unsigned int start, unsigned int stop){ + auto sync = [&f](const std::vector &streams, unsigned int start, + unsigned int stop) { for (unsigned int i = start; i < stop; i++) { f(streams[i]); } @@ -458,14 +459,14 @@ struct _pi_queue { ? num_compute_streams_ : compute_stream_idx_.load(); last_sync_compute_streams_ = end; - if(end - start >= size){ + if (end - start >= size) { sync(compute_streams_, 0, size); - } else{ + } else { start %= size; end %= size; - if(start < end){ + if (start < end) { sync(compute_streams_, start, end); - } else{ + } else { sync(compute_streams_, start, size); sync(compute_streams_, 0, end); } @@ -480,14 +481,14 @@ struct _pi_queue { ? num_transfer_streams_ : transfer_stream_idx_.load(); last_sync_transfer_streams_ = end; - if(end - start >= size){ + if (end - start >= size) { sync(transfer_streams_, 0, size); - } else{ + } else { start %= size; end %= size; - if(start < end){ + if (start < end) { sync(transfer_streams_, start, end); - } else{ + } else { sync(transfer_streams_, start, size); sync(transfer_streams_, 0, end); }