Skip to content

Commit 0749e4f

Browse files
[SYCL][L0] Rework how we maintain per-thread queue groups (#8896)
Signed-off-by: Sergey V Maslov <sergey.v.maslov@intel.com>
1 parent 0c74bbb commit 0749e4f

File tree

2 files changed

+58
-43
lines changed

2 files changed

+58
-43
lines changed

sycl/plugins/level_zero/pi_level_zero.cpp

Lines changed: 14 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,6 @@
2222
#include <string>
2323
#include <sycl/detail/pi.h>
2424
#include <sycl/detail/spinlock.hpp>
25-
#include <thread>
2625
#include <utility>
2726

2827
#include <zet_api.h>
@@ -932,10 +931,7 @@ _pi_queue::_pi_queue(std::vector<ze_command_queue_handle_t> &ComputeQueues,
932931
ComputeQueueGroup.ImmCmdLists = std::vector<pi_command_list_ptr_t>(
933932
ComputeQueueGroup.ZeQueues.size(), CommandListMap.end());
934933
}
935-
936-
// Thread id will be used to create separate queue groups per thread.
937-
auto TID = std::this_thread::get_id();
938-
ComputeQueueGroupsByTID.insert({TID, ComputeQueueGroup});
934+
ComputeQueueGroupsByTID.set(ComputeQueueGroup);
939935

940936
// Copy group initialization.
941937
pi_queue_group_t CopyQueueGroup{this, queue_type::MainCopy};
@@ -961,7 +957,7 @@ _pi_queue::_pi_queue(std::vector<ze_command_queue_handle_t> &ComputeQueues,
961957
}
962958
}
963959
}
964-
CopyQueueGroupsByTID.insert({TID, CopyQueueGroup});
960+
CopyQueueGroupsByTID.set(CopyQueueGroup);
965961

966962
// Initialize compute/copy command batches.
967963
ComputeCommandBatch.OpenCommandList = CommandListMap.end();
@@ -1259,24 +1255,7 @@ pi_result _pi_context::getAvailableCommandList(
12591255

12601256
_pi_queue::pi_queue_group_t &_pi_queue::getQueueGroup(bool UseCopyEngine) {
12611257
auto &Map = (UseCopyEngine ? CopyQueueGroupsByTID : ComputeQueueGroupsByTID);
1262-
auto &InitialGroup = Map.begin()->second;
1263-
1264-
// Check if thread-specifc immediate commandlists are requested.
1265-
if (Device->ImmCommandListUsed == _pi_device::PerThreadPerQueue) {
1266-
// Thread id is used to create separate imm cmdlists per thread.
1267-
auto Result = Map.insert({std::this_thread::get_id(), InitialGroup});
1268-
auto &QueueGroupRef = Result.first->second;
1269-
// If an entry for this thread does not exists, create an entry.
1270-
if (Result.second) {
1271-
// Create space for immediate commandlists, which are created on demand.
1272-
QueueGroupRef.ImmCmdLists = std::vector<pi_command_list_ptr_t>(
1273-
InitialGroup.ZeQueues.size(), CommandListMap.end());
1274-
}
1275-
return QueueGroupRef;
1276-
}
1277-
1278-
// If not PerThreadPerQueue then use the groups from Queue creation time.
1279-
return InitialGroup;
1258+
return Map.get();
12801259
}
12811260

12821261
// Helper function to create a new command-list to this queue and associated
@@ -2545,13 +2524,13 @@ pi_result piextQueueCreate(pi_context Context, pi_device Device,
25452524
// At this point only the thread creating the queue will have associated
25462525
// command-lists. Other threads have not accessed the queue yet. So we can
25472526
// only warmup the initial thread's command-lists.
2548-
auto InitialGroup = Q->ComputeQueueGroupsByTID.begin()->second;
2549-
PI_CALL(warmupQueueGroup(false, InitialGroup.UpperIndex -
2550-
InitialGroup.LowerIndex + 1));
2527+
auto QueueGroup = Q->ComputeQueueGroupsByTID.get();
2528+
PI_CALL(warmupQueueGroup(false, QueueGroup.UpperIndex -
2529+
QueueGroup.LowerIndex + 1));
25512530
if (Q->useCopyEngine()) {
2552-
auto InitialGroup = Q->CopyQueueGroupsByTID.begin()->second;
2553-
PI_CALL(warmupQueueGroup(true, InitialGroup.UpperIndex -
2554-
InitialGroup.LowerIndex + 1));
2531+
auto QueueGroup = Q->CopyQueueGroupsByTID.get();
2532+
PI_CALL(warmupQueueGroup(true, QueueGroup.UpperIndex -
2533+
QueueGroup.LowerIndex + 1));
25552534
}
25562535
// TODO: warmup event pools. Both host-visible and device-only.
25572536
}
@@ -2859,14 +2838,9 @@ pi_result piextQueueGetNativeHandle(pi_queue Queue,
28592838
auto ZeQueue = pi_cast<ze_command_queue_handle_t *>(NativeHandle);
28602839

28612840
// Extract a Level Zero compute queue handle from the given PI queue
2841+
auto &QueueGroup = Queue->getQueueGroup(false /*compute*/);
28622842
uint32_t QueueGroupOrdinalUnused;
2863-
auto TID = std::this_thread::get_id();
2864-
auto &InitialGroup = Queue->ComputeQueueGroupsByTID.begin()->second;
2865-
const auto &Result =
2866-
Queue->ComputeQueueGroupsByTID.insert({TID, InitialGroup});
2867-
auto &ComputeQueueGroupRef = Result.first->second;
2868-
2869-
*ZeQueue = ComputeQueueGroupRef.getZeQueue(&QueueGroupOrdinalUnused);
2843+
*ZeQueue = QueueGroup.getZeQueue(&QueueGroupOrdinalUnused);
28702844
return PI_SUCCESS;
28712845
}
28722846

@@ -5586,10 +5560,9 @@ pi_result piEnqueueEventsWaitWithBarrier(pi_queue Queue,
55865560
std::vector<pi_command_list_ptr_t> CmdLists;
55875561

55885562
// There must be at least one L0 queue.
5589-
auto &InitialComputeGroup = Queue->ComputeQueueGroupsByTID.begin()->second;
5590-
auto &InitialCopyGroup = Queue->CopyQueueGroupsByTID.begin()->second;
5591-
PI_ASSERT(!InitialComputeGroup.ZeQueues.empty() ||
5592-
!InitialCopyGroup.ZeQueues.empty(),
5563+
auto &ComputeGroup = Queue->ComputeQueueGroupsByTID.get();
5564+
auto &CopyGroup = Queue->CopyQueueGroupsByTID.get();
5565+
PI_ASSERT(!ComputeGroup.ZeQueues.empty() || !CopyGroup.ZeQueues.empty(),
55935566
PI_ERROR_INVALID_QUEUE);
55945567

55955568
size_t NumQueues = 0;

sycl/plugins/level_zero/pi_level_zero.hpp

Lines changed: 44 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -468,16 +468,58 @@ struct _pi_queue : _pi_object {
468468
uint32_t NextIndex{0};
469469
};
470470

471+
// Helper class to facilitate per-thread queue groups
472+
// We maintain a hashtable of queue groups if requested to do them per-thread.
473+
// Otherwise it is just single entry used for all threads.
474+
struct pi_queue_group_by_tid_t
475+
: public std::unordered_map<std::thread::id, pi_queue_group_t> {
476+
bool PerThread = false;
477+
478+
// Returns thread id if doing per-thread, or a generic id that represents
479+
// all the threads.
480+
std::thread::id tid() const {
481+
return PerThread ? std::this_thread::get_id() : std::thread::id();
482+
}
483+
484+
// Make the specified queue group be the master
485+
void set(const pi_queue_group_t &QueueGroup) {
486+
const auto &Device = QueueGroup.Queue->Device;
487+
PerThread = Device->ImmCommandListUsed == _pi_device::PerThreadPerQueue;
488+
assert(empty());
489+
insert({tid(), QueueGroup});
490+
}
491+
492+
// Get a queue group to use for this thread
493+
pi_queue_group_t &get() {
494+
assert(!empty());
495+
auto It = find(tid());
496+
if (It != end()) {
497+
return It->second;
498+
}
499+
// Add new queue group for this thread initialized from a master entry.
500+
auto QueueGroup = begin()->second;
501+
// Create space for queues and immediate commandlists, which are created
502+
// on demand.
503+
QueueGroup.ZeQueues = std::vector<ze_command_queue_handle_t>(
504+
QueueGroup.ZeQueues.size(), nullptr);
505+
QueueGroup.ImmCmdLists = std::vector<pi_command_list_ptr_t>(
506+
QueueGroup.ZeQueues.size(), QueueGroup.Queue->CommandListMap.end());
507+
508+
std::tie(It, std::ignore) = insert({tid(), QueueGroup});
509+
return It->second;
510+
}
511+
};
512+
471513
// A map of compute groups containing compute queue handles, one per thread.
472514
// When a queue is accessed from multiple host threads, a separate queue group
473515
// is created for each thread. The key used for mapping is the thread ID.
474-
std::unordered_map<std::thread::id, pi_queue_group_t> ComputeQueueGroupsByTID;
516+
pi_queue_group_by_tid_t ComputeQueueGroupsByTID;
475517

476518
// A group containing copy queue handles. The main copy engine, if available,
477519
// comes first followed by link copy engines, if available.
478520
// When a queue is accessed from multiple host threads, a separate queue group
479521
// is created for each thread. The key used for mapping is the thread ID.
480-
std::unordered_map<std::thread::id, pi_queue_group_t> CopyQueueGroupsByTID;
522+
pi_queue_group_by_tid_t CopyQueueGroupsByTID;
481523

482524
// Wait for all commandlists associated with this Queue to finish operations.
483525
pi_result synchronize();

0 commit comments

Comments
 (0)