diff --git a/sycl/doc/EnvironmentVariables.md b/sycl/doc/EnvironmentVariables.md index d8eebfcd4ebcd..86b508f67dac9 100755 --- a/sycl/doc/EnvironmentVariables.md +++ b/sycl/doc/EnvironmentVariables.md @@ -148,7 +148,6 @@ variables in production code. | Environment variable | Values | Description | | -------------------- | ------ | ----------- | | `SYCL_PI_LEVEL_ZERO_SINGLE_THREAD_MODE` | Integer | A single-threaded app has an opportunity to enable this mode to avoid overhead from mutex locking in the Level Zero plugin. A value greater than 0 enables single thread mode. A value of 0 disables single thread mode. The default is 0. | -| `SYCL_PI_LEVEL_ZERO_MAX_COMMAND_LIST_CACHE` | Positive integer | Maximum number of oneAPI Level Zero Command lists that can be allocated with no reuse before throwing an "out of resources" error. Default is 20000, threshold may be increased based on resource availabilty and workload demand. | | `SYCL_PI_LEVEL_ZERO_USM_ALLOCATOR` | [EnableBuffers][;[MaxPoolSize][;[host\|device\|shared:][MaxPoolableSize][,[Capacity][,SlabMinSize]]]...] | EnableBuffers enables pooling for SYCL buffers, default 0, set to 1 to enable. MaxPoolSize is the maximum size of the pool, default 0. MemType is host, device or shared. Other parameters are values specified as positive integers with optional K, M or G suffix. MaxPoolableSize is the maximum allocation size that may be pooled, default 0 for host and shared, 32KB for device. Capacity is the number of allocations in each size range freed by the program but retained in the pool for reallocation, default 0. Size ranges follow this pattern: 64, 96, 128, 192, and so on, i.e., powers of 2, with one range in between. SlabMinSize is the minimum allocation size, 64KB for host and device, 2MB for shared. Example: SYCL_PI_LEVEL_ZERO_USM_ALLOCATOR=1;32M;host:1M,4,64K;device:1M,4,64K;shared:0,0,2M| | `SYCL_PI_LEVEL_ZERO_BATCH_SIZE` | Integer | Sets a preferred number of compute commands to batch into a command list before executing the command list. A value of 0 causes the batch size to be adjusted dynamically. A value greater than 0 specifies fixed size batching, with the batch size set to the specified value. The default is 0. | | `SYCL_PI_LEVEL_ZERO_COPY_BATCH_SIZE` | Integer | Sets a preferred number of copy commands to batch into a command list before executing the command list. A value of 0 causes the batch size to be adjusted dynamically. A value greater than 0 specifies fixed size batching, with the batch size set to the specified value. The default is 0. | diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp index ff77cfd78abcb..78c6e2e75efb6 100644 --- a/sycl/plugins/level_zero/pi_level_zero.cpp +++ b/sycl/plugins/level_zero/pi_level_zero.cpp @@ -444,7 +444,7 @@ _pi_context::getFreeSlotInExistingOrNewPool(ze_event_pool_handle_t &Pool, size_t &Index, bool HostVisible, bool ProfilingEnabled) { // Lock while updating event pool machinery. - std::lock_guard Lock(ZeEventPoolCacheMutex); + std::scoped_lock Lock(ZeEventPoolCacheMutex); std::list *ZePoolCache = getZeEventPoolCache(HostVisible, ProfilingEnabled); @@ -474,8 +474,9 @@ _pi_context::getFreeSlotInExistingOrNewPool(ze_event_pool_handle_t &Pool, zePrint("ze_event_pool_desc_t flags set to: %d\n", ZeEventPoolDesc.flags); std::vector ZeDevices; - std::for_each(Devices.begin(), Devices.end(), - [&](pi_device &D) { ZeDevices.push_back(D->ZeDevice); }); + std::for_each(Devices.begin(), Devices.end(), [&](const pi_device &D) { + ZeDevices.push_back(D->ZeDevice); + }); ZE_CALL(zeEventPoolCreate, (ZeContext, &ZeEventPoolDesc, ZeDevices.size(), &ZeDevices[0], ZePool)); @@ -789,6 +790,34 @@ pi_result _pi_device::initialize(int SubSubDeviceOrdinal, return PI_SUCCESS; } +pi_device _pi_context::getRootDevice() const { + assert(Devices.size() > 0); + + if (Devices.size() == 1) + return Devices[0]; + + // Check if we have context with subdevices of the same device (context + // may include root device itself as well) + pi_device ContextRootDevice = + Devices[0]->RootDevice ? Devices[0]->RootDevice : Devices[0]; + + // For context with sub subdevices, the ContextRootDevice might still + // not be the root device. + // Check whether the ContextRootDevice is the subdevice or root device. + if (ContextRootDevice->isSubDevice()) { + ContextRootDevice = ContextRootDevice->RootDevice; + } + + for (auto &Device : Devices) { + if ((!Device->RootDevice && Device != ContextRootDevice) || + (Device->RootDevice && Device->RootDevice != ContextRootDevice)) { + ContextRootDevice = nullptr; + break; + } + } + return ContextRootDevice; +} + pi_result _pi_context::initialize() { // Create the immediate command list to be used for initializations // Created as synchronous so level-zero performs implicit synchronization and @@ -816,7 +845,7 @@ pi_result _pi_context::finalize() { // There could be some memory that may have not been deallocated. // For example, event pool caches would be still alive. { - std::lock_guard Lock(ZeEventPoolCacheMutex); + std::scoped_lock Lock(ZeEventPoolCacheMutex); for (auto &ZePoolCache : ZeEventPoolCache) { for (auto &ZePool : ZePoolCache) ZE_CALL(zeEventPoolDestroy, (ZePool)); @@ -827,23 +856,18 @@ pi_result _pi_context::finalize() { // Destroy the command list used for initializations ZE_CALL(zeCommandListDestroy, (ZeCommandListInit)); - // Adjust the number of command lists created on this platform. - auto Platform = Devices[0]->Platform; - - std::lock_guard Lock(ZeCommandListCacheMutex); + std::scoped_lock Lock(ZeCommandListCacheMutex); for (auto &List : ZeComputeCommandListCache) { for (ze_command_list_handle_t &ZeCommandList : List.second) { if (ZeCommandList) ZE_CALL(zeCommandListDestroy, (ZeCommandList)); } - Platform->ZeGlobalCommandListCount -= List.second.size(); } for (auto &List : ZeCopyCommandListCache) { for (ze_command_list_handle_t &ZeCommandList : List.second) { if (ZeCommandList) ZE_CALL(zeCommandListDestroy, (ZeCommandList)); } - Platform->ZeGlobalCommandListCount -= List.second.size(); } return PI_SUCCESS; } @@ -865,10 +889,6 @@ _pi_queue::resetCommandList(pi_command_list_ptr_t CommandList, bool MakeAvailable, std::vector &EventListToCleanup) { bool UseCopyEngine = CommandList->second.isCopy(this); - auto &ZeCommandListCache = - UseCopyEngine - ? this->Context->ZeCopyCommandListCache[this->Device->ZeDevice] - : this->Context->ZeComputeCommandListCache[this->Device->ZeDevice]; // Immediate commandlists do not have an associated fence. if (CommandList->second.ZeFence != nullptr) { @@ -890,33 +910,17 @@ _pi_queue::resetCommandList(pi_command_list_ptr_t CommandList, // Standard commandlists move in and out of the cache as they are recycled. // Immediate commandlists are always available. if (CommandList->second.ZeFence != nullptr && MakeAvailable) { - std::lock_guard lock(this->Context->ZeCommandListCacheMutex); + std::scoped_lock Lock(this->Context->ZeCommandListCacheMutex); + auto &ZeCommandListCache = + UseCopyEngine + ? this->Context->ZeCopyCommandListCache[this->Device->ZeDevice] + : this->Context->ZeComputeCommandListCache[this->Device->ZeDevice]; ZeCommandListCache.push_back(CommandList->first); } return PI_SUCCESS; } -// Maximum Number of Command Lists that can be created. -// This Value is initialized to 20000, but can be changed by the user -// thru the environment variable SYCL_PI_LEVEL_ZERO_MAX_COMMAND_LIST_CACHE -// ie SYCL_PI_LEVEL_ZERO_MAX_COMMAND_LIST_CACHE =10000. -static const int ZeMaxCommandListCacheSize = [] { - const char *CommandListCacheSize = - std::getenv("SYCL_PI_LEVEL_ZERO_MAX_COMMAND_LIST_CACHE"); - pi_uint32 CommandListCacheSizeValue; - try { - CommandListCacheSizeValue = - CommandListCacheSize ? std::stoi(CommandListCacheSize) : 20000; - } catch (std::exception const &) { - zePrint( - "SYCL_PI_LEVEL_ZERO_MAX_COMMAND_LIST_CACHE: invalid value provided, " - "default set.\n"); - CommandListCacheSizeValue = 20000; - } - return CommandListCacheSizeValue; -}(); - // Configuration of the command-list batching. typedef struct CommandListBatchConfig { // Default value of 0. This specifies to use dynamic batch size adjustment. @@ -1202,7 +1206,7 @@ _pi_context::getAvailableCommandList(pi_queue Queue, { // Make sure to acquire the lock before checking the size, or there // will be a race condition. - std::lock_guard lock(Queue->Context->ZeCommandListCacheMutex); + std::scoped_lock Lock(Queue->Context->ZeCommandListCacheMutex); // Under mutex since operator[] does insertion on the first usage for every // unique ZeDevice. auto &ZeCommandListCache = @@ -1241,34 +1245,28 @@ _pi_context::getAvailableCommandList(pi_queue Queue, } // If there are no available command lists nor signalled command lists, then - // we must create another command list if we have not exceed the maximum - // command lists we can create. + // we must create another command list. // Once created, this command list & fence are added to the command list fence // map. - if (Queue->Device->Platform->ZeGlobalCommandListCount < - ZeMaxCommandListCacheSize) { - ze_command_list_handle_t ZeCommandList; - ze_fence_handle_t ZeFence; + ze_command_list_handle_t ZeCommandList; + ze_fence_handle_t ZeFence; - uint32_t QueueGroupOrdinal; - auto &ZeCommandQueue = - Queue->getQueueGroup(UseCopyEngine).getZeQueue(&QueueGroupOrdinal); + uint32_t QueueGroupOrdinal; + auto &ZeCommandQueue = + Queue->getQueueGroup(UseCopyEngine).getZeQueue(&QueueGroupOrdinal); - ZeStruct ZeCommandListDesc; - ZeCommandListDesc.commandQueueGroupOrdinal = QueueGroupOrdinal; + ZeStruct ZeCommandListDesc; + ZeCommandListDesc.commandQueueGroupOrdinal = QueueGroupOrdinal; - ZE_CALL(zeCommandListCreate, - (Queue->Context->ZeContext, Queue->Device->ZeDevice, - &ZeCommandListDesc, &ZeCommandList)); - // Increments the total number of command lists created on this platform. - Queue->Device->Platform->ZeGlobalCommandListCount++; + ZE_CALL(zeCommandListCreate, + (Queue->Context->ZeContext, Queue->Device->ZeDevice, + &ZeCommandListDesc, &ZeCommandList)); - ZE_CALL(zeFenceCreate, (ZeCommandQueue, &ZeFenceDesc, &ZeFence)); - std::tie(CommandList, std::ignore) = Queue->CommandListMap.insert( - std::pair( - ZeCommandList, {ZeFence, true, ZeCommandQueue, QueueGroupOrdinal})); - pi_result = PI_SUCCESS; - } + ZE_CALL(zeFenceCreate, (ZeCommandQueue, &ZeFenceDesc, &ZeFence)); + std::tie(CommandList, std::ignore) = Queue->CommandListMap.insert( + std::pair( + ZeCommandList, {ZeFence, true, ZeCommandQueue, QueueGroupOrdinal})); + pi_result = PI_SUCCESS; return pi_result; } @@ -1413,8 +1411,8 @@ pi_result _pi_queue::executeCommandList(pi_command_list_ptr_t CommandList, // unique_lock destructor at the end of the function will unlock the mutex // if it was locked (which happens only if IndirectAccessTrackingEnabled is // true). - std::unique_lock ContextsLock(Device->Platform->ContextsMutex, - std::defer_lock); + std::unique_lock ContextsLock( + Device->Platform->ContextsMutex, std::defer_lock); if (IndirectAccessTrackingEnabled) { // We are going to submit kernels for execution. If indirect access flag is @@ -1622,11 +1620,11 @@ pi_command_list_ptr_t &_pi_queue::pi_queue_group_t::getImmCmdList() { // Add this commandlist to the cache so it can be destroyed as part of // piQueueReleaseInternal auto QueueType = Type; + std::scoped_lock Lock(Queue->Context->ZeCommandListCacheMutex); auto &ZeCommandListCache = QueueType == queue_type::Compute ? Queue->Context->ZeComputeCommandListCache[Queue->Device->ZeDevice] : Queue->Context->ZeCopyCommandListCache[Queue->Device->ZeDevice]; - std::lock_guard lock(Queue->Context->ZeCommandListCacheMutex); ZeCommandListCache.push_back(ZeCommandList); return ImmCmdLists[Index]; @@ -1832,7 +1830,7 @@ pi_result _pi_ze_event_list_t::collectEventsForReleaseAndDestroyPiZeEventList( { // acquire the lock and copy fields locally // Lock automatically releases when this goes out of scope. - std::lock_guard lock(this->PiZeEventListMutex); + std::scoped_lock lock(this->PiZeEventListMutex); LocLength = Length; LocZeEventList = ZeEventList; @@ -2213,6 +2211,7 @@ pi_device _pi_platform::getDeviceFromNativeHandle(ze_device_handle_t ZeDevice) { // mapping from L0 device handle to PI device assumed in this function. Until // Level-Zero adds unique ze_device_handle_t for sub-sub-devices, here we // filter out PI sub-sub-devices. + std::shared_lock Lock(PiDevicesCacheMutex); auto it = std::find_if(PiDevicesCache.begin(), PiDevicesCache.end(), [&](std::unique_ptr<_pi_device> &D) { return D.get()->ZeDevice == ZeDevice && @@ -2238,6 +2237,7 @@ pi_result piDevicesGet(pi_platform Platform, pi_device_type DeviceType, // Filter available devices based on input DeviceType. std::vector MatchedDevices; + std::shared_lock Lock(Platform->PiDevicesCacheMutex); for (auto &D : Platform->PiDevicesCache) { // Only ever return root-devices from piDevicesGet, but the // devices cache also keeps sub-devices. @@ -2291,7 +2291,7 @@ pi_result piDevicesGet(pi_platform Platform, pi_device_type DeviceType, // Check the device cache and load it if necessary. pi_result _pi_platform::populateDeviceCacheIfNeeded() { - std::lock_guard Lock(PiDevicesCacheMutex); + std::scoped_lock Lock(PiDevicesCacheMutex); if (DeviceCachePopulated) { return PI_SUCCESS; @@ -3096,7 +3096,7 @@ pi_result piContextCreate(const pi_context_properties *Properties, *RetContext = new _pi_context(ZeContext, NumDevices, Devices, true); (*RetContext)->initialize(); if (IndirectAccessTrackingEnabled) { - std::lock_guard Lock(Platform->ContextsMutex); + std::scoped_lock Lock(Platform->ContextsMutex); Platform->Contexts.push_back(*RetContext); } } catch (const std::bad_alloc &) { @@ -3114,6 +3114,7 @@ pi_result piContextGetInfo(pi_context Context, pi_context_info ParamName, PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT); + std::shared_lock Lock(Context->Mutex); ReturnHelper ReturnValue(ParamValueSize, ParamValue, ParamValueSizeRet); switch (ParamName) { case PI_CONTEXT_INFO_DEVICES: @@ -3225,8 +3226,8 @@ pi_result ContextReleaseHelper(pi_context Context) { pi_result piContextRelease(pi_context Context) { pi_platform Plt = Context->getPlatform(); - std::unique_lock ContextsLock(Plt->ContextsMutex, - std::defer_lock); + std::unique_lock ContextsLock(Plt->ContextsMutex, + std::defer_lock); if (IndirectAccessTrackingEnabled) ContextsLock.lock(); @@ -3289,6 +3290,7 @@ pi_result piQueueGetInfo(pi_queue Queue, pi_queue_info ParamName, PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE); + std::shared_lock Lock(Queue->Mutex); ReturnHelper ReturnValue(ParamValueSize, ParamValue, ParamValueSizeRet); // TODO: consider support for queue properties and size switch (ParamName) { @@ -3490,7 +3492,7 @@ pi_result piextQueueGetNativeHandle(pi_queue Queue, PI_ASSERT(NativeHandle, PI_ERROR_INVALID_VALUE); // Lock automatically releases when this goes out of scope. - std::scoped_lock lock(Queue->Mutex); + std::shared_lock lock(Queue->Mutex); auto ZeQueue = pi_cast(NativeHandle); // Extract the Level Zero compute queue handle from the given PI queue @@ -3529,8 +3531,8 @@ pi_result piextQueueCreateWithNativeHandle(pi_native_handle NativeHandle, static pi_result ZeDeviceMemAllocHelper(void **ResultPtr, pi_context Context, pi_device Device, size_t Size) { pi_platform Plt = Device->Platform; - std::unique_lock ContextsLock(Plt->ContextsMutex, - std::defer_lock); + std::unique_lock ContextsLock(Plt->ContextsMutex, + std::defer_lock); if (IndirectAccessTrackingEnabled) { // Lock the mutex which is guarding contexts container in the platform. // This prevents new kernels from being submitted in any context while @@ -3564,8 +3566,8 @@ static pi_result ZeDeviceMemAllocHelper(void **ResultPtr, pi_context Context, static pi_result ZeHostMemAllocHelper(void **ResultPtr, pi_context Context, size_t Size) { pi_platform Plt = Context->getPlatform(); - std::unique_lock ContextsLock(Plt->ContextsMutex, - std::defer_lock); + std::unique_lock ContextsLock(Plt->ContextsMutex, + std::defer_lock); if (IndirectAccessTrackingEnabled) { // Lock the mutex which is guarding contexts container in the platform. // This prevents new kernels from being submitted in any context while @@ -3669,6 +3671,9 @@ pi_result piMemBufferCreate(pi_context Context, pi_mem_flags Flags, size_t Size, memcpy(ZeHandleDst, HostPtr, Size); } else { // Initialize the buffer synchronously with immediate offload + // zeCommandListAppendMemoryCopy must not be called from simultaneous + // threads with the same command list handle, so we need exclusive lock. + std::scoped_lock Lock(Context->ImmediateCommandListMutex); ZE_CALL(zeCommandListAppendMemoryCopy, (Context->ZeCommandListInit, ZeHandleDst, HostPtr, Size, nullptr, 0, nullptr)); @@ -3690,6 +3695,7 @@ pi_result piMemGetInfo(pi_mem Mem, pi_mem_info ParamName, size_t ParamValueSize, // piMemImageGetInfo must be used for images PI_ASSERT(!Mem->isImage(), PI_ERROR_INVALID_VALUE); + std::shared_lock Lock(Mem->Mutex); ReturnHelper ReturnValue(ParamValueSize, ParamValue, ParamValueSizeRet); switch (ParamName) { @@ -3720,8 +3726,8 @@ pi_result piMemRetain(pi_mem Mem) { static pi_result ZeMemFreeHelper(pi_context Context, void *Ptr, bool OwnZeMemHandle = true) { pi_platform Plt = Context->getPlatform(); - std::unique_lock ContextsLock(Plt->ContextsMutex, - std::defer_lock); + std::unique_lock ContextsLock(Plt->ContextsMutex, + std::defer_lock); if (IndirectAccessTrackingEnabled) { ContextsLock.lock(); auto It = Context->MemAllocs.find(Ptr); @@ -3903,6 +3909,8 @@ pi_result piMemImageCreate(pi_context Context, pi_mem_flags Flags, ZeImageDesc.arraylevels = pi_cast(ImageDesc->image_array_size); ZeImageDesc.miplevels = ImageDesc->num_mip_levels; + std::shared_lock Lock(Context->Mutex); + // Currently we have the "0" device in context with mutliple root devices to // own the image. // TODO: Implement explicit copying for acessing the image from other devices @@ -3922,7 +3930,11 @@ pi_result piMemImageCreate(pi_context Context, pi_mem_flags Flags, if ((Flags & PI_MEM_FLAGS_HOST_PTR_USE) != 0 || (Flags & PI_MEM_FLAGS_HOST_PTR_COPY) != 0) { - // Initialize image synchronously with immediate offload + // Initialize image synchronously with immediate offload. + // zeCommandListAppendImageCopyFromMemory must not be called from + // simultaneous threads with the same command list handle, so we need + // exclusive lock. + std::scoped_lock Lock(Context->ImmediateCommandListMutex); ZE_CALL(zeCommandListAppendImageCopyFromMemory, (Context->ZeCommandListInit, ZeHImage, HostPtr, nullptr, nullptr, 0, nullptr)); @@ -3953,6 +3965,8 @@ pi_result piextMemCreateWithNativeHandle(pi_native_handle NativeHandle, PI_ASSERT(NativeHandle, PI_ERROR_INVALID_VALUE); PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT); + std::shared_lock Lock(Context->Mutex); + // Get base of the allocation void *Base; size_t Size; @@ -3993,8 +4007,8 @@ pi_result piextMemCreateWithNativeHandle(pi_native_handle NativeHandle, ownNativeHandle); pi_platform Plt = Context->getPlatform(); - std::unique_lock ContextsLock(Plt->ContextsMutex, - std::defer_lock); + std::unique_lock ContextsLock(Plt->ContextsMutex, + std::defer_lock); if (IndirectAccessTrackingEnabled) { // We need to keep track of all memory allocations in the context ContextsLock.lock(); @@ -4026,6 +4040,10 @@ pi_result piextMemCreateWithNativeHandle(pi_native_handle NativeHandle, // created device allocation. char *ZeHandleDst; PI_CALL(Buffer->getZeHandle(ZeHandleDst, _pi_mem::write_only)); + + // zeCommandListAppendMemoryCopy must not be called from simultaneous + // threads with the same command list handle, so we need exclusive lock. + std::scoped_lock Lock(Context->ImmediateCommandListMutex); ZE_CALL(zeCommandListAppendMemoryCopy, (Context->ZeCommandListInit, ZeHandleDst, Ptr, Size, nullptr, 0, nullptr)); @@ -4956,7 +4974,7 @@ pi_result piKernelRelease(pi_kernel Kernel) { // memory can be deallocated and context can be removed from container in // the platform. That's why we need to lock a mutex here. pi_platform Plt = Kernel->Program->Context->getPlatform(); - std::lock_guard ContextsLock(Plt->ContextsMutex); + std::scoped_lock ContextsLock(Plt->ContextsMutex); if (--Kernel->SubmissionsCount == 0) { // Kernel is not submitted for execution, release referenced memory @@ -5121,7 +5139,7 @@ piEnqueueKernelLaunch(pi_queue Queue, pi_kernel Kernel, pi_uint32 WorkDim, // If using immediate commandlists then gathering of indirect // references and appending to the queue (which means submission) // must be done together. - std::unique_lock ContextsLock( + std::unique_lock ContextsLock( Queue->Device->Platform->ContextsMutex, std::defer_lock); // We are going to submit kernels for execution. If indirect access flag is // set for a kernel then we need to make a snapshot of existing memory @@ -5708,6 +5726,8 @@ pi_result piSamplerCreate(pi_context Context, PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT); PI_ASSERT(RetSampler, PI_ERROR_INVALID_VALUE); + std::shared_lock Lock(Context->Mutex); + // Have the "0" device in context to own the sampler. Rely on Level-Zero // drivers to perform migration as necessary for sharing it across multiple // devices in the context. @@ -7385,18 +7405,25 @@ pi_result piextUSMDeviceAlloc(void **ResultPtr, pi_context Context, return PI_ERROR_INVALID_VALUE; pi_platform Plt = Device->Platform; - std::unique_lock ContextsLock(Plt->ContextsMutex, - std::defer_lock); + + // If indirect access tracking is enabled then lock the mutex which is + // guarding contexts container in the platform. This prevents new kernels from + // being submitted in any context while we are in the process of allocating a + // memory, this is needed to properly capture allocations by kernels with + // indirect access. This lock also protects access to the context's data + // structures. If indirect access tracking is not enabled then lock context + // mutex to protect access to context's data structures. + std::shared_lock ContextLock(Context->Mutex, std::defer_lock); + std::unique_lock IndirectAccessTrackingLock(Plt->ContextsMutex, + std::defer_lock); if (IndirectAccessTrackingEnabled) { - // Lock the mutex which is guarding contexts container in the platform. - // This prevents new kernels from being submitted in any context while we - // are in the process of allocating a memory, this is needed to properly - // capture allocations by kernels with indirect access. - ContextsLock.lock(); + IndirectAccessTrackingLock.lock(); // We are going to defer memory release if there are kernels with indirect // access, that is why explicitly retain context to be sure that it is // released after all memory allocations in this context are released. PI_CALL(piContextRetain(Context)); + } else { + ContextLock.lock(); } if (!UseUSMAllocator || @@ -7457,14 +7484,19 @@ pi_result piextUSMSharedAlloc(void **ResultPtr, pi_context Context, return PI_ERROR_INVALID_VALUE; pi_platform Plt = Device->Platform; - std::unique_lock ContextsLock(Plt->ContextsMutex, - std::defer_lock); + + // If indirect access tracking is enabled then lock the mutex which is + // guarding contexts container in the platform. This prevents new kernels from + // being submitted in any context while we are in the process of allocating a + // memory, this is needed to properly capture allocations by kernels with + // indirect access. This lock also protects access to the context's data + // structures. If indirect access tracking is not enabled then lock context + // mutex to protect access to context's data structures. + auto Lock = IndirectAccessTrackingEnabled + ? std::scoped_lock(Plt->ContextsMutex) + : std::scoped_lock(Context->Mutex); + if (IndirectAccessTrackingEnabled) { - // Lock the mutex which is guarding contexts container in the platform. - // This prevents new kernels from being submitted in any context while we - // are in the process of allocating a memory, this is needed to properly - // capture allocations by kernels with indirect access. - ContextsLock.lock(); // We are going to defer memory release if there are kernels with indirect // access, that is why explicitly retain context to be sure that it is // released after all memory allocations in this context are released. @@ -7523,18 +7555,24 @@ pi_result piextUSMHostAlloc(void **ResultPtr, pi_context Context, return PI_ERROR_INVALID_VALUE; pi_platform Plt = Context->getPlatform(); - std::unique_lock ContextsLock(Plt->ContextsMutex, - std::defer_lock); + // If indirect access tracking is enabled then lock the mutex which is + // guarding contexts container in the platform. This prevents new kernels from + // being submitted in any context while we are in the process of allocating a + // memory, this is needed to properly capture allocations by kernels with + // indirect access. This lock also protects access to the context's data + // structures. If indirect access tracking is not enabled then lock context + // mutex to protect access to context's data structures. + std::shared_lock ContextLock(Context->Mutex, std::defer_lock); + std::unique_lock IndirectAccessTrackingLock(Plt->ContextsMutex, + std::defer_lock); if (IndirectAccessTrackingEnabled) { - // Lock the mutex which is guarding contexts container in the platform. - // This prevents new kernels from being submitted in any context while we - // are in the process of allocating a memory, this is needed to properly - // capture allocations by kernels with indirect access. - ContextsLock.lock(); + IndirectAccessTrackingLock.lock(); // We are going to defer memory release if there are kernels with indirect // access, that is why explicitly retain context to be sure that it is // released after all memory allocations in this context are released. PI_CALL(piContextRetain(Context)); + } else { + ContextLock.lock(); } if (!UseUSMAllocator || @@ -7579,6 +7617,8 @@ pi_result piextUSMHostAlloc(void **ResultPtr, pi_context Context, // container with contexts because deallocating the memory can turn RefCount of // a context to 0 and as a result the context being removed from the list of // tracked contexts. +// If indirect access tracking is not enabled then caller must lock Context +// mutex. static pi_result USMFreeHelper(pi_context Context, void *Ptr, bool OwnZeMemHandle) { if (IndirectAccessTrackingEnabled) { @@ -7694,10 +7734,11 @@ static pi_result USMFreeHelper(pi_context Context, void *Ptr, pi_result piextUSMFree(pi_context Context, void *Ptr) { pi_platform Plt = Context->getPlatform(); - std::unique_lock ContextsLock(Plt->ContextsMutex, - std::defer_lock); - if (IndirectAccessTrackingEnabled) - ContextsLock.lock(); + + auto Lock = IndirectAccessTrackingEnabled + ? std::scoped_lock(Plt->ContextsMutex) + : std::scoped_lock(Context->Mutex); + return USMFreeHelper(Context, Ptr, true /* OwnZeMemHandle */); } @@ -8286,6 +8327,9 @@ pi_result _pi_buffer::getZeHandle(char *&ZeHandle, access_mode_t AccessMode, // instead of maintaining a separate allocation and performing this // explciit copy. // + // zeCommandListAppendMemoryCopy must not be called from simultaneous + // threads with the same command list handle, so we need exclusive lock. + std::scoped_lock Lock(Context->ImmediateCommandListMutex); ZE_CALL(zeCommandListAppendMemoryCopy, (Context->ZeCommandListInit, ZeHandle /* Dst */, ZeHandleSrc, Size, nullptr, 0, nullptr)); @@ -8320,10 +8364,9 @@ pi_result _pi_buffer::free() { break; case allocation_t::free: { pi_platform Plt = Context->getPlatform(); - std::unique_lock ContextsLock(Plt->ContextsMutex, - std::defer_lock); - if (IndirectAccessTrackingEnabled) - ContextsLock.lock(); + auto Lock = IndirectAccessTrackingEnabled + ? std::scoped_lock(Plt->ContextsMutex) + : std::scoped_lock(Context->Mutex); PI_CALL(USMFreeHelper(Context, ZeHandle, true)); break; diff --git a/sycl/plugins/level_zero/pi_level_zero.hpp b/sycl/plugins/level_zero/pi_level_zero.hpp index 61df5920c81d5..bde07c1d5cd31 100644 --- a/sycl/plugins/level_zero/pi_level_zero.hpp +++ b/sycl/plugins/level_zero/pi_level_zero.hpp @@ -174,32 +174,6 @@ template struct ZesStruct : public T { } }; -// The wrapper for immutable Level-Zero data. -// The data is initialized only once at first access (via ->) with the -// initialization function provided in Init. All subsequent access to -// the data just returns the already stored data. -// -template struct ZeCache : private T { - // The initialization function takes a reference to the data - // it is going to initialize, since it is private here in - // order to disallow access other than through "->". - // - typedef std::function InitFunctionType; - InitFunctionType Compute{nullptr}; - bool Computed{false}; - - ZeCache() : T{} {} - - // Access to the fields of the original T data structure. - T *operator->() { - if (!Computed) { - Compute(*this); - Computed = true; - } - return this; - } -}; - // A single-threaded app has an opportunity to enable this mode to avoid // overhead from mutex locking. Default value is 0 which means that single // thread mode is disabled. @@ -239,6 +213,50 @@ class pi_shared_mutex : public std::shared_mutex { } }; +// Class which acts like std::mutex if SingleThreadMode variable is not set. +// If SingleThreadMode variable is set then mutex operations are turned into +// nop. +class pi_mutex : public std::mutex { +public: + void lock() { + if (!SingleThreadMode) + std::mutex::lock(); + } + bool try_lock() { return SingleThreadMode ? true : std::mutex::try_lock(); } + void unlock() { + if (!SingleThreadMode) + std::mutex::unlock(); + } +}; + +// The wrapper for immutable Level-Zero data. +// The data is initialized only once at first access (via ->) with the +// initialization function provided in Init. All subsequent access to +// the data just returns the already stored data. +// +template struct ZeCache : private T { + // The initialization function takes a reference to the data + // it is going to initialize, since it is private here in + // order to disallow access other than through "->". + // + typedef std::function InitFunctionType; + InitFunctionType Compute{nullptr}; + bool Computed{false}; + pi_mutex ZeCacheMutex; + + ZeCache() : T{} {} + + // Access to the fields of the original T data structure. + T *operator->() { + std::unique_lock Lock(ZeCacheMutex); + if (!Computed) { + Compute(*this); + Computed = true; + } + return this; + } +}; + // This wrapper around std::atomic is created to limit operations with reference // counter and to make allowed operations more transparent in terms of // thread-safety in the plugin. increment() and load() operations do not need a @@ -342,7 +360,7 @@ struct _pi_platform { // Cache pi_devices for reuse std::vector> PiDevicesCache; - std::mutex PiDevicesCacheMutex; + pi_shared_mutex PiDevicesCacheMutex; bool DeviceCachePopulated = false; // Check the device cache and load it if necessary. @@ -352,17 +370,13 @@ struct _pi_platform { // If not found, then nullptr is returned. pi_device getDeviceFromNativeHandle(ze_device_handle_t); - // Current number of L0 Command Lists created on this platform. - // this number must not exceed ZeMaxCommandListCache. - std::atomic ZeGlobalCommandListCount{0}; - // Keep track of all contexts in the platform. This is needed to manage // a lifetime of memory allocations in each context when there are kernels // with indirect access. // TODO: should be deleted when memory isolation in the context is implemented // in the driver. std::list Contexts; - std::mutex ContextsMutex; + pi_shared_mutex ContextsMutex; }; // Implements memory allocation via L0 RT for USM allocator interface. @@ -492,7 +506,9 @@ struct _pi_device : _pi_object { int SubSubDeviceIndex = -1); // Level Zero device handle. - ze_device_handle_t ZeDevice; + // This field is only set at _pi_device creation time, and cannot change. + // Therefore it can be accessed without holding a lock on this _pi_device. + const ze_device_handle_t ZeDevice; // Keep the subdevices that are partitioned from this pi_device for reuse // The order of sub-devices in this vector is repeated from the @@ -501,10 +517,15 @@ struct _pi_device : _pi_object { std::vector SubDevices; // PI platform to which this device belongs. - pi_platform Platform; + // This field is only set at _pi_device creation time, and cannot change. + // Therefore it can be accessed without holding a lock on this _pi_device. + const pi_platform Platform; // Root-device of a sub-device, null if this is not a sub-device. - pi_device RootDevice; + // This field is only set at _pi_device creation time, and cannot change. + // Therefore it can be accessed without holding a lock on this _pi_device. + const pi_device RootDevice; + bool isSubDevice() { return RootDevice != nullptr; } // Cache of the immutable device properties. @@ -560,8 +581,9 @@ typedef pi_command_list_map_t::iterator pi_command_list_ptr_t; struct _pi_context : _pi_object { _pi_context(ze_context_handle_t ZeContext, pi_uint32 NumDevices, const pi_device *Devs, bool OwnZeContext) - : ZeContext{ZeContext}, OwnZeContext{OwnZeContext}, - Devices{Devs, Devs + NumDevices}, ZeCommandListInit{nullptr} { + : ZeContext{ZeContext}, + OwnZeContext{OwnZeContext}, Devices{Devs, Devs + NumDevices}, + SingleRootDevice(getRootDevice()), ZeCommandListInit{nullptr} { // NOTE: one must additionally call initialize() to complete // PI context creation. @@ -587,31 +609,6 @@ struct _pi_context : _pi_object { HostMemAllocContext = std::make_unique( std::unique_ptr(new USMHostMemoryAlloc(this))); - if (NumDevices == 1) { - SingleRootDevice = Devices[0]; - } else { - - // Check if we have context with subdevices of the same device (context - // may include root device itself as well) - SingleRootDevice = - Devices[0]->RootDevice ? Devices[0]->RootDevice : Devices[0]; - - // For context with sub subdevices, the SingleRootDevice might still - // not be the root device. - // Check whether the SingleRootDevice is the subdevice or root device. - if (SingleRootDevice->isSubDevice()) { - SingleRootDevice = SingleRootDevice->RootDevice; - } - - for (auto &Device : Devices) { - if ((!Device->RootDevice && Device != SingleRootDevice) || - (Device->RootDevice && Device->RootDevice != SingleRootDevice)) { - SingleRootDevice = nullptr; - break; - } - } - } - // We may allocate memory to this root device so create allocators. if (SingleRootDevice && DeviceMemAllocContexts.find(SingleRootDevice) == DeviceMemAllocContexts.end()) { @@ -637,18 +634,24 @@ struct _pi_context : _pi_object { // A L0 context handle is primarily used during creation and management of // resources that may be used by multiple devices. - ze_context_handle_t ZeContext; + // This field is only set at _pi_context creation time, and cannot change. + // Therefore it can be accessed without holding a lock on this _pi_context. + const ze_context_handle_t ZeContext; // Indicates if we own the ZeContext or it came from interop that // asked to not transfer the ownership to SYCL RT. bool OwnZeContext; // Keep the PI devices this PI context was created for. - std::vector Devices; + // This field is only set at _pi_context creation time, and cannot change. + // Therefore it can be accessed without holding a lock on this _pi_context. + const std::vector Devices; // If context contains one device or sub-devices of the same device, we want // to save this device. - pi_device SingleRootDevice = nullptr; + // This field is only set at _pi_context creation time, and cannot change. + // Therefore it can be accessed without holding a lock on this _pi_context. + const pi_device SingleRootDevice = nullptr; // Immediate Level Zero command list for the device in this context, to be // used for initializations. To be created as: @@ -660,9 +663,14 @@ struct _pi_context : _pi_object { // support of the multiple devices per context will be added. ze_command_list_handle_t ZeCommandListInit; + // Mutex for the immediate command list. Per the Level Zero spec memory copy + // operations submitted to an immediate command list are not allowed to be + // called from simultaneous threads. + pi_mutex ImmediateCommandListMutex; + // Mutex Lock for the Command List Cache. This lock is used to control both // compute and copy command list caches. - std::mutex ZeCommandListCacheMutex; + pi_mutex ZeCommandListCacheMutex; // Cache of all currently available/completed command/copy lists. // Note that command-list can only be re-used on the same device. // @@ -733,6 +741,15 @@ struct _pi_context : _pi_object { std::unordered_map MemAllocs; private: + // If context contains one device then return this device. + // If context contains sub-devices of the same device, then return this parent + // device. Return nullptr if context consists of several devices which are not + // sub-devices of the same device. We call returned device the root device of + // a context. + // TODO: get rid of this when contexts with multiple devices are supported for + // images. + pi_device getRootDevice() const; + // Following member variables are used to manage assignment of events // to event pools. // @@ -770,7 +787,7 @@ struct _pi_context : _pi_object { // Mutex to control operations on event pool caches and the helper maps // holding the current pool usage counts. - std::mutex ZeEventPoolCacheMutex; + pi_mutex ZeEventPoolCacheMutex; }; struct _pi_queue : _pi_object { @@ -1203,7 +1220,7 @@ struct _pi_ze_event_list_t { // when an event is initially created. However, it might be // possible to have multiple threads racing to destroy the list, // so this will be used to make list destruction thread-safe. - std::mutex PiZeEventListMutex; + pi_mutex PiZeEventListMutex; // Initialize this using the array of events in EventList, and retain // all the pi_events in the created data structure.