diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp old mode 100755 new mode 100644 index 0009202557f53..583f811aa01af --- a/sycl/plugins/level_zero/pi_level_zero.cpp +++ b/sycl/plugins/level_zero/pi_level_zero.cpp @@ -99,6 +99,16 @@ static const bool UseMultipleCmdlistBarriers = [] { return std::stoi(UseMultipleCmdlistBarriersFlag) > 0; }(); +// This is an experimental option that allows to disable caching of events in +// the context. +static const bool DisableEventsCaching = [] { + const char *DisableEventsCachingFlag = + std::getenv("SYCL_PI_LEVEL_ZERO_DISABLE_EVENTS_CACHING"); + if (!DisableEventsCachingFlag) + return false; + return std::stoi(DisableEventsCachingFlag) != 0; +}(); + // This class encapsulates actions taken along with a call to Level Zero API. class ZeCall { private: @@ -468,10 +478,18 @@ _pi_context::getFreeSlotInExistingOrNewPool(ze_event_pool_handle_t &Pool, std::list *ZePoolCache = getZeEventPoolCache(HostVisible, ProfilingEnabled); - // Remove full pool from the cache. if (!ZePoolCache->empty()) { if (NumEventsAvailableInEventPool[ZePoolCache->front()] == 0) { - ZePoolCache->erase(ZePoolCache->begin()); + if (DisableEventsCaching) { + // Remove full pool from the cache if events caching is disabled. + ZePoolCache->erase(ZePoolCache->begin()); + } else { + // If event caching is enabled then we don't destroy events so there is + // no need to remove pool from the cache and add it back when it has + // available slots. Just keep it in the tail of the cache so that all + // pools can be destroyed during context destruction. + ZePoolCache->push_front(nullptr); + } } } if (ZePoolCache->empty()) { @@ -868,7 +886,18 @@ pi_result _pi_context::initialize() { pi_result _pi_context::finalize() { // This function is called when pi_context is deallocated, piContextRelease. // There could be some memory that may have not been deallocated. - // For example, event pool caches would be still alive. + // For example, event and event pool caches would be still alive. + + if (!DisableEventsCaching) { + std::scoped_lock Lock(EventCacheMutex); + for (auto &EventCache : EventCaches) { + for (auto Event : EventCache) { + ZE_CALL(zeEventDestroy, (Event->ZeEvent)); + delete Event; + } + EventCache.clear(); + } + } { std::scoped_lock Lock(ZeEventPoolCacheMutex); for (auto &ZePoolCache : ZeEventPoolCache) { @@ -5430,6 +5459,42 @@ _pi_event::getOrCreateHostVisibleEvent(ze_event_handle_t &ZeHostVisibleEvent) { return PI_SUCCESS; } +pi_result _pi_event::reset() { + Queue = nullptr; + CleanedUp = false; + Completed = false; + CommandData = nullptr; + CommandType = PI_COMMAND_TYPE_USER; + WaitList = {}; + RefCount.reset(); + + if (!isHostVisible()) + HostVisibleEvent = nullptr; + + ZE_CALL(zeEventHostReset, (ZeEvent)); + return PI_SUCCESS; +} + +pi_event _pi_context::getEventFromCache(bool HostVisible, bool WithProfiling) { + std::scoped_lock Lock(EventCacheMutex); + auto Cache = getEventCache(HostVisible, WithProfiling); + if (Cache->empty()) + return nullptr; + + auto It = Cache->begin(); + pi_event Event = *It; + Cache->erase(It); + return Event; +} + +void _pi_context::addEventToCache(pi_event Event) { + std::scoped_lock Lock(EventCacheMutex); + auto Cache = + getEventCache(Event->isHostVisible(), Event->isProfilingEnabled()); + Event->reset(); + Cache->emplace_back(Event); +} + // Helper function for creating a PI event. // The "Queue" argument specifies the PI queue where a command is submitted. // The "HostVisible" argument specifies if event needs to be allocated from @@ -5437,17 +5502,24 @@ _pi_event::getOrCreateHostVisibleEvent(ze_event_handle_t &ZeHostVisibleEvent) { // static pi_result EventCreate(pi_context Context, pi_queue Queue, bool HostVisible, pi_event *RetEvent) { - bool ProfilingEnabled = !Queue || (Queue->Properties & PI_QUEUE_PROFILING_ENABLE) != 0; - size_t Index = 0; + if (auto CachedEvent = + Context->getEventFromCache(HostVisible, ProfilingEnabled)) { + *RetEvent = CachedEvent; + return PI_SUCCESS; + } + + ze_event_handle_t ZeEvent; ze_event_pool_handle_t ZeEventPool = {}; + + size_t Index = 0; + if (auto Res = Context->getFreeSlotInExistingOrNewPool( ZeEventPool, Index, HostVisible, ProfilingEnabled)) return Res; - ze_event_handle_t ZeEvent; ZeStruct ZeEventDesc; ZeEventDesc.index = Index; ZeEventDesc.wait = 0; @@ -5456,9 +5528,9 @@ static pi_result EventCreate(pi_context Context, pi_queue Queue, ZeEventDesc.signal = ZE_EVENT_SCOPE_FLAG_HOST; } else { // - // Set the scope to "device" for every event. This is sufficient for global - // device access and peer device access. If needed to be seen on the host - // we are doing special handling, see EventsScope options. + // Set the scope to "device" for every event. This is sufficient for + // global device access and peer device access. If needed to be seen on + // the host we are doing special handling, see EventsScope options. // // TODO: see if "sub-device" (ZE_EVENT_SCOPE_FLAG_SUBDEVICE) can better be // used in some circumstances. @@ -5819,7 +5891,12 @@ pi_result piEventRelease(pi_event Event) { Event->CommandData = nullptr; } if (Event->OwnZeEvent) { - ZE_CALL(zeEventDestroy, (Event->ZeEvent)); + if (DisableEventsCaching) { + ZE_CALL(zeEventDestroy, (Event->ZeEvent)); + auto Context = Event->Context; + if (auto Res = Context->decrementUnreleasedEventsInPool(Event)) + return Res; + } } // It is possible that host-visible event was never created. // In case it was check if that's different from this same event @@ -5829,10 +5906,6 @@ pi_result piEventRelease(pi_event Event) { PI_CALL(piEventRelease(Event->HostVisibleEvent)); } - auto Context = Event->Context; - if (auto Res = Context->decrementUnreleasedEventsInPool(Event)) - return Res; - // We intentionally incremented the reference counter when an event is // created so that we can avoid pi_queue is released before the associated // pi_event is released. Here we have to decrement it so pi_queue @@ -5840,7 +5913,12 @@ pi_result piEventRelease(pi_event Event) { if (Event->Queue) { PI_CALL(piQueueReleaseInternal(Event->Queue)); } - delete Event; + + if (DisableEventsCaching || !Event->OwnZeEvent) { + delete Event; + } else { + Event->Context->addEventToCache(Event); + } return PI_SUCCESS; } diff --git a/sycl/plugins/level_zero/pi_level_zero.hpp b/sycl/plugins/level_zero/pi_level_zero.hpp index 9a96cb9360d3c..8cf66f7b26bc5 100644 --- a/sycl/plugins/level_zero/pi_level_zero.hpp +++ b/sycl/plugins/level_zero/pi_level_zero.hpp @@ -274,7 +274,10 @@ template struct ZeCache : private T { // thread can reach ref count equal to zero, i.e. only a single thread can pass // through this check. struct ReferenceCounter { - ReferenceCounter(pi_uint32 InitVal) : RefCount{InitVal} {} + ReferenceCounter() : RefCount{1} {} + + // Reset the counter to the initial value. + void reset() { RefCount = 1; } // Used when retaining an object. void increment() { RefCount++; } @@ -306,7 +309,7 @@ struct ReferenceCounter { // Base class to store common data struct _pi_object { - _pi_object() : RefCount{1} {} + _pi_object() : RefCount{} {} // Level Zero doesn't do the reference counting, so we have to do. // Must be atomic to prevent data race when incrementing/decrementing. @@ -750,6 +753,12 @@ struct _pi_context : _pi_object { // when kernel has finished execution. std::unordered_map MemAllocs; + // Get pi_event from cache. + pi_event getEventFromCache(bool HostVisible, bool WithProfiling); + + // Add pi_event to cache. + void addEventToCache(pi_event); + private: // If context contains one device then return this device. // If context contains sub-devices of the same device, then return this parent @@ -798,6 +807,20 @@ struct _pi_context : _pi_object { // Mutex to control operations on event pool caches and the helper maps // holding the current pool usage counts. pi_mutex ZeEventPoolCacheMutex; + + // Mutex to control operations on event caches. + pi_mutex EventCacheMutex; + + // Caches for events. + std::vector> EventCaches{4}; + + // Get the cache of events for a provided scope and profiling mode. + auto getEventCache(bool HostVisible, bool WithProfiling) { + if (HostVisible) + return WithProfiling ? &EventCaches[0] : &EventCaches[1]; + else + return WithProfiling ? &EventCaches[2] : &EventCaches[3]; + } }; struct _pi_queue : _pi_object { @@ -1350,6 +1373,9 @@ struct _pi_event : _pi_object { // L0 event (if any) is not guranteed to have been signalled, or // being visible to the host at all. bool Completed = {false}; + + // Reset _pi_event object. + pi_result reset(); }; struct _pi_program : _pi_object {