diff --git a/sycl/include/sycl/buffer.hpp b/sycl/include/sycl/buffer.hpp index 5e115c4f3643a..4810285ce642f 100644 --- a/sycl/include/sycl/buffer.hpp +++ b/sycl/include/sycl/buffer.hpp @@ -125,6 +125,8 @@ class __SYCL_EXPORT buffer_plain { size_t getSize() const; + void handleRelease() const; + std::shared_ptr impl; }; @@ -466,7 +468,7 @@ class buffer : public detail::buffer_plain, buffer &operator=(buffer &&rhs) = default; - ~buffer() = default; + ~buffer() { buffer_plain::handleRelease(); } bool operator==(const buffer &rhs) const { return impl == rhs.impl; } diff --git a/sycl/plugins/esimd_emulator/pi_esimd_emulator.cpp b/sycl/plugins/esimd_emulator/pi_esimd_emulator.cpp index 2d3e78edae683..fa26c99c96024 100644 --- a/sycl/plugins/esimd_emulator/pi_esimd_emulator.cpp +++ b/sycl/plugins/esimd_emulator/pi_esimd_emulator.cpp @@ -1381,8 +1381,40 @@ pi_result piKernelRelease(pi_kernel) { DIE_NO_IMPLEMENTATION; } pi_result piEventCreate(pi_context, pi_event *) { DIE_NO_IMPLEMENTATION; } -pi_result piEventGetInfo(pi_event, pi_event_info, size_t, void *, size_t *) { - DIE_NO_IMPLEMENTATION; +pi_result piEventGetInfo(pi_event Event, pi_event_info ParamName, + size_t ParamValueSize, void *ParamValue, + size_t *ParamValueSizeRet) { + if (ParamName != PI_EVENT_INFO_COMMAND_EXECUTION_STATUS) { + DIE_NO_IMPLEMENTATION; + } + + auto CheckAndFillStatus = [&](const cm_support::CM_STATUS &State) { + pi_int32 Result = PI_EVENT_RUNNING; + if (State == cm_support::CM_STATUS_FINISHED) + Result = PI_EVENT_COMPLETE; + if (ParamValue) { + if (ParamValueSize < sizeof(Result)) + return PI_ERROR_INVALID_VALUE; + *static_cast(ParamValue) = Result; + } + if (ParamValueSizeRet) { + *ParamValueSizeRet = sizeof(Result); + } + return PI_SUCCESS; + }; + // Dummy event is already completed ones done by CM. + if (Event->IsDummyEvent) + return CheckAndFillStatus(cm_support::CM_STATUS_FINISHED); + + if (Event->CmEventPtr == nullptr) + return PI_ERROR_INVALID_EVENT; + + cm_support::CM_STATUS Status; + int32_t Result = Event->CmEventPtr->GetStatus(Status); + if (Result != cm_support::CM_SUCCESS) + return PI_ERROR_COMMAND_EXECUTION_FAILURE; + + return CheckAndFillStatus(Status); } pi_result piEventGetProfilingInfo(pi_event Event, pi_profiling_info ParamName, diff --git a/sycl/source/buffer.cpp b/sycl/source/buffer.cpp index 823e1a19b338f..6e5e682b9429a 100644 --- a/sycl/source/buffer.cpp +++ b/sycl/source/buffer.cpp @@ -121,6 +121,13 @@ void buffer_plain::addOrReplaceAccessorProperties( size_t buffer_plain::getSize() const { return impl->getSizeInBytes(); } +void buffer_plain::handleRelease() const { + // Try to detach memory object only if impl is going to be released. + // Buffer copy will have pointer to the same impl. + if (impl.use_count() == 1) + impl->detachMemoryObject(impl); +} + } // namespace detail } // __SYCL_INLINE_VER_NAMESPACE(_V1) } // namespace sycl diff --git a/sycl/source/detail/event_impl.cpp b/sycl/source/detail/event_impl.cpp index 9ea1eac1867ca..48c235fc9b096 100644 --- a/sycl/source/detail/event_impl.cpp +++ b/sycl/source/detail/event_impl.cpp @@ -80,17 +80,19 @@ void event_impl::waitInternal() { void event_impl::setComplete() { if (MHostEvent || !MEvent) { - std::unique_lock lock(MMutex); + { + std::unique_lock lock(MMutex); #ifndef NDEBUG - int Expected = HES_NotComplete; - int Desired = HES_Complete; + int Expected = HES_NotComplete; + int Desired = HES_Complete; - bool Succeeded = MState.compare_exchange_strong(Expected, Desired); + bool Succeeded = MState.compare_exchange_strong(Expected, Desired); - assert(Succeeded && "Unexpected state of event"); + assert(Succeeded && "Unexpected state of event"); #else - MState.store(static_cast(HES_Complete)); + MState.store(static_cast(HES_Complete)); #endif + } cv.notify_all(); return; } @@ -144,8 +146,8 @@ event_impl::event_impl(RT::PiEvent Event, const context &SyclContext) } event_impl::event_impl(const QueueImplPtr &Queue) - : MQueue{Queue}, MIsProfilingEnabled{Queue->is_host() || - Queue->MIsProfilingEnabled} { + : MQueue{Queue}, + MIsProfilingEnabled{Queue->is_host() || Queue->MIsProfilingEnabled} { this->setContextImpl(Queue->getContextImplPtr()); if (Queue->is_host()) { @@ -429,6 +431,11 @@ void event_impl::cleanDepEventsThroughOneLevel() { } } +bool event_impl::isCompleted() { + return get_info() == + info::event_command_status::complete; +} + } // namespace detail } // __SYCL_INLINE_VER_NAMESPACE(_V1) } // namespace sycl diff --git a/sycl/source/detail/event_impl.hpp b/sycl/source/detail/event_impl.hpp index e689270c5abe8..fc49a0ffc639c 100644 --- a/sycl/source/detail/event_impl.hpp +++ b/sycl/source/detail/event_impl.hpp @@ -243,6 +243,8 @@ class event_impl { /// state. bool isInitialized() const noexcept { return MIsInitialized; } + bool isCompleted(); + void attachEventToComplete(const EventImplPtr &Event) { std::lock_guard Lock(MMutex); MPostCompleteEvents.push_back(Event); diff --git a/sycl/source/detail/global_handler.cpp b/sycl/source/detail/global_handler.cpp index 874ded5862906..c2b60ef1de043 100644 --- a/sycl/source/detail/global_handler.cpp +++ b/sycl/source/detail/global_handler.cpp @@ -27,6 +27,36 @@ namespace sycl { __SYCL_INLINE_VER_NAMESPACE(_V1) { namespace detail { + +// Utility class to track references on object. +// Used for Scheduler now and created as thread_local object. +// Origin idea is to track usage of Scheduler from main and other used threads - +// they increment MCounter; and to use but not add extra reference by our +// thread_pool threads. For this control MIncrementCounter class member is used. +template class ObjectUsageCounter { +public: + ObjectUsageCounter(std::unique_ptr &Obj, bool ModifyCounter) + : MModifyCounter(ModifyCounter), MObj(Obj) { + if (MModifyCounter) + MCounter++; + } + ~ObjectUsageCounter() { + if (!MModifyCounter) + return; + + MCounter--; + if (!MCounter && MObj) + MObj->releaseResources(); + } + +private: + static std::atomic_uint MCounter; + bool MModifyCounter; + std::unique_ptr &MObj; +}; +template +std::atomic_uint ObjectUsageCounter::MCounter{0}; + using LockGuard = std::lock_guard; GlobalHandler::GlobalHandler() = default; @@ -47,7 +77,24 @@ T &GlobalHandler::getOrCreate(InstWithLock &IWL, Types... Args) { return *IWL.Inst; } -Scheduler &GlobalHandler::getScheduler() { return getOrCreate(MScheduler); } +void GlobalHandler::attachScheduler(Scheduler *Scheduler) { + // The method is used in unit tests only. Do not protect with lock since + // releaseResources will cause dead lock due to host queue release + if (MScheduler.Inst) + MScheduler.Inst->releaseResources(); + MScheduler.Inst.reset(Scheduler); +} + +Scheduler &GlobalHandler::getScheduler() { + getOrCreate(MScheduler); + registerSchedulerUsage(); + return *MScheduler.Inst; +} + +void GlobalHandler::registerSchedulerUsage(bool ModifyCounter) { + thread_local ObjectUsageCounter SchedulerCounter(MScheduler.Inst, + ModifyCounter); +} ProgramManager &GlobalHandler::getProgramManager() { return getOrCreate(MProgramManager); @@ -141,9 +188,18 @@ void GlobalHandler::unloadPlugins() { GlobalHandler::instance().getPlugins().clear(); } +void GlobalHandler::drainThreadPool() { + if (MHostTaskThreadPool.Inst) + MHostTaskThreadPool.Inst->drain(); +} + void shutdown() { // Ensure neither host task is working so that no default context is accessed // upon its release + + if (GlobalHandler::instance().MScheduler.Inst) + GlobalHandler::instance().MScheduler.Inst->releaseResources(); + if (GlobalHandler::instance().MHostTaskThreadPool.Inst) GlobalHandler::instance().MHostTaskThreadPool.Inst->finishAndWait(); diff --git a/sycl/source/detail/global_handler.hpp b/sycl/source/detail/global_handler.hpp index 50e0a0f93ef3d..cecd3ab09792a 100644 --- a/sycl/source/detail/global_handler.hpp +++ b/sycl/source/detail/global_handler.hpp @@ -54,6 +54,7 @@ class GlobalHandler { GlobalHandler(const GlobalHandler &) = delete; GlobalHandler(GlobalHandler &&) = delete; + void registerSchedulerUsage(bool ModifyCounter = true); Scheduler &getScheduler(); ProgramManager &getProgramManager(); Sync &getSync(); @@ -74,6 +75,10 @@ class GlobalHandler { static void registerDefaultContextReleaseHandler(); void unloadPlugins(); + void drainThreadPool(); + + // For testing purposes only + void attachScheduler(Scheduler *Scheduler); private: friend void releaseDefaultContexts(); diff --git a/sycl/source/detail/scheduler/scheduler.cpp b/sycl/source/detail/scheduler/scheduler.cpp index 336835d6198be..a8ba574c9e731 100644 --- a/sycl/source/detail/scheduler/scheduler.cpp +++ b/sycl/source/detail/scheduler/scheduler.cpp @@ -26,6 +26,18 @@ namespace sycl { __SYCL_INLINE_VER_NAMESPACE(_V1) { namespace detail { +bool Scheduler::checkLeavesCompletion(MemObjRecord *Record) { + for (Command *Cmd : Record->MReadLeaves) { + if (!Cmd->getEvent()->isCompleted()) + return false; + } + for (Command *Cmd : Record->MWriteLeaves) { + if (!Cmd->getEvent()->isCompleted()) + return false; + } + return true; +} + void Scheduler::waitForRecordToFinish(MemObjRecord *Record, ReadLockT &GraphReadLock) { #ifdef XPTI_ENABLE_INSTRUMENTATION @@ -256,7 +268,8 @@ void Scheduler::cleanupFinishedCommands(const EventImplPtr &FinishedEvent) { deallocateStreams(StreamsToDeallocate); } -void Scheduler::removeMemoryObject(detail::SYCLMemObjI *MemObj) { +bool Scheduler::removeMemoryObject(detail::SYCLMemObjI *MemObj, + bool StrictLock) { // We are going to traverse a graph of finished commands. Gather stream // objects from these commands if any and deallocate buffers for these stream // objects, this is needed to guarantee that streamed data is printed and @@ -269,23 +282,25 @@ void Scheduler::removeMemoryObject(detail::SYCLMemObjI *MemObj) { std::vector> AuxResourcesToDeallocate; { - MemObjRecord *Record = nullptr; + MemObjRecord *Record = MGraphBuilder.getMemObjRecord(MemObj); + if (!Record) + // No operations were performed on the mem object + return true; { // This only needs a shared mutex as it only involves enqueueing and // awaiting for events - ReadLockT Lock = acquireReadLock(); - - Record = MGraphBuilder.getMemObjRecord(MemObj); - if (!Record) - // No operations were performed on the mem object - return; - + ReadLockT Lock = StrictLock ? ReadLockT(MGraphLock) + : ReadLockT(MGraphLock, std::try_to_lock); + if (!Lock.owns_lock()) + return false; waitForRecordToFinish(Record, Lock); } - { - WriteLockT Lock = acquireWriteLock(); + WriteLockT Lock = StrictLock ? acquireWriteLock() + : WriteLockT(MGraphLock, std::try_to_lock); + if (!Lock.owns_lock()) + return false; MGraphBuilder.decrementLeafCountersForRecord(Record); MGraphBuilder.cleanupCommandsForRecord(Record, StreamsToDeallocate, AuxResourcesToDeallocate); @@ -293,6 +308,7 @@ void Scheduler::removeMemoryObject(detail::SYCLMemObjI *MemObj) { } } deallocateStreams(StreamsToDeallocate); + return true; } EventImplPtr Scheduler::addHostAccessor(Requirement *Req) { @@ -406,10 +422,29 @@ Scheduler::~Scheduler() { "not all resources were released. Please be sure that all kernels " "have synchronization points.\n\n"); } - // There might be some commands scheduled for post enqueue cleanup that - // haven't been freed because of the graph mutex being locked at the time, - // clean them up now. + DefaultHostQueue.reset(); +} + +void Scheduler::releaseResources() { + if (DefaultHostQueue) { + DefaultHostQueue->wait(); + } + GlobalHandler::instance().drainThreadPool(); + + // There might be some commands scheduled for post enqueue cleanup that + // haven't been freed because of the graph mutex being locked at the time, + // clean them up now. cleanupCommands({}); + + // We need loop since sometimes we may need new objects to be added to + // deferred mem objects storage during cleanup. Known example is: we cleanup + // existing deferred mem objects under write lock, during this process we + // cleanup commands related to this record, command may have last reference to + // queue_impl, ~queue_impl is called and buffer for assert (which is created + // with size only so all confitions for deferred release are satisfied) is + // added to deferred mem obj storage. So we may end up with leak. + while (!isDeferredMemObjectsEmpty()) + cleanupDeferredMemObjects(BlockingT::BLOCKING); } MemObjRecord *Scheduler::getMemObjRecord(const Requirement *const Req) { @@ -417,8 +452,9 @@ MemObjRecord *Scheduler::getMemObjRecord(const Requirement *const Req) { } void Scheduler::cleanupCommands(const std::vector &Cmds) { - if (Cmds.empty()) - { + cleanupDeferredMemObjects(BlockingT::NON_BLOCKING); + + if (Cmds.empty()) { std::lock_guard Lock{MDeferredCleanupMutex}; if (MDeferredCleanupCommands.empty()) return; @@ -472,6 +508,67 @@ void Scheduler::NotifyHostTaskCompletion(Command *Cmd, Command *BlockingCmd) { cleanupCommands(ToCleanUp); } +void Scheduler::deferMemObjRelease(const std::shared_ptr &MemObj) { + { + std::lock_guard Lock{MDeferredMemReleaseMutex}; + MDeferredMemObjRelease.push_back(MemObj); + } + cleanupDeferredMemObjects(BlockingT::NON_BLOCKING); +} + +inline bool Scheduler::isDeferredMemObjectsEmpty() { + std::lock_guard Lock{MDeferredMemReleaseMutex}; + return MDeferredMemObjRelease.empty(); +} + +void Scheduler::cleanupDeferredMemObjects(BlockingT Blocking) { + if (isDeferredMemObjectsEmpty()) + return; + if (Blocking == BlockingT::BLOCKING) { + std::vector> TempStorage; + { + std::lock_guard LockDef{MDeferredMemReleaseMutex}; + MDeferredMemObjRelease.swap(TempStorage); + } + // if any objects in TempStorage exist - it is leaving scope and being + // deleted + } + + std::vector> ObjsReadyToRelease; + { + // Lock is needed for checkLeavesCompletion - if walks through Record leaves + ReadLockT Lock = ReadLockT(MGraphLock, std::try_to_lock); + if (Lock.owns_lock()) { + // Not expected that Blocking == true will be used in parallel with + // adding MemObj to storage, no such scenario. + std::lock_guard LockDef{MDeferredMemReleaseMutex}; + auto MemObjIt = MDeferredMemObjRelease.begin(); + while (MemObjIt != MDeferredMemObjRelease.end()) { + MemObjRecord *Record = MGraphBuilder.getMemObjRecord((*MemObjIt).get()); + if (!checkLeavesCompletion(Record)) { + MemObjIt++; + continue; + } + ObjsReadyToRelease.push_back(*MemObjIt); + MemObjIt = MDeferredMemObjRelease.erase(MemObjIt); + } + } + } + auto ReleaseCandidateIt = ObjsReadyToRelease.begin(); + while (ReleaseCandidateIt != ObjsReadyToRelease.end()) { + if (!removeMemoryObject(ReleaseCandidateIt->get(), false)) + break; + ReleaseCandidateIt = ObjsReadyToRelease.erase(ReleaseCandidateIt); + } + if (!ObjsReadyToRelease.empty()) { + std::lock_guard LockDef{MDeferredMemReleaseMutex}; + MDeferredMemObjRelease.insert( + MDeferredMemObjRelease.end(), + std::make_move_iterator(ObjsReadyToRelease.begin()), + std::make_move_iterator(ObjsReadyToRelease.end())); + } +} + } // namespace detail } // __SYCL_INLINE_VER_NAMESPACE(_V1) } // namespace sycl diff --git a/sycl/source/detail/scheduler/scheduler.hpp b/sycl/source/detail/scheduler/scheduler.hpp index b6c4fb94be809..7f6e08998b7fd 100644 --- a/sycl/source/detail/scheduler/scheduler.hpp +++ b/sycl/source/detail/scheduler/scheduler.hpp @@ -174,7 +174,6 @@ class MockScheduler; namespace sycl { __SYCL_INLINE_VER_NAMESPACE(_V1) { namespace detail { - class queue_impl; class event_impl; class context_impl; @@ -397,7 +396,14 @@ class Scheduler { /// This member function is used by \ref buffer and \ref image. /// /// \param MemObj is a memory object that points to the buffer being removed. - void removeMemoryObject(detail::SYCLMemObjI *MemObj); + /// \param StrictLock WA, is a flag used to identify if strict read and write + /// lock are allowed or not. Default value is always applied in buffer_impl + /// destructor. StrictLock == false is introduced for + /// cleanupDeferredMemObjects to avoid blocking mem object release that may + /// lead to dead lock. \return WA, true if all release action completed and we + /// could delete memory object, false otherwise, most possible reason to + /// receive false - fail to obtain write lock. + bool removeMemoryObject(detail::SYCLMemObjI *MemObj, bool StrictLock = true); /// Removes finished non-leaf non-alloca commands from the subgraph (assuming /// that all its commands have been waited for). @@ -445,8 +451,12 @@ class Scheduler { static MemObjRecord *getMemObjRecord(const Requirement *const Req); + void deferMemObjRelease(const std::shared_ptr &MemObj); + Scheduler(); ~Scheduler(); + void releaseResources(); + bool isDeferredMemObjectsEmpty(); protected: using RWLockT = std::shared_timed_mutex; @@ -483,6 +493,9 @@ class Scheduler { static void enqueueLeavesOfReqUnlocked(const Requirement *const Req, std::vector &ToCleanUp); + // May lock graph with read and write modes during execution. + void cleanupDeferredMemObjects(BlockingT Blocking); + /// Graph builder class. /// /// The graph builder provides means to change an existing graph (e.g. add @@ -785,6 +798,7 @@ class Scheduler { /// GraphReadLock will be unlocked/locked as needed. Upon return from the /// function, GraphReadLock will be left in locked state. void waitForRecordToFinish(MemObjRecord *Record, ReadLockT &GraphReadLock); + bool checkLeavesCompletion(MemObjRecord *Record); GraphBuilder MGraphBuilder; RWLockT MGraphLock; @@ -792,6 +806,9 @@ class Scheduler { std::vector MDeferredCleanupCommands; std::mutex MDeferredCleanupMutex; + std::vector> MDeferredMemObjRelease; + std::mutex MDeferredMemReleaseMutex; + QueueImplPtr DefaultHostQueue; friend class Command; diff --git a/sycl/source/detail/sycl_mem_obj_t.cpp b/sycl/source/detail/sycl_mem_obj_t.cpp index 3636e5d5e0ac6..2acd31259a40b 100644 --- a/sycl/source/detail/sycl_mem_obj_t.cpp +++ b/sycl/source/detail/sycl_mem_obj_t.cpp @@ -31,7 +31,7 @@ SYCLMemObjT::SYCLMemObjT(pi_native_handle MemObject, const context &SyclContext, MInteropContext(detail::getSyclObjImpl(SyclContext)), MOpenCLInterop(true), MHostPtrReadOnly(false), MNeedWriteBack(true), MUserPtr(nullptr), MShadowCopy(nullptr), MUploadDataFunctor(nullptr), - MSharedPtrStorage(nullptr) { + MSharedPtrStorage(nullptr), MHostPtrProvided(true) { if (MInteropContext->is_host()) throw sycl::invalid_parameter_error( "Creation of interoperability memory object using host context is " @@ -90,8 +90,13 @@ void SYCLMemObjT::updateHostMemory() { // If we're attached to a memory record, process the deletion of the memory // record. We may get detached before we do this. - if (MRecord) - Scheduler::getInstance().removeMemoryObject(this); + if (MRecord) { + bool Result = Scheduler::getInstance().removeMemoryObject(this); + std::ignore = Result; // for no assert build + assert( + Result && + "removeMemoryObject should not return false in mem object destructor"); + } releaseHostMem(MShadowCopy); if (MOpenCLInterop) { @@ -147,6 +152,18 @@ void SYCLMemObjT::determineHostPtr(const ContextImplPtr &Context, } else HostPtrReadOnly = false; } + +void SYCLMemObjT::detachMemoryObject( + const std::shared_ptr &Self) const { + // Check MRecord without read lock because at this point we expect that no + // commands that operate on the buffer can be created. MRecord is nullptr on + // buffer creation and set to meaningfull + // value only if any operation on buffer submitted inside addCG call. addCG is + // called from queue::submit and buffer destruction could not overlap with it. + if (MRecord && !MHostPtrProvided) + Scheduler::getInstance().deferMemObjRelease(Self); +} + } // namespace detail } // __SYCL_INLINE_VER_NAMESPACE(_V1) } // namespace sycl diff --git a/sycl/source/detail/sycl_mem_obj_t.hpp b/sycl/source/detail/sycl_mem_obj_t.hpp index 54098bbc019df..8ee0bc10bc9e3 100644 --- a/sycl/source/detail/sycl_mem_obj_t.hpp +++ b/sycl/source/detail/sycl_mem_obj_t.hpp @@ -57,7 +57,8 @@ class __SYCL_EXPORT SYCLMemObjT : public SYCLMemObjI { MInteropContext(nullptr), MInteropMemObject(nullptr), MOpenCLInterop(false), MHostPtrReadOnly(false), MNeedWriteBack(true), MSizeInBytes(SizeInBytes), MUserPtr(nullptr), MShadowCopy(nullptr), - MUploadDataFunctor(nullptr), MSharedPtrStorage(nullptr) {} + MUploadDataFunctor(nullptr), MSharedPtrStorage(nullptr), + MHostPtrProvided(false) {} SYCLMemObjT(const property_list &Props, std::unique_ptr Allocator) @@ -134,6 +135,7 @@ class __SYCL_EXPORT SYCLMemObjT : public SYCLMemObjI { updateHostMemory(FinalData); } }; + MHostPtrProvided = true; } void set_final_data( @@ -144,6 +146,7 @@ class __SYCL_EXPORT SYCLMemObjT : public SYCLMemObjI { MUploadDataFunctor = [FinalDataFunc, UpdateFunc]() { FinalDataFunc(UpdateFunc); }; + MHostPtrProvided = true; } protected: @@ -169,6 +172,7 @@ class __SYCL_EXPORT SYCLMemObjT : public SYCLMemObjI { } void handleHostData(void *HostPtr, const size_t RequiredAlign) { + MHostPtrProvided = true; if (!MHostPtrReadOnly && HostPtr) { set_final_data([HostPtr](const std::function &F) { F(HostPtr); @@ -192,6 +196,7 @@ class __SYCL_EXPORT SYCLMemObjT : public SYCLMemObjI { void handleHostData(const std::shared_ptr &HostPtr, const size_t RequiredAlign, bool IsConstPtr) { + MHostPtrProvided = true; MSharedPtrStorage = HostPtr; MHostPtrReadOnly = IsConstPtr; if (HostPtr) { @@ -252,6 +257,8 @@ class __SYCL_EXPORT SYCLMemObjT : public SYCLMemObjI { bool isHostPointerReadOnly() const { return MHostPtrReadOnly; } + void detachMemoryObject(const std::shared_ptr &Self) const; + protected: // An allocateMem helper that determines which host ptr to use void determineHostPtr(const ContextImplPtr &Context, bool InitFromUserData, @@ -287,8 +294,12 @@ class __SYCL_EXPORT SYCLMemObjT : public SYCLMemObjI { // Field which holds user's shared_ptr in case of memory object is created // using constructor with shared_ptr. std::shared_ptr MSharedPtrStorage; + // Field to identify if dtor is not necessarily blocking. + // check for MUploadDataFunctor is not enough to define it since for case when + // we have read only HostPtr - MUploadDataFunctor is empty but delayed release + // must be not allowed. + bool MHostPtrProvided; }; - } // namespace detail } // __SYCL_INLINE_VER_NAMESPACE(_V1) } // namespace sycl diff --git a/sycl/source/detail/thread_pool.hpp b/sycl/source/detail/thread_pool.hpp index 2ffbe8a0bd52a..798573a40eca9 100644 --- a/sycl/source/detail/thread_pool.hpp +++ b/sycl/source/detail/thread_pool.hpp @@ -30,10 +30,11 @@ class ThreadPool { std::mutex MJobQueueMutex; std::condition_variable MDoSmthOrStop; std::atomic_bool MStop; + std::atomic_uint MJobsInPool; void worker() { + GlobalHandler::instance().registerSchedulerUsage(/*ModifyCounter*/ false); std::unique_lock Lock(MJobQueueMutex); - while (true) { MDoSmthOrStop.wait( Lock, [this]() { return !MJobQueue.empty() || MStop.load(); }); @@ -48,6 +49,8 @@ class ThreadPool { Job(); Lock.lock(); + + MJobsInPool--; } } @@ -55,12 +58,18 @@ class ThreadPool { MLaunchedThreads.reserve(MThreadCount); MStop.store(false); + MJobsInPool.store(0); for (size_t Idx = 0; Idx < MThreadCount; ++Idx) MLaunchedThreads.emplace_back([this] { worker(); }); } public: + void drain() { + while (MJobsInPool != 0) + std::this_thread::yield(); + } + ThreadPool(unsigned int ThreadCount = 1) : MThreadCount(ThreadCount) { start(); } @@ -82,7 +91,7 @@ class ThreadPool { std::lock_guard Lock(MJobQueueMutex); MJobQueue.emplace([F = std::move(Func)]() { F(); }); } - + MJobsInPool++; MDoSmthOrStop.notify_one(); } @@ -91,7 +100,7 @@ class ThreadPool { std::lock_guard Lock(MJobQueueMutex); MJobQueue.emplace(Func); } - + MJobsInPool++; MDoSmthOrStop.notify_one(); } }; diff --git a/sycl/test/abi/sycl_symbols_linux.dump b/sycl/test/abi/sycl_symbols_linux.dump index 133d4d4ac3a6e..ea2bf0a3c5e38 100644 --- a/sycl/test/abi/sycl_symbols_linux.dump +++ b/sycl/test/abi/sycl_symbols_linux.dump @@ -4052,6 +4052,7 @@ _ZNK4sycl3_V15queue9getNativeEv _ZNK4sycl3_V16ONEAPI15filter_selector13select_deviceEv _ZNK4sycl3_V16ONEAPI15filter_selector5resetEv _ZNK4sycl3_V16ONEAPI15filter_selectorclERKNS0_6deviceE +_ZNK4sycl3_V16detail11SYCLMemObjT18detachMemoryObjectERKSt10shared_ptrIS2_E _ZNK4sycl3_V16detail11SYCLMemObjT9getPluginEv _ZNK4sycl3_V16detail11SYCLMemObjT9isInteropEv _ZNK4sycl3_V16detail11buffer_impl15getNativeVectorENS0_7backendE @@ -4132,6 +4133,7 @@ _ZNK4sycl3_V16detail12buffer_plain12has_propertyINS0_8property6noinitEEEbv _ZNK4sycl3_V16detail12buffer_plain12has_propertyINS0_8property7context4cuda19use_primary_contextEEEbv _ZNK4sycl3_V16detail12buffer_plain12has_propertyINS0_8property7no_initEEEbv _ZNK4sycl3_V16detail12buffer_plain12has_propertyINS0_8property9reduction22initialize_to_identityEEEbv +_ZNK4sycl3_V16detail12buffer_plain13handleReleaseEv _ZNK4sycl3_V16detail12buffer_plain15getNativeVectorENS0_7backendE _ZNK4sycl3_V16detail12buffer_plain22get_allocator_internalEv _ZNK4sycl3_V16detail12buffer_plain7getSizeEv diff --git a/sycl/unittests/buffer/BufferDestructionCheck.cpp b/sycl/unittests/buffer/BufferDestructionCheck.cpp new file mode 100644 index 0000000000000..0abe2918aea36 --- /dev/null +++ b/sycl/unittests/buffer/BufferDestructionCheck.cpp @@ -0,0 +1,331 @@ +//==- BufferDestructionCheck.cpp --- check delayed destruction of buffer --==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include +#include + +#include +#include + +#include + +#include +#include +#include +#include + +#include "../scheduler/SchedulerTestUtils.hpp" + +class MockCmdWithReleaseTracking : public MockCommand { +public: + MockCmdWithReleaseTracking( + sycl::detail::QueueImplPtr Queue, sycl::detail::Requirement Req, + sycl::detail::Command::CommandType Type = sycl::detail::Command::RUN_CG) + : MockCommand(Queue, Req, Type){}; + MockCmdWithReleaseTracking( + sycl::detail::QueueImplPtr Queue, + sycl::detail::Command::CommandType Type = sycl::detail::Command::RUN_CG) + : MockCommand(Queue, Type){}; + ~MockCmdWithReleaseTracking() { Release(); } + MOCK_METHOD0(Release, void()); +}; + +class BufferDestructionCheck : public ::testing::Test { +public: + BufferDestructionCheck() : Mock{}, Plt{Mock.getPlatform()} {} + +protected: + void SetUp() override { + if (Plt.is_host()) { + std::cout << "Not run due to host-only environment\n"; + GTEST_SKIP(); + } + MockSchedulerPtr = new MockScheduler(); + sycl::detail::GlobalHandler::instance().attachScheduler( + dynamic_cast(MockSchedulerPtr)); + } + void TearDown() override { + sycl::detail::GlobalHandler::instance().attachScheduler(NULL); + } + + template + MockCmdWithReleaseTracking *addCommandToBuffer(Buffer &Buf, sycl::queue &Q) { + sycl::detail::Requirement MockReq = getMockRequirement(Buf); + std::vector AuxCmds; + sycl::detail::MemObjRecord *Rec = MockSchedulerPtr->getOrInsertMemObjRecord( + sycl::detail::getSyclObjImpl(Q), &MockReq, AuxCmds); + MockCmdWithReleaseTracking *MockCmd = new MockCmdWithReleaseTracking( + sycl::detail::getSyclObjImpl(Q), MockReq); + std::vector ToEnqueue; + MockSchedulerPtr->addNodeToLeaves(Rec, MockCmd, sycl::access::mode::write, + ToEnqueue); + // we do not want to enqueue commands, just keep not enqueued and not + // completed, otherwise check is not possible + return MockCmd; + } + +protected: + sycl::unittest::PiMock Mock; + sycl::platform Plt; + MockScheduler *MockSchedulerPtr; +}; + +TEST_F(BufferDestructionCheck, BufferWithSizeOnlyDefault) { + sycl::context Context{Plt}; + sycl::queue Q = sycl::queue{Context, sycl::default_selector{}}; + + MockCmdWithReleaseTracking *MockCmd = NULL; + sycl::detail::buffer_impl *RawBufferImplPtr = NULL; + { + sycl::buffer Buf(1); + std::shared_ptr BufImpl = + sycl::detail::getSyclObjImpl(Buf); + RawBufferImplPtr = BufImpl.get(); + MockCmd = addCommandToBuffer(Buf, Q); + } + ASSERT_EQ(MockSchedulerPtr->MDeferredMemObjRelease.size(), 1u); + EXPECT_EQ(MockSchedulerPtr->MDeferredMemObjRelease[0].get(), + RawBufferImplPtr); + EXPECT_CALL(*MockCmd, Release).Times(1); +} + +TEST_F(BufferDestructionCheck, BufferWithSizeOnlyDefaultSetFinalData) { + sycl::context Context{Plt}; + sycl::queue Q = sycl::queue{Context, sycl::default_selector{}}; + + MockCmdWithReleaseTracking *MockCmd = NULL; + { + int FinalData = 0; + sycl::buffer Buf(1); + Buf.set_final_data(&FinalData); + MockCmd = addCommandToBuffer(Buf, Q); + EXPECT_CALL(*MockCmd, Release).Times(1); + } + ASSERT_EQ(MockSchedulerPtr->MDeferredMemObjRelease.size(), 0u); +} + +TEST_F(BufferDestructionCheck, BufferWithSizeOnlyNonDefaultAllocator) { + sycl::context Context{Plt}; + sycl::queue Q = sycl::queue{Context, sycl::default_selector{}}; + + MockCmdWithReleaseTracking *MockCmd = NULL; + sycl::detail::buffer_impl *RawBufferImplPtr = NULL; + { + using AllocatorTypeTest = + sycl::usm_allocator; + AllocatorTypeTest allocator(Q); + sycl::buffer Buf(1, allocator); + std::shared_ptr BufImpl = + sycl::detail::getSyclObjImpl(Buf); + RawBufferImplPtr = BufImpl.get(); + MockCmd = addCommandToBuffer(Buf, Q); + EXPECT_CALL(*MockCmd, Release).Times(1); + } + ASSERT_EQ(MockSchedulerPtr->MDeferredMemObjRelease.size(), 1u); + EXPECT_EQ(MockSchedulerPtr->MDeferredMemObjRelease[0].get(), + RawBufferImplPtr); +} + +TEST_F(BufferDestructionCheck, BufferWithSizeOnlyDefaultAllocator) { + sycl::context Context{Plt}; + sycl::queue Q = sycl::queue{Context, sycl::default_selector{}}; + + MockCmdWithReleaseTracking *MockCmd = NULL; + sycl::detail::buffer_impl *RawBufferImplPtr = NULL; + { + using AllocatorTypeTest = sycl::buffer_allocator; + AllocatorTypeTest allocator; + sycl::buffer Buf(1, allocator); + std::shared_ptr BufImpl = + sycl::detail::getSyclObjImpl(Buf); + RawBufferImplPtr = BufImpl.get(); + MockCmd = addCommandToBuffer(Buf, Q); + EXPECT_CALL(*MockCmd, Release).Times(1); + } + ASSERT_EQ(MockSchedulerPtr->MDeferredMemObjRelease.size(), 1u); + EXPECT_EQ(MockSchedulerPtr->MDeferredMemObjRelease[0].get(), + RawBufferImplPtr); +} + +TEST_F(BufferDestructionCheck, BufferWithRawHostPtr) { + sycl::context Context{Plt}; + sycl::queue Q = sycl::queue{Context, sycl::default_selector{}}; + + MockCmdWithReleaseTracking *MockCmd = NULL; + { + int InitialVal = 8; + sycl::buffer Buf(&InitialVal, 1); + MockCmd = addCommandToBuffer(Buf, Q); + EXPECT_CALL(*MockCmd, Release).Times(1); + } + ASSERT_EQ(MockSchedulerPtr->MDeferredMemObjRelease.size(), 0u); +} + +TEST_F(BufferDestructionCheck, BufferWithRawHostPtrWithNonDefaultAllocator) { + sycl::context Context{Plt}; + sycl::queue Q = sycl::queue{Context, sycl::default_selector{}}; + + MockCmdWithReleaseTracking *MockCmd = NULL; + { + int InitialVal = 8; + using AllocatorTypeTest = + sycl::usm_allocator; + AllocatorTypeTest allocator(Q); + sycl::buffer Buf(&InitialVal, 1, allocator); + MockCmd = addCommandToBuffer(Buf, Q); + EXPECT_CALL(*MockCmd, Release).Times(1); + } + ASSERT_EQ(MockSchedulerPtr->MDeferredMemObjRelease.size(), 0u); +} + +TEST_F(BufferDestructionCheck, BufferWithConstRawHostPtr) { + sycl::context Context{Plt}; + sycl::queue Q = sycl::queue{Context, sycl::default_selector{}}; + + MockCmdWithReleaseTracking *MockCmd = NULL; + { + const int InitialVal = 8; + sycl::buffer Buf(&InitialVal, 1); + MockCmd = addCommandToBuffer(Buf, Q); + EXPECT_CALL(*MockCmd, Release).Times(1); + } + ASSERT_EQ(MockSchedulerPtr->MDeferredMemObjRelease.size(), 0u); +} + +TEST_F(BufferDestructionCheck, BufferWithContainer) { + sycl::context Context{Plt}; + sycl::queue Q = sycl::queue{Context, sycl::default_selector{}}; + + MockCmdWithReleaseTracking *MockCmd = NULL; + { + std::vector data{3, 4}; + sycl::buffer Buf(data); + MockCmd = addCommandToBuffer(Buf, Q); + EXPECT_CALL(*MockCmd, Release).Times(1); + } + ASSERT_EQ(MockSchedulerPtr->MDeferredMemObjRelease.size(), 0u); +} + +TEST_F(BufferDestructionCheck, BufferWithSharedPtr) { + sycl::context Context{Plt}; + sycl::queue Q = sycl::queue{Context, sycl::default_selector{}}; + + MockCmdWithReleaseTracking *MockCmd = NULL; + { + std::shared_ptr InitialVal(new int(5)); + sycl::buffer Buf(InitialVal, 1); + MockCmd = addCommandToBuffer(Buf, Q); + EXPECT_CALL(*MockCmd, Release).Times(1); + } + ASSERT_EQ(MockSchedulerPtr->MDeferredMemObjRelease.size(), 0u); +} + +TEST_F(BufferDestructionCheck, BufferWithSharedPtrArray) { + sycl::context Context{Plt}; + sycl::queue Q = sycl::queue{Context, sycl::default_selector{}}; + + MockCmdWithReleaseTracking *MockCmd = NULL; + { + std::shared_ptr InitialVal(new int[2]); + sycl::buffer Buf(InitialVal, 1); + MockCmd = addCommandToBuffer(Buf, Q); + EXPECT_CALL(*MockCmd, Release).Times(1); + } + ASSERT_EQ(MockSchedulerPtr->MDeferredMemObjRelease.size(), 0u); +} + +TEST_F(BufferDestructionCheck, BufferWithIterators) { + sycl::context Context{Plt}; + sycl::queue Q = sycl::queue{Context, sycl::default_selector{}}; + + MockCmdWithReleaseTracking *MockCmd = NULL; + sycl::detail::buffer_impl *RawBufferImplPtr = NULL; + { + std::vector data{3, 4}; + sycl::buffer Buf(data.begin(), data.end()); + std::shared_ptr BufImpl = + sycl::detail::getSyclObjImpl(Buf); + RawBufferImplPtr = BufImpl.get(); + MockCmd = addCommandToBuffer(Buf, Q); + EXPECT_CALL(*MockCmd, Release).Times(1); + } + ASSERT_EQ(MockSchedulerPtr->MDeferredMemObjRelease.size(), 1u); + EXPECT_EQ(MockSchedulerPtr->MDeferredMemObjRelease[0].get(), + RawBufferImplPtr); +} + +std::map ExpectedEventStatus; +pi_result getEventInfoFunc(pi_event Event, pi_event_info PName, size_t PVSize, + void *PV, size_t *PVSizeRet) { + EXPECT_EQ(PName, PI_EVENT_INFO_COMMAND_EXECUTION_STATUS) + << "Unknown param name"; + // could not use assert here + EXPECT_EQ(PVSize, 4u); + auto it = ExpectedEventStatus.find(Event); + if (it != ExpectedEventStatus.end()) { + *(static_cast(PV)) = it->second; + return PI_SUCCESS; + } else + return PI_ERROR_INVALID_OPERATION; +} + +TEST_F(BufferDestructionCheck, ReadyToReleaseLogic) { + sycl::context Context{Plt}; + sycl::queue Q = sycl::queue{Context, sycl::default_selector{}}; + + sycl::buffer Buf(1); + sycl::detail::Requirement MockReq = getMockRequirement(Buf); + std::vector AuxCmds; + sycl::detail::MemObjRecord *Rec = MockSchedulerPtr->getOrInsertMemObjRecord( + sycl::detail::getSyclObjImpl(Q), &MockReq, AuxCmds); + + std::shared_ptr CtxImpl = + sycl::detail::getSyclObjImpl(Context); + MockCmdWithReleaseTracking *ReadCmd = nullptr; + MockCmdWithReleaseTracking *WriteCmd = nullptr; + ReadCmd = + new MockCmdWithReleaseTracking(sycl::detail::getSyclObjImpl(Q), MockReq); + ReadCmd->getEvent()->getHandleRef() = + createDummyHandle(); // just assign to be able to use mock + WriteCmd = + new MockCmdWithReleaseTracking(sycl::detail::getSyclObjImpl(Q), MockReq); + WriteCmd->getEvent()->getHandleRef() = + createDummyHandle(); // just assign to be able to use mock + ReadCmd->MEnqueueStatus = sycl::detail::EnqueueResultT::SyclEnqueueSuccess; + WriteCmd->MEnqueueStatus = sycl::detail::EnqueueResultT::SyclEnqueueSuccess; + + std::vector ToCleanUp; + std::vector ToEnqueue; + MockSchedulerPtr->addNodeToLeaves(Rec, ReadCmd, sycl::access::mode::read, + ToEnqueue); + MockSchedulerPtr->addNodeToLeaves(Rec, WriteCmd, sycl::access::mode::write, + ToEnqueue); + + Mock.redefine(getEventInfoFunc); + testing::InSequence S; + + ExpectedEventStatus[ReadCmd->getEvent()->getHandleRef()] = PI_EVENT_SUBMITTED; + ExpectedEventStatus[WriteCmd->getEvent()->getHandleRef()] = + PI_EVENT_SUBMITTED; + + EXPECT_FALSE(MockSchedulerPtr->checkLeavesCompletion(Rec)); + + ExpectedEventStatus[ReadCmd->getEvent()->getHandleRef()] = PI_EVENT_COMPLETE; + ExpectedEventStatus[WriteCmd->getEvent()->getHandleRef()] = + PI_EVENT_SUBMITTED; + + EXPECT_FALSE(MockSchedulerPtr->checkLeavesCompletion(Rec)); + + ExpectedEventStatus[ReadCmd->getEvent()->getHandleRef()] = PI_EVENT_COMPLETE; + ExpectedEventStatus[WriteCmd->getEvent()->getHandleRef()] = PI_EVENT_COMPLETE; + EXPECT_TRUE(MockSchedulerPtr->checkLeavesCompletion(Rec)); + // previous expect_call is still valid and will generate failure if we recieve + // call here, no need for extra limitation + EXPECT_CALL(*ReadCmd, Release).Times(1); + EXPECT_CALL(*WriteCmd, Release).Times(1); +} \ No newline at end of file diff --git a/sycl/unittests/buffer/CMakeLists.txt b/sycl/unittests/buffer/CMakeLists.txt index f5dabae23f6df..137efe06cc555 100644 --- a/sycl/unittests/buffer/CMakeLists.txt +++ b/sycl/unittests/buffer/CMakeLists.txt @@ -1,4 +1,5 @@ add_sycl_unittest(BufferTests OBJECT BufferLocation.cpp Image.cpp + BufferDestructionCheck.cpp ) diff --git a/sycl/unittests/scheduler/LeafLimit.cpp b/sycl/unittests/scheduler/LeafLimit.cpp index 94e35bcdebd6a..4fc077bb1dad2 100644 --- a/sycl/unittests/scheduler/LeafLimit.cpp +++ b/sycl/unittests/scheduler/LeafLimit.cpp @@ -9,6 +9,7 @@ #include "SchedulerTest.hpp" #include "SchedulerTestUtils.hpp" +#include #include #include #include @@ -87,4 +88,8 @@ TEST_F(SchedulerTest, LeafLimit) { EXPECT_TRUE(std::any_of( NewestLeaf->MDeps.begin(), NewestLeaf->MDeps.end(), [&](const detail::DepDesc &DD) { return DD.MDepCommand == OldestLeaf; })); + MS.cleanupCommandsForRecord(Rec); + auto MemObj = static_cast( + detail::getSyclObjImpl(Buf).get()); + MS.removeRecordForMemObj(MemObj); } diff --git a/sycl/unittests/scheduler/SchedulerTestUtils.hpp b/sycl/unittests/scheduler/SchedulerTestUtils.hpp index 02226e30fa938..b588602af26d7 100644 --- a/sycl/unittests/scheduler/SchedulerTestUtils.hpp +++ b/sycl/unittests/scheduler/SchedulerTestUtils.hpp @@ -103,7 +103,9 @@ class MockScheduler : public sycl::detail::Scheduler { public: using sycl::detail::Scheduler::addCG; using sycl::detail::Scheduler::addCopyBack; + using sycl::detail::Scheduler::checkLeavesCompletion; using sycl::detail::Scheduler::cleanupCommands; + using sycl::detail::Scheduler::MDeferredMemObjRelease; sycl::detail::MemObjRecord * getOrInsertMemObjRecord(const sycl::detail::QueueImplPtr &Queue,