diff --git a/source/adapters/hip/context.cpp b/source/adapters/hip/context.cpp index 8298d513d8..73ac777edb 100644 --- a/source/adapters/hip/context.cpp +++ b/source/adapters/hip/context.cpp @@ -40,15 +40,13 @@ ur_context_handle_t_::getOwningURPool(umf_memory_pool_t *UMFPool) { UR_APIEXPORT ur_result_t UR_APICALL urContextCreate( uint32_t DeviceCount, const ur_device_handle_t *phDevices, const ur_context_properties_t *, ur_context_handle_t *phContext) { - std::ignore = DeviceCount; - assert(DeviceCount == 1); ur_result_t RetErr = UR_RESULT_SUCCESS; std::unique_ptr ContextPtr{nullptr}; try { // Create a scoped context. ContextPtr = std::unique_ptr( - new ur_context_handle_t_{*phDevices}); + new ur_context_handle_t_{phDevices, DeviceCount}); static std::once_flag InitFlag; std::call_once( @@ -78,9 +76,9 @@ urContextGetInfo(ur_context_handle_t hContext, ur_context_info_t propName, switch (uint32_t{propName}) { case UR_CONTEXT_INFO_NUM_DEVICES: - return ReturnValue(1); + return ReturnValue(static_cast(hContext->Devices.size())); case UR_CONTEXT_INFO_DEVICES: - return ReturnValue(hContext->getDevice()); + return ReturnValue(hContext->getDevices()); case UR_CONTEXT_INFO_REFERENCE_COUNT: return ReturnValue(hContext->getReferenceCount()); case UR_CONTEXT_INFO_ATOMIC_MEMORY_ORDER_CAPABILITIES: @@ -124,8 +122,10 @@ urContextRetain(ur_context_handle_t hContext) { UR_APIEXPORT ur_result_t UR_APICALL urContextGetNativeHandle( ur_context_handle_t hContext, ur_native_handle_t *phNativeContext) { + // FIXME: this entry point has been deprecated in the SYCL RT and should be + // changed to unsupported once the deprecation period has elapsed *phNativeContext = reinterpret_cast( - hContext->getDevice()->getNativeContext()); + hContext->getDevices()[0]->getNativeContext()); return UR_RESULT_SUCCESS; } diff --git a/source/adapters/hip/context.hpp b/source/adapters/hip/context.hpp index d5eb2e1df8..69d4df9b6d 100644 --- a/source/adapters/hip/context.hpp +++ b/source/adapters/hip/context.hpp @@ -28,26 +28,26 @@ typedef void (*ur_context_extended_deleter_t)(void *UserData); /// /// One of the main differences between the UR API and the HIP driver API is /// that the second modifies the state of the threads by assigning -/// `hipCtx_t` objects to threads. `hipCtx_t` objects store data associated +/// \c hipCtx_t objects to threads. \c hipCtx_t objects store data associated /// with a given device and control access to said device from the user side. /// UR API context are objects that are passed to functions, and not bound /// to threads. -/// The ur_context_handle_t_ object doesn't implement this behavior. It only -/// holds the HIP context data. The RAII object \ref ScopedContext implements -/// the active context behavior. /// -/// Primary vs UserDefined context +/// Since the \c ur_context_handle_t can contain multiple devices, and a \c +/// hipCtx_t refers to only a single device, the \c hipCtx_t is more tightly +/// coupled to a \c ur_device_handle_t than a \c ur_context_handle_t. In order +/// to remove some ambiguities about the different semantics of \c +/// \c ur_context_handle_t and native \c hipCtx_t, we access the native \c +/// hipCtx_t solely through the \c ur_device_handle_t class, by using the object +/// \ref ScopedContext, which sets the active device (by setting the active +/// native \c hipCtx_t). /// -/// HIP has two different types of context, the Primary context, -/// which is usable by all threads on a given process for a given device, and -/// the aforementioned custom contexts. -/// The HIP documentation, and performance analysis, suggest using the Primary -/// context whenever possible. The Primary context is also used by the HIP -/// Runtime API. For UR applications to interop with HIP Runtime API, they have -/// to use the primary context - and make that active in the thread. The -/// `ur_context_handle_t_` object can be constructed with a `kind` parameter -/// that allows to construct a Primary or `UserDefined` context, so that -/// the UR object interface is always the same. +/// Primary vs User-defined \c hipCtx_t +/// +/// HIP has two different types of \c hipCtx_t, the Primary context, which is +/// usable by all threads on a given process for a given device, and the +/// aforementioned custom \c hipCtx_t s. The HIP documentation, confirmed with +/// performance analysis, suggest using the Primary context whenever possible. /// /// Destructor callback /// @@ -57,6 +57,16 @@ typedef void (*ur_context_extended_deleter_t)(void *UserData); /// See proposal for details. /// https://github.com/codeplaysoftware/standards-proposals/blob/master/extended-context-destruction/index.md /// +/// Memory Management for Devices in a Context <\b> +/// +/// A \c ur_mem_handle_t is associated with a \c ur_context_handle_t_, which +/// may refer to multiple devices. Therefore the \c ur_mem_handle_t must +/// handle a native allocation for each device in the context. UR is +/// responsible for automatically handling event dependencies for kernels +/// writing to or reading from the same \c ur_mem_handle_t and migrating memory +/// between native allocations for devices in the same \c ur_context_handle_t_ +/// if necessary. +/// struct ur_context_handle_t_ { struct deleter_data { @@ -68,15 +78,22 @@ struct ur_context_handle_t_ { using native_type = hipCtx_t; - ur_device_handle_t DeviceId; + std::vector Devices; + std::atomic_uint32_t RefCount; - ur_context_handle_t_(ur_device_handle_t DevId) - : DeviceId{DevId}, RefCount{1} { - urDeviceRetain(DeviceId); + ur_context_handle_t_(const ur_device_handle_t *Devs, uint32_t NumDevices) + : Devices{Devs, Devs + NumDevices}, RefCount{1} { + for (auto &Dev : Devices) { + urDeviceRetain(Dev); + } }; - ~ur_context_handle_t_() { urDeviceRelease(DeviceId); } + ~ur_context_handle_t_() { + for (auto &Dev : Devices) { + urDeviceRelease(Dev); + } + } void invokeExtendedDeleters() { std::lock_guard Guard(Mutex); @@ -91,7 +108,9 @@ struct ur_context_handle_t_ { ExtendedDeleters.emplace_back(deleter_data{Function, UserData}); } - ur_device_handle_t getDevice() const noexcept { return DeviceId; } + const std::vector &getDevices() const noexcept { + return Devices; + } uint32_t incrementReferenceCount() noexcept { return ++RefCount; } diff --git a/source/adapters/hip/device.hpp b/source/adapters/hip/device.hpp index 83cc2ee954..bea2c46fb5 100644 --- a/source/adapters/hip/device.hpp +++ b/source/adapters/hip/device.hpp @@ -25,12 +25,13 @@ struct ur_device_handle_t_ { std::atomic_uint32_t RefCount; ur_platform_handle_t Platform; hipCtx_t HIPContext; + uint32_t DeviceIndex; public: ur_device_handle_t_(native_type HipDevice, hipCtx_t Context, - ur_platform_handle_t Platform) + ur_platform_handle_t Platform, uint32_t DeviceIndex) : HIPDevice(HipDevice), RefCount{1}, Platform(Platform), - HIPContext(Context) {} + HIPContext(Context), DeviceIndex(DeviceIndex) {} ~ur_device_handle_t_() { UR_CHECK_ERROR(hipDevicePrimaryCtxRelease(HIPDevice)); @@ -42,7 +43,11 @@ struct ur_device_handle_t_ { ur_platform_handle_t getPlatform() const noexcept { return Platform; }; - hipCtx_t getNativeContext() { return HIPContext; }; + hipCtx_t getNativeContext() const noexcept { return HIPContext; }; + + // Returns the index of the device relative to the other devices in the same + // platform + uint32_t getIndex() const noexcept { return DeviceIndex; }; }; int getAttribute(ur_device_handle_t Device, hipDeviceAttribute_t Attribute); diff --git a/source/adapters/hip/enqueue.cpp b/source/adapters/hip/enqueue.cpp index 1a73618c77..078d3ae399 100644 --- a/source/adapters/hip/enqueue.cpp +++ b/source/adapters/hip/enqueue.cpp @@ -36,19 +36,18 @@ static size_t imageElementByteSize(hipArray_Format ArrayFormat) { return 0; } -ur_result_t enqueueEventsWait(ur_queue_handle_t CommandQueue, - hipStream_t Stream, uint32_t NumEventsInWaitList, +ur_result_t enqueueEventsWait(ur_queue_handle_t, hipStream_t Stream, + uint32_t NumEventsInWaitList, const ur_event_handle_t *EventWaitList) { if (!EventWaitList) { return UR_RESULT_SUCCESS; } try { - ScopedContext Active(CommandQueue->getDevice()); - auto Result = forLatestEvents( EventWaitList, NumEventsInWaitList, [Stream](ur_event_handle_t Event) -> ur_result_t { - if (Event->getStream() == Stream) { + ScopedContext Active(Event->getDevice()); + if (Event->isCompleted() || Event->getStream() == Stream) { return UR_RESULT_SUCCESS; } else { UR_CHECK_ERROR(hipStreamWaitEvent(Stream, Event->get(), 0)); @@ -95,6 +94,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferWrite( UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST); UR_ASSERT(!(phEventWaitList != NULL && numEventsInWaitList == 0), UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST); + UR_ASSERT(hBuffer->isBuffer(), UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST); ur_result_t Result = UR_RESULT_SUCCESS; std::unique_ptr RetImplEvent{nullptr}; @@ -102,8 +102,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferWrite( try { ScopedContext Active(hQueue->getDevice()); hipStream_t HIPStream = hQueue->getNextTransferStream(); - Result = enqueueEventsWait(hQueue, HIPStream, numEventsInWaitList, - phEventWaitList); + UR_CHECK_ERROR(enqueueEventsWait(hQueue, HIPStream, numEventsInWaitList, + phEventWaitList)); if (phEvent) { RetImplEvent = @@ -112,9 +112,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferWrite( UR_CHECK_ERROR(RetImplEvent->start()); } - UR_CHECK_ERROR(hipMemcpyHtoDAsync( - std::get(hBuffer->Mem).getWithOffset(offset), - const_cast(pSrc), size, HIPStream)); + UR_CHECK_ERROR( + hipMemcpyHtoDAsync(std::get(hBuffer->Mem) + .getPtrWithOffset(hQueue->getDevice(), offset), + const_cast(pSrc), size, HIPStream)); if (phEvent) { UR_CHECK_ERROR(RetImplEvent->record()); @@ -141,15 +142,34 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferRead( UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST); UR_ASSERT(!(phEventWaitList != NULL && numEventsInWaitList == 0), UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST); + UR_ASSERT(hBuffer->isBuffer(), UR_RESULT_ERROR_INVALID_EVENT_WAIT_LIST); - ur_result_t Result = UR_RESULT_SUCCESS; std::unique_ptr RetImplEvent{nullptr}; + ur_lock MemoryMigrationLock{hBuffer->MemoryMigrationMutex}; + auto Device = hQueue->getDevice(); + hipStream_t HIPStream = hQueue->getNextTransferStream(); + try { - ScopedContext Active(hQueue->getDevice()); - hipStream_t HIPStream = hQueue->getNextTransferStream(); - Result = enqueueEventsWait(hQueue, HIPStream, numEventsInWaitList, - phEventWaitList); + // Note that this entry point may be called on a queue that may not be the + // last queue to write to the MemBuffer, meaning we must perform the copy + // from a different device + if (hBuffer->LastEventWritingToMemObj && + hBuffer->LastEventWritingToMemObj->getDevice() != hQueue->getDevice()) { + Device = hBuffer->LastEventWritingToMemObj->getDevice(); + ScopedContext Active(Device); + HIPStream = hipStream_t{0}; // Default stream for different device + // We may have to wait for an event on another queue if it is the last + // event writing to mem obj + UR_CHECK_ERROR(enqueueEventsWait(hQueue, HIPStream, 1, + &hBuffer->LastEventWritingToMemObj)); + } + + ScopedContext Active(Device); + + // Use the default stream if copying from another device + UR_CHECK_ERROR(enqueueEventsWait(hQueue, HIPStream, numEventsInWaitList, + phEventWaitList)); if (phEvent) { RetImplEvent = @@ -158,9 +178,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferRead( UR_CHECK_ERROR(RetImplEvent->start()); } + // Copying from the device with latest version of memory, not necessarily + // the device associated with the Queue UR_CHECK_ERROR(hipMemcpyDtoHAsync( - pDst, std::get(hBuffer->Mem).getWithOffset(offset), size, - HIPStream)); + pDst, + std::get(hBuffer->Mem).getPtrWithOffset(Device, offset), + size, HIPStream)); if (phEvent) { UR_CHECK_ERROR(RetImplEvent->record()); @@ -175,9 +198,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferRead( } } catch (ur_result_t err) { - Result = err; + return err; } - return Result; + return UR_RESULT_SUCCESS; } UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( @@ -190,9 +213,44 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( UR_ASSERT(workDim > 0, UR_RESULT_ERROR_INVALID_WORK_DIMENSION); UR_ASSERT(workDim < 4, UR_RESULT_ERROR_INVALID_WORK_DIMENSION); + std::vector DepEvents( + phEventWaitList, phEventWaitList + numEventsInWaitList); + std::vector> MemMigrationLocks; + + // phEventWaitList only contains events that are handed to UR by the SYCL + // runtime. However since UR handles memory dependencies within a context + // we may need to add more events to our dependent events list if the UR + // context contains multiple devices + if (hQueue->getContext()->Devices.size() > 1) { + MemMigrationLocks.reserve(hKernel->Args.MemObjArgs.size()); + for (auto &MemArg : hKernel->Args.MemObjArgs) { + bool PushBack = false; + if (auto MemDepEvent = MemArg.Mem->LastEventWritingToMemObj; + MemDepEvent && std::find(DepEvents.begin(), DepEvents.end(), + MemDepEvent) == DepEvents.end()) { + DepEvents.push_back(MemDepEvent); + PushBack = true; + } + if ((MemArg.AccessFlags & + (UR_MEM_FLAG_READ_WRITE | UR_MEM_FLAG_WRITE_ONLY)) || + PushBack) { + if (std::find_if(MemMigrationLocks.begin(), MemMigrationLocks.end(), + [MemArg](auto &Lock) { + return Lock.first == MemArg.Mem; + }) == MemMigrationLocks.end()) + MemMigrationLocks.emplace_back( + std::pair{MemArg.Mem, ur_lock{MemArg.Mem->MemoryMigrationMutex}}); + } + } + } + + // Early exit for zero size range kernel if (*pGlobalWorkSize == 0) { - return urEnqueueEventsWaitWithBarrier(hQueue, numEventsInWaitList, - phEventWaitList, phEvent); + if (DepEvents.size()) { + return urEnqueueEventsWaitWithBarrier(hQueue, DepEvents.size(), + phEventWaitList, phEvent); + } + return UR_RESULT_SUCCESS; } // Set the number of threads per block to the number of threads per warp @@ -265,8 +323,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( numEventsInWaitList, phEventWaitList, Guard, &StreamToken); hipFunction_t HIPFunc = hKernel->get(); - Result = enqueueEventsWait(hQueue, HIPStream, numEventsInWaitList, - phEventWaitList); + if (DepEvents.size()) { + UR_CHECK_ERROR(enqueueEventsWait(hQueue, HIPStream, DepEvents.size(), + DepEvents.data())); + } + + // For memory migration across devices in the same context + if (hQueue->getContext()->Devices.size() > 1) { + for (auto &MemArg : hKernel->Args.MemObjArgs) { + migrateMemoryToDeviceIfNeeded(MemArg.Mem, hQueue->getDevice()); + } + } // Set the implicit global offset parameter if kernel has offset variant if (hKernel->getWithOffsetParameter()) { @@ -293,6 +360,20 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( UR_CHECK_ERROR(RetImplEvent->start()); } + // Once event has been started we can unlock MemoryMigrationMutex + if (hQueue->getContext()->Devices.size() > 1) { + for (auto &MemArg : hKernel->Args.MemObjArgs) { + // Telling the ur_mem_handle_t that it will need to wait on this kernel + // if it has been written to + if (phEvent && (MemArg.AccessFlags & + (UR_MEM_FLAG_READ_WRITE | UR_MEM_FLAG_WRITE_ONLY))) { + MemArg.Mem->setLastEventWritingToMemObj(RetImplEvent.get()); + } + } + // We can release the MemoryMigrationMutexes now + MemMigrationLocks.clear(); + } + // Set local mem max size if env var is present static const char *LocalMemSzPtrUR = std::getenv("UR_HIP_MAX_LOCAL_MEM_SIZE"); @@ -509,16 +590,32 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferReadRect( UR_ASSERT(!(hostSlicePitch != 0 && hostSlicePitch % hostRowPitch != 0), UR_RESULT_ERROR_INVALID_SIZE); - ur_result_t Result = UR_RESULT_SUCCESS; - void *DevPtr = std::get(hBuffer->Mem).getVoid(); std::unique_ptr RetImplEvent{nullptr}; + ur_result_t Result = UR_RESULT_SUCCESS; + ur_lock MemoryMigrationLock(hBuffer->MemoryMigrationMutex); + auto Device = hQueue->getDevice(); + hipStream_t HIPStream = hQueue->getNextTransferStream(); + try { - ScopedContext Active(hQueue->getDevice()); - hipStream_t HIPStream = hQueue->getNextTransferStream(); + // Note that this entry point may be called on a queue that may not be the + // last queue to write to the MemBuffer, meaning we must perform the copy + // from a different device + if (hBuffer->LastEventWritingToMemObj && + hBuffer->LastEventWritingToMemObj->getDevice() != hQueue->getDevice()) { + Device = hBuffer->LastEventWritingToMemObj->getDevice(); + ScopedContext Active(Device); + HIPStream = hipStream_t{0}; // Default stream for different device + // We may have to wait for an event on another queue if it is the last + // event writing to mem obj + UR_CHECK_ERROR(enqueueEventsWait(hQueue, HIPStream, 1, + &hBuffer->LastEventWritingToMemObj)); + } - Result = enqueueEventsWait(hQueue, HIPStream, numEventsInWaitList, - phEventWaitList); + ScopedContext Active(Device); + + UR_CHECK_ERROR(enqueueEventsWait(hQueue, HIPStream, numEventsInWaitList, + phEventWaitList)); if (phEvent) { RetImplEvent = @@ -527,10 +624,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferReadRect( UR_CHECK_ERROR(RetImplEvent->start()); } - Result = commonEnqueueMemBufferCopyRect( + void *DevPtr = std::get(hBuffer->Mem).getVoid(Device); + UR_CHECK_ERROR(commonEnqueueMemBufferCopyRect( HIPStream, region, &DevPtr, hipMemoryTypeDevice, bufferOrigin, bufferRowPitch, bufferSlicePitch, pDst, hipMemoryTypeHost, hostOrigin, - hostRowPitch, hostSlicePitch); + hostRowPitch, hostSlicePitch)); if (phEvent) { UR_CHECK_ERROR(RetImplEvent->record()); @@ -558,7 +656,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferWriteRect( uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { ur_result_t Result = UR_RESULT_SUCCESS; - void *DevPtr = std::get(hBuffer->Mem).getVoid(); + void *DevPtr = std::get(hBuffer->Mem).getVoid(hQueue->getDevice()); std::unique_ptr RetImplEvent{nullptr}; try { @@ -626,8 +724,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferCopy( UR_CHECK_ERROR(RetImplEvent->start()); } - auto Src = std::get(hBufferSrc->Mem).getWithOffset(srcOffset); - auto Dst = std::get(hBufferDst->Mem).getWithOffset(dstOffset); + auto Src = std::get(hBufferSrc->Mem) + .getPtrWithOffset(hQueue->getDevice(), srcOffset); + auto Dst = std::get(hBufferDst->Mem) + .getPtrWithOffset(hQueue->getDevice(), dstOffset); UR_CHECK_ERROR(hipMemcpyDtoDAsync(Dst, Src, size, Stream)); @@ -652,8 +752,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferCopyRect( uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { ur_result_t Result = UR_RESULT_SUCCESS; - void *SrcPtr = std::get(hBufferSrc->Mem).getVoid(); - void *DstPtr = std::get(hBufferDst->Mem).getVoid(); + void *SrcPtr = + std::get(hBufferSrc->Mem).getVoid(hQueue->getDevice()); + void *DstPtr = + std::get(hBufferDst->Mem).getVoid(hQueue->getDevice()); std::unique_ptr RetImplEvent{nullptr}; try { @@ -762,7 +864,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferFill( UR_CHECK_ERROR(RetImplEvent->start()); } - auto DstDevice = std::get(hBuffer->Mem).getWithOffset(offset); + auto DstDevice = std::get(hBuffer->Mem) + .getPtrWithOffset(hQueue->getDevice(), offset); auto N = size / patternSize; // pattern size in bytes @@ -882,21 +985,37 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageRead( ur_rect_offset_t origin, ur_rect_region_t region, size_t, size_t, void *pDst, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - UR_ASSERT(hImage->MemType == ur_mem_handle_t_::Type::Surface, - UR_RESULT_ERROR_INVALID_MEM_OBJECT); + UR_ASSERT(hImage->isImage(), UR_RESULT_ERROR_INVALID_MEM_OBJECT); ur_result_t Result = UR_RESULT_SUCCESS; + ur_lock MemoryMigrationLock{hImage->MemoryMigrationMutex}; + auto Device = hQueue->getDevice(); + hipStream_t HIPStream = hQueue->getNextTransferStream(); + try { - ScopedContext Active(hQueue->getDevice()); - hipStream_t HIPStream = hQueue->getNextTransferStream(); + // Note that this entry point may be called on a queue that may not be the + // last queue to write to the MemBuffer, meaning we must perform the copy + // from a different device + if (hImage->LastEventWritingToMemObj && + hImage->LastEventWritingToMemObj->getDevice() != hQueue->getDevice()) { + Device = hImage->LastEventWritingToMemObj->getDevice(); + ScopedContext Active(Device); + HIPStream = hipStream_t{0}; // Default stream for different device + // We may have to wait for an event on another queue if it is the last + // event writing to mem obj + UR_CHECK_ERROR(enqueueEventsWait(hQueue, HIPStream, 1, + &hImage->LastEventWritingToMemObj)); + } + + ScopedContext Active(Device); if (phEventWaitList) { - Result = enqueueEventsWait(hQueue, HIPStream, numEventsInWaitList, - phEventWaitList); + UR_CHECK_ERROR(enqueueEventsWait(hQueue, HIPStream, numEventsInWaitList, + phEventWaitList)); } - hipArray *Array = std::get(hImage->Mem).getArray(); + hipArray *Array = std::get(hImage->Mem).getArray(Device); hipArray_Format Format; size_t NumChannels; @@ -950,8 +1069,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageWrite( ur_rect_offset_t origin, ur_rect_region_t region, size_t, size_t, void *pSrc, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - UR_ASSERT(hImage->MemType == ur_mem_handle_t_::Type::Surface, - UR_RESULT_ERROR_INVALID_MEM_OBJECT); + UR_ASSERT(hImage->isImage(), UR_RESULT_ERROR_INVALID_MEM_OBJECT); ur_result_t Result = UR_RESULT_SUCCESS; @@ -964,7 +1082,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageWrite( phEventWaitList); } - hipArray *Array = std::get(hImage->Mem).getArray(); + hipArray *Array = + std::get(hImage->Mem).getArray(hQueue->getDevice()); hipArray_Format Format; size_t NumChannels; @@ -1017,10 +1136,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageCopy( ur_rect_offset_t dstOrigin, ur_rect_region_t region, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - UR_ASSERT(hImageSrc->MemType == ur_mem_handle_t_::Type::Surface, - UR_RESULT_ERROR_INVALID_MEM_OBJECT); - UR_ASSERT(hImageDst->MemType == ur_mem_handle_t_::Type::Surface, - UR_RESULT_ERROR_INVALID_MEM_OBJECT); + UR_ASSERT(hImageSrc->isImage(), UR_RESULT_ERROR_INVALID_MEM_OBJECT); + UR_ASSERT(hImageDst->isImage(), UR_RESULT_ERROR_INVALID_MEM_OBJECT); UR_ASSERT(std::get(hImageSrc->Mem).getImageType() == std::get(hImageDst->Mem).getImageType(), UR_RESULT_ERROR_INVALID_MEM_OBJECT); @@ -1035,12 +1152,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemImageCopy( phEventWaitList); } - hipArray *SrcArray = std::get(hImageSrc->Mem).getArray(); + hipArray *SrcArray = + std::get(hImageSrc->Mem).getArray(hQueue->getDevice()); hipArray_Format SrcFormat; size_t SrcNumChannels; getArrayDesc(SrcArray, SrcFormat, SrcNumChannels); - hipArray *DstArray = std::get(hImageDst->Mem).getArray(); + hipArray *DstArray = + std::get(hImageDst->Mem).getArray(hQueue->getDevice()); hipArray_Format DstFormat; size_t DstNumChannels; getArrayDesc(DstArray, DstFormat, DstNumChannels); @@ -1101,8 +1220,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferMap( ur_map_flags_t mapFlags, size_t offset, size_t size, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent, void **ppRetMap) { - UR_ASSERT(hBuffer->MemType == ur_mem_handle_t_::Type::Buffer, - UR_RESULT_ERROR_INVALID_MEM_OBJECT); + UR_ASSERT(hBuffer->isBuffer(), UR_RESULT_ERROR_INVALID_MEM_OBJECT); auto &BufferImpl = std::get(hBuffer->Mem); UR_ASSERT(offset + size <= BufferImpl.getSize(), UR_RESULT_ERROR_INVALID_SIZE); @@ -1161,8 +1279,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemUnmap( uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { ur_result_t Result = UR_RESULT_SUCCESS; - UR_ASSERT(hMem->MemType == ur_mem_handle_t_::Type::Buffer, - UR_RESULT_ERROR_INVALID_MEM_OBJECT); + UR_ASSERT(hMem->isBuffer(), UR_RESULT_ERROR_INVALID_MEM_OBJECT); UR_ASSERT(std::get(hMem->Mem).getMapPtr() != nullptr, UR_RESULT_ERROR_INVALID_MEM_OBJECT); UR_ASSERT(std::get(hMem->Mem).getMapPtr() == pMappedPtr, @@ -1302,7 +1419,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMPrefetch( ur_usm_migration_flags_t flags, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { void *HIPDevicePtr = const_cast(pMem); - ur_device_handle_t Device = hQueue->getContext()->getDevice(); + ur_device_handle_t Device = hQueue->getDevice(); // If the device does not support managed memory access, we can't set // mem_advise. diff --git a/source/adapters/hip/event.cpp b/source/adapters/hip/event.cpp index 4871335c9f..2af6c5e910 100644 --- a/source/adapters/hip/event.cpp +++ b/source/adapters/hip/event.cpp @@ -193,7 +193,7 @@ urEventWait(uint32_t numEvents, const ur_event_handle_t *phEventWaitList) { try { auto Context = phEventWaitList[0]->getContext(); - ScopedContext Active(Context->getDevice()); + ScopedContext Active(phEventWaitList[0]->getDevice()); auto WaitFunc = [Context](ur_event_handle_t Event) -> ur_result_t { UR_ASSERT(Event, UR_RESULT_ERROR_INVALID_EVENT); @@ -292,7 +292,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEventRelease(ur_event_handle_t hEvent) { std::unique_ptr event_ptr{hEvent}; ur_result_t Result = UR_RESULT_ERROR_INVALID_EVENT; try { - ScopedContext Active(hEvent->getContext()->getDevice()); Result = hEvent->release(); } catch (...) { Result = UR_RESULT_ERROR_OUT_OF_RESOURCES; diff --git a/source/adapters/hip/event.hpp b/source/adapters/hip/event.hpp index bfa05b59d7..ecb995dfbe 100644 --- a/source/adapters/hip/event.hpp +++ b/source/adapters/hip/event.hpp @@ -28,6 +28,8 @@ struct ur_event_handle_t_ { ur_queue_handle_t getQueue() const noexcept { return Queue; } + ur_device_handle_t getDevice() const noexcept { return Queue->getDevice(); } + hipStream_t getStream() const noexcept { return Stream; } uint32_t getComputeStreamToken() const noexcept { return StreamToken; } diff --git a/source/adapters/hip/kernel.cpp b/source/adapters/hip/kernel.cpp index cc6f4384bc..ec58bafcc6 100644 --- a/source/adapters/hip/kernel.cpp +++ b/source/adapters/hip/kernel.cpp @@ -19,7 +19,7 @@ urKernelCreate(ur_program_handle_t hProgram, const char *pKernelName, std::unique_ptr RetKernel{nullptr}; try { - ScopedContext Active(hProgram->getContext()->getDevice()); + ScopedContext Active(hProgram->getDevice()); hipFunction_t HIPFunc; hipError_t KernelError = @@ -263,9 +263,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgPointer( return UR_RESULT_SUCCESS; } -UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgMemObj( - ur_kernel_handle_t hKernel, uint32_t argIndex, - const ur_kernel_arg_mem_obj_properties_t *, ur_mem_handle_t hArgValue) { +UR_APIEXPORT ur_result_t UR_APICALL +urKernelSetArgMemObj(ur_kernel_handle_t hKernel, uint32_t argIndex, + const ur_kernel_arg_mem_obj_properties_t *Properties, + ur_mem_handle_t hArgValue) { // Below sets kernel arg when zero-sized buffers are handled. // In such case the corresponding memory is null. if (hArgValue == nullptr) { @@ -275,8 +276,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgMemObj( ur_result_t Result = UR_RESULT_SUCCESS; try { - if (hArgValue->MemType == ur_mem_handle_t_::Type::Surface) { - auto array = std::get(hArgValue->Mem).getArray(); + auto Device = hKernel->getProgram()->getDevice(); + hKernel->Args.addMemObjArg(argIndex, hArgValue, Properties->memoryAccess); + if (hArgValue->isImage()) { + auto array = std::get(hArgValue->Mem).getArray(Device); hipArray_Format Format; size_t NumChannels; getArrayDesc(array, Format, NumChannels); @@ -288,10 +291,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgMemObj( "uint32, float, and half."); } hipSurfaceObject_t hipSurf = - std::get(hArgValue->Mem).getSurface(); + std::get(hArgValue->Mem).getSurface(Device); hKernel->setKernelArg(argIndex, sizeof(hipSurf), (void *)&hipSurf); } else { - void *HIPPtr = std::get(hArgValue->Mem).getVoid(); + void *HIPPtr = std::get(hArgValue->Mem).getVoid(Device); hKernel->setKernelArg(argIndex, sizeof(void *), (void *)&HIPPtr); } } catch (ur_result_t Err) { diff --git a/source/adapters/hip/kernel.hpp b/source/adapters/hip/kernel.hpp index f13478a69c..83693a3d41 100644 --- a/source/adapters/hip/kernel.hpp +++ b/source/adapters/hip/kernel.hpp @@ -57,6 +57,14 @@ struct ur_kernel_handle_t_ { args_size_t ParamSizes; args_index_t Indices; args_size_t OffsetPerIndex; + // A struct to keep track of memargs so that we can do dependency analysis + // at urEnqueueKernelLaunch + struct mem_obj_arg { + ur_mem_handle_t_ *Mem; + int Index; + ur_mem_flags_t AccessFlags; + }; + std::vector MemObjArgs; std::uint32_t ImplicitOffsetArgs[3] = {0, 0, 0}; @@ -110,6 +118,20 @@ struct ur_kernel_handle_t_ { Size + AlignedLocalOffset - LocalOffset); } + void addMemObjArg(int Index, ur_mem_handle_t hMem, ur_mem_flags_t Flags) { + assert(hMem && "Invalid mem handle"); + // To avoid redundancy we are not storing mem obj with index i at index + // i in the vec of MemObjArgs. + for (auto &Arg : MemObjArgs) { + if (Arg.Index == Index) { + // Overwrite the mem obj with the same index + Arg = arguments::mem_obj_arg{hMem, Index, Flags}; + return; + } + } + MemObjArgs.push_back(arguments::mem_obj_arg{hMem, Index, Flags}); + } + void setImplicitOffset(size_t Size, std::uint32_t *ImplicitOffset) { assert(Size == sizeof(std::uint32_t) * 3); std::memcpy(ImplicitOffsetArgs, ImplicitOffset, Size); @@ -167,10 +189,10 @@ struct ur_kernel_handle_t_ { const char *getName() const noexcept { return Name.c_str(); } - /// Get the number of kernel arguments, excluding the implicit global offset. - /// Note this only returns the current known number of arguments, not the - /// real one required by the kernel, since this cannot be queried from - /// the HIP Driver API + /// Get the number of kernel arguments, excluding the implicit global + /// offset. Note this only returns the current known number of arguments, + /// not the real one required by the kernel, since this cannot be queried + /// from the HIP Driver API uint32_t getNumArgs() const noexcept { return Args.Indices.size() - 1; } void setKernelArg(int Index, size_t Size, const void *Arg) { diff --git a/source/adapters/hip/memory.cpp b/source/adapters/hip/memory.cpp index 899dad5674..68ded26263 100644 --- a/source/adapters/hip/memory.cpp +++ b/source/adapters/hip/memory.cpp @@ -55,28 +55,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemRelease(ur_mem_handle_t hMem) { return UR_RESULT_SUCCESS; } - ScopedContext Active(uniqueMemObj->getContext()->getDevice()); - - if (hMem->MemType == ur_mem_handle_t_::Type::Buffer) { - auto &hBuffer = std::get(uniqueMemObj->Mem); - switch (hBuffer.MemAllocMode) { - case BufferMem::AllocMode::CopyIn: - case BufferMem::AllocMode::Classic: - UR_CHECK_ERROR(hipFree((void *)hBuffer.Ptr)); - break; - case BufferMem::AllocMode::UseHostPtr: - UR_CHECK_ERROR(hipHostUnregister(hBuffer.HostPtr)); - break; - case BufferMem::AllocMode::AllocHostPtr: - UR_CHECK_ERROR(hipFreeHost(hBuffer.HostPtr)); - }; - } - - else if (hMem->MemType == ur_mem_handle_t_::Type::Surface) { - auto &hImage = std::get(uniqueMemObj->Mem); - UR_CHECK_ERROR(hipDestroySurfaceObject(hImage.getSurface())); - UR_CHECK_ERROR(hipFreeArray(hImage.getArray())); - } + UR_CHECK_ERROR(hMem->clear()); } catch (ur_result_t Err) { Result = Err; @@ -123,49 +102,41 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemBufferCreate( ur_mem_handle_t RetMemObj = nullptr; try { - ScopedContext Active(hContext->getDevice()); - void *Ptr; - auto pHost = pProperties ? pProperties->pHost : nullptr; + auto HostPtr = pProperties ? pProperties->pHost : nullptr; BufferMem::AllocMode AllocMode = BufferMem::AllocMode::Classic; - if ((flags & UR_MEM_FLAG_USE_HOST_POINTER) && EnableUseHostPtr) { - UR_CHECK_ERROR(hipHostRegister(pHost, size, hipHostRegisterMapped)); - UR_CHECK_ERROR(hipHostGetDevicePointer(&Ptr, pHost, 0)); AllocMode = BufferMem::AllocMode::UseHostPtr; } else if (flags & UR_MEM_FLAG_ALLOC_HOST_POINTER) { - UR_CHECK_ERROR(hipHostMalloc(&pHost, size)); - UR_CHECK_ERROR(hipHostGetDevicePointer(&Ptr, pHost, 0)); + UR_CHECK_ERROR(hipHostMalloc(&HostPtr, size)); AllocMode = BufferMem::AllocMode::AllocHostPtr; - } else { - UR_CHECK_ERROR(hipMalloc(&Ptr, size)); - if (flags & UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER) { - AllocMode = BufferMem::AllocMode::CopyIn; - } + } else if (flags & UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER) { + AllocMode = BufferMem::AllocMode::CopyIn; } - if (Result == UR_RESULT_SUCCESS) { - ur_mem_handle_t parentBuffer = nullptr; - - auto DevPtr = reinterpret_cast(Ptr); - auto URMemObj = std::unique_ptr(new ur_mem_handle_t_{ - hContext, parentBuffer, flags, AllocMode, DevPtr, pHost, size}); - if (URMemObj != nullptr) { - RetMemObj = URMemObj.release(); - if (PerformInitialCopy) { - // Operates on the default stream of the current HIP context. - UR_CHECK_ERROR(hipMemcpyHtoD(DevPtr, pHost, size)); - // Synchronize with default stream implicitly used by hipMemcpyHtoD - // to make buffer data available on device before any other UR call - // uses it. - if (Result == UR_RESULT_SUCCESS) { - hipStream_t defaultStream = 0; - UR_CHECK_ERROR(hipStreamSynchronize(defaultStream)); - } - } - } else { - Result = UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; + auto URMemObj = std::unique_ptr( + new ur_mem_handle_t_{hContext, flags, AllocMode, HostPtr, size}); + if (URMemObj == nullptr) { + throw UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; + } + + // First allocation will be made at urMemBufferCreate if context only + // has one device + if (PerformInitialCopy && HostPtr) { + // Perform initial copy to every device in context + for (auto &Device : hContext->getDevices()) { + ScopedContext Active(Device); + // getPtr may allocate mem if not already allocated + const auto &Ptr = std::get(URMemObj->Mem).getPtr(Device); + UR_CHECK_ERROR(hipMemcpyHtoD(Ptr, HostPtr, size)); + // TODO check if we can remove this + // Synchronize with default stream implicitly used by cuMemcpyHtoD + // to make buffer data available on device before any other UR + // call uses it. + // hipStream_t defaultStream = 0; + // UR_CHECK_ERROR(hipStreamSynchronize(defaultStream)); } } + RetMemObj = URMemObj.release(); } catch (ur_result_t Err) { Result = Err; } catch (...) { @@ -215,27 +186,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemBufferPartition( auto &BufferImpl = std::get(hBuffer->Mem); UR_ASSERT(((pRegion->origin + pRegion->size) <= BufferImpl.getSize()), UR_RESULT_ERROR_INVALID_BUFFER_SIZE); - // Retained indirectly due to retaining parent buffer below. - ur_context_handle_t Context = hBuffer->Context; - BufferMem::AllocMode AllocMode = BufferMem::AllocMode::Classic; - - UR_ASSERT(BufferImpl.Ptr != BufferMem::native_type{0}, - UR_RESULT_ERROR_INVALID_MEM_OBJECT); - BufferMem::native_type Ptr = BufferImpl.getWithOffset(pRegion->origin); - - void *HostPtr = nullptr; - if (BufferImpl.HostPtr) { - HostPtr = static_cast(BufferImpl.HostPtr) + pRegion->origin; + for (auto Device : hBuffer->Context->getDevices()) { + BufferImpl.getPtr(Device); // This is allocating a dev ptr behind the scenes + // which is necessary before SubBuffer partition } ReleaseGuard ReleaseGuard(hBuffer); std::unique_ptr RetMemObj{nullptr}; try { - ScopedContext Active(Context->getDevice()); - - RetMemObj = std::unique_ptr{new ur_mem_handle_t_{ - Context, hBuffer, flags, AllocMode, Ptr, HostPtr, pRegion->size}}; + RetMemObj = std::unique_ptr{ + new ur_mem_handle_t_{hBuffer, pRegion->origin}}; } catch (ur_result_t Err) { *phMem = nullptr; return Err; @@ -258,19 +219,23 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemGetInfo(ur_mem_handle_t hMemory, UR_ASSERT(MemInfoType <= UR_MEM_INFO_CONTEXT, UR_RESULT_ERROR_INVALID_ENUMERATION); - UrReturnHelper ReturnValue(propSize, pMemInfo, pPropSizeRet); + // FIXME: Only getting info for the first device in the context. This + // should be fine in general + auto Device = hMemory->getContext()->getDevices()[0]; + ScopedContext Active(Device); - ScopedContext Active(hMemory->getContext()->getDevice()); + UrReturnHelper ReturnValue(propSize, pMemInfo, pPropSizeRet); switch (MemInfoType) { case UR_MEM_INFO_SIZE: { try { - const auto MemVisitor = [](auto &&Mem) -> size_t { + const auto MemVisitor = [Device](auto &&Mem) -> size_t { using T = std::decay_t; if constexpr (std::is_same_v) { size_t AllocSize = 0; hipDeviceptr_t BasePtr = nullptr; - UR_CHECK_ERROR(hipMemGetAddressRange(&BasePtr, &AllocSize, Mem.Ptr)); + UR_CHECK_ERROR( + hipMemGetAddressRange(&BasePtr, &AllocSize, Mem.getPtr(Device))); return AllocSize; } else if constexpr (std::is_same_v) { #if HIP_VERSION < 50600000 @@ -278,7 +243,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemGetInfo(ur_mem_handle_t hMemory, #else HIP_ARRAY3D_DESCRIPTOR ArrayDescriptor; UR_CHECK_ERROR( - hipArray3DGetDescriptor(&ArrayDescriptor, Mem.getArray())); + hipArray3DGetDescriptor(&ArrayDescriptor, Mem.getArray(Device))); const auto PixelSizeBytes = GetHipFormatPixelSize(ArrayDescriptor.Format) * ArrayDescriptor.NumChannels; @@ -317,30 +282,16 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemGetInfo(ur_mem_handle_t hMemory, /// \param[out] phNativeMem Set to the native handle of the UR mem object. /// /// \return UR_RESULT_SUCCESS -UR_APIEXPORT ur_result_t UR_APICALL -urMemGetNativeHandle(ur_mem_handle_t hMem, ur_native_handle_t *phNativeMem) { -#if defined(__HIP_PLATFORM_NVIDIA__) - if (sizeof(BufferMem::native_type) > sizeof(ur_native_handle_t)) { - // Check that all the upper bits that cannot be represented by - // ur_native_handle_t are empty. - // NOTE: The following shift might trigger a warning, but the check in the - // if above makes sure that this does not underflow. - BufferMem::native_type UpperBits = std::get(hMem->Mem).get() >> - (sizeof(ur_native_handle_t) * CHAR_BIT); - if (UpperBits) { - // Return an error if any of the remaining bits is non-zero. - return UR_RESULT_ERROR_INVALID_MEM_OBJECT; - } - } - *phNativeMem = reinterpret_cast( - std::get(hMem->Mem).get()); -#elif defined(__HIP_PLATFORM_AMD__) - *phNativeMem = reinterpret_cast( - std::get(hMem->Mem).get()); -#else -#error("Must define exactly one of __HIP_PLATFORM_AMD__ or __HIP_PLATFORM_NVIDIA__"); -#endif - return UR_RESULT_SUCCESS; +UR_APIEXPORT ur_result_t UR_APICALL urMemGetNativeHandle(ur_mem_handle_t, + ur_native_handle_t *) { + // FIXME: there is no good way of doing this with a multi device context. + // If we return a single pointer, how would we know which device's allocation + // it should be? + // If we return a vector of pointers, this is OK for read only access but if + // we write to a buffer, how would we know which one had been written to? + // Should unused allocations be updated afterwards? We have no way of knowing + // any of these things in the current API design. + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } UR_APIEXPORT ur_result_t UR_APICALL urMemBufferCreateWithNativeHandle( @@ -356,7 +307,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemImageCreateWithNativeHandle( return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } -/// \TODO Not implemented UR_APIEXPORT ur_result_t UR_APICALL urMemImageCreate( ur_context_handle_t hContext, ur_mem_flags_t flags, const ur_image_format_t *pImageFormat, const ur_image_desc_t *pImageDesc, @@ -389,145 +339,25 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemImageCreate( UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR); } - ur_result_t Result = UR_RESULT_SUCCESS; - // We only support RBGA channel order // TODO: check SYCL CTS and spec. May also have to support BGRA UR_ASSERT(pImageFormat->channelOrder == UR_IMAGE_CHANNEL_ORDER_RGBA, UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION); - // We have to use hipArray3DCreate, which has some caveats. The height and - // depth parameters must be set to 0 produce 1D or 2D arrays. image_desc gives - // a minimum value of 1, so we need to convert the answer. - HIP_ARRAY3D_DESCRIPTOR ArrayDesc; - ArrayDesc.NumChannels = 4; // Only support 4 channel image - ArrayDesc.Flags = 0; // No flags required - ArrayDesc.Width = pImageDesc->width; - if (pImageDesc->type == UR_MEM_TYPE_IMAGE1D) { - ArrayDesc.Height = 0; - ArrayDesc.Depth = 0; - } else if (pImageDesc->type == UR_MEM_TYPE_IMAGE2D) { - ArrayDesc.Height = pImageDesc->height; - ArrayDesc.Depth = 0; - } else if (pImageDesc->type == UR_MEM_TYPE_IMAGE3D) { - ArrayDesc.Height = pImageDesc->height; - ArrayDesc.Depth = pImageDesc->depth; - } + auto URMemObj = std::unique_ptr( + new ur_mem_handle_t_{hContext, flags, *pImageFormat, *pImageDesc, pHost}); - // We need to get this now in bytes for calculating the total image size later - size_t PixelTypeSizeBytes; - - switch (pImageFormat->channelType) { - - case UR_IMAGE_CHANNEL_TYPE_UNORM_INT8: - case UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8: - ArrayDesc.Format = HIP_AD_FORMAT_UNSIGNED_INT8; - PixelTypeSizeBytes = 1; - break; - case UR_IMAGE_CHANNEL_TYPE_SIGNED_INT8: - ArrayDesc.Format = HIP_AD_FORMAT_SIGNED_INT8; - PixelTypeSizeBytes = 1; - break; - case UR_IMAGE_CHANNEL_TYPE_UNORM_INT16: - case UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16: - ArrayDesc.Format = HIP_AD_FORMAT_UNSIGNED_INT16; - PixelTypeSizeBytes = 2; - break; - case UR_IMAGE_CHANNEL_TYPE_SIGNED_INT16: - ArrayDesc.Format = HIP_AD_FORMAT_SIGNED_INT16; - PixelTypeSizeBytes = 2; - break; - case UR_IMAGE_CHANNEL_TYPE_HALF_FLOAT: - ArrayDesc.Format = HIP_AD_FORMAT_HALF; - PixelTypeSizeBytes = 2; - break; - case UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32: - ArrayDesc.Format = HIP_AD_FORMAT_UNSIGNED_INT32; - PixelTypeSizeBytes = 4; - break; - case UR_IMAGE_CHANNEL_TYPE_SIGNED_INT32: - ArrayDesc.Format = HIP_AD_FORMAT_SIGNED_INT32; - PixelTypeSizeBytes = 4; - break; - case UR_IMAGE_CHANNEL_TYPE_FLOAT: - ArrayDesc.Format = HIP_AD_FORMAT_FLOAT; - PixelTypeSizeBytes = 4; - break; - default: - // urMemImageCreate given unsupported image_channel_data_type - return UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR; + if (URMemObj == nullptr) { + return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; } - // When a dimension isn't used image_desc has the size set to 1 - size_t PixelSizeBytes = - PixelTypeSizeBytes * 4; // 4 is the only number of channels we support - size_t ImageSizeBytes = PixelSizeBytes * pImageDesc->width * - pImageDesc->height * pImageDesc->depth; - - ScopedContext Active(hContext->getDevice()); - hipArray *ImageArray; - UR_CHECK_ERROR(hipArray3DCreate(reinterpret_cast(&ImageArray), - &ArrayDesc)); - - try { - if (PerformInitialCopy) { - // We have to use a different copy function for each image dimensionality - if (pImageDesc->type == UR_MEM_TYPE_IMAGE1D) { - UR_CHECK_ERROR(hipMemcpyHtoA(ImageArray, 0, pHost, ImageSizeBytes)); - } else if (pImageDesc->type == UR_MEM_TYPE_IMAGE2D) { - hip_Memcpy2D CpyDesc; - memset(&CpyDesc, 0, sizeof(CpyDesc)); - CpyDesc.srcMemoryType = hipMemoryType::hipMemoryTypeHost; - CpyDesc.srcHost = pHost; - CpyDesc.dstMemoryType = hipMemoryType::hipMemoryTypeArray; - CpyDesc.dstArray = reinterpret_cast(ImageArray); - CpyDesc.WidthInBytes = PixelSizeBytes * pImageDesc->width; - CpyDesc.Height = pImageDesc->height; - UR_CHECK_ERROR(hipMemcpyParam2D(&CpyDesc)); - } else if (pImageDesc->type == UR_MEM_TYPE_IMAGE3D) { - HIP_MEMCPY3D CpyDesc; - memset(&CpyDesc, 0, sizeof(CpyDesc)); - CpyDesc.srcMemoryType = hipMemoryType::hipMemoryTypeHost; - CpyDesc.srcHost = pHost; - CpyDesc.dstMemoryType = hipMemoryType::hipMemoryTypeArray; - CpyDesc.dstArray = reinterpret_cast(ImageArray); - CpyDesc.WidthInBytes = PixelSizeBytes * pImageDesc->width; - CpyDesc.Height = pImageDesc->height; - CpyDesc.Depth = pImageDesc->depth; - UR_CHECK_ERROR(hipDrvMemcpy3D(&CpyDesc)); - } + if (PerformInitialCopy) { + for (const auto &Dev : hContext->getDevices()) { + UR_CHECK_ERROR(migrateMemoryToDeviceIfNeeded(URMemObj.get(), Dev)); } - - // HIP_RESOURCE_DESC is a union of different structs, shown here - // We need to fill it as described here to use it for a surface or texture - // HIP_RESOURCE_DESC::resType must be HIP_RESOURCE_TYPE_ARRAY and - // HIP_RESOURCE_DESC::res::array::hArray must be set to a valid HIP array - // handle. - // HIP_RESOURCE_DESC::flags must be set to zero - - hipResourceDesc ImageResDesc; - ImageResDesc.res.array.array = ImageArray; - ImageResDesc.resType = hipResourceTypeArray; - - hipSurfaceObject_t Surface; - UR_CHECK_ERROR(hipCreateSurfaceObject(&Surface, &ImageResDesc)); - - auto URMemObj = std::unique_ptr(new ur_mem_handle_t_{ - hContext, ImageArray, Surface, flags, pImageDesc->type, pHost}); - - if (URMemObj == nullptr) { - return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; - } - - *phMem = URMemObj.release(); - } catch (ur_result_t Err) { - UR_CHECK_ERROR(hipFreeArray(ImageArray)); - return Err; - } catch (...) { - UR_CHECK_ERROR(hipFreeArray(ImageArray)); - return UR_RESULT_ERROR_UNKNOWN; } - return Result; + *phMem = URMemObj.release(); + return UR_RESULT_SUCCESS; } UR_APIEXPORT ur_result_t UR_APICALL urMemImageGetInfo(ur_mem_handle_t hMemory, @@ -536,14 +366,16 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemImageGetInfo(ur_mem_handle_t hMemory, void *pPropValue, size_t *pPropSizeRet) { UR_ASSERT(hMemory->isImage(), UR_RESULT_ERROR_INVALID_MEM_OBJECT); - ScopedContext Active(hMemory->getContext()->getDevice()); + // FIXME: only getting infor for first image in ctx + auto Device = hMemory->getContext()->getDevices()[0]; + ScopedContext Active(Device); UrReturnHelper ReturnValue(propSize, pPropValue, pPropSizeRet); try { HIP_ARRAY3D_DESCRIPTOR ArrayInfo; #if HIP_VERSION >= 50600000 UR_CHECK_ERROR(hipArray3DGetDescriptor( - &ArrayInfo, std::get(hMemory->Mem).getArray())); + &ArrayInfo, std::get(hMemory->Mem).getArray(Device))); #else return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; #endif @@ -625,3 +457,174 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemRetain(ur_mem_handle_t hMem) { hMem->incrementReferenceCount(); return UR_RESULT_SUCCESS; } + +inline ur_result_t +allocateMemObjOnDeviceIfNeeded(ur_mem_handle_t Mem, + const ur_device_handle_t hDevice) { + ScopedContext Active(hDevice); + ur_lock LockGuard(Mem->MemoryAllocationMutex); + + if (Mem->isBuffer()) { + auto &Buffer = std::get(Mem->Mem); + hipDeviceptr_t &DevPtr = Buffer.Ptrs[hDevice->getIndex()]; + + // Allocation has already been made + if (DevPtr != BufferMem::native_type{0}) { + return UR_RESULT_SUCCESS; + } + + if (Buffer.MemAllocMode == BufferMem::AllocMode::AllocHostPtr) { + // Host allocation has already been made + UR_CHECK_ERROR(hipHostGetDevicePointer(&DevPtr, Buffer.HostPtr, 0)); + } else if (Buffer.MemAllocMode == BufferMem::AllocMode::UseHostPtr) { + UR_CHECK_ERROR( + hipHostRegister(Buffer.HostPtr, Buffer.Size, hipHostRegisterMapped)); + UR_CHECK_ERROR(hipHostGetDevicePointer(&DevPtr, Buffer.HostPtr, 0)); + } else { + UR_CHECK_ERROR(hipMalloc(&DevPtr, Buffer.Size)); + } + } else { + hipArray *ImageArray; + hipSurfaceObject_t Surface; + try { + auto &Image = std::get(Mem->Mem); + // Allocation has already been made + if (Image.Arrays[hDevice->getIndex()]) { + return UR_RESULT_SUCCESS; + } + UR_CHECK_ERROR(hipArray3DCreate( + reinterpret_cast(&ImageArray), &Image.ArrayDesc)); + Image.Arrays[hDevice->getIndex()] = ImageArray; + // HIP_RESOURCE_DESC is a union of different structs, shown here + // We need to fill it as described here to use it for a surface or texture + // HIP_RESOURCE_DESC::resType must be HIP_RESOURCE_TYPE_ARRAY and + // HIP_RESOURCE_DESC::res::array::hArray must be set to a valid HIP array + // handle. + // HIP_RESOURCE_DESC::flags must be set to zero + hipResourceDesc ImageResDesc; + ImageResDesc.res.array.array = ImageArray; + ImageResDesc.resType = hipResourceTypeArray; + + UR_CHECK_ERROR(hipCreateSurfaceObject(&Surface, &ImageResDesc)); + Image.SurfObjs[hDevice->getIndex()] = Surface; + } catch (ur_result_t Err) { + if (ImageArray) { + UR_CHECK_ERROR(hipFreeArray(ImageArray)); + } + return Err; + } catch (...) { + if (ImageArray) { + UR_CHECK_ERROR(hipFreeArray(ImageArray)); + } + return UR_RESULT_ERROR_UNKNOWN; + } + } + return UR_RESULT_SUCCESS; +} + +namespace { +inline ur_result_t migrateBufferToDevice(ur_mem_handle_t Mem, + ur_device_handle_t hDevice) { + auto &Buffer = std::get(Mem->Mem); + if (Mem->LastEventWritingToMemObj == nullptr) { + // Device allocation being initialized from host for the first time + if (Buffer.HostPtr) { + UR_CHECK_ERROR( + hipMemcpyHtoD(Buffer.getPtr(hDevice), Buffer.HostPtr, Buffer.Size)); + } + } else if (Mem->LastEventWritingToMemObj->getDevice() != hDevice) { + UR_CHECK_ERROR( + hipMemcpyDtoD(Buffer.getPtr(hDevice), + Buffer.getPtr(Mem->LastEventWritingToMemObj->getDevice()), + Buffer.Size)); + } + return UR_RESULT_SUCCESS; +} + +inline ur_result_t migrateImageToDevice(ur_mem_handle_t Mem, + ur_device_handle_t hDevice) { + auto &Image = std::get(Mem->Mem); + // When a dimension isn't used image_desc has the size set to 1 + size_t PixelSizeBytes = Image.PixelTypeSizeBytes * + 4; // 4 is the only number of channels we support + size_t ImageSizeBytes = PixelSizeBytes * Image.ImageDesc.width * + Image.ImageDesc.height * Image.ImageDesc.depth; + + hipArray *ImageArray = Image.getArray(hDevice); + + hip_Memcpy2D CpyDesc2D; + HIP_MEMCPY3D CpyDesc3D; + // We have to use a different copy function for each image + // dimensionality + if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE2D) { + memset(&CpyDesc2D, 0, sizeof(CpyDesc2D)); + CpyDesc2D.srcMemoryType = hipMemoryType::hipMemoryTypeHost; + CpyDesc2D.dstMemoryType = hipMemoryType::hipMemoryTypeArray; + CpyDesc2D.dstArray = reinterpret_cast(ImageArray); + CpyDesc2D.WidthInBytes = PixelSizeBytes * Image.ImageDesc.width; + CpyDesc2D.Height = Image.ImageDesc.height; + } else if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE3D) { + memset(&CpyDesc3D, 0, sizeof(CpyDesc3D)); + CpyDesc3D.srcMemoryType = hipMemoryType::hipMemoryTypeHost; + CpyDesc3D.dstMemoryType = hipMemoryType::hipMemoryTypeArray; + CpyDesc3D.dstArray = reinterpret_cast(ImageArray); + CpyDesc3D.WidthInBytes = PixelSizeBytes * Image.ImageDesc.width; + CpyDesc3D.Height = Image.ImageDesc.height; + CpyDesc3D.Depth = Image.ImageDesc.depth; + } + + if (Mem->LastEventWritingToMemObj == nullptr) { + if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE1D) { + UR_CHECK_ERROR( + hipMemcpyHtoA(ImageArray, 0, Image.HostPtr, ImageSizeBytes)); + } else if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE2D) { + CpyDesc2D.srcHost = Image.HostPtr; + UR_CHECK_ERROR(hipMemcpyParam2D(&CpyDesc2D)); + } else if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE3D) { + CpyDesc3D.srcHost = Image.HostPtr; + UR_CHECK_ERROR(hipDrvMemcpy3D(&CpyDesc3D)); + } + } else if (Mem->LastEventWritingToMemObj->getDevice() != hDevice) { + if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE1D) { + // FIXME: 1D memcpy from DtoD going through the host. + UR_CHECK_ERROR(hipMemcpyAtoH( + Image.HostPtr, + Image.getArray(Mem->LastEventWritingToMemObj->getDevice()), + 0 /*srcOffset*/, ImageSizeBytes)); + UR_CHECK_ERROR( + hipMemcpyHtoA(ImageArray, 0, Image.HostPtr, ImageSizeBytes)); + } else if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE2D) { + CpyDesc2D.srcArray = + Image.getArray(Mem->LastEventWritingToMemObj->getDevice()); + UR_CHECK_ERROR(hipMemcpyParam2D(&CpyDesc2D)); + } else if (Image.ImageDesc.type == UR_MEM_TYPE_IMAGE3D) { + CpyDesc3D.srcArray = + Image.getArray(Mem->LastEventWritingToMemObj->getDevice()); + UR_CHECK_ERROR(hipDrvMemcpy3D(&CpyDesc3D)); + } + } + return UR_RESULT_SUCCESS; +} +} // namespace + +// If calling this entry point it is necessary to lock the memoryMigrationMutex +// beforehand +ur_result_t migrateMemoryToDeviceIfNeeded(ur_mem_handle_t Mem, + const ur_device_handle_t hDevice) { + UR_ASSERT(hDevice, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + // Device allocation has already been initialized with most up to date + // data in buffer + if (Mem->HaveMigratedToDeviceSinceLastWrite[hDevice->getIndex()]) { + return UR_RESULT_SUCCESS; + } + + ScopedContext Active(hDevice); + if (Mem->isBuffer()) { + UR_CHECK_ERROR(migrateBufferToDevice(Mem, hDevice)); + } else { + UR_CHECK_ERROR(migrateImageToDevice(Mem, hDevice)); + } + + Mem->HaveMigratedToDeviceSinceLastWrite[hDevice->getIndex()] = true; + return UR_RESULT_SUCCESS; +} diff --git a/source/adapters/hip/memory.hpp b/source/adapters/hip/memory.hpp index 2732b22a6e..d36b9ee001 100644 --- a/source/adapters/hip/memory.hpp +++ b/source/adapters/hip/memory.hpp @@ -10,18 +10,25 @@ #pragma once #include "common.hpp" +#include "context.hpp" +#include "event.hpp" #include #include +ur_result_t allocateMemObjOnDeviceIfNeeded(ur_mem_handle_t, + const ur_device_handle_t); +ur_result_t migrateMemoryToDeviceIfNeeded(ur_mem_handle_t, + const ur_device_handle_t); + // Handler for plain, pointer-based HIP allocations struct BufferMem { using native_type = hipDeviceptr_t; // If this allocation is a sub-buffer (i.e., a view on an existing // allocation), this is the pointer to the parent handler structure - ur_mem_handle_t Parent; - // HIP handler for the pointer - native_type Ptr; + ur_mem_handle_t Parent = nullptr; + // Outer mem holding this struct in variant + ur_mem_handle_t OuterMemStruct; /// Pointer associated with this device on the host void *HostPtr; @@ -50,20 +57,44 @@ struct BufferMem { AllocHostPtr } MemAllocMode; - BufferMem(ur_mem_handle_t Parent, AllocMode Mode, hipDeviceptr_t Ptr, - void *HostPtr, size_t Size) - : Parent{Parent}, Ptr{Ptr}, HostPtr{HostPtr}, Size{Size}, MapSize{0}, - MapOffset{0}, MapPtr{nullptr}, MapFlags{UR_MAP_FLAG_WRITE}, - MemAllocMode{Mode} {}; +private: + // Vector of HIP pointers + std::vector Ptrs; + +public: + BufferMem(ur_context_handle_t Context, ur_mem_handle_t OuterMemStruct, + AllocMode Mode, void *HostPtr, size_t Size) + : OuterMemStruct{OuterMemStruct}, HostPtr{HostPtr}, Size{Size}, + MapSize{0}, MapOffset{0}, MapPtr{nullptr}, MapFlags{UR_MAP_FLAG_WRITE}, + MemAllocMode{Mode}, Ptrs(Context->Devices.size(), native_type{0}){}; + + BufferMem(const BufferMem &Buffer) = default; - native_type get() const noexcept { return Ptr; } + // This will allocate memory on device if there isn't already an active + // allocation on the device + native_type getPtr(const ur_device_handle_t Device) { + return getPtrWithOffset(Device, 0); + } + + // This will allocate memory on device with index Index if there isn't already + // an active allocation on the device + native_type getPtrWithOffset(const ur_device_handle_t Device, size_t Offset) { + if (ur_result_t Err = + allocateMemObjOnDeviceIfNeeded(OuterMemStruct, Device); + Err != UR_RESULT_SUCCESS) { + throw Err; + } + return reinterpret_cast( + reinterpret_cast(Ptrs[Device->getIndex()]) + Offset); + } - native_type getWithOffset(size_t Offset) const noexcept { - return reinterpret_cast(reinterpret_cast(Ptr) + - Offset); + // This will allocate memory on device if there isn't already an active + // allocation on the device + void *getVoid(const ur_device_handle_t Device) { + return reinterpret_cast(getPtrWithOffset(Device, 0)); } - void *getVoid() const noexcept { return reinterpret_cast(Ptr); } + bool isSubBuffer() const noexcept { return Parent != nullptr; } size_t getSize() const noexcept { return Size; } @@ -107,28 +138,240 @@ struct BufferMem { assert(MapPtr != nullptr); return MapFlags; } + + ur_result_t clear() { + if (Parent != nullptr) { + return UR_RESULT_SUCCESS; + } + + switch (MemAllocMode) { + case AllocMode::CopyIn: + case AllocMode::Classic: + for (auto &DevPtr : Ptrs) { + if (DevPtr != native_type{0}) { + UR_CHECK_ERROR(hipFree(DevPtr)); + } + } + break; + case AllocMode::UseHostPtr: + UR_CHECK_ERROR(hipHostUnregister(HostPtr)); + break; + case AllocMode::AllocHostPtr: + UR_CHECK_ERROR(hipFreeHost(HostPtr)); + } + return UR_RESULT_SUCCESS; + } + + friend struct ur_mem_handle_t_; + friend ur_result_t allocateMemObjOnDeviceIfNeeded(ur_mem_handle_t, + const ur_device_handle_t); }; // Handler data for surface object (i.e. Images) struct SurfaceMem { - hipArray *Array; - hipSurfaceObject_t SurfObj; - ur_mem_type_t ImageType; +private: + std::vector Arrays; + std::vector SurfObjs; + +public: + ur_mem_handle_t OuterMemStruct; + + ur_image_format_t ImageFormat; + ur_image_desc_t ImageDesc; + HIP_ARRAY3D_DESCRIPTOR ArrayDesc; + size_t PixelTypeSizeBytes; + void *HostPtr; + + SurfaceMem(ur_context_handle_t Context, ur_mem_handle_t OuterMemStruct, + ur_image_format_t ImageFormat, ur_image_desc_t ImageDesc, + void *HostPtr) + : Arrays(Context->Devices.size(), nullptr), + SurfObjs(Context->Devices.size(), nullptr), + OuterMemStruct{OuterMemStruct}, + ImageFormat{ImageFormat}, ImageDesc{ImageDesc}, HostPtr{HostPtr} { + // We have to use hipArray3DCreate, which has some caveats. The height and + // depth parameters must be set to 0 produce 1D or 2D arrays. image_desc + // gives a minimum value of 1, so we need to convert the answer. + ArrayDesc.NumChannels = 4; // Only support 4 channel image + ArrayDesc.Flags = 0; // No flags required + ArrayDesc.Width = ImageDesc.width; + if (ImageDesc.type == UR_MEM_TYPE_IMAGE1D) { + ArrayDesc.Height = 0; + ArrayDesc.Depth = 0; + } else if (ImageDesc.type == UR_MEM_TYPE_IMAGE2D) { + ArrayDesc.Height = ImageDesc.height; + ArrayDesc.Depth = 0; + } else if (ImageDesc.type == UR_MEM_TYPE_IMAGE3D) { + ArrayDesc.Height = ImageDesc.height; + ArrayDesc.Depth = ImageDesc.depth; + } + + // We need to get PixelTypeSizeBytes for calculating the total image size + // later + switch (ImageFormat.channelType) { + + case UR_IMAGE_CHANNEL_TYPE_UNORM_INT8: + case UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT8: + ArrayDesc.Format = HIP_AD_FORMAT_UNSIGNED_INT8; + PixelTypeSizeBytes = 1; + break; + case UR_IMAGE_CHANNEL_TYPE_SIGNED_INT8: + ArrayDesc.Format = HIP_AD_FORMAT_SIGNED_INT8; + PixelTypeSizeBytes = 1; + break; + case UR_IMAGE_CHANNEL_TYPE_UNORM_INT16: + case UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT16: + ArrayDesc.Format = HIP_AD_FORMAT_UNSIGNED_INT16; + PixelTypeSizeBytes = 2; + break; + case UR_IMAGE_CHANNEL_TYPE_SIGNED_INT16: + ArrayDesc.Format = HIP_AD_FORMAT_SIGNED_INT16; + PixelTypeSizeBytes = 2; + break; + case UR_IMAGE_CHANNEL_TYPE_HALF_FLOAT: + ArrayDesc.Format = HIP_AD_FORMAT_HALF; + PixelTypeSizeBytes = 2; + break; + case UR_IMAGE_CHANNEL_TYPE_UNSIGNED_INT32: + ArrayDesc.Format = HIP_AD_FORMAT_UNSIGNED_INT32; + PixelTypeSizeBytes = 4; + break; + case UR_IMAGE_CHANNEL_TYPE_SIGNED_INT32: + ArrayDesc.Format = HIP_AD_FORMAT_SIGNED_INT32; + PixelTypeSizeBytes = 4; + break; + case UR_IMAGE_CHANNEL_TYPE_FLOAT: + ArrayDesc.Format = HIP_AD_FORMAT_FLOAT; + PixelTypeSizeBytes = 4; + break; + default: + // urMemImageCreate given unsupported image_channel_data_type + detail::ur::die("Bad image format given to ur_image_ constructor"); + } + } + + // Will allocate a new array on device if not already allocated + hipArray *getArray(const ur_device_handle_t Device) { + if (ur_result_t Err = + allocateMemObjOnDeviceIfNeeded(OuterMemStruct, Device); + Err != UR_RESULT_SUCCESS) { + throw Err; + } + return Arrays[Device->getIndex()]; + } - SurfaceMem(hipArray *Array, hipSurfaceObject_t Surf, ur_mem_type_t ImageType) - : Array{Array}, SurfObj{Surf}, ImageType{ImageType} {}; + // Will allocate a new surface on device if not already allocated + hipSurfaceObject_t getSurface(const ur_device_handle_t Device) { + if (ur_result_t Err = + allocateMemObjOnDeviceIfNeeded(OuterMemStruct, Device); + Err != UR_RESULT_SUCCESS) { + throw Err; + } + return SurfObjs[Device->getIndex()]; + } - hipArray *getArray() const noexcept { return Array; } + ur_mem_type_t getImageType() const noexcept { return ImageDesc.type; } - hipSurfaceObject_t getSurface() const noexcept { return SurfObj; } + ur_result_t clear() { + for (auto Array : Arrays) { + if (Array) { + UR_CHECK_ERROR(hipFreeArray(Array)); + } + } + for (auto Surf : SurfObjs) { + if (Surf != hipSurfaceObject_t{0}) { + UR_CHECK_ERROR(hipDestroySurfaceObject(Surf)); + } + } + return UR_RESULT_SUCCESS; + } - ur_mem_type_t getImageType() const noexcept { return ImageType; } + friend ur_result_t allocateMemObjOnDeviceIfNeeded(ur_mem_handle_t, + const ur_device_handle_t); }; /// UR Mem mapping to HIP memory allocations, both data and texture/surface. /// \brief Represents non-SVM allocations on the HIP backend. /// Keeps tracks of all mapped regions used for Map/Unmap calls. /// Only one region can be active at the same time per allocation. +/// +/// The ur_mem_handle_t is responsible for memory allocation and migration +/// across devices in the same ur_context_handle_t. If a kernel writes to a +/// ur_mem_handle_t then it will write to LastEventWritingToMemObj. Then all +/// subsequent operations that want to read from the ur_mem_handle_t must wait +/// on the event referring to the last write. +/// +/// Since urMemBufferCreate/urMemImageCreate do not take a queue or device +/// object, only a ur_context_handle_t, at mem obj creation we don't know which +/// device we must make a native image/allocation on. Therefore no allocations +/// are made at urMemBufferCreate/urMemImageCreate. Instead device +/// images/allocations are made lazily. These allocations are made implicitly +/// with a call to getPtr/getArray which will allocate a new allocation/image on +/// device if need be. +/// +/// Memory migration between native allocations for devices in the same +/// ur_context_handle_t will occur at: +/// +/// 1. urEnqueueKernelLaunch +/// 2. urEnqueueMem(Buffer|Image)Read(Rect) +/// +/// Migrations will occur in both cases if the most recent version of data +/// is on a different device, marked by LastEventWritingToMemObj->getDevice(). +/// +/// Example trace: +/// ~~~~~~~~~~~~~~ +/// +/// =====> urContextCreate([device0, device1], ...) // associated with [q0, q1] +/// -> OUT: hContext +/// +/// =====> urMemBufferCreate(hContext,...); +/// -> No native allocations made +/// -> OUT: hBuffer +/// +/// =====> urEnqueueMemBufferWrite(q0, hBuffer,...); +/// -> Allocation made on q0 ie device0 +/// -> New allocation initialized with host data. +/// +/// =====> urKernelSetArgMemObj(hKernel0, hBuffer, ...); +/// -> ur_kernel_handle_t associated with a ur_program_handle_t, +/// which is in turn unique to a device. So we can set the kernel +/// arg with the ptr of the device specific allocation. +/// -> hKernel0->getProgram()->getDevice() == device0 +/// -> allocateMemObjOnDeviceIfNeeded(device0); +/// -> Native allocation already made on device0, continue. +/// +/// =====> urEnqueueKernelLaunch(q0, hKernel0, ...); +/// -> Suppose that hKernel0 writes to hBuffer. +/// -> Call hBuffer->setLastEventWritingToMemObj with return event +/// from this operation +/// -> Enqueue native kernel launch +/// +/// =====> urKernelSetArgMemObj(hKernel1, hBuffer, ...); +/// -> hKernel1->getProgram()->getDevice() == device1 +/// -> New allocation will be made on device1 when calling +/// getPtr(device1) +/// -> No native allocation on device1 +/// -> Make native allocation on device1 +/// +/// =====> urEnqueueKernelLaunch(q1, hKernel1, ...); +/// -> Suppose hKernel1 wants to read from hBuffer and not write. +/// -> migrateMemoryToDeviceIfNeeded(device1); +/// -> hBuffer->LastEventWritingToMemObj is not nullptr +/// -> Check if memory has been migrated to device1 since the +/// last write +/// -> Hasn't been migrated +/// -> Wait on LastEventWritingToMemObj. +/// -> Migrate memory from device0's native allocation to +/// device1's native allocation. +/// -> Enqueue native kernel launch +/// +/// =====> urEnqueueKernelLaunch(q0, hKernel0, ...); +/// -> migrateMemoryToDeviceIfNeeded(device0); +/// -> hBuffer->LastEventWritingToMemObj refers to an event +/// from q0 +/// -> Migration not necessary +/// -> Enqueue native kernel launch +/// struct ur_mem_handle_t_ { // TODO: Move as much shared data up as possible @@ -140,36 +383,76 @@ struct ur_mem_handle_t_ { /// Reference counting of the handler std::atomic_uint32_t RefCount; - enum class Type { Buffer, Surface } MemType; // Original mem flags passed ur_mem_flags_t MemFlags; + // If we make a ur_mem_handle_t_ from a native allocation, it can be useful to + // associate it with the device that holds the native allocation. + ur_device_handle_t DeviceWithNativeAllocation{nullptr}; + + // Has the memory been migrated to a device since the last write? + std::vector HaveMigratedToDeviceSinceLastWrite; + + // We should wait on this event prior to migrating memory across allocations + // in this ur_mem_handle_t_ + ur_event_handle_t LastEventWritingToMemObj{nullptr}; + + // Enumerates all possible types of accesses. + enum access_mode_t { unknown, read_write, read_only, write_only }; + + ur_mutex MemoryAllocationMutex; // A mutex for allocations + ur_mutex MemoryMigrationMutex; // A mutex for memory transfers + /// A UR Memory object represents either plain memory allocations ("Buffers" /// in OpenCL) or typed allocations ("Images" in OpenCL). /// In HIP their API handlers are different. Whereas "Buffers" are allocated /// as pointer-like structs, "Images" are stored in Textures or Surfaces. - /// This union allows implementation to use either from the same handler. + /// This variant allows implementation to use either from the same handler. std::variant Mem; - /// Constructs the UR MEM handler for a non-typed allocation ("buffer") - ur_mem_handle_t_(ur_context Ctxt, ur_mem Parent, ur_mem_flags_t MemFlags, - BufferMem::AllocMode Mode, hipDeviceptr_t Ptr, void *HostPtr, - size_t Size) - : Context{Ctxt}, RefCount{1}, MemType{Type::Buffer}, MemFlags{MemFlags}, - Mem{BufferMem{Parent, Mode, Ptr, HostPtr, Size}} { - if (isSubBuffer()) { - urMemRetain(std::get(Mem).Parent); - } else { - urContextRetain(Context); + /// Constructs the UR mem handler for a non-typed allocation ("buffer") + ur_mem_handle_t_(ur_context_handle_t Ctxt, ur_mem_flags_t MemFlags, + BufferMem::AllocMode Mode, void *HostPtr, size_t Size) + : Context{Ctxt}, RefCount{1}, MemFlags{MemFlags}, + HaveMigratedToDeviceSinceLastWrite(Context->Devices.size(), false), + Mem{std::in_place_type, Ctxt, this, Mode, HostPtr, Size} { + urContextRetain(Context); + }; + + // Subbuffer constructor + ur_mem_handle_t_(ur_mem Parent, size_t SubBufferOffset) + : Context{Parent->Context}, RefCount{1}, MemFlags{Parent->MemFlags}, + HaveMigratedToDeviceSinceLastWrite(Parent->Context->Devices.size(), + false), + Mem{BufferMem{std::get(Parent->Mem)}} { + auto &SubBuffer = std::get(Mem); + SubBuffer.Parent = Parent; + SubBuffer.OuterMemStruct = this; + if (SubBuffer.HostPtr) { + SubBuffer.HostPtr = + static_cast(SubBuffer.HostPtr) + SubBufferOffset; + } + for (auto &DevPtr : SubBuffer.Ptrs) { + if (DevPtr) { + DevPtr = static_cast(DevPtr) + SubBufferOffset; + } } + urMemRetain(Parent); }; - /// Constructs the UR allocation for an Image object - ur_mem_handle_t_(ur_context Ctxt, hipArray *Array, hipSurfaceObject_t Surf, - ur_mem_flags_t MemFlags, ur_mem_type_t ImageType, void *) - : Context{Ctxt}, RefCount{1}, MemType{Type::Surface}, MemFlags{MemFlags}, - Mem{SurfaceMem{Array, Surf, ImageType}} { + /// Constructs the UR mem handler for an Image object + ur_mem_handle_t_(ur_context Ctxt, ur_mem_flags_t MemFlags, + ur_image_format_t ImageFormat, ur_image_desc_t ImageDesc, + void *HostPtr) + : Context{Ctxt}, RefCount{1}, MemFlags{MemFlags}, + HaveMigratedToDeviceSinceLastWrite(Context->Devices.size(), false), + Mem{std::in_place_type, + Ctxt, + this, + ImageFormat, + ImageDesc, + HostPtr} { urContextRetain(Context); } @@ -181,13 +464,24 @@ struct ur_mem_handle_t_ { urContextRelease(Context); } - bool isBuffer() const noexcept { return MemType == Type::Buffer; } + bool isBuffer() const noexcept { + return std::holds_alternative(Mem); + } bool isSubBuffer() const noexcept { return (isBuffer() && (std::get(Mem).Parent != nullptr)); } - bool isImage() const noexcept { return MemType == Type::Surface; } + bool isImage() const noexcept { + return std::holds_alternative(Mem); + } + + ur_result_t clear() { + if (isBuffer()) { + return std::get(Mem).clear(); + } + return std::get(Mem).clear(); + } ur_context getContext() const noexcept { return Context; } @@ -196,4 +490,19 @@ struct ur_mem_handle_t_ { uint32_t decrementReferenceCount() noexcept { return --RefCount; } uint32_t getReferenceCount() const noexcept { return RefCount; } + + void setLastEventWritingToMemObj(ur_event_handle_t NewEvent) { + assert(NewEvent && "Invalid event!"); + // This entry point should only ever be called when using multi device ctx + assert(Context->Devices.size() > 1); + if (LastEventWritingToMemObj != nullptr) { + urEventRelease(LastEventWritingToMemObj); + } + urEventRetain(NewEvent); + LastEventWritingToMemObj = NewEvent; + for (const auto &Device : Context->getDevices()) { + HaveMigratedToDeviceSinceLastWrite[Device->getIndex()] = + Device == NewEvent->getDevice(); + } + } }; diff --git a/source/adapters/hip/platform.cpp b/source/adapters/hip/platform.cpp index 5f35b55f1f..287f941c30 100644 --- a/source/adapters/hip/platform.cpp +++ b/source/adapters/hip/platform.cpp @@ -47,9 +47,6 @@ urPlatformGetInfo(ur_platform_handle_t, ur_platform_info_t propName, /// There is only one HIP platform, and contains all devices on the system. /// Triggers the HIP Driver initialization (hipInit) the first time, so this /// must be the first UR API called. -/// -/// However because multiple devices in a context is not currently supported, -/// place each device in a separate platform. UR_APIEXPORT ur_result_t UR_APICALL urPlatformGet(ur_adapter_handle_t *, uint32_t, uint32_t NumEntries, ur_platform_handle_t *phPlatforms, uint32_t *pNumPlatforms) { @@ -57,7 +54,7 @@ urPlatformGet(ur_adapter_handle_t *, uint32_t, uint32_t NumEntries, try { static std::once_flag InitFlag; static uint32_t NumPlatforms = 1; - static std::vector PlatformIds; + static ur_platform_handle_t_ Platform; UR_ASSERT(phPlatforms || pNumPlatforms, UR_RESULT_ERROR_INVALID_VALUE); UR_ASSERT(!phPlatforms || NumEntries > 0, UR_RESULT_ERROR_INVALID_VALUE); @@ -79,22 +76,18 @@ urPlatformGet(ur_adapter_handle_t *, uint32_t, uint32_t NumEntries, return; } try { - // make one platform per device - NumPlatforms = NumDevices; - PlatformIds.resize(NumDevices); - - for (int i = 0; i < NumDevices; ++i) { + for (auto i = 0u; i < static_cast(NumDevices); ++i) { hipDevice_t Device; UR_CHECK_ERROR(hipDeviceGet(&Device, i)); hipCtx_t Context; UR_CHECK_ERROR(hipDevicePrimaryCtxRetain(&Context, Device)); - PlatformIds[i].Devices.emplace_back( - new ur_device_handle_t_{Device, Context, &PlatformIds[i]}); + Platform.Devices.emplace_back( + new ur_device_handle_t_{Device, Context, &Platform, i}); } // Setup EvBase { - ScopedContext Active(PlatformIds.front().Devices.front().get()); + ScopedContext Active(Platform.Devices.front().get()); hipEvent_t EvBase; UR_CHECK_ERROR(hipEventCreate(&EvBase)); UR_CHECK_ERROR(hipEventRecord(EvBase, 0)); @@ -103,17 +96,11 @@ urPlatformGet(ur_adapter_handle_t *, uint32_t, uint32_t NumEntries, } } catch (const std::bad_alloc &) { // Signal out-of-memory situation - for (int i = 0; i < NumDevices; ++i) { - PlatformIds[i].Devices.clear(); - } - PlatformIds.clear(); + Platform.Devices.clear(); Err = UR_RESULT_ERROR_OUT_OF_HOST_MEMORY; } catch (ur_result_t CatchErr) { // Clear and rethrow to allow retry - for (int i = 0; i < NumDevices; ++i) { - PlatformIds[i].Devices.clear(); - } - PlatformIds.clear(); + Platform.Devices.clear(); Err = CatchErr; throw CatchErr; } catch (...) { @@ -128,9 +115,7 @@ urPlatformGet(ur_adapter_handle_t *, uint32_t, uint32_t NumEntries, } if (phPlatforms != nullptr) { - for (unsigned i = 0; i < std::min(NumEntries, NumPlatforms); ++i) { - phPlatforms[i] = &PlatformIds[i]; - } + *phPlatforms = &Platform; } return Result; diff --git a/source/adapters/hip/program.cpp b/source/adapters/hip/program.cpp index 2c71c53208..0cf539602b 100644 --- a/source/adapters/hip/program.cpp +++ b/source/adapters/hip/program.cpp @@ -74,14 +74,6 @@ void getCoMgrBuildLog(const amd_comgr_data_set_t BuildDataSet, char *BuildLog, } // namespace #endif -ur_program_handle_t_::ur_program_handle_t_(ur_context_handle_t Ctxt) - : Module{nullptr}, Binary{}, BinarySizeInBytes{0}, RefCount{1}, Context{ - Ctxt} { - urContextRetain(Context); -} - -ur_program_handle_t_::~ur_program_handle_t_() { urContextRelease(Context); } - ur_result_t ur_program_handle_t_::setMetadata(const ur_program_metadata_t *Metadata, size_t Length) { @@ -135,8 +127,8 @@ ur_result_t ur_program_handle_t_::finalizeRelocatable() { std::string ISA = "amdgcn-amd-amdhsa--"; hipDeviceProp_t Props; - detail::ur::assertion(hipGetDeviceProperties( - &Props, Context->getDevice()->get()) == hipSuccess); + detail::ur::assertion(hipGetDeviceProperties(&Props, getDevice()->get()) == + hipSuccess); ISA += Props.gcnArchName; UR_CHECK_ERROR(amd_comgr_action_info_set_isa_name(Action, ISA.data())); @@ -222,18 +214,13 @@ ur_result_t getKernelNames(ur_program_handle_t) { return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } -/// HIP will handle the PTX/HIPBIN binaries internally through hipModule_t -/// object. So, urProgramCreateWithIL and urProgramCreateWithBinary are -/// equivalent in terms of HIP adapter. See \ref urProgramCreateWithBinary. +/// A program must be specific to a device so this entry point is UNSUPPORTED UR_APIEXPORT ur_result_t UR_APICALL -urProgramCreateWithIL(ur_context_handle_t hContext, const void *pIL, - size_t length, const ur_program_properties_t *pProperties, - ur_program_handle_t *phProgram) { - ur_device_handle_t hDevice = hContext->getDevice(); - const auto pBinary = reinterpret_cast(pIL); - - return urProgramCreateWithBinary(hContext, hDevice, length, pBinary, - pProperties, phProgram); +urProgramCreateWithIL(ur_context_handle_t, const void *, size_t, + const ur_program_properties_t *, ur_program_handle_t *) { + detail::ur::die("urProgramCreateWithIL not implemented for HIP adapter" + " please use urProgramCreateWithBinary instead"); + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } /// HIP will handle the PTX/HIPBIN binaries internally through a call to @@ -268,7 +255,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramBuild(ur_context_handle_t, ur_result_t Result = UR_RESULT_SUCCESS; try { - ScopedContext Active(hProgram->getContext()->getDevice()); + ScopedContext Active(hProgram->getDevice()); hProgram->buildProgram(pOptions); @@ -340,7 +327,7 @@ urProgramGetInfo(ur_program_handle_t hProgram, ur_program_info_t propName, case UR_PROGRAM_INFO_NUM_DEVICES: return ReturnValue(1u); case UR_PROGRAM_INFO_DEVICES: - return ReturnValue(&hProgram->Context->DeviceId, 1); + return ReturnValue(hProgram->getDevice(), 1); case UR_PROGRAM_INFO_SOURCE: return ReturnValue(hProgram->Binary); case UR_PROGRAM_INFO_BINARY_SIZES: @@ -380,7 +367,7 @@ urProgramRelease(ur_program_handle_t hProgram) { ur_result_t Result = UR_RESULT_ERROR_INVALID_PROGRAM; try { - ScopedContext Active(hProgram->getContext()->getDevice()); + ScopedContext Active(hProgram->getDevice()); auto HIPModule = hProgram->get(); if (HIPModule) { UR_CHECK_ERROR(hipModuleUnload(HIPModule)); @@ -422,13 +409,15 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramCreateWithBinary( const uint8_t *pBinary, const ur_program_properties_t *pProperties, ur_program_handle_t *phProgram) { UR_ASSERT(pBinary != nullptr && size != 0, UR_RESULT_ERROR_INVALID_BINARY); - UR_ASSERT(hContext->getDevice()->get() == hDevice->get(), + UR_ASSERT(std::find(hContext->getDevices().begin(), + hContext->getDevices().end(), + hDevice) != hContext->getDevices().end(), UR_RESULT_ERROR_INVALID_CONTEXT); ur_result_t Result = UR_RESULT_SUCCESS; std::unique_ptr RetProgram{ - new ur_program_handle_t_{hContext}}; + new ur_program_handle_t_{hContext, hDevice}}; // TODO: Set metadata here and use reqd_work_group_size information. // See urProgramCreateWithBinary in CUDA adapter. @@ -469,8 +458,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramGetFunctionPointer( ur_device_handle_t hDevice, ur_program_handle_t hProgram, const char *pFunctionName, void **ppFunctionPointer) { // Check if device passed is the same the device bound to the context - UR_ASSERT(hDevice == hProgram->getContext()->getDevice(), - UR_RESULT_ERROR_INVALID_DEVICE); + UR_ASSERT(hDevice == hProgram->getDevice(), UR_RESULT_ERROR_INVALID_DEVICE); hipFunction_t Func; hipError_t Ret = hipModuleGetFunction(&Func, hProgram->get(), pFunctionName); diff --git a/source/adapters/hip/program.hpp b/source/adapters/hip/program.hpp index ff9b68fc92..4b4e5ec878 100644 --- a/source/adapters/hip/program.hpp +++ b/source/adapters/hip/program.hpp @@ -23,6 +23,7 @@ struct ur_program_handle_t_ { size_t BinarySizeInBytes; std::atomic_uint32_t RefCount; ur_context_handle_t Context; + ur_device_handle_t Device; std::string ExecutableCache; // Metadata @@ -34,8 +35,17 @@ struct ur_program_handle_t_ { std::string BuildOptions; ur_program_build_status_t BuildStatus = UR_PROGRAM_BUILD_STATUS_NONE; - ur_program_handle_t_(ur_context_handle_t Ctxt); - ~ur_program_handle_t_(); + ur_program_handle_t_(ur_context_handle_t Ctxt, ur_device_handle_t Device) + : Module{nullptr}, Binary{}, + BinarySizeInBytes{0}, RefCount{1}, Context{Ctxt}, Device{Device} { + urContextRetain(Context); + urDeviceRetain(Device); + } + + ~ur_program_handle_t_() { + urContextRelease(Context); + urDeviceRelease(Device); + } ur_result_t setMetadata(const ur_program_metadata_t *Metadata, size_t Length); @@ -44,6 +54,7 @@ struct ur_program_handle_t_ { ur_result_t buildProgram(const char *BuildOptions); ur_result_t finalizeRelocatable(); ur_context_handle_t getContext() const { return Context; }; + ur_device_handle_t getDevice() const { return Device; }; native_type get() const noexcept { return Module; }; diff --git a/source/adapters/hip/queue.cpp b/source/adapters/hip/queue.cpp index 910d7cf512..f01fc0e180 100644 --- a/source/adapters/hip/queue.cpp +++ b/source/adapters/hip/queue.cpp @@ -110,14 +110,13 @@ hipStream_t ur_queue_handle_t_::getNextTransferStream() { UR_APIEXPORT ur_result_t UR_APICALL urQueueCreate(ur_context_handle_t hContext, ur_device_handle_t hDevice, const ur_queue_properties_t *pProps, ur_queue_handle_t *phQueue) { + UR_ASSERT(std::find(hContext->getDevices().begin(), + hContext->getDevices().end(), + hDevice) != hContext->getDevices().end(), + UR_RESULT_ERROR_INVALID_CONTEXT); try { std::unique_ptr QueueImpl{nullptr}; - if (hContext->getDevice() != hDevice) { - *phQueue = nullptr; - return UR_RESULT_ERROR_INVALID_DEVICE; - } - unsigned int Flags = 0; const bool IsOutOfOrder = @@ -198,7 +197,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueRelease(ur_queue_handle_t hQueue) { if (!hQueue->backendHasOwnership()) return UR_RESULT_SUCCESS; - ScopedContext Active(hQueue->getContext()->getDevice()); + ScopedContext Active(hQueue->getDevice()); hQueue->forEachStream([](hipStream_t S) { UR_CHECK_ERROR(hipStreamSynchronize(S)); @@ -219,7 +218,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueFinish(ur_queue_handle_t hQueue) { try { - ScopedContext Active(hQueue->getContext()->getDevice()); + ScopedContext Active(hQueue->getDevice()); hQueue->syncStreams([&Result](hipStream_t S) { UR_CHECK_ERROR(hipStreamSynchronize(S)); @@ -251,7 +250,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueFlush(ur_queue_handle_t) { UR_APIEXPORT ur_result_t UR_APICALL urQueueGetNativeHandle(ur_queue_handle_t hQueue, ur_queue_native_desc_t *, ur_native_handle_t *phNativeQueue) { - ScopedContext Active(hQueue->getContext()->getDevice()); + ScopedContext Active(hQueue->getDevice()); *phNativeQueue = reinterpret_cast(hQueue->getNextComputeStream()); return UR_RESULT_SUCCESS; @@ -291,7 +290,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueCreateWithNativeHandle( new ur_queue_handle_t_{std::move(ComputeHIPStreams), std::move(TransferHIPStreams), hContext, - hContext->getDevice(), + hDevice, HIPFlags, Flags, /*backend_owns*/ pProperties->isNativeHandleOwned}; diff --git a/source/adapters/hip/usm.cpp b/source/adapters/hip/usm.cpp index 7af7401f87..334e0a86c1 100644 --- a/source/adapters/hip/usm.cpp +++ b/source/adapters/hip/usm.cpp @@ -66,11 +66,10 @@ urUSMSharedAlloc(ur_context_handle_t hContext, ur_device_handle_t hDevice, return umfPoolMallocHelper(hPool, ppMem, size, alignment); } -UR_APIEXPORT ur_result_t UR_APICALL USMFreeImpl(ur_context_handle_t hContext, - void *pMem) { +UR_APIEXPORT ur_result_t UR_APICALL +USMFreeImpl([[maybe_unused]] ur_context_handle_t hContext, void *pMem) { ur_result_t Result = UR_RESULT_SUCCESS; try { - ScopedContext Active(hContext->getDevice()); hipPointerAttribute_t hipPointerAttributeType; UR_CHECK_ERROR(hipPointerGetAttributes(&hipPointerAttributeType, pMem)); unsigned int Type = hipPointerAttributeType.memoryType; @@ -98,12 +97,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urUSMFree(ur_context_handle_t hContext, } } -ur_result_t USMDeviceAllocImpl(void **ResultPtr, ur_context_handle_t Context, - ur_device_handle_t, ur_usm_device_mem_flags_t *, - size_t Size, +ur_result_t USMDeviceAllocImpl(void **ResultPtr, ur_context_handle_t, + ur_device_handle_t Device, + ur_usm_device_mem_flags_t *, size_t Size, [[maybe_unused]] uint32_t Alignment) { try { - ScopedContext Active(Context->getDevice()); + ScopedContext Active(Device); UR_CHECK_ERROR(hipMalloc(ResultPtr, Size)); } catch (ur_result_t Err) { return Err; @@ -113,12 +112,13 @@ ur_result_t USMDeviceAllocImpl(void **ResultPtr, ur_context_handle_t Context, return UR_RESULT_SUCCESS; } -ur_result_t USMSharedAllocImpl(void **ResultPtr, ur_context_handle_t Context, - ur_device_handle_t, ur_usm_host_mem_flags_t *, +ur_result_t USMSharedAllocImpl(void **ResultPtr, ur_context_handle_t, + ur_device_handle_t Device, + ur_usm_host_mem_flags_t *, ur_usm_device_mem_flags_t *, size_t Size, [[maybe_unused]] uint32_t Alignment) { try { - ScopedContext Active(Context->getDevice()); + ScopedContext Active(Device); UR_CHECK_ERROR(hipMallocManaged(ResultPtr, Size, hipMemAttachGlobal)); } catch (ur_result_t Err) { return Err; @@ -128,11 +128,11 @@ ur_result_t USMSharedAllocImpl(void **ResultPtr, ur_context_handle_t Context, return UR_RESULT_SUCCESS; } -ur_result_t USMHostAllocImpl(void **ResultPtr, ur_context_handle_t Context, +ur_result_t USMHostAllocImpl(void **ResultPtr, + [[maybe_unused]] ur_context_handle_t Context, ur_usm_host_mem_flags_t *, size_t Size, [[maybe_unused]] uint32_t Alignment) { try { - ScopedContext Active(Context->getDevice()); UR_CHECK_ERROR(hipHostMalloc(ResultPtr, Size)); } catch (ur_result_t Err) { return Err; @@ -152,7 +152,6 @@ urUSMGetMemAllocInfo(ur_context_handle_t hContext, const void *pMem, UrReturnHelper ReturnValue(propValueSize, pPropValue, pPropValueSizeRet); try { - ScopedContext Active(hContext->getDevice()); switch (propName) { case UR_USM_ALLOC_INFO_TYPE: { unsigned int Value; @@ -346,25 +345,26 @@ ur_usm_pool_handle_t_::ur_usm_pool_handle_t_(ur_context_handle_t Context, this->DisjointPoolConfigs.Configs[usm::DisjointPoolMemType::Host]) .second; - auto Device = Context->DeviceId; - MemProvider = - umf::memoryProviderMakeUnique(Context, Device) - .second; - DeviceMemPool = - umf::poolMakeUnique( - {std::move(MemProvider)}, - this->DisjointPoolConfigs.Configs[usm::DisjointPoolMemType::Device]) - .second; - - MemProvider = - umf::memoryProviderMakeUnique(Context, Device) - .second; - SharedMemPool = - umf::poolMakeUnique( - {std::move(MemProvider)}, - this->DisjointPoolConfigs.Configs[usm::DisjointPoolMemType::Shared]) - .second; - Context->addPool(this); + for (const auto &Device : Context->getDevices()) { + MemProvider = + umf::memoryProviderMakeUnique(Context, Device) + .second; + DeviceMemPool = + umf::poolMakeUnique( + {std::move(MemProvider)}, + this->DisjointPoolConfigs.Configs[usm::DisjointPoolMemType::Device]) + .second; + + MemProvider = + umf::memoryProviderMakeUnique(Context, Device) + .second; + SharedMemPool = + umf::poolMakeUnique( + {std::move(MemProvider)}, + this->DisjointPoolConfigs.Configs[usm::DisjointPoolMemType::Shared]) + .second; + Context->addPool(this); + } } bool ur_usm_pool_handle_t_::hasUMFPool(umf_memory_pool_t *umf_pool) { diff --git a/source/ur/ur.hpp b/source/ur/ur.hpp index 0437d719ba..da5ef0d81f 100644 --- a/source/ur/ur.hpp +++ b/source/ur/ur.hpp @@ -106,6 +106,7 @@ class ur_shared_mutex { // nop. class ur_mutex { std::mutex Mutex; + friend class ur_lock; public: void lock() { @@ -121,6 +122,17 @@ class ur_mutex { } }; +class ur_lock { + std::unique_lock Lock; + +public: + explicit ur_lock(ur_mutex &Mutex) { + if (!SingleThreadMode) { + Lock = std::unique_lock(Mutex.Mutex); + } + } +}; + /// SpinLock is a synchronization primitive, that uses atomic variable and /// causes thread trying acquire lock wait in loop while repeatedly check if /// the lock is available. diff --git a/test/adapters/hip/test_context.cpp b/test/adapters/hip/test_context.cpp index 90c28b842f..c58dfc5af7 100644 --- a/test/adapters/hip/test_context.cpp +++ b/test/adapters/hip/test_context.cpp @@ -28,7 +28,9 @@ TEST_P(urHipContextTest, ActiveContexts) { hipCtx_t hipContext = nullptr; ASSERT_SUCCESS_HIP(hipCtxGetCurrent(&hipContext)); ASSERT_NE(hipContext, nullptr); - ASSERT_EQ(hipContext, context->getDevice()->getNativeContext()); + if (context->getDevices().size() == 1) { + ASSERT_EQ(hipContext, context->getDevices()[0]->getNativeContext()); + } ASSERT_SUCCESS(urQueueRelease(queue)); ASSERT_SUCCESS(urContextRelease(context)); @@ -60,7 +62,9 @@ TEST_P(urHipContextTest, ActiveContextsThreads) { // check that the first context is now the active HIP context ASSERT_SUCCESS_HIP(hipCtxGetCurrent(¤t)); - ASSERT_EQ(current, context1->getDevice()->getNativeContext()); + if (context1->getDevices().size() == 1) { + ASSERT_EQ(current, context1->getDevices()[0]->getNativeContext()); + } ASSERT_SUCCESS(urQueueRelease(queue)); @@ -87,7 +91,9 @@ TEST_P(urHipContextTest, ActiveContextsThreads) { // check that the second context is now the active HIP context ASSERT_SUCCESS_HIP(hipCtxGetCurrent(¤t)); - ASSERT_EQ(current, context2->getDevice()->getNativeContext()); + if (context2->getDevices().size() == 1) { + ASSERT_EQ(current, context2->getDevices()[0]->getNativeContext()); + } ASSERT_SUCCESS(urQueueRelease(queue)); }); diff --git a/test/conformance/context/context_adapter_hip.match b/test/conformance/context/context_adapter_hip.match index 129b8d392c..82d8d71397 100644 --- a/test/conformance/context/context_adapter_hip.match +++ b/test/conformance/context/context_adapter_hip.match @@ -1 +1,2 @@ urContextCreateWithNativeHandleTest.Success/AMD_HIP_BACKEND___{{.*}}_ +urContextGetInfoTestWithInfoParam.Success/AMD_HIP_BACKEND___{{.*}} diff --git a/test/conformance/memory/memory_adapter_hip.match b/test/conformance/memory/memory_adapter_hip.match index a4ae7d4f8a..02760dcb8a 100644 --- a/test/conformance/memory/memory_adapter_hip.match +++ b/test/conformance/memory/memory_adapter_hip.match @@ -1,5 +1,7 @@ -urMemBufferCreateWithNativeHandleTest.Success/AMD_HIP_BACKEND___{{.*}}_ -{{OPT}}urMemImageCreateTest.InvalidSize/AMD_HIP_BACKEND___{{.*}}_ -{{OPT}}urMemImageGetInfoTest.Success/AMD_HIP_BACKEND___{{.*}}___UR_IMAGE_INFO_ROW_PITCH -{{OPT}}urMemImageGetInfoTest.Success/AMD_HIP_BACKEND___{{.*}}___UR_IMAGE_INFO_SLICE_PITCH -{{OPT}}Segmentation fault +{{OPT}}urMemGetInfoTest.InvalidNullPointerParamValue/AMD_HIP_BACKEND___{{.*}} +{{OPT}}urMemGetInfoTest.InvalidNullPointerParamValue/AMD_HIP_BACKEND___{{.*}} +{{OPT}}urMemGetInfoTest.InvalidNullPointerPropSizeRet/AMD_HIP_BACKEND___{{.*}} +{{OPT}}urMemGetInfoTest.InvalidNullPointerPropSizeRet/AMD_HIP_BACKEND___{{.*}} +{{OPT}}urMemImageCreateTest.InvalidSize/AMD_HIP_BACKEND___{{.*}} +{{OPT}}urMemImageGetInfoTest.Success/AMD_HIP_BACKEND___{{.*}} +{{OPT}}urMemImageGetInfoTest.Success/AMD_HIP_BACKEND___{{.*}}