Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
73 changes: 56 additions & 17 deletions source/adapters/hip/enqueue.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -769,29 +769,19 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferCopyRect(
return Result;
}

// HIP has no memset functions that allow setting values more than 4 bytes. UR
// API lets you pass an arbitrary "pattern" to the buffer fill, which can be
// more than 4 bytes. We must break up the pattern into 1 byte values, and set
// the buffer using multiple strided calls. The first 4 patterns are set using
// hipMemsetD32Async then all subsequent 1 byte patterns are set using
// hipMemset2DAsync which is called for each pattern.
ur_result_t commonMemSetLargePattern(hipStream_t Stream, uint32_t PatternSize,
size_t Size, const void *pPattern,
hipDeviceptr_t Ptr) {
// Calculate the number of patterns, stride, number of times the pattern
// needs to be applied, and the number of times the first 32 bit pattern
// needs to be applied.
static inline void memsetRemainPattern(hipStream_t Stream, uint32_t PatternSize,
size_t Size, const void *pPattern,
hipDeviceptr_t Ptr) {

// Calculate the number of patterns, stride and the number of times the
// pattern needs to be applied.
auto NumberOfSteps = PatternSize / sizeof(uint8_t);
auto Pitch = NumberOfSteps * sizeof(uint8_t);
auto Height = Size / NumberOfSteps;
auto Count32 = Size / sizeof(uint32_t);

// Get 4-byte chunk of the pattern and call hipMemsetD32Async
auto Value = *(static_cast<const uint32_t *>(pPattern));
UR_CHECK_ERROR(hipMemsetD32Async(Ptr, Value, Count32, Stream));
for (auto step = 4u; step < NumberOfSteps; ++step) {
// take 1 byte of the pattern
Value = *(static_cast<const uint8_t *>(pPattern) + step);
auto Value = *(static_cast<const uint8_t *>(pPattern) + step);

// offset the pointer to the part of the buffer we want to write to
auto OffsetPtr = reinterpret_cast<void *>(reinterpret_cast<uint8_t *>(Ptr) +
Expand All @@ -801,6 +791,55 @@ ur_result_t commonMemSetLargePattern(hipStream_t Stream, uint32_t PatternSize,
UR_CHECK_ERROR(hipMemset2DAsync(OffsetPtr, Pitch, Value, sizeof(uint8_t),
Height, Stream));
}
}

// HIP has no memset functions that allow setting values more than 4 bytes. UR
// API lets you pass an arbitrary "pattern" to the buffer fill, which can be
// more than 4 bytes. We must break up the pattern into 1 byte values, and set
// the buffer using multiple strided calls. The first 4 patterns are set using
// hipMemsetD32Async then all subsequent 1 byte patterns are set using
// hipMemset2DAsync which is called for each pattern.
ur_result_t commonMemSetLargePattern(hipStream_t Stream, uint32_t PatternSize,
size_t Size, const void *pPattern,
hipDeviceptr_t Ptr) {

// Get 4-byte chunk of the pattern and call hipMemsetD32Async
auto Count32 = Size / sizeof(uint32_t);
auto Value = *(static_cast<const uint32_t *>(pPattern));
UR_CHECK_ERROR(hipMemsetD32Async(Ptr, Value, Count32, Stream));

// There is a bug in ROCm prior to 6.0.0 version which causes hipMemset2D
// to behave incorrectly when acting on host pinned memory.
// In such a case, the memset operation is partially emulated with memcpy.
#if HIP_VERSION_MAJOR < 6
hipPointerAttribute_t ptrAttribs{};
UR_CHECK_ERROR(hipPointerGetAttributes(&ptrAttribs, (const void *)Ptr));

// The hostPointer attribute is non-null also for shared memory allocations.
// To make sure that this workaround only executes for host pinned memory, we
// need to check that isManaged attribute is false.
if (ptrAttribs.hostPointer && !ptrAttribs.isManaged) {
const auto NumOfCopySteps = Size / PatternSize;
const auto Offset = sizeof(uint32_t);
const auto LeftPatternSize = PatternSize - Offset;
const auto OffsetPatternPtr = reinterpret_cast<const void *>(
reinterpret_cast<const uint8_t *>(pPattern) + Offset);

// Loop through the memory area to memset, advancing each time by the
// PatternSize and memcpy the left over pattern bits.
for (uint32_t i = 0; i < NumOfCopySteps; ++i) {
auto OffsetDstPtr = reinterpret_cast<void *>(
reinterpret_cast<uint8_t *>(Ptr) + Offset + i * PatternSize);
UR_CHECK_ERROR(hipMemcpyAsync(OffsetDstPtr, OffsetPatternPtr,
LeftPatternSize, hipMemcpyHostToHost,
Stream));
}
} else {
memsetRemainPattern(Stream, PatternSize, Size, pPattern, Ptr);
}
#else
memsetRemainPattern(Stream, PatternSize, Size, pPattern, Ptr);
#endif
return UR_RESULT_SUCCESS;
}

Expand Down