diff --git a/source/adapters/opencl/usm.cpp b/source/adapters/opencl/usm.cpp index 3f4382fc0d..09dae5fe45 100644 --- a/source/adapters/opencl/usm.cpp +++ b/source/adapters/opencl/usm.cpp @@ -256,9 +256,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMFill( return UR_RESULT_SUCCESS; } - // OpenCL only supports pattern sizes as large as the largest CL type - // (double16/long16 - 128 bytes), anything larger we need to do on the host - // side and copy it into the target allocation. + // OpenCL only supports pattern sizes which are powers of 2 and are as large + // as the largest CL type (double16/long16 - 128 bytes), anything larger or + // not a power of 2, we need to do on the host side and copy it into the + // target allocation. clHostMemAllocINTEL_fn HostMemAlloc = nullptr; UR_RETURN_ON_FAILURE(cl_ext::getExtFuncFromContext( CLContext, cl_ext::ExtFuncPtrCache->clHostMemAllocINTELCache, @@ -275,14 +276,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMFill( cl_ext::MemBlockingFreeName, &USMFree)); cl_int ClErr = CL_SUCCESS; - auto HostBuffer = static_cast( - HostMemAlloc(CLContext, nullptr, size, 0, &ClErr)); + auto HostBuffer = + static_cast(HostMemAlloc(CLContext, nullptr, size, 0, &ClErr)); CL_RETURN_ON_FAILURE(ClErr); - auto NumValues = size / sizeof(uint64_t); - auto NumChunks = patternSize / sizeof(uint64_t); - for (size_t i = 0; i < NumValues; i++) { - HostBuffer[i] = static_cast(pPattern)[i % NumChunks]; + auto *End = HostBuffer + size; + for (auto *Iter = HostBuffer; Iter < End; Iter += patternSize) { + std::memcpy(Iter, pPattern, patternSize); } cl_event CopyEvent = nullptr;