Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
dfc5abe
[HIP] Fix Kernel Compilation on AMD
veselypeta Oct 18, 2023
a2623e2
[HIP] Don't link against device binaries to fix CI
veselypeta Oct 24, 2023
66763bf
[HIP] Fix host/device synchronization
veselypeta Oct 26, 2023
c7f34f3
[HIP] Make synchronized time optional
veselypeta Oct 26, 2023
0ce1654
[HIP][CTS] Fix Device CTS failures
veselypeta Oct 26, 2023
c8f5320
[UR] Remove passing queue tests from match file
veselypeta Oct 26, 2023
451017f
[HIP] Fix get mem size segfault
veselypeta Oct 30, 2023
7377fc0
[OO] Add CTS for urMemGetInfo with images
veselypeta Oct 30, 2023
28ff1cb
[UR] Disable Image query on CUDA
veselypeta Oct 30, 2023
d1d4f10
[UR] Disable Image query on L0
veselypeta Oct 30, 2023
60c0b81
[UR] Windows doesn't like std::array without template arguments
veselypeta Oct 30, 2023
8e7f785
[HIP] Implement urMemImageGetInfo
veselypeta Oct 30, 2023
7928a9b
[HIP] Define all UR entry points
veselypeta Oct 31, 2023
858938d
Merge branch 'petr/compiler-hip-kernels' into benie/hip-cts-fixes-com…
kbenzie Nov 2, 2023
d4e45e1
Merge branch 'petr/fix-hip-sync' into benie/hip-cts-fixes-combined
kbenzie Nov 2, 2023
e0e55b9
Merge branch 'petr/cts-device-hip' into benie/hip-cts-fixes-combined
kbenzie Nov 2, 2023
dd1e816
Merge branch 'petr/fix-mem-segfault' into benie/hip-cts-fixes-combined
kbenzie Nov 2, 2023
0c7b74e
Merge branch 'petr/hip-get-mem-info' into benie/hip-cts-fixes-combined
kbenzie Nov 2, 2023
716cd8f
Merge branch 'petr/hip-define-all-entry-points' into benie/hip-cts-fi…
kbenzie Nov 2, 2023
08ca4be
[HIP] Update CTS match files
kbenzie Nov 2, 2023
8945db4
[CUDA] Update hint functions to only return warnings
Oct 23, 2023
7ea615a
[CUDA] Add support for binary type query
Oct 4, 2023
d164792
Restore change from adapters branch
Nov 14, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion .github/workflows/cmake.yml
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,7 @@ jobs:
matrix:
adapter: [
{name: CUDA, triplet: nvptx64-nvidia-cuda},
{name: HIP, triplet: spir64}, # should be amdgcn-amdhsa, but build scripts for device binaries are currently broken for this target.
{name: HIP, triplet: amdgcn-amd-amdhsa},
{name: L0, triplet: spir64}
]
build_type: [Debug, Release]
Expand Down Expand Up @@ -198,6 +198,8 @@ jobs:
-DUR_DPCXX=${{github.workspace}}/dpcpp_compiler/bin/clang++
-DUR_SYCL_LIBRARY_DIR=${{github.workspace}}/dpcpp_compiler/lib
-DUR_CONFORMANCE_TARGET_TRIPLES=${{matrix.adapter.triplet}}
${{ matrix.adapter.name == 'HIP' && '-DAMD_ARCH=gfx1030' || '' }}
${{ matrix.adapter.name == 'HIP' && '-DUR_HIP_PLATFORM=AMD' || '' }}

- name: Build
# This is so that device binaries can find the sycl runtime library
Expand Down
17 changes: 9 additions & 8 deletions source/adapters/cuda/enqueue.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,10 @@ ur_result_t setCuMemAdvise(CUdeviceptr DevPtr, size_t Size,

for (auto &UnmappedFlag : UnmappedMemAdviceFlags) {
if (URAdviceFlags & UnmappedFlag) {
throw UR_RESULT_ERROR_INVALID_ENUMERATION;
setErrorMessage("Memory advice ignored because the CUDA backend does not "
"support some of the specified flags",
UR_RESULT_SUCCESS);
return UR_RESULT_ERROR_ADAPTER_SPECIFIC;
}
}

Expand Down Expand Up @@ -1355,15 +1358,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMPrefetch(
ur_queue_handle_t hQueue, const void *pMem, size_t size,
ur_usm_migration_flags_t flags, uint32_t numEventsInWaitList,
const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) {
unsigned int PointerRangeSize = 0;
std::ignore = flags;

size_t PointerRangeSize = 0;
UR_CHECK_ERROR(cuPointerGetAttribute(
&PointerRangeSize, CU_POINTER_ATTRIBUTE_RANGE_SIZE, (CUdeviceptr)pMem));
UR_ASSERT(size <= PointerRangeSize, UR_RESULT_ERROR_INVALID_SIZE);
ur_device_handle_t Device = hQueue->getContext()->getDevice();

// Certain cuda devices and Windows do not have support for some Unified
// Memory features. cuMemPrefetchAsync requires concurrent memory access
// for managed memory. Therfore, ignore prefetch hint if concurrent managed
// for managed memory. Therefore, ignore prefetch hint if concurrent managed
// memory access is not available.
if (!getAttribute(Device, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS)) {
setErrorMessage("Prefetch hint ignored as device does not support "
Expand All @@ -1381,10 +1386,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMPrefetch(
return UR_RESULT_ERROR_ADAPTER_SPECIFIC;
}

// flags is currently unused so fail if set
if (flags != 0)
return UR_RESULT_ERROR_INVALID_VALUE;

ur_result_t Result = UR_RESULT_SUCCESS;
std::unique_ptr<ur_event_handle_t_> EventPtr{nullptr};

Expand Down Expand Up @@ -1415,7 +1416,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMPrefetch(
UR_APIEXPORT ur_result_t UR_APICALL
urEnqueueUSMAdvise(ur_queue_handle_t hQueue, const void *pMem, size_t size,
ur_usm_advice_flags_t advice, ur_event_handle_t *phEvent) {
unsigned int PointerRangeSize = 0;
size_t PointerRangeSize = 0;
UR_CHECK_ERROR(cuPointerGetAttribute(
&PointerRangeSize, CU_POINTER_ATTRIBUTE_RANGE_SIZE, (CUdeviceptr)pMem));
UR_ASSERT(size <= PointerRangeSize, UR_RESULT_ERROR_INVALID_SIZE);
Expand Down
85 changes: 50 additions & 35 deletions source/adapters/cuda/program.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,42 @@ ur_result_t getKernelNames(ur_program_handle_t) {
return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
}

/// Loads images from a list of PTX or CUBIN binaries.
/// Note: No calls to CUDA driver API in this function, only store binaries
/// for later.
///
/// Note: Only supports one device
///
ur_result_t createProgram(ur_context_handle_t hContext,
ur_device_handle_t hDevice, size_t size,
const uint8_t *pBinary,
const ur_program_properties_t *pProperties,
ur_program_handle_t *phProgram) {
UR_ASSERT(hContext->getDevice()->get() == hDevice->get(),
UR_RESULT_ERROR_INVALID_CONTEXT);
UR_ASSERT(size, UR_RESULT_ERROR_INVALID_SIZE);

std::unique_ptr<ur_program_handle_t_> RetProgram{
new ur_program_handle_t_{hContext}};

if (pProperties) {
if (pProperties->count > 0 && pProperties->pMetadatas == nullptr) {
return UR_RESULT_ERROR_INVALID_NULL_POINTER;
} else if (pProperties->count == 0 && pProperties->pMetadatas != nullptr) {
return UR_RESULT_ERROR_INVALID_SIZE;
}
UR_CHECK_ERROR(
RetProgram->setMetadata(pProperties->pMetadatas, pProperties->count));
}

auto pBinary_string = reinterpret_cast<const char *>(pBinary);

UR_CHECK_ERROR(RetProgram->setBinary(pBinary_string, size));
*phProgram = RetProgram.release();

return UR_RESULT_SUCCESS;
}

/// CUDA will handle the PTX/CUBIN binaries internally through CUmodule object.
/// So, urProgramCreateWithIL and urProgramCreateWithBinary are equivalent in
/// terms of CUDA adapter. See \ref urProgramCreateWithBinary.
Expand All @@ -175,8 +211,8 @@ urProgramCreateWithIL(ur_context_handle_t hContext, const void *pIL,
ur_device_handle_t hDevice = hContext->getDevice();
auto pBinary = reinterpret_cast<const uint8_t *>(pIL);

return urProgramCreateWithBinary(hContext, hDevice, length, pBinary,
pProperties, phProgram);
return createProgram(hContext, hDevice, length, pBinary, pProperties,
phProgram);
}

/// CUDA will handle the PTX/CUBIN binaries internally through a call to
Expand All @@ -185,7 +221,9 @@ urProgramCreateWithIL(ur_context_handle_t hContext, const void *pIL,
UR_APIEXPORT ur_result_t UR_APICALL
urProgramCompile(ur_context_handle_t hContext, ur_program_handle_t hProgram,
const char *pOptions) {
return urProgramBuild(hContext, hProgram, pOptions);
UR_CHECK_ERROR(urProgramBuild(hContext, hProgram, pOptions));
hProgram->BinaryType = UR_PROGRAM_BINARY_TYPE_COMPILED_OBJECT;
return UR_RESULT_SUCCESS;
}

/// Loads the images from a UR program into a CUmodule that can be
Expand All @@ -202,6 +240,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramBuild(ur_context_handle_t hContext,
ScopedContext Active(hProgram->getContext());

hProgram->buildProgram(pOptions);
hProgram->BinaryType = UR_PROGRAM_BINARY_TYPE_EXECUTABLE;

} catch (ur_result_t Err) {
Result = Err;
Expand Down Expand Up @@ -241,6 +280,7 @@ urProgramLink(ur_context_handle_t hContext, uint32_t count,
RetProgram->setBinary(static_cast<const char *>(CuBin), CuBinSize);

Result = RetProgram->buildProgram(pOptions);
RetProgram->BinaryType = UR_PROGRAM_BINARY_TYPE_EXECUTABLE;
} catch (...) {
// Upon error attempt cleanup
UR_CHECK_ERROR(cuLinkDestroy(State));
Expand Down Expand Up @@ -287,6 +327,9 @@ urProgramGetBuildInfo(ur_program_handle_t hProgram, ur_device_handle_t hDevice,
return ReturnValue(hProgram->BuildOptions.c_str());
case UR_PROGRAM_BUILD_INFO_LOG:
return ReturnValue(hProgram->InfoLog, hProgram->MaxLogSize);
case UR_PROGRAM_BUILD_INFO_BINARY_TYPE: {
return ReturnValue(hProgram->BinaryType);
}
default:
break;
}
Expand Down Expand Up @@ -384,44 +427,16 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramGetNativeHandle(
return UR_RESULT_SUCCESS;
}

/// Loads images from a list of PTX or CUBIN binaries.
/// Note: No calls to CUDA driver API in this function, only store binaries
/// for later.
///
/// Note: Only supports one device
///
UR_APIEXPORT ur_result_t UR_APICALL urProgramCreateWithBinary(
ur_context_handle_t hContext, ur_device_handle_t hDevice, size_t size,
const uint8_t *pBinary, const ur_program_properties_t *pProperties,
ur_program_handle_t *phProgram) {
UR_ASSERT(hContext->getDevice()->get() == hDevice->get(),
UR_RESULT_ERROR_INVALID_CONTEXT);
UR_ASSERT(size, UR_RESULT_ERROR_INVALID_SIZE);

ur_result_t Result = UR_RESULT_SUCCESS;
UR_CHECK_ERROR(
createProgram(hContext, hDevice, size, pBinary, pProperties, phProgram));
(*phProgram)->BinaryType = UR_PROGRAM_BINARY_TYPE_COMPILED_OBJECT;

std::unique_ptr<ur_program_handle_t_> RetProgram{
new ur_program_handle_t_{hContext}};

if (pProperties) {
if (pProperties->count > 0 && pProperties->pMetadatas == nullptr) {
return UR_RESULT_ERROR_INVALID_NULL_POINTER;
} else if (pProperties->count == 0 && pProperties->pMetadatas != nullptr) {
return UR_RESULT_ERROR_INVALID_SIZE;
}
Result =
RetProgram->setMetadata(pProperties->pMetadatas, pProperties->count);
}
UR_ASSERT(Result == UR_RESULT_SUCCESS, Result);

auto pBinary_string = reinterpret_cast<const char *>(pBinary);

Result = RetProgram->setBinary(pBinary_string, size);
UR_ASSERT(Result == UR_RESULT_SUCCESS, Result);

*phProgram = RetProgram.release();

return Result;
return UR_RESULT_SUCCESS;
}

// This entry point is only used for native specialization constants (SPIR-V),
Expand Down
6 changes: 6 additions & 0 deletions source/adapters/cuda/program.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,12 @@ struct ur_program_handle_t_ {
std::atomic_uint32_t RefCount;
ur_context_handle_t Context;

/* The ur_program_binary_type_t property is defined individually for every
* device in a program. However, since the CUDA adapter only has 1 device per
* context / program, there is no need to keep track of its value for each
* device. */
ur_program_binary_type_t BinaryType = UR_PROGRAM_BINARY_TYPE_NONE;

// Metadata
std::unordered_map<std::string, std::tuple<uint32_t, uint32_t, uint32_t>>
KernelReqdWorkGroupSizeMD;
Expand Down
56 changes: 32 additions & 24 deletions source/adapters/hip/device.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -210,14 +210,20 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
return ReturnValue(uint64_t{MaxAlloc});
}
case UR_DEVICE_INFO_IMAGE_SUPPORTED: {
return ReturnValue(uint32_t{true});
return ReturnValue(true);
}
case UR_DEVICE_INFO_MAX_READ_IMAGE_ARGS: {
// This call doesn't match to HIP as it doesn't have images, but instead
// surfaces and textures. No clear call in the HIP API to determine this,
// but some searching found as of SM 2.x 128 are supported.
return ReturnValue(128u);
}
case UR_DEVICE_INFO_MAX_READ_WRITE_IMAGE_ARGS: {
// This call doesn't match to HIP as it doesn't have images, but instead
// surfaces and textures. No clear call in the HIP API to determine this,
// but some searching found as of SM 2.x 128 are supported.
return ReturnValue(128u);
}
case UR_DEVICE_INFO_MAX_WRITE_IMAGE_ARGS: {
// This call doesn't match to HIP as it doesn't have images, but instead
// surfaces and textures. No clear call in the HIP API to determine this,
Expand Down Expand Up @@ -339,7 +345,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
return ReturnValue(0u);
}
case UR_DEVICE_INFO_SINGLE_FP_CONFIG: {
uint64_t Config =
ur_device_fp_capability_flags_t Config =
UR_DEVICE_FP_CAPABILITY_FLAG_DENORM |
UR_DEVICE_FP_CAPABILITY_FLAG_INF_NAN |
UR_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_NEAREST |
Expand All @@ -350,12 +356,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
return ReturnValue(Config);
}
case UR_DEVICE_INFO_DOUBLE_FP_CONFIG: {
uint64_t Config = UR_DEVICE_FP_CAPABILITY_FLAG_DENORM |
UR_DEVICE_FP_CAPABILITY_FLAG_INF_NAN |
UR_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_NEAREST |
UR_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_ZERO |
UR_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_INF |
UR_DEVICE_FP_CAPABILITY_FLAG_FMA;
ur_device_fp_capability_flags_t Config =
UR_DEVICE_FP_CAPABILITY_FLAG_DENORM |
UR_DEVICE_FP_CAPABILITY_FLAG_INF_NAN |
UR_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_NEAREST |
UR_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_ZERO |
UR_DEVICE_FP_CAPABILITY_FLAG_ROUND_TO_INF |
UR_DEVICE_FP_CAPABILITY_FLAG_FMA;
return ReturnValue(Config);
}
case UR_DEVICE_INFO_GLOBAL_MEM_CACHE_TYPE: {
Expand Down Expand Up @@ -459,14 +466,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
}
case UR_DEVICE_INFO_QUEUE_ON_DEVICE_PROPERTIES: {
// The mandated minimum capability:
uint64_t Capability = UR_QUEUE_FLAG_PROFILING_ENABLE |
UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE;
ur_queue_flags_t Capability = UR_QUEUE_FLAG_PROFILING_ENABLE |
UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE;
return ReturnValue(Capability);
}
case UR_DEVICE_INFO_QUEUE_ON_HOST_PROPERTIES:
case UR_DEVICE_INFO_QUEUE_PROPERTIES: {
// The mandated minimum capability:
uint64_t Capability = UR_QUEUE_FLAG_PROFILING_ENABLE;
ur_queue_flags_t Capability = UR_QUEUE_FLAG_PROFILING_ENABLE;
return ReturnValue(Capability);
}
case UR_DEVICE_INFO_BUILT_IN_KERNELS: {
Expand Down Expand Up @@ -730,9 +737,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
}

case UR_DEVICE_INFO_ATOMIC_MEMORY_ORDER_CAPABILITIES: {
uint64_t Capabilities = UR_MEMORY_ORDER_CAPABILITY_FLAG_RELAXED |
UR_MEMORY_ORDER_CAPABILITY_FLAG_ACQUIRE |
UR_MEMORY_ORDER_CAPABILITY_FLAG_RELEASE;
ur_memory_order_capability_flags_t Capabilities =
UR_MEMORY_ORDER_CAPABILITY_FLAG_RELAXED |
UR_MEMORY_ORDER_CAPABILITY_FLAG_ACQUIRE |
UR_MEMORY_ORDER_CAPABILITY_FLAG_RELEASE;
return ReturnValue(Capabilities);
}
case UR_DEVICE_INFO_ATOMIC_MEMORY_SCOPE_CAPABILITIES:
Expand Down Expand Up @@ -821,7 +829,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
case UR_DEVICE_INFO_GPU_HW_THREADS_PER_EU:
case UR_DEVICE_INFO_MAX_MEMORY_BANDWIDTH:
case UR_DEVICE_INFO_BFLOAT16:
return UR_RESULT_ERROR_INVALID_ENUMERATION;
case UR_DEVICE_INFO_IL_VERSION:
case UR_DEVICE_INFO_ASYNC_BARRIER:
case UR_DEVICE_INFO_VIRTUAL_MEMORY_SUPPORT:
return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION;

default:
break;
Expand Down Expand Up @@ -939,21 +950,18 @@ ur_result_t UR_APICALL urDeviceGetGlobalTimestamps(ur_device_handle_t hDevice,
if (pDeviceTimestamp) {
UR_CHECK_ERROR(hipEventCreateWithFlags(&Event, hipEventDefault));
UR_CHECK_ERROR(hipEventRecord(Event));
}
if (pHostTimestamp) {
using namespace std::chrono;
*pHostTimestamp =
duration_cast<nanoseconds>(steady_clock::now().time_since_epoch())
.count();
}

if (pDeviceTimestamp) {
UR_CHECK_ERROR(hipEventSynchronize(Event));
float ElapsedTime = 0.0f;
UR_CHECK_ERROR(hipEventElapsedTime(&ElapsedTime,
ur_platform_handle_t_::EvBase, Event));
*pDeviceTimestamp = (uint64_t)(ElapsedTime * (double)1e6);
}

if (pHostTimestamp) {
using namespace std::chrono;
*pHostTimestamp =
duration_cast<nanoseconds>(steady_clock::now().time_since_epoch())
.count();
}
return UR_RESULT_SUCCESS;
}
15 changes: 13 additions & 2 deletions source/adapters/hip/kernel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,12 @@ urKernelCreate(ur_program_handle_t hProgram, const char *pKernelName,
ScopedContext Active(hProgram->getContext()->getDevice());

hipFunction_t HIPFunc;
UR_CHECK_ERROR(
hipModuleGetFunction(&HIPFunc, hProgram->get(), pKernelName));
hipError_t KernelError =
hipModuleGetFunction(&HIPFunc, hProgram->get(), pKernelName);
if (KernelError == hipErrorNotFound) {
return UR_RESULT_ERROR_INVALID_KERNEL_NAME;
}
UR_CHECK_ERROR(KernelError);

std::string KernelNameWoffset = std::string(pKernelName) + "_with_offset";
hipFunction_t HIPFuncWithOffsetParam;
Expand Down Expand Up @@ -321,3 +325,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelCreateWithNativeHandle(
const ur_kernel_native_properties_t *, ur_kernel_handle_t *) {
return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
}

UR_APIEXPORT ur_result_t UR_APICALL urKernelSetSpecializationConstants(
[[maybe_unused]] ur_kernel_handle_t hKernel,
[[maybe_unused]] uint32_t count,
[[maybe_unused]] const ur_specialization_constant_info_t *pSpecConstants) {
return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
}
Loading