Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 9 additions & 8 deletions source/adapters/cuda/enqueue.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,10 @@ ur_result_t setCuMemAdvise(CUdeviceptr DevPtr, size_t Size,

for (auto &UnmappedFlag : UnmappedMemAdviceFlags) {
if (URAdviceFlags & UnmappedFlag) {
throw UR_RESULT_ERROR_INVALID_ENUMERATION;
setErrorMessage("Memory advice ignored because the CUDA backend does not "
"support some of the specified flags",
UR_RESULT_SUCCESS);
return UR_RESULT_ERROR_ADAPTER_SPECIFIC;
}
}

Expand Down Expand Up @@ -1355,15 +1358,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMPrefetch(
ur_queue_handle_t hQueue, const void *pMem, size_t size,
ur_usm_migration_flags_t flags, uint32_t numEventsInWaitList,
const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) {
unsigned int PointerRangeSize = 0;
std::ignore = flags;

size_t PointerRangeSize = 0;
UR_CHECK_ERROR(cuPointerGetAttribute(
&PointerRangeSize, CU_POINTER_ATTRIBUTE_RANGE_SIZE, (CUdeviceptr)pMem));
UR_ASSERT(size <= PointerRangeSize, UR_RESULT_ERROR_INVALID_SIZE);
ur_device_handle_t Device = hQueue->getContext()->getDevice();

// Certain cuda devices and Windows do not have support for some Unified
// Memory features. cuMemPrefetchAsync requires concurrent memory access
// for managed memory. Therfore, ignore prefetch hint if concurrent managed
// for managed memory. Therefore, ignore prefetch hint if concurrent managed
// memory access is not available.
if (!getAttribute(Device, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS)) {
setErrorMessage("Prefetch hint ignored as device does not support "
Expand All @@ -1381,10 +1386,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMPrefetch(
return UR_RESULT_ERROR_ADAPTER_SPECIFIC;
}

// flags is currently unused so fail if set
if (flags != 0)
return UR_RESULT_ERROR_INVALID_VALUE;

ur_result_t Result = UR_RESULT_SUCCESS;
std::unique_ptr<ur_event_handle_t_> EventPtr{nullptr};

Expand Down Expand Up @@ -1415,7 +1416,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMPrefetch(
UR_APIEXPORT ur_result_t UR_APICALL
urEnqueueUSMAdvise(ur_queue_handle_t hQueue, const void *pMem, size_t size,
ur_usm_advice_flags_t advice, ur_event_handle_t *phEvent) {
unsigned int PointerRangeSize = 0;
size_t PointerRangeSize = 0;
UR_CHECK_ERROR(cuPointerGetAttribute(
&PointerRangeSize, CU_POINTER_ATTRIBUTE_RANGE_SIZE, (CUdeviceptr)pMem));
UR_ASSERT(size <= PointerRangeSize, UR_RESULT_ERROR_INVALID_SIZE);
Expand Down
85 changes: 50 additions & 35 deletions source/adapters/cuda/program.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,42 @@ ur_result_t getKernelNames(ur_program_handle_t) {
return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
}

/// Loads images from a list of PTX or CUBIN binaries.
/// Note: No calls to CUDA driver API in this function, only store binaries
/// for later.
///
/// Note: Only supports one device
///
ur_result_t createProgram(ur_context_handle_t hContext,
ur_device_handle_t hDevice, size_t size,
const uint8_t *pBinary,
const ur_program_properties_t *pProperties,
ur_program_handle_t *phProgram) {
UR_ASSERT(hContext->getDevice()->get() == hDevice->get(),
UR_RESULT_ERROR_INVALID_CONTEXT);
UR_ASSERT(size, UR_RESULT_ERROR_INVALID_SIZE);

std::unique_ptr<ur_program_handle_t_> RetProgram{
new ur_program_handle_t_{hContext}};

if (pProperties) {
if (pProperties->count > 0 && pProperties->pMetadatas == nullptr) {
return UR_RESULT_ERROR_INVALID_NULL_POINTER;
} else if (pProperties->count == 0 && pProperties->pMetadatas != nullptr) {
return UR_RESULT_ERROR_INVALID_SIZE;
}
UR_CHECK_ERROR(
RetProgram->setMetadata(pProperties->pMetadatas, pProperties->count));
}

auto pBinary_string = reinterpret_cast<const char *>(pBinary);

UR_CHECK_ERROR(RetProgram->setBinary(pBinary_string, size));
*phProgram = RetProgram.release();

return UR_RESULT_SUCCESS;
}

/// CUDA will handle the PTX/CUBIN binaries internally through CUmodule object.
/// So, urProgramCreateWithIL and urProgramCreateWithBinary are equivalent in
/// terms of CUDA adapter. See \ref urProgramCreateWithBinary.
Expand All @@ -175,8 +211,8 @@ urProgramCreateWithIL(ur_context_handle_t hContext, const void *pIL,
ur_device_handle_t hDevice = hContext->getDevice();
auto pBinary = reinterpret_cast<const uint8_t *>(pIL);

return urProgramCreateWithBinary(hContext, hDevice, length, pBinary,
pProperties, phProgram);
return createProgram(hContext, hDevice, length, pBinary, pProperties,
phProgram);
}

/// CUDA will handle the PTX/CUBIN binaries internally through a call to
Expand All @@ -185,7 +221,9 @@ urProgramCreateWithIL(ur_context_handle_t hContext, const void *pIL,
UR_APIEXPORT ur_result_t UR_APICALL
urProgramCompile(ur_context_handle_t hContext, ur_program_handle_t hProgram,
const char *pOptions) {
return urProgramBuild(hContext, hProgram, pOptions);
UR_CHECK_ERROR(urProgramBuild(hContext, hProgram, pOptions));
hProgram->BinaryType = UR_PROGRAM_BINARY_TYPE_COMPILED_OBJECT;
return UR_RESULT_SUCCESS;
}

/// Loads the images from a UR program into a CUmodule that can be
Expand All @@ -202,6 +240,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramBuild(ur_context_handle_t hContext,
ScopedContext Active(hProgram->getContext());

hProgram->buildProgram(pOptions);
hProgram->BinaryType = UR_PROGRAM_BINARY_TYPE_EXECUTABLE;

} catch (ur_result_t Err) {
Result = Err;
Expand Down Expand Up @@ -241,6 +280,7 @@ urProgramLink(ur_context_handle_t hContext, uint32_t count,
RetProgram->setBinary(static_cast<const char *>(CuBin), CuBinSize);

Result = RetProgram->buildProgram(pOptions);
RetProgram->BinaryType = UR_PROGRAM_BINARY_TYPE_EXECUTABLE;
} catch (...) {
// Upon error attempt cleanup
UR_CHECK_ERROR(cuLinkDestroy(State));
Expand Down Expand Up @@ -287,6 +327,9 @@ urProgramGetBuildInfo(ur_program_handle_t hProgram, ur_device_handle_t hDevice,
return ReturnValue(hProgram->BuildOptions.c_str());
case UR_PROGRAM_BUILD_INFO_LOG:
return ReturnValue(hProgram->InfoLog, hProgram->MaxLogSize);
case UR_PROGRAM_BUILD_INFO_BINARY_TYPE: {
return ReturnValue(hProgram->BinaryType);
}
default:
break;
}
Expand Down Expand Up @@ -384,44 +427,16 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramGetNativeHandle(
return UR_RESULT_SUCCESS;
}

/// Loads images from a list of PTX or CUBIN binaries.
/// Note: No calls to CUDA driver API in this function, only store binaries
/// for later.
///
/// Note: Only supports one device
///
UR_APIEXPORT ur_result_t UR_APICALL urProgramCreateWithBinary(
ur_context_handle_t hContext, ur_device_handle_t hDevice, size_t size,
const uint8_t *pBinary, const ur_program_properties_t *pProperties,
ur_program_handle_t *phProgram) {
UR_ASSERT(hContext->getDevice()->get() == hDevice->get(),
UR_RESULT_ERROR_INVALID_CONTEXT);
UR_ASSERT(size, UR_RESULT_ERROR_INVALID_SIZE);

ur_result_t Result = UR_RESULT_SUCCESS;
UR_CHECK_ERROR(
createProgram(hContext, hDevice, size, pBinary, pProperties, phProgram));
(*phProgram)->BinaryType = UR_PROGRAM_BINARY_TYPE_COMPILED_OBJECT;

std::unique_ptr<ur_program_handle_t_> RetProgram{
new ur_program_handle_t_{hContext}};

if (pProperties) {
if (pProperties->count > 0 && pProperties->pMetadatas == nullptr) {
return UR_RESULT_ERROR_INVALID_NULL_POINTER;
} else if (pProperties->count == 0 && pProperties->pMetadatas != nullptr) {
return UR_RESULT_ERROR_INVALID_SIZE;
}
Result =
RetProgram->setMetadata(pProperties->pMetadatas, pProperties->count);
}
UR_ASSERT(Result == UR_RESULT_SUCCESS, Result);

auto pBinary_string = reinterpret_cast<const char *>(pBinary);

Result = RetProgram->setBinary(pBinary_string, size);
UR_ASSERT(Result == UR_RESULT_SUCCESS, Result);

*phProgram = RetProgram.release();

return Result;
return UR_RESULT_SUCCESS;
}

// This entry point is only used for native specialization constants (SPIR-V),
Expand Down
6 changes: 6 additions & 0 deletions source/adapters/cuda/program.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,12 @@ struct ur_program_handle_t_ {
std::atomic_uint32_t RefCount;
ur_context_handle_t Context;

/* The ur_program_binary_type_t property is defined individually for every
* device in a program. However, since the CUDA adapter only has 1 device per
* context / program, there is no need to keep track of its value for each
* device. */
ur_program_binary_type_t BinaryType = UR_PROGRAM_BINARY_TYPE_NONE;

// Metadata
std::unordered_map<std::string, std::tuple<uint32_t, uint32_t, uint32_t>>
KernelReqdWorkGroupSizeMD;
Expand Down