diff --git a/source/adapters/cuda/enqueue.cpp b/source/adapters/cuda/enqueue.cpp index 5761f24e0a..c752c3fd14 100644 --- a/source/adapters/cuda/enqueue.cpp +++ b/source/adapters/cuda/enqueue.cpp @@ -121,7 +121,10 @@ ur_result_t setCuMemAdvise(CUdeviceptr DevPtr, size_t Size, for (auto &UnmappedFlag : UnmappedMemAdviceFlags) { if (URAdviceFlags & UnmappedFlag) { - throw UR_RESULT_ERROR_INVALID_ENUMERATION; + setErrorMessage("Memory advice ignored because the CUDA backend does not " + "support some of the specified flags", + UR_RESULT_SUCCESS); + return UR_RESULT_ERROR_ADAPTER_SPECIFIC; } } @@ -1355,7 +1358,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMPrefetch( ur_queue_handle_t hQueue, const void *pMem, size_t size, ur_usm_migration_flags_t flags, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - unsigned int PointerRangeSize = 0; + std::ignore = flags; + + size_t PointerRangeSize = 0; UR_CHECK_ERROR(cuPointerGetAttribute( &PointerRangeSize, CU_POINTER_ATTRIBUTE_RANGE_SIZE, (CUdeviceptr)pMem)); UR_ASSERT(size <= PointerRangeSize, UR_RESULT_ERROR_INVALID_SIZE); @@ -1363,7 +1368,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMPrefetch( // Certain cuda devices and Windows do not have support for some Unified // Memory features. cuMemPrefetchAsync requires concurrent memory access - // for managed memory. Therfore, ignore prefetch hint if concurrent managed + // for managed memory. Therefore, ignore prefetch hint if concurrent managed // memory access is not available. if (!getAttribute(Device, CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS)) { setErrorMessage("Prefetch hint ignored as device does not support " @@ -1381,10 +1386,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMPrefetch( return UR_RESULT_ERROR_ADAPTER_SPECIFIC; } - // flags is currently unused so fail if set - if (flags != 0) - return UR_RESULT_ERROR_INVALID_VALUE; - ur_result_t Result = UR_RESULT_SUCCESS; std::unique_ptr EventPtr{nullptr}; @@ -1415,7 +1416,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMPrefetch( UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMAdvise(ur_queue_handle_t hQueue, const void *pMem, size_t size, ur_usm_advice_flags_t advice, ur_event_handle_t *phEvent) { - unsigned int PointerRangeSize = 0; + size_t PointerRangeSize = 0; UR_CHECK_ERROR(cuPointerGetAttribute( &PointerRangeSize, CU_POINTER_ATTRIBUTE_RANGE_SIZE, (CUdeviceptr)pMem)); UR_ASSERT(size <= PointerRangeSize, UR_RESULT_ERROR_INVALID_SIZE); diff --git a/source/adapters/cuda/program.cpp b/source/adapters/cuda/program.cpp index e868793319..bee94d00a6 100644 --- a/source/adapters/cuda/program.cpp +++ b/source/adapters/cuda/program.cpp @@ -165,6 +165,42 @@ ur_result_t getKernelNames(ur_program_handle_t) { return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } +/// Loads images from a list of PTX or CUBIN binaries. +/// Note: No calls to CUDA driver API in this function, only store binaries +/// for later. +/// +/// Note: Only supports one device +/// +ur_result_t createProgram(ur_context_handle_t hContext, + ur_device_handle_t hDevice, size_t size, + const uint8_t *pBinary, + const ur_program_properties_t *pProperties, + ur_program_handle_t *phProgram) { + UR_ASSERT(hContext->getDevice()->get() == hDevice->get(), + UR_RESULT_ERROR_INVALID_CONTEXT); + UR_ASSERT(size, UR_RESULT_ERROR_INVALID_SIZE); + + std::unique_ptr RetProgram{ + new ur_program_handle_t_{hContext}}; + + if (pProperties) { + if (pProperties->count > 0 && pProperties->pMetadatas == nullptr) { + return UR_RESULT_ERROR_INVALID_NULL_POINTER; + } else if (pProperties->count == 0 && pProperties->pMetadatas != nullptr) { + return UR_RESULT_ERROR_INVALID_SIZE; + } + UR_CHECK_ERROR( + RetProgram->setMetadata(pProperties->pMetadatas, pProperties->count)); + } + + auto pBinary_string = reinterpret_cast(pBinary); + + UR_CHECK_ERROR(RetProgram->setBinary(pBinary_string, size)); + *phProgram = RetProgram.release(); + + return UR_RESULT_SUCCESS; +} + /// CUDA will handle the PTX/CUBIN binaries internally through CUmodule object. /// So, urProgramCreateWithIL and urProgramCreateWithBinary are equivalent in /// terms of CUDA adapter. See \ref urProgramCreateWithBinary. @@ -175,8 +211,8 @@ urProgramCreateWithIL(ur_context_handle_t hContext, const void *pIL, ur_device_handle_t hDevice = hContext->getDevice(); auto pBinary = reinterpret_cast(pIL); - return urProgramCreateWithBinary(hContext, hDevice, length, pBinary, - pProperties, phProgram); + return createProgram(hContext, hDevice, length, pBinary, pProperties, + phProgram); } /// CUDA will handle the PTX/CUBIN binaries internally through a call to @@ -185,7 +221,9 @@ urProgramCreateWithIL(ur_context_handle_t hContext, const void *pIL, UR_APIEXPORT ur_result_t UR_APICALL urProgramCompile(ur_context_handle_t hContext, ur_program_handle_t hProgram, const char *pOptions) { - return urProgramBuild(hContext, hProgram, pOptions); + UR_CHECK_ERROR(urProgramBuild(hContext, hProgram, pOptions)); + hProgram->BinaryType = UR_PROGRAM_BINARY_TYPE_COMPILED_OBJECT; + return UR_RESULT_SUCCESS; } /// Loads the images from a UR program into a CUmodule that can be @@ -202,6 +240,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramBuild(ur_context_handle_t hContext, ScopedContext Active(hProgram->getContext()); hProgram->buildProgram(pOptions); + hProgram->BinaryType = UR_PROGRAM_BINARY_TYPE_EXECUTABLE; } catch (ur_result_t Err) { Result = Err; @@ -241,6 +280,7 @@ urProgramLink(ur_context_handle_t hContext, uint32_t count, RetProgram->setBinary(static_cast(CuBin), CuBinSize); Result = RetProgram->buildProgram(pOptions); + RetProgram->BinaryType = UR_PROGRAM_BINARY_TYPE_EXECUTABLE; } catch (...) { // Upon error attempt cleanup UR_CHECK_ERROR(cuLinkDestroy(State)); @@ -287,6 +327,9 @@ urProgramGetBuildInfo(ur_program_handle_t hProgram, ur_device_handle_t hDevice, return ReturnValue(hProgram->BuildOptions.c_str()); case UR_PROGRAM_BUILD_INFO_LOG: return ReturnValue(hProgram->InfoLog, hProgram->MaxLogSize); + case UR_PROGRAM_BUILD_INFO_BINARY_TYPE: { + return ReturnValue(hProgram->BinaryType); + } default: break; } @@ -384,44 +427,16 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramGetNativeHandle( return UR_RESULT_SUCCESS; } -/// Loads images from a list of PTX or CUBIN binaries. -/// Note: No calls to CUDA driver API in this function, only store binaries -/// for later. -/// -/// Note: Only supports one device -/// UR_APIEXPORT ur_result_t UR_APICALL urProgramCreateWithBinary( ur_context_handle_t hContext, ur_device_handle_t hDevice, size_t size, const uint8_t *pBinary, const ur_program_properties_t *pProperties, ur_program_handle_t *phProgram) { - UR_ASSERT(hContext->getDevice()->get() == hDevice->get(), - UR_RESULT_ERROR_INVALID_CONTEXT); - UR_ASSERT(size, UR_RESULT_ERROR_INVALID_SIZE); - ur_result_t Result = UR_RESULT_SUCCESS; + UR_CHECK_ERROR( + createProgram(hContext, hDevice, size, pBinary, pProperties, phProgram)); + (*phProgram)->BinaryType = UR_PROGRAM_BINARY_TYPE_COMPILED_OBJECT; - std::unique_ptr RetProgram{ - new ur_program_handle_t_{hContext}}; - - if (pProperties) { - if (pProperties->count > 0 && pProperties->pMetadatas == nullptr) { - return UR_RESULT_ERROR_INVALID_NULL_POINTER; - } else if (pProperties->count == 0 && pProperties->pMetadatas != nullptr) { - return UR_RESULT_ERROR_INVALID_SIZE; - } - Result = - RetProgram->setMetadata(pProperties->pMetadatas, pProperties->count); - } - UR_ASSERT(Result == UR_RESULT_SUCCESS, Result); - - auto pBinary_string = reinterpret_cast(pBinary); - - Result = RetProgram->setBinary(pBinary_string, size); - UR_ASSERT(Result == UR_RESULT_SUCCESS, Result); - - *phProgram = RetProgram.release(); - - return Result; + return UR_RESULT_SUCCESS; } // This entry point is only used for native specialization constants (SPIR-V), diff --git a/source/adapters/cuda/program.hpp b/source/adapters/cuda/program.hpp index e27c2d8863..30131a4120 100644 --- a/source/adapters/cuda/program.hpp +++ b/source/adapters/cuda/program.hpp @@ -25,6 +25,12 @@ struct ur_program_handle_t_ { std::atomic_uint32_t RefCount; ur_context_handle_t Context; + /* The ur_program_binary_type_t property is defined individually for every + * device in a program. However, since the CUDA adapter only has 1 device per + * context / program, there is no need to keep track of its value for each + * device. */ + ur_program_binary_type_t BinaryType = UR_PROGRAM_BINARY_TYPE_NONE; + // Metadata std::unordered_map> KernelReqdWorkGroupSizeMD;