Skip to content

Commit

Permalink
Add try catch in command_buffer.cpp
Browse files Browse the repository at this point in the history
  • Loading branch information
hdelan committed Oct 22, 2024
1 parent f5ac85b commit ebc7760
Show file tree
Hide file tree
Showing 2 changed files with 87 additions and 75 deletions.
84 changes: 45 additions & 39 deletions source/adapters/cuda/command_buffer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1361,49 +1361,55 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp(
return UR_RESULT_ERROR_INVALID_VALUE;
}

auto KernelCommandHandle = static_cast<kernel_command_handle *>(hCommand);
try {
auto KernelCommandHandle = static_cast<kernel_command_handle *>(hCommand);

UR_CHECK_ERROR(validateCommandDesc(KernelCommandHandle, pUpdateKernelLaunch));
UR_CHECK_ERROR(
updateKernelArguments(CommandBuffer->Device, pUpdateKernelLaunch));
UR_CHECK_ERROR(updateCommand(KernelCommandHandle, pUpdateKernelLaunch));

// If no work-size is provided make sure we pass nullptr to setKernelParams so
// it can guess the local work size.
const bool ProvidedLocalSize = !KernelCommandHandle->isNullLocalSize();
size_t *LocalWorkSize =
ProvidedLocalSize ? KernelCommandHandle->LocalWorkSize : nullptr;

// Set the number of threads per block to the number of threads per warp
// by default unless user has provided a better number.
size_t ThreadsPerBlock[3] = {32u, 1u, 1u};
size_t BlocksPerGrid[3] = {1u, 1u, 1u};
CUfunction CuFunc = KernelCommandHandle->Kernel->get();
auto Result = setKernelParams(
CommandBuffer->Context, CommandBuffer->Device,
KernelCommandHandle->WorkDim, KernelCommandHandle->GlobalWorkOffset,
KernelCommandHandle->GlobalWorkSize, LocalWorkSize,
KernelCommandHandle->Kernel, CuFunc, ThreadsPerBlock, BlocksPerGrid);
if (Result != UR_RESULT_SUCCESS) {
return Result;
}
UR_CHECK_ERROR(
validateCommandDesc(KernelCommandHandle, pUpdateKernelLaunch));
UR_CHECK_ERROR(
updateKernelArguments(CommandBuffer->Device, pUpdateKernelLaunch));
UR_CHECK_ERROR(updateCommand(KernelCommandHandle, pUpdateKernelLaunch));

CUDA_KERNEL_NODE_PARAMS &Params = KernelCommandHandle->Params;
// If no work-size is provided make sure we pass nullptr to setKernelParams
// so it can guess the local work size.
const bool ProvidedLocalSize = !KernelCommandHandle->isNullLocalSize();
size_t *LocalWorkSize =
ProvidedLocalSize ? KernelCommandHandle->LocalWorkSize : nullptr;

Params.func = CuFunc;
Params.gridDimX = BlocksPerGrid[0];
Params.gridDimY = BlocksPerGrid[1];
Params.gridDimZ = BlocksPerGrid[2];
Params.blockDimX = ThreadsPerBlock[0];
Params.blockDimY = ThreadsPerBlock[1];
Params.blockDimZ = ThreadsPerBlock[2];
Params.sharedMemBytes = KernelCommandHandle->Kernel->getLocalSize();
Params.kernelParams =
const_cast<void **>(KernelCommandHandle->Kernel->getArgIndices().data());
// Set the number of threads per block to the number of threads per warp
// by default unless user has provided a better number.
size_t ThreadsPerBlock[3] = {32u, 1u, 1u};
size_t BlocksPerGrid[3] = {1u, 1u, 1u};
CUfunction CuFunc = KernelCommandHandle->Kernel->get();
auto Result = setKernelParams(
CommandBuffer->Context, CommandBuffer->Device,
KernelCommandHandle->WorkDim, KernelCommandHandle->GlobalWorkOffset,
KernelCommandHandle->GlobalWorkSize, LocalWorkSize,
KernelCommandHandle->Kernel, CuFunc, ThreadsPerBlock, BlocksPerGrid);
if (Result != UR_RESULT_SUCCESS) {
return Result;
}

CUgraphNode Node = KernelCommandHandle->Node;
CUgraphExec CudaGraphExec = CommandBuffer->CudaGraphExec;
UR_CHECK_ERROR(cuGraphExecKernelNodeSetParams(CudaGraphExec, Node, &Params));
CUDA_KERNEL_NODE_PARAMS &Params = KernelCommandHandle->Params;

Params.func = CuFunc;
Params.gridDimX = BlocksPerGrid[0];
Params.gridDimY = BlocksPerGrid[1];
Params.gridDimZ = BlocksPerGrid[2];
Params.blockDimX = ThreadsPerBlock[0];
Params.blockDimY = ThreadsPerBlock[1];
Params.blockDimZ = ThreadsPerBlock[2];
Params.sharedMemBytes = KernelCommandHandle->Kernel->getLocalSize();
Params.kernelParams = const_cast<void **>(
KernelCommandHandle->Kernel->getArgIndices().data());

CUgraphNode Node = KernelCommandHandle->Node;
CUgraphExec CudaGraphExec = CommandBuffer->CudaGraphExec;
UR_CHECK_ERROR(
cuGraphExecKernelNodeSetParams(CudaGraphExec, Node, &Params));
} catch (ur_result_t Err) {
return Err;
}
return UR_RESULT_SUCCESS;
}

Expand Down
78 changes: 42 additions & 36 deletions source/adapters/hip/command_buffer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1066,42 +1066,48 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferUpdateKernelLaunchExp(

ur_exp_command_buffer_handle_t CommandBuffer = hCommand->CommandBuffer;

UR_CHECK_ERROR(validateCommandDesc(hCommand, pUpdateKernelLaunch));
UR_CHECK_ERROR(
updateKernelArguments(CommandBuffer->Device, pUpdateKernelLaunch));
UR_CHECK_ERROR(updateCommand(hCommand, pUpdateKernelLaunch));

// If no worksize is provided make sure we pass nullptr to setKernelParams
// so it can guess the local work size.
const bool ProvidedLocalSize = !hCommand->isNullLocalSize();
size_t *LocalWorkSize = ProvidedLocalSize ? hCommand->LocalWorkSize : nullptr;

// Set the number of threads per block to the number of threads per warp
// by default unless user has provided a better number
size_t ThreadsPerBlock[3] = {32u, 1u, 1u};
size_t BlocksPerGrid[3] = {1u, 1u, 1u};
hipFunction_t HIPFunc = hCommand->Kernel->get();
UR_CHECK_ERROR(setKernelParams(
CommandBuffer->Device, hCommand->WorkDim, hCommand->GlobalWorkOffset,
hCommand->GlobalWorkSize, LocalWorkSize, hCommand->Kernel, HIPFunc,
ThreadsPerBlock, BlocksPerGrid));

hipKernelNodeParams &Params = hCommand->Params;

Params.func = HIPFunc;
Params.gridDim.x = BlocksPerGrid[0];
Params.gridDim.y = BlocksPerGrid[1];
Params.gridDim.z = BlocksPerGrid[2];
Params.blockDim.x = ThreadsPerBlock[0];
Params.blockDim.y = ThreadsPerBlock[1];
Params.blockDim.z = ThreadsPerBlock[2];
Params.sharedMemBytes = hCommand->Kernel->getLocalSize();
Params.kernelParams =
const_cast<void **>(hCommand->Kernel->getArgIndices().data());

hipGraphNode_t Node = hCommand->Node;
hipGraphExec_t HipGraphExec = CommandBuffer->HIPGraphExec;
UR_CHECK_ERROR(hipGraphExecKernelNodeSetParams(HipGraphExec, Node, &Params));
try {
UR_CHECK_ERROR(validateCommandDesc(hCommand, pUpdateKernelLaunch));
UR_CHECK_ERROR(
updateKernelArguments(CommandBuffer->Device, pUpdateKernelLaunch));
UR_CHECK_ERROR(updateCommand(hCommand, pUpdateKernelLaunch));

// If no worksize is provided make sure we pass nullptr to setKernelParams
// so it can guess the local work size.
const bool ProvidedLocalSize = !hCommand->isNullLocalSize();
size_t *LocalWorkSize =
ProvidedLocalSize ? hCommand->LocalWorkSize : nullptr;

// Set the number of threads per block to the number of threads per warp
// by default unless user has provided a better number
size_t ThreadsPerBlock[3] = {32u, 1u, 1u};
size_t BlocksPerGrid[3] = {1u, 1u, 1u};
hipFunction_t HIPFunc = hCommand->Kernel->get();
UR_CHECK_ERROR(setKernelParams(
CommandBuffer->Device, hCommand->WorkDim, hCommand->GlobalWorkOffset,
hCommand->GlobalWorkSize, LocalWorkSize, hCommand->Kernel, HIPFunc,
ThreadsPerBlock, BlocksPerGrid));

hipKernelNodeParams &Params = hCommand->Params;

Params.func = HIPFunc;
Params.gridDim.x = BlocksPerGrid[0];
Params.gridDim.y = BlocksPerGrid[1];
Params.gridDim.z = BlocksPerGrid[2];
Params.blockDim.x = ThreadsPerBlock[0];
Params.blockDim.y = ThreadsPerBlock[1];
Params.blockDim.z = ThreadsPerBlock[2];
Params.sharedMemBytes = hCommand->Kernel->getLocalSize();
Params.kernelParams =
const_cast<void **>(hCommand->Kernel->getArgIndices().data());

hipGraphNode_t Node = hCommand->Node;
hipGraphExec_t HipGraphExec = CommandBuffer->HIPGraphExec;
UR_CHECK_ERROR(
hipGraphExecKernelNodeSetParams(HipGraphExec, Node, &Params));
} catch (ur_result_t Err) {
return Err;
}
return UR_RESULT_SUCCESS;
}

Expand Down

0 comments on commit ebc7760

Please sign in to comment.