Skip to content

Commit f7254f9

Browse files
DEVICE/API: Update NIXL device API return status (#881)
Signed-off-by: Michal Shalev <mshalev@nvidia.com>
1 parent 24d691f commit f7254f9

File tree

1 file changed

+19
-11
lines changed

1 file changed

+19
-11
lines changed

src/api/gpu/ucx/nixl_device.cuh

Lines changed: 19 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,11 @@ struct nixlGpuXferReqParams {
6363
*/
6464
__device__ inline nixl_status_t
6565
nixlGpuConvertUcsStatus(ucs_status_t status) {
66-
return status == UCS_OK ? NIXL_SUCCESS : NIXL_ERR_BACKEND;
66+
if (!UCS_STATUS_IS_ERR(status)) {
67+
return NIXL_SUCCESS;
68+
}
69+
printf("UCX returned error: %d\n", status);
70+
return NIXL_ERR_BACKEND;
6771
}
6872

6973
/**
@@ -76,7 +80,8 @@ nixlGpuConvertUcsStatus(ucs_status_t status) {
7680
* @param size [in] Size in bytes of the memory to be transferred.
7781
* @param channel_id [in] Channel ID to use for the transfer.
7882
* @param is_no_delay [in] Whether to use no-delay mode.
79-
* @param xfer_status [out] Status of the transfer. If null, the status is not reported.
83+
* @param xfer_status [out] Status of the transfer. If not null, use @ref
84+
* nixlGpuGetXferStatus to check for completion.
8085
*
8186
* @return nixl_status_t Error code if call was not successful
8287
*/
@@ -107,9 +112,10 @@ nixlGpuPostSingleWriteXferReq(nixlGpuXferReqH req_hndl,
107112
* @param signal_offset [in] Offset of the signal to be sent.
108113
* @param channel_id [in] Channel ID to use for the transfer.
109114
* @param is_no_delay [in] Whether to use no-delay mode.
110-
* @param xfer_status [out] Status of the transfer. If null, the status is not reported.
115+
* @param xfer_status [out] Status of the transfer. If not null, use @ref
116+
* nixlGpuGetXferStatus to check for completion.
111117
*
112-
* @return nixl_status_t Error code if call was not successful
118+
* @return nixl_status_t Error code if call was not successful
113119
*/
114120
template<nixl_gpu_level_t level = nixl_gpu_level_t::THREAD>
115121
__device__ nixl_status_t
@@ -143,9 +149,10 @@ nixlGpuPostSignalXferReq(nixlGpuXferReqH req_hndl,
143149
* @param signal_offset [in] Offset of the signal to be sent.
144150
* @param channel_id [in] Channel ID to use for the transfer.
145151
* @param is_no_delay [in] Whether to use no-delay mode.
146-
* @param xfer_status [out] Status of the transfer. If null, the status is not reported.
152+
* @param xfer_status [out] Status of the transfer. If not null, use @ref
153+
* nixlGpuGetXferStatus to check for completion.
147154
*
148-
* @return nixl_status_t Error code if call was not successful
155+
* @return nixl_status_t Error code if call was not successful
149156
*/
150157
template<nixl_gpu_level_t level = nixl_gpu_level_t::THREAD>
151158
__device__ nixl_status_t
@@ -188,9 +195,10 @@ nixlGpuPostPartialWriteXferReq(nixlGpuXferReqH req_hndl,
188195
* @param signal_offset [in] Offset of the signal to be sent.
189196
* @param channel_id [in] Channel ID to use for the transfer.
190197
* @param is_no_delay [in] Whether to use no-delay mode.
191-
* @param xfer_status [out] Status of the transfer. If null, the status is not reported.
198+
* @param xfer_status [out] Status of the transfer. If not null, use @ref
199+
* nixlGpuGetXferStatus to check for completion.
192200
*
193-
* @return nixl_status_t Error code if call was not successful
201+
* @return nixl_status_t Error code if call was not successful
194202
*/
195203
template<nixl_gpu_level_t level = nixl_gpu_level_t::THREAD>
196204
__device__ nixl_status_t
@@ -218,9 +226,9 @@ nixlGpuPostWriteXferReq(nixlGpuXferReqH req_hndl,
218226
*
219227
* @param xfer_status [in] Status of the transfer.
220228
*
221-
* @return NIXL_SUCCESS The request has completed, no more operations are in progress.
222-
* @return NIXL_IN_PROG One or more operations in the request have not completed.
223-
* @return Error code if call was not successful
229+
* @return NIXL_SUCCESS The request has completed, no more operations are in progress.
230+
* @return NIXL_IN_PROG One or more operations in the request have not completed.
231+
* @return NIXL_ERR_BACKEND An error occurred in UCX backend.
224232
*/
225233
template<nixl_gpu_level_t level = nixl_gpu_level_t::THREAD>
226234
__device__ nixl_status_t

0 commit comments

Comments
 (0)