@@ -1217,25 +1217,143 @@ nixlAgent::releaseXferReq(nixlXferReqH *req_hndl) const {
12171217}
12181218
12191219nixl_status_t
1220- nixlAgent::createGpuXferReq (const nixlXferReqH &req_hndl, nixlGpuXferReqH &gpu_req_hndl) const {
1221- if (!req_hndl.engine ) {
1222- NIXL_ERROR_FUNC << " Invalid request handle[" << &req_hndl << " ]: engine is null" ;
1223- return NIXL_ERR_INVALID_PARAM;
1220+ nixlAgent::createGpuXferReq (const nixl_xfer_dlist_t &local_descs,
1221+ const nixl_xfer_dlist_t &remote_descs,
1222+ const std::string &remote_agent,
1223+ nixlGpuXferReqH &gpu_req_hndl,
1224+ nixlXferReqH *&req_hndl,
1225+ const nixl_opt_args_t *extra_params) const {
1226+ nixl_status_t ret1, ret2;
1227+ nixl_opt_b_args_t opt_args;
1228+
1229+ std::unique_ptr<backend_set_t > backend_set = std::make_unique<backend_set_t >();
1230+
1231+ req_hndl = nullptr ;
1232+
1233+ NIXL_SHARED_LOCK_GUARD (data->lock );
1234+
1235+ if (data->remoteSections .count (remote_agent) == 0 )
1236+ {
1237+ NIXL_ERROR_FUNC << " metadata for remote agent '" << remote_agent << " ' not found" ;
1238+ data->addErrorTelemetry (NIXL_ERR_NOT_FOUND);
1239+ return NIXL_ERR_NOT_FOUND;
12241240 }
12251241
1226- if (!req_hndl.backendHandle ) {
1227- NIXL_ERROR_FUNC << " Invalid request handle[" << &req_hndl << " ]: backendHandle is null" ;
1242+ size_t total_bytes = 0 ;
1243+ if (local_descs.descCount () != remote_descs.descCount ()) {
1244+ NIXL_ERROR_FUNC << " different descriptor list sizes (local=" << local_descs.descCount ()
1245+ << " , remote=" << remote_descs.descCount () << " )" ;
12281246 return NIXL_ERR_INVALID_PARAM;
12291247 }
1248+ for (int i = 0 ; i < local_descs.descCount (); ++i) {
1249+ if (local_descs[i].len != remote_descs[i].len ) {
1250+ NIXL_ERROR_FUNC << " length mismatch at index " << i;
1251+ return NIXL_ERR_INVALID_PARAM;
1252+ }
1253+ total_bytes += local_descs[i].len ;
1254+ }
12301255
1231- NIXL_SHARED_LOCK_GUARD (data->lock );
1232- const auto status = req_hndl.engine ->createGpuXferReq (
1233- *req_hndl.backendHandle , *req_hndl.initiatorDescs , *req_hndl.targetDescs , gpu_req_hndl);
1256+ if (!extra_params || extra_params->backends .size () == 0 ) {
1257+ // Finding backends that support the corresponding memories
1258+ // locally and remotely, and find the common ones.
1259+ backend_set_t * local_set =
1260+ data->memorySection ->queryBackends (local_descs.getType ());
1261+ backend_set_t * remote_set =
1262+ data->remoteSections [remote_agent]->queryBackends (
1263+ remote_descs.getType ());
1264+ if (!local_set || !remote_set) {
1265+ NIXL_ERROR_FUNC << " no backends found for local or remote for their "
1266+ " corresponding memory type" ;
1267+ return NIXL_ERR_NOT_FOUND;
1268+ }
1269+
1270+ for (auto & elm : *local_set)
1271+ if (remote_set->count (elm) != 0 )
1272+ backend_set->insert (elm);
1273+
1274+ if (backend_set->empty ()) {
1275+ NIXL_ERROR_FUNC << " no potential backend found to be able to do the transfer" ;
1276+ return NIXL_ERR_NOT_FOUND;
1277+ }
1278+ } else {
1279+ for (auto & elm : extra_params->backends )
1280+ backend_set->insert (elm->engine );
1281+ }
1282+
1283+ std::unique_ptr<nixlXferReqH> handle = std::make_unique<nixlXferReqH>();
1284+ handle->initiatorDescs = new nixl_meta_dlist_t (local_descs.getType ());
1285+
1286+ handle->targetDescs = new nixl_meta_dlist_t (remote_descs.getType ());
1287+
1288+ for (auto & backend : *backend_set) {
1289+ ret1 = data->memorySection ->populate (
1290+ local_descs, backend, *handle->initiatorDescs );
1291+ ret2 = data->remoteSections [remote_agent]->populate (
1292+ remote_descs, backend, *handle->targetDescs );
1293+
1294+ if ((ret1 == NIXL_SUCCESS) && (ret2 == NIXL_SUCCESS)) {
1295+ NIXL_INFO << " Selected backend: " << backend->getType ();
1296+ handle->engine = backend;
1297+ break ;
1298+ }
1299+ }
1300+
1301+ if (!handle->engine ) {
1302+ NIXL_ERROR_FUNC << " no specified or potential backend had the required "
1303+ " registrations to be able to do the transfer" ;
1304+ data->addErrorTelemetry (NIXL_ERR_NOT_FOUND);
1305+ return NIXL_ERR_NOT_FOUND;
1306+ }
1307+
1308+ if (extra_params) {
1309+ if (extra_params->hasNotif ) {
1310+ opt_args.notifMsg = extra_params->notifMsg ;
1311+ opt_args.hasNotif = true ;
1312+ }
1313+
1314+ if (extra_params->customParam .length () > 0 )
1315+ opt_args.customParam = extra_params->customParam ;
1316+ }
1317+
1318+ if (opt_args.hasNotif && (!handle->engine ->supportsNotif ())) {
1319+ NIXL_ERROR_FUNC << " the selected backend '" << handle->engine ->getType ()
1320+ << " ' does not support notifications" ;
1321+ data->addErrorTelemetry (NIXL_ERR_BACKEND);
1322+ return NIXL_ERR_BACKEND;
1323+ }
1324+
1325+ handle->remoteAgent = remote_agent;
1326+ handle->status = NIXL_ERR_NOT_POSTED;
1327+ handle->notifMsg = opt_args.notifMsg ;
1328+ handle->hasNotif = opt_args.hasNotif ;
1329+
1330+ if (data->telemetryEnabled ) {
1331+ handle->telemetry .totalBytes = total_bytes;
1332+ handle->telemetry .descCount = handle->initiatorDescs ->descCount ();
1333+ }
1334+
1335+ ret1 = handle->engine ->prepXfer (handle->backendOp ,
1336+ *handle->initiatorDescs ,
1337+ *handle->targetDescs ,
1338+ handle->remoteAgent ,
1339+ handle->backendHandle ,
1340+ &opt_args);
1341+ if (ret1 != NIXL_SUCCESS) {
1342+ NIXL_ERROR_FUNC << " backend '" << handle->engine ->getType ()
1343+ << " ' failed to prepare the transfer request with status " << ret1;
1344+ data->addErrorTelemetry (ret1);
1345+ return ret1;
1346+ }
1347+
1348+ req_hndl = handle.release ();
1349+
1350+ const auto status = req_hndl->engine ->createGpuXferReq (
1351+ *req_hndl->backendHandle , *req_hndl->initiatorDescs , *req_hndl->targetDescs , gpu_req_hndl);
12341352 if (status == NIXL_SUCCESS) {
1235- data->gpuReqToEngine .emplace (gpu_req_hndl, req_hndl. engine );
1353+ data->gpuReqToEngine .emplace (gpu_req_hndl, req_hndl-> engine );
12361354 }
12371355
1238- return status ;
1356+ return NIXL_SUCCESS ;
12391357}
12401358
12411359void
0 commit comments