Skip to content

Commit 2d45ba0

Browse files
committed
libfabric: Use desc-specific target offset
This fixes a bug in multi-descriptor transfers where the descriptors refer to different offsets within a region (each) descriptor's .addr field points to a different sub-region, or partial buffer updates, SG operations, etc. Without this fix, RDMA reads always target offset 0. Should extract each descriptor's specific target address instead. Signed-off-by: Tushar Gohad <tushar.gohad@intel.com>
1 parent 5ee7e35 commit 2d45ba0

File tree

1 file changed

+6
-2
lines changed

1 file changed

+6
-2
lines changed

src/plugins/libfabric/libfabric_backend.cpp

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1017,7 +1017,8 @@ nixlLibfabricEngine::postXfer(const nixl_xfer_op_t &operation,
10171017
int gpu_id = local[desc_idx].devId;
10181018

10191019
NIXL_DEBUG << "Processing descriptor " << desc_idx << " GPU " << gpu_id
1020-
<< " addr: " << transfer_addr << " size: " << transfer_size;
1020+
<< " local_addr: " << transfer_addr << " size: " << transfer_size
1021+
<< " remote_addr: " << (void *)remote[desc_idx].addr;
10211022

10221023
NIXL_DEBUG << "DEBUG: remote_agent='" << remote_agent << "' localAgent='" << localAgent
10231024
<< "'";
@@ -1053,11 +1054,14 @@ nixlLibfabricEngine::postXfer(const nixl_xfer_op_t &operation,
10531054
}
10541055

10551056
// Prepare and submit transfer for remote agents
1057+
// Use descriptor's specific target address
1058+
uint64_t remote_target_addr = remote[desc_idx].addr;
1059+
10561060
nixl_status_t status = rail_manager.prepareAndSubmitTransfer(
10571061
op_type,
10581062
transfer_addr,
10591063
transfer_size,
1060-
remote_md->remote_buf_addr_,
1064+
remote_target_addr, // Use descriptor's specific target address
10611065
local_md->selected_rails_,
10621066
local_md->rail_mr_list_,
10631067
remote_md->rail_remote_key_list_,

0 commit comments

Comments
 (0)