Skip to content

Commit ae91b11

Browse files
committed
btl/vader: when using single-copy emulation fragment large rdma
This commit changes how the single-copy emulation in the vader btl operates. Before this change the BTL set its put and get limits based on the max send size. After this change the limits are unset and the put or get operation is fragmented internally. References #6568 Signed-off-by: Nathan Hjelm <hjelmn@google.com>
1 parent 5ff6cb6 commit ae91b11

File tree

5 files changed

+90
-157
lines changed

5 files changed

+90
-157
lines changed

opal/mca/btl/vader/btl_vader_atomic.c

+10-73
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
/*
33
* Copyright (c) 2010-2017 Los Alamos National Security, LLC. All rights
44
* reserved.
5+
* Copyright (c) 2019 Google, Inc. All rights reserved.
56
* $COPYRIGHT$
67
*
78
* Additional copyrights may follow
@@ -16,58 +17,14 @@
1617
#include "btl_vader_endpoint.h"
1718
#include "btl_vader_xpmem.h"
1819

19-
static void mca_btl_vader_sc_emu_aop_complete (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint,
20-
mca_btl_base_descriptor_t *desc, int status)
21-
{
22-
mca_btl_vader_frag_t *frag = (mca_btl_vader_frag_t *) desc;
23-
void *local_address = frag->rdma.local_address;
24-
void *context = frag->rdma.context;
25-
void *cbdata = frag->rdma.cbdata;
26-
mca_btl_base_rdma_completion_fn_t cbfunc = frag->rdma.cbfunc;
27-
28-
/* return the fragment first since the callback may call put/get/amo and could use this fragment */
29-
MCA_BTL_VADER_FRAG_RETURN(frag);
30-
31-
cbfunc (btl, endpoint, local_address, NULL, context, cbdata, status);
32-
}
33-
3420
int mca_btl_vader_emu_aop (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
3521
uint64_t remote_address, mca_btl_base_registration_handle_t *remote_handle,
3622
mca_btl_base_atomic_op_t op, uint64_t operand, int flags, int order,
3723
mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata)
3824
{
39-
mca_btl_vader_frag_t *frag;
40-
41-
frag = mca_btl_vader_rdma_frag_alloc (btl, endpoint, MCA_BTL_VADER_OP_ATOMIC, operand, 0, op, 0, order, flags, NULL,
42-
remote_address, cbfunc, cbcontext, cbdata, mca_btl_vader_sc_emu_aop_complete);
43-
if (OPAL_UNLIKELY(NULL == frag)) {
44-
return OPAL_ERR_OUT_OF_RESOURCE;
45-
}
46-
47-
/* send is always successful */
48-
(void) mca_btl_vader_send (btl, endpoint, &frag->base, MCA_BTL_TAG_VADER);
49-
50-
return OPAL_SUCCESS;
51-
}
52-
53-
static void mca_btl_vader_sc_emu_afop_complete (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint,
54-
mca_btl_base_descriptor_t *desc, int status)
55-
{
56-
mca_btl_vader_frag_t *frag = (mca_btl_vader_frag_t *) desc;
57-
mca_btl_vader_sc_emu_hdr_t *hdr;
58-
void *local_address = frag->rdma.local_address;
59-
void *context = frag->rdma.context;
60-
void *cbdata = frag->rdma.cbdata;
61-
mca_btl_base_rdma_completion_fn_t cbfunc = frag->rdma.cbfunc;
62-
63-
hdr = (mca_btl_vader_sc_emu_hdr_t *) frag->segments[0].seg_addr.pval;
64-
65-
*((int64_t *) frag->rdma.local_address) = hdr->operand[0];
66-
67-
/* return the fragment first since the callback may call put/get/amo and could use this fragment */
68-
MCA_BTL_VADER_FRAG_RETURN(frag);
69-
70-
cbfunc (btl, endpoint, local_address, NULL, context, cbdata, status);
25+
size_t size = (flags & MCA_BTL_ATOMIC_FLAG_32BIT) ? 4 : 8;
26+
return mca_btl_vader_rdma_frag_start (btl, endpoint, MCA_BTL_VADER_OP_ATOMIC, operand, 0, op, order, flags,
27+
size, NULL, remote_address, cbfunc, cbcontext, cbdata);
7128
}
7229

7330
int mca_btl_vader_emu_afop (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
@@ -76,37 +33,17 @@ int mca_btl_vader_emu_afop (struct mca_btl_base_module_t *btl, struct mca_btl_ba
7633
uint64_t operand, int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc,
7734
void *cbcontext, void *cbdata)
7835
{
79-
mca_btl_vader_frag_t *frag;
80-
81-
frag = mca_btl_vader_rdma_frag_alloc (btl, endpoint, MCA_BTL_VADER_OP_ATOMIC, operand, 0, op, 0, order, flags,
82-
local_address, remote_address, cbfunc, cbcontext, cbdata,
83-
mca_btl_vader_sc_emu_afop_complete);
84-
if (OPAL_UNLIKELY(NULL == frag)) {
85-
return OPAL_ERR_OUT_OF_RESOURCE;
86-
}
87-
88-
/* send is always successful */
89-
(void) mca_btl_vader_send (btl, endpoint, &frag->base, MCA_BTL_TAG_VADER);
90-
91-
return OPAL_SUCCESS;
36+
size_t size = (flags & MCA_BTL_ATOMIC_FLAG_32BIT) ? 4 : 8;
37+
return mca_btl_vader_rdma_frag_start (btl, endpoint, MCA_BTL_VADER_OP_ATOMIC, operand, 0, op, order, flags,
38+
size, local_address, remote_address, cbfunc, cbcontext, cbdata);
9239
}
9340

9441
int mca_btl_vader_emu_acswap (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
9542
void *local_address, uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
9643
mca_btl_base_registration_handle_t *remote_handle, uint64_t compare, uint64_t value, int flags,
9744
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata)
9845
{
99-
mca_btl_vader_frag_t *frag;
100-
101-
frag = mca_btl_vader_rdma_frag_alloc (btl, endpoint, MCA_BTL_VADER_OP_CSWAP, compare, value, 0, 0, order,
102-
flags, local_address, remote_address, cbfunc, cbcontext, cbdata,
103-
mca_btl_vader_sc_emu_afop_complete);
104-
if (OPAL_UNLIKELY(NULL == frag)) {
105-
return OPAL_ERR_OUT_OF_RESOURCE;
106-
}
107-
108-
/* send is always successful */
109-
(void) mca_btl_vader_send (btl, endpoint, &frag->base, MCA_BTL_TAG_VADER);
110-
111-
return OPAL_SUCCESS;
46+
size_t size = (flags & MCA_BTL_ATOMIC_FLAG_32BIT) ? 4 : 8;
47+
return mca_btl_vader_rdma_frag_start (btl, endpoint, MCA_BTL_VADER_OP_CSWAP, compare, value, 0, order,
48+
flags, size, local_address, remote_address, cbfunc, cbcontext, cbdata);
11249
}

opal/mca/btl/vader/btl_vader_component.c

+1-6
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
* Copyright (c) 2018 Amazon.com, Inc. or its affiliates. All Rights reserved.
2222
* Copyright (c) 2018 Triad National Security, LLC. All rights
2323
* reserved.
24+
* Copyright (c) 2019 Google, Inc. All rights reserved.
2425
* $COPYRIGHT$
2526
*
2627
* Additional copyrights may follow
@@ -478,12 +479,6 @@ static void mca_btl_vader_check_single_copy (void)
478479
mca_btl_vader.super.btl_get = NULL;
479480
mca_btl_vader.super.btl_put = NULL;
480481
}
481-
482-
if (MCA_BTL_VADER_EMUL == mca_btl_vader_component.single_copy_mechanism) {
483-
/* limit to the maximum fragment size */
484-
mca_btl_vader.super.btl_put_limit = mca_btl_vader.super.btl_max_send_size - sizeof (mca_btl_vader_sc_emu_hdr_t);
485-
mca_btl_vader.super.btl_get_limit = mca_btl_vader.super.btl_max_send_size - sizeof (mca_btl_vader_sc_emu_hdr_t);
486-
}
487482
}
488483

489484
/*

opal/mca/btl/vader/btl_vader_frag.h

+73-9
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
1515
* Copyright (c) 2011-2015 Los Alamos National Security, LLC. All rights
1616
* reserved.
17+
* Copyright (c) 2019 Google, Inc. All rights reserved.
1718
* $COPYRIGHT$
1819
*
1920
* Additional copyrights may follow
@@ -91,9 +92,12 @@ struct mca_btl_vader_frag_t {
9192
/** rdma callback data */
9293
struct mca_btl_vader_rdma_cbdata_t {
9394
void *local_address;
95+
uint64_t remote_address;
9496
mca_btl_base_rdma_completion_fn_t cbfunc;
9597
void *context;
9698
void *cbdata;
99+
size_t remaining;
100+
size_t sent;
97101
} rdma;
98102
};
99103

@@ -151,28 +155,87 @@ static inline void mca_btl_vader_frag_complete (mca_btl_vader_frag_t *frag) {
151155

152156
int mca_btl_vader_frag_init (opal_free_list_item_t *item, void *ctx);
153157

154-
static inline mca_btl_vader_frag_t *
155-
mca_btl_vader_rdma_frag_alloc (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, int type,
158+
static inline void mca_btl_vader_rdma_frag_advance (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint,
159+
mca_btl_vader_frag_t *frag, int status)
160+
{
161+
mca_btl_vader_sc_emu_hdr_t *hdr = (mca_btl_vader_sc_emu_hdr_t *) frag->segments[0].seg_addr.pval;
162+
mca_btl_base_rdma_completion_fn_t cbfunc = frag->rdma.cbfunc;
163+
size_t hdr_size = sizeof (*hdr);
164+
size_t len = frag->rdma.sent ? frag->segments[0].seg_len - hdr_size : 0;
165+
void *context = frag->rdma.context;
166+
void *cbdata = frag->rdma.cbdata;
167+
void *data = (void *) (hdr + 1);
168+
169+
if (frag->rdma.sent) {
170+
if (MCA_BTL_VADER_OP_GET == hdr->type) {
171+
memcpy (frag->rdma.local_address, data, len);
172+
} else if ((MCA_BTL_VADER_OP_ATOMIC == hdr->type || MCA_BTL_VADER_OP_CSWAP == hdr->type) &&
173+
frag->rdma.local_address) {
174+
if (8 == len) {
175+
*((int64_t *) frag->rdma.local_address) = hdr->operand[0];
176+
} else {
177+
*((int32_t *) frag->rdma.local_address) = (int32_t) hdr->operand[0];
178+
}
179+
}
180+
}
181+
182+
if (frag->rdma.remaining) {
183+
size_t packet_size = (frag->rdma.remaining + hdr_size) <= mca_btl_vader.super.btl_max_send_size ?
184+
frag->rdma.remaining : mca_btl_vader.super.btl_max_send_size - hdr_size;
185+
186+
/* advance the local and remote pointers */
187+
frag->rdma.local_address = (void *)((uintptr_t) frag->rdma.local_address + len);
188+
frag->rdma.remote_address += len;
189+
190+
if (MCA_BTL_VADER_OP_PUT == hdr->type) {
191+
/* copy the next block into the fragment buffer */
192+
memcpy ((void *) (hdr + 1), frag->rdma.local_address, packet_size);
193+
}
194+
195+
hdr->addr = frag->rdma.remote_address;
196+
/* clear out the complete flag before sending the fragment again */
197+
frag->hdr->flags &= ~MCA_BTL_VADER_FLAG_COMPLETE;
198+
frag->segments[0].seg_len = packet_size + sizeof (*hdr);
199+
frag->rdma.sent += packet_size;
200+
frag->rdma.remaining -= packet_size;
201+
202+
/* send is always successful */
203+
(void) mca_btl_vader_send (btl, endpoint, &frag->base, MCA_BTL_TAG_VADER);
204+
return;
205+
}
206+
207+
/* return the fragment before calling the callback */
208+
MCA_BTL_VADER_FRAG_RETURN(frag);
209+
cbfunc (btl, endpoint, (void *)((uintptr_t) frag->rdma.local_address - frag->rdma.sent), NULL,
210+
context, cbdata, status);
211+
}
212+
213+
static inline int
214+
mca_btl_vader_rdma_frag_start (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, int type,
156215
uint64_t operand1, uint64_t operand2, mca_btl_base_atomic_op_t op, int order,
157216
int flags, size_t size, void *local_address, int64_t remote_address,
158-
mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext,
159-
void *cbdata, mca_btl_base_completion_fn_t des_cbfunc)
217+
mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata)
160218
{
161219
mca_btl_vader_sc_emu_hdr_t *hdr;
162-
size_t total_size = size + sizeof (*hdr);
220+
size_t hdr_size = sizeof (*hdr);
221+
size_t packet_size = (size + hdr_size) <= mca_btl_vader.super.btl_max_send_size ? size :
222+
mca_btl_vader.super.btl_max_send_size - hdr_size;
163223
mca_btl_vader_frag_t *frag;
164224

165-
frag = (mca_btl_vader_frag_t *) mca_btl_vader_alloc (btl, endpoint, order, total_size,
225+
frag = (mca_btl_vader_frag_t *) mca_btl_vader_alloc (btl, endpoint, order, packet_size + hdr_size,
166226
MCA_BTL_DES_SEND_ALWAYS_CALLBACK);
167227
if (OPAL_UNLIKELY(NULL == frag)) {
168-
return NULL;
228+
return OPAL_ERR_OUT_OF_RESOURCE;
169229
}
170230

171-
frag->base.des_cbfunc = des_cbfunc;
231+
frag->base.des_cbfunc = (mca_btl_base_completion_fn_t) mca_btl_vader_rdma_frag_advance;
172232
frag->rdma.local_address = local_address;
233+
frag->rdma.remote_address = remote_address;
173234
frag->rdma.cbfunc = cbfunc;
174235
frag->rdma.context = cbcontext;
175236
frag->rdma.cbdata = cbdata;
237+
frag->rdma.remaining = size;
238+
frag->rdma.sent = 0;
176239

177240
hdr = (mca_btl_vader_sc_emu_hdr_t *) frag->segments[0].seg_addr.pval;
178241

@@ -183,7 +246,8 @@ mca_btl_vader_rdma_frag_alloc (mca_btl_base_module_t *btl, mca_btl_base_endpoint
183246
hdr->operand[0] = operand1;
184247
hdr->operand[1] = operand2;
185248

186-
return frag;
249+
mca_btl_vader_rdma_frag_advance (btl, endpoint, frag, OPAL_SUCCESS);
250+
return OPAL_SUCCESS;
187251
}
188252

189253
#endif /* MCA_BTL_VADER_SEND_FRAG_H */

opal/mca/btl/vader/btl_vader_get.c

+3-36
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
* reserved.
55
* Copyright (c) 2018 Research Organization for Information Science
66
* and Technology (RIST). All rights reserved.
7+
* Copyright (c) 2019 Google, Inc. All rights reserved.
78
* $COPYRIGHT$
89
*
910
* Additional copyrights may follow
@@ -156,49 +157,15 @@ int mca_btl_vader_get_knem (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t
156157
}
157158
#endif
158159

159-
static void mca_btl_vader_sc_emu_get_complete (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint,
160-
mca_btl_base_descriptor_t *desc, int status)
161-
{
162-
mca_btl_vader_frag_t *frag = (mca_btl_vader_frag_t *) desc;
163-
mca_btl_vader_sc_emu_hdr_t *hdr;
164-
void *local_address = frag->rdma.local_address;
165-
size_t len = frag->segments[0].seg_len - sizeof (*hdr);
166-
void *context = frag->rdma.context;
167-
void *cbdata = frag->rdma.cbdata;
168-
mca_btl_base_rdma_completion_fn_t cbfunc = frag->rdma.cbfunc;
169-
void *data;
170-
171-
hdr = (mca_btl_vader_sc_emu_hdr_t *) frag->segments[0].seg_addr.pval;
172-
data = (void *) (hdr + 1);
173-
174-
memcpy (local_address, data, len);
175-
176-
/* return the fragment before calling the callback */
177-
MCA_BTL_VADER_FRAG_RETURN(frag);
178-
179-
cbfunc (btl, endpoint, local_address, NULL, context, cbdata, status);
180-
}
181-
182160
int mca_btl_vader_get_sc_emu (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, void *local_address,
183161
uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
184162
mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
185163
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata)
186164
{
187-
mca_btl_vader_frag_t *frag;
188-
189165
if (size > mca_btl_vader.super.btl_get_limit) {
190166
return OPAL_ERR_NOT_AVAILABLE;
191167
}
192168

193-
frag = mca_btl_vader_rdma_frag_alloc (btl, endpoint, MCA_BTL_VADER_OP_GET, 0, 0, 0, order, flags, size,
194-
local_address, remote_address, cbfunc, cbcontext, cbdata,
195-
mca_btl_vader_sc_emu_get_complete);
196-
if (OPAL_UNLIKELY(NULL == frag)) {
197-
return OPAL_ERR_OUT_OF_RESOURCE;
198-
}
199-
200-
/* send is always successful */
201-
(void) mca_btl_vader_send (btl, endpoint, &frag->base, MCA_BTL_TAG_VADER);
202-
203-
return OPAL_SUCCESS;
169+
return mca_btl_vader_rdma_frag_start (btl, endpoint, MCA_BTL_VADER_OP_GET, 0, 0, 0, order, flags, size,
170+
local_address, remote_address, cbfunc, cbcontext, cbdata);
204171
}

opal/mca/btl/vader/btl_vader_put.c

+3-33
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
* reserved.
55
* Copyright (c) 2014-2018 Research Organization for Information Science
66
* and Technology (RIST). All rights reserved.
7+
* Copyright (c) 2019 Google, Inc. All rights reserved.
78
* $COPYRIGHT$
89
*
910
* Additional copyrights may follow
@@ -135,21 +136,6 @@ int mca_btl_vader_put_knem (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t
135136
}
136137
#endif
137138

138-
static void mca_btl_vader_sc_emu_put_complete (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint,
139-
mca_btl_base_descriptor_t *desc, int status)
140-
{
141-
mca_btl_vader_frag_t *frag = (mca_btl_vader_frag_t *) desc;
142-
void *local_address = frag->rdma.local_address;
143-
void *context = frag->rdma.context;
144-
void *cbdata = frag->rdma.cbdata;
145-
mca_btl_base_rdma_completion_fn_t cbfunc = frag->rdma.cbfunc;
146-
147-
/* return the fragment first since the callback may call put/get/amo and could use this fragment */
148-
MCA_BTL_VADER_FRAG_RETURN(frag);
149-
150-
cbfunc (btl, endpoint, local_address, NULL, context, cbdata, status);
151-
}
152-
153139
/**
154140
* @brief Provides an emulated put path which uses copy-in copy-out with shared memory buffers
155141
*/
@@ -158,26 +144,10 @@ int mca_btl_vader_put_sc_emu (mca_btl_base_module_t *btl, mca_btl_base_endpoint_
158144
mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
159145
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata)
160146
{
161-
mca_btl_vader_sc_emu_hdr_t *hdr;
162-
mca_btl_vader_frag_t *frag;
163-
164147
if (size > mca_btl_vader.super.btl_put_limit) {
165148
return OPAL_ERR_NOT_AVAILABLE;
166149
}
167150

168-
frag = mca_btl_vader_rdma_frag_alloc (btl, endpoint, MCA_BTL_VADER_OP_PUT, 0, 0, 0, order, flags, size,
169-
local_address, remote_address, cbfunc, cbcontext, cbdata,
170-
mca_btl_vader_sc_emu_put_complete);
171-
if (OPAL_UNLIKELY(NULL == frag)) {
172-
return OPAL_ERR_OUT_OF_RESOURCE;
173-
}
174-
175-
hdr = (mca_btl_vader_sc_emu_hdr_t *) frag->segments[0].seg_addr.pval;
176-
177-
memcpy ((void *) (hdr + 1), local_address, size);
178-
179-
/* send is always successful */
180-
(void) mca_btl_vader_send (btl, endpoint, &frag->base, MCA_BTL_TAG_VADER);
181-
182-
return OPAL_SUCCESS;
151+
return mca_btl_vader_rdma_frag_start (btl, endpoint, MCA_BTL_VADER_OP_PUT, 0, 0, 0, order, flags, size,
152+
local_address, remote_address, cbfunc, cbcontext, cbdata);
183153
}

0 commit comments

Comments
 (0)