Skip to content

btl/vader: when using single-copy emulation fragment large rdma #6986

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Sep 18, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
83 changes: 10 additions & 73 deletions opal/mca/btl/vader/btl_vader_atomic.c
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
/*
* Copyright (c) 2010-2017 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2019 Google, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
Expand All @@ -16,58 +17,14 @@
#include "btl_vader_endpoint.h"
#include "btl_vader_xpmem.h"

static void mca_btl_vader_sc_emu_aop_complete (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint,
mca_btl_base_descriptor_t *desc, int status)
{
mca_btl_vader_frag_t *frag = (mca_btl_vader_frag_t *) desc;
void *local_address = frag->rdma.local_address;
void *context = frag->rdma.context;
void *cbdata = frag->rdma.cbdata;
mca_btl_base_rdma_completion_fn_t cbfunc = frag->rdma.cbfunc;

/* return the fragment first since the callback may call put/get/amo and could use this fragment */
MCA_BTL_VADER_FRAG_RETURN(frag);

cbfunc (btl, endpoint, local_address, NULL, context, cbdata, status);
}

int mca_btl_vader_emu_aop (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
uint64_t remote_address, mca_btl_base_registration_handle_t *remote_handle,
mca_btl_base_atomic_op_t op, uint64_t operand, int flags, int order,
mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata)
{
mca_btl_vader_frag_t *frag;

frag = mca_btl_vader_rdma_frag_alloc (btl, endpoint, MCA_BTL_VADER_OP_ATOMIC, operand, 0, op, 0, order, flags, NULL,
remote_address, cbfunc, cbcontext, cbdata, mca_btl_vader_sc_emu_aop_complete);
if (OPAL_UNLIKELY(NULL == frag)) {
return OPAL_ERR_OUT_OF_RESOURCE;
}

/* send is always successful */
(void) mca_btl_vader_send (btl, endpoint, &frag->base, MCA_BTL_TAG_VADER);

return OPAL_SUCCESS;
}

static void mca_btl_vader_sc_emu_afop_complete (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint,
mca_btl_base_descriptor_t *desc, int status)
{
mca_btl_vader_frag_t *frag = (mca_btl_vader_frag_t *) desc;
mca_btl_vader_sc_emu_hdr_t *hdr;
void *local_address = frag->rdma.local_address;
void *context = frag->rdma.context;
void *cbdata = frag->rdma.cbdata;
mca_btl_base_rdma_completion_fn_t cbfunc = frag->rdma.cbfunc;

hdr = (mca_btl_vader_sc_emu_hdr_t *) frag->segments[0].seg_addr.pval;

*((int64_t *) frag->rdma.local_address) = hdr->operand[0];

/* return the fragment first since the callback may call put/get/amo and could use this fragment */
MCA_BTL_VADER_FRAG_RETURN(frag);

cbfunc (btl, endpoint, local_address, NULL, context, cbdata, status);
size_t size = (flags & MCA_BTL_ATOMIC_FLAG_32BIT) ? 4 : 8;
return mca_btl_vader_rdma_frag_start (btl, endpoint, MCA_BTL_VADER_OP_ATOMIC, operand, 0, op, order, flags,
size, NULL, remote_address, cbfunc, cbcontext, cbdata);
}

int mca_btl_vader_emu_afop (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
Expand All @@ -76,37 +33,17 @@ int mca_btl_vader_emu_afop (struct mca_btl_base_module_t *btl, struct mca_btl_ba
uint64_t operand, int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc,
void *cbcontext, void *cbdata)
{
mca_btl_vader_frag_t *frag;

frag = mca_btl_vader_rdma_frag_alloc (btl, endpoint, MCA_BTL_VADER_OP_ATOMIC, operand, 0, op, 0, order, flags,
local_address, remote_address, cbfunc, cbcontext, cbdata,
mca_btl_vader_sc_emu_afop_complete);
if (OPAL_UNLIKELY(NULL == frag)) {
return OPAL_ERR_OUT_OF_RESOURCE;
}

/* send is always successful */
(void) mca_btl_vader_send (btl, endpoint, &frag->base, MCA_BTL_TAG_VADER);

return OPAL_SUCCESS;
size_t size = (flags & MCA_BTL_ATOMIC_FLAG_32BIT) ? 4 : 8;
return mca_btl_vader_rdma_frag_start (btl, endpoint, MCA_BTL_VADER_OP_ATOMIC, operand, 0, op, order, flags,
size, local_address, remote_address, cbfunc, cbcontext, cbdata);
}

int mca_btl_vader_emu_acswap (struct mca_btl_base_module_t *btl, struct mca_btl_base_endpoint_t *endpoint,
void *local_address, uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
mca_btl_base_registration_handle_t *remote_handle, uint64_t compare, uint64_t value, int flags,
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata)
{
mca_btl_vader_frag_t *frag;

frag = mca_btl_vader_rdma_frag_alloc (btl, endpoint, MCA_BTL_VADER_OP_CSWAP, compare, value, 0, 0, order,
flags, local_address, remote_address, cbfunc, cbcontext, cbdata,
mca_btl_vader_sc_emu_afop_complete);
if (OPAL_UNLIKELY(NULL == frag)) {
return OPAL_ERR_OUT_OF_RESOURCE;
}

/* send is always successful */
(void) mca_btl_vader_send (btl, endpoint, &frag->base, MCA_BTL_TAG_VADER);

return OPAL_SUCCESS;
size_t size = (flags & MCA_BTL_ATOMIC_FLAG_32BIT) ? 4 : 8;
return mca_btl_vader_rdma_frag_start (btl, endpoint, MCA_BTL_VADER_OP_CSWAP, compare, value, 0, order,
flags, size, local_address, remote_address, cbfunc, cbcontext, cbdata);
}
7 changes: 1 addition & 6 deletions opal/mca/btl/vader/btl_vader_component.c
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
* Copyright (c) 2018 Amazon.com, Inc. or its affiliates. All Rights reserved.
* Copyright (c) 2018 Triad National Security, LLC. All rights
* reserved.
* Copyright (c) 2019 Google, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
Expand Down Expand Up @@ -477,12 +478,6 @@ static void mca_btl_vader_check_single_copy (void)
mca_btl_vader.super.btl_get = NULL;
mca_btl_vader.super.btl_put = NULL;
}

if (MCA_BTL_VADER_EMUL == mca_btl_vader_component.single_copy_mechanism) {
/* limit to the maximum fragment size */
mca_btl_vader.super.btl_put_limit = mca_btl_vader.super.btl_max_send_size - sizeof (mca_btl_vader_sc_emu_hdr_t);
mca_btl_vader.super.btl_get_limit = mca_btl_vader.super.btl_max_send_size - sizeof (mca_btl_vader_sc_emu_hdr_t);
}
}

/*
Expand Down
82 changes: 73 additions & 9 deletions opal/mca/btl/vader/btl_vader_frag.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
* Copyright (c) 2009 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2011-2015 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2019 Google, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
Expand Down Expand Up @@ -91,9 +92,12 @@ struct mca_btl_vader_frag_t {
/** rdma callback data */
struct mca_btl_vader_rdma_cbdata_t {
void *local_address;
uint64_t remote_address;
mca_btl_base_rdma_completion_fn_t cbfunc;
void *context;
void *cbdata;
size_t remaining;
size_t sent;
} rdma;
};

Expand Down Expand Up @@ -151,28 +155,87 @@ static inline void mca_btl_vader_frag_complete (mca_btl_vader_frag_t *frag) {

int mca_btl_vader_frag_init (opal_free_list_item_t *item, void *ctx);

static inline mca_btl_vader_frag_t *
mca_btl_vader_rdma_frag_alloc (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, int type,
static inline void mca_btl_vader_rdma_frag_advance (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint,
mca_btl_vader_frag_t *frag, int status)
{
mca_btl_vader_sc_emu_hdr_t *hdr = (mca_btl_vader_sc_emu_hdr_t *) frag->segments[0].seg_addr.pval;
mca_btl_base_rdma_completion_fn_t cbfunc = frag->rdma.cbfunc;
size_t hdr_size = sizeof (*hdr);
size_t len = frag->rdma.sent ? frag->segments[0].seg_len - hdr_size : 0;
void *context = frag->rdma.context;
void *cbdata = frag->rdma.cbdata;
void *data = (void *) (hdr + 1);

if (frag->rdma.sent) {
if (MCA_BTL_VADER_OP_GET == hdr->type) {
memcpy (frag->rdma.local_address, data, len);
} else if ((MCA_BTL_VADER_OP_ATOMIC == hdr->type || MCA_BTL_VADER_OP_CSWAP == hdr->type) &&
frag->rdma.local_address) {
if (8 == len) {
*((int64_t *) frag->rdma.local_address) = hdr->operand[0];
} else {
*((int32_t *) frag->rdma.local_address) = (int32_t) hdr->operand[0];
}
}
}

if (frag->rdma.remaining) {
size_t packet_size = (frag->rdma.remaining + hdr_size) <= mca_btl_vader.super.btl_max_send_size ?
frag->rdma.remaining : mca_btl_vader.super.btl_max_send_size - hdr_size;

/* advance the local and remote pointers */
frag->rdma.local_address = (void *)((uintptr_t) frag->rdma.local_address + len);
frag->rdma.remote_address += len;

if (MCA_BTL_VADER_OP_PUT == hdr->type) {
/* copy the next block into the fragment buffer */
memcpy ((void *) (hdr + 1), frag->rdma.local_address, packet_size);
}

hdr->addr = frag->rdma.remote_address;
/* clear out the complete flag before sending the fragment again */
frag->hdr->flags &= ~MCA_BTL_VADER_FLAG_COMPLETE;
frag->segments[0].seg_len = packet_size + sizeof (*hdr);
frag->rdma.sent += packet_size;
frag->rdma.remaining -= packet_size;

/* send is always successful */
(void) mca_btl_vader_send (btl, endpoint, &frag->base, MCA_BTL_TAG_VADER);
return;
}

/* return the fragment before calling the callback */
MCA_BTL_VADER_FRAG_RETURN(frag);
cbfunc (btl, endpoint, (void *)((uintptr_t) frag->rdma.local_address - frag->rdma.sent), NULL,
context, cbdata, status);
}

static inline int
mca_btl_vader_rdma_frag_start (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, int type,
uint64_t operand1, uint64_t operand2, mca_btl_base_atomic_op_t op, int order,
int flags, size_t size, void *local_address, int64_t remote_address,
mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext,
void *cbdata, mca_btl_base_completion_fn_t des_cbfunc)
mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata)
{
mca_btl_vader_sc_emu_hdr_t *hdr;
size_t total_size = size + sizeof (*hdr);
size_t hdr_size = sizeof (*hdr);
size_t packet_size = (size + hdr_size) <= mca_btl_vader.super.btl_max_send_size ? size :
mca_btl_vader.super.btl_max_send_size - hdr_size;
mca_btl_vader_frag_t *frag;

frag = (mca_btl_vader_frag_t *) mca_btl_vader_alloc (btl, endpoint, order, total_size,
frag = (mca_btl_vader_frag_t *) mca_btl_vader_alloc (btl, endpoint, order, packet_size + hdr_size,
MCA_BTL_DES_SEND_ALWAYS_CALLBACK);
if (OPAL_UNLIKELY(NULL == frag)) {
return NULL;
return OPAL_ERR_OUT_OF_RESOURCE;
}

frag->base.des_cbfunc = des_cbfunc;
frag->base.des_cbfunc = (mca_btl_base_completion_fn_t) mca_btl_vader_rdma_frag_advance;
frag->rdma.local_address = local_address;
frag->rdma.remote_address = remote_address;
frag->rdma.cbfunc = cbfunc;
frag->rdma.context = cbcontext;
frag->rdma.cbdata = cbdata;
frag->rdma.remaining = size;
frag->rdma.sent = 0;

hdr = (mca_btl_vader_sc_emu_hdr_t *) frag->segments[0].seg_addr.pval;

Expand All @@ -183,7 +246,8 @@ mca_btl_vader_rdma_frag_alloc (mca_btl_base_module_t *btl, mca_btl_base_endpoint
hdr->operand[0] = operand1;
hdr->operand[1] = operand2;

return frag;
mca_btl_vader_rdma_frag_advance (btl, endpoint, frag, OPAL_SUCCESS);
return OPAL_SUCCESS;
}

#endif /* MCA_BTL_VADER_SEND_FRAG_H */
39 changes: 3 additions & 36 deletions opal/mca/btl/vader/btl_vader_get.c
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
* reserved.
* Copyright (c) 2018 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* Copyright (c) 2019 Google, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
Expand Down Expand Up @@ -156,49 +157,15 @@ int mca_btl_vader_get_knem (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t
}
#endif

static void mca_btl_vader_sc_emu_get_complete (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint,
mca_btl_base_descriptor_t *desc, int status)
{
mca_btl_vader_frag_t *frag = (mca_btl_vader_frag_t *) desc;
mca_btl_vader_sc_emu_hdr_t *hdr;
void *local_address = frag->rdma.local_address;
size_t len = frag->segments[0].seg_len - sizeof (*hdr);
void *context = frag->rdma.context;
void *cbdata = frag->rdma.cbdata;
mca_btl_base_rdma_completion_fn_t cbfunc = frag->rdma.cbfunc;
void *data;

hdr = (mca_btl_vader_sc_emu_hdr_t *) frag->segments[0].seg_addr.pval;
data = (void *) (hdr + 1);

memcpy (local_address, data, len);

/* return the fragment before calling the callback */
MCA_BTL_VADER_FRAG_RETURN(frag);

cbfunc (btl, endpoint, local_address, NULL, context, cbdata, status);
}

int mca_btl_vader_get_sc_emu (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint, void *local_address,
uint64_t remote_address, mca_btl_base_registration_handle_t *local_handle,
mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata)
{
mca_btl_vader_frag_t *frag;

if (size > mca_btl_vader.super.btl_get_limit) {
return OPAL_ERR_NOT_AVAILABLE;
}

frag = mca_btl_vader_rdma_frag_alloc (btl, endpoint, MCA_BTL_VADER_OP_GET, 0, 0, 0, order, flags, size,
local_address, remote_address, cbfunc, cbcontext, cbdata,
mca_btl_vader_sc_emu_get_complete);
if (OPAL_UNLIKELY(NULL == frag)) {
return OPAL_ERR_OUT_OF_RESOURCE;
}

/* send is always successful */
(void) mca_btl_vader_send (btl, endpoint, &frag->base, MCA_BTL_TAG_VADER);

return OPAL_SUCCESS;
return mca_btl_vader_rdma_frag_start (btl, endpoint, MCA_BTL_VADER_OP_GET, 0, 0, 0, order, flags, size,
local_address, remote_address, cbfunc, cbcontext, cbdata);
}
36 changes: 3 additions & 33 deletions opal/mca/btl/vader/btl_vader_put.c
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
* reserved.
* Copyright (c) 2014-2018 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* Copyright (c) 2019 Google, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
Expand Down Expand Up @@ -135,21 +136,6 @@ int mca_btl_vader_put_knem (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t
}
#endif

static void mca_btl_vader_sc_emu_put_complete (mca_btl_base_module_t *btl, mca_btl_base_endpoint_t *endpoint,
mca_btl_base_descriptor_t *desc, int status)
{
mca_btl_vader_frag_t *frag = (mca_btl_vader_frag_t *) desc;
void *local_address = frag->rdma.local_address;
void *context = frag->rdma.context;
void *cbdata = frag->rdma.cbdata;
mca_btl_base_rdma_completion_fn_t cbfunc = frag->rdma.cbfunc;

/* return the fragment first since the callback may call put/get/amo and could use this fragment */
MCA_BTL_VADER_FRAG_RETURN(frag);

cbfunc (btl, endpoint, local_address, NULL, context, cbdata, status);
}

/**
* @brief Provides an emulated put path which uses copy-in copy-out with shared memory buffers
*/
Expand All @@ -158,26 +144,10 @@ int mca_btl_vader_put_sc_emu (mca_btl_base_module_t *btl, mca_btl_base_endpoint_
mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags,
int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata)
{
mca_btl_vader_sc_emu_hdr_t *hdr;
mca_btl_vader_frag_t *frag;

if (size > mca_btl_vader.super.btl_put_limit) {
return OPAL_ERR_NOT_AVAILABLE;
}

frag = mca_btl_vader_rdma_frag_alloc (btl, endpoint, MCA_BTL_VADER_OP_PUT, 0, 0, 0, order, flags, size,
local_address, remote_address, cbfunc, cbcontext, cbdata,
mca_btl_vader_sc_emu_put_complete);
if (OPAL_UNLIKELY(NULL == frag)) {
return OPAL_ERR_OUT_OF_RESOURCE;
}

hdr = (mca_btl_vader_sc_emu_hdr_t *) frag->segments[0].seg_addr.pval;

memcpy ((void *) (hdr + 1), local_address, size);

/* send is always successful */
(void) mca_btl_vader_send (btl, endpoint, &frag->base, MCA_BTL_TAG_VADER);

return OPAL_SUCCESS;
return mca_btl_vader_rdma_frag_start (btl, endpoint, MCA_BTL_VADER_OP_PUT, 0, 0, 0, order, flags, size,
local_address, remote_address, cbfunc, cbcontext, cbdata);
}