Skip to content
This repository was archived by the owner on Sep 30, 2022. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion ompi/mca/btl/usnic/btl_usnic.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
* All rights reserved.
* Copyright (c) 2006 Sandia National Laboratories. All rights
* reserved.
* Copyright (c) 2011-2015 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2011-2016 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
Expand Down Expand Up @@ -181,6 +181,9 @@ typedef struct opal_btl_usnic_component_t {
/** max completion queue entries per module */
int32_t cq_num;

/** max number of entries in AV EQ */
int32_t av_eq_num;

/** retrans characteristics */
int retrans_timeout;

Expand Down
5 changes: 2 additions & 3 deletions ompi/mca/btl/usnic/btl_usnic_ack.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2013-2015 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2013-2016 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
Expand Down Expand Up @@ -207,8 +207,7 @@ opal_btl_usnic_ack_send(
/* send the seq of the lowest item in the window that
we've received */
ack->ss_base.us_btl_header->ack_seq =
endpoint->endpoint_next_contig_seq_to_recv - 1;

SEQ_DIFF(endpoint->endpoint_next_contig_seq_to_recv, 1);
ack->ss_len = sizeof(opal_btl_usnic_btl_header_t);

#if MSGDEBUG1
Expand Down
4 changes: 2 additions & 2 deletions ompi/mca/btl/usnic/btl_usnic_ack.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2013-2015 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2013-2016 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
Expand Down Expand Up @@ -92,7 +92,7 @@ opal_btl_usnic_piggyback_ack(
if (endpoint->endpoint_ack_needed) {
opal_btl_usnic_remove_from_endpoints_needing_ack(endpoint);
sseg->ss_base.us_btl_header->ack_seq =
endpoint->endpoint_next_contig_seq_to_recv - 1;
SEQ_DIFF(endpoint->endpoint_next_contig_seq_to_recv, 1);
sseg->ss_base.us_btl_header->ack_present = 1;
#if MSGDEBUG1
opal_output(0, "Piggy-backing ACK for sequence %"UDSEQ"\n",
Expand Down
5 changes: 3 additions & 2 deletions ompi/mca/btl/usnic/btl_usnic_component.c
Original file line number Diff line number Diff line change
Expand Up @@ -956,11 +956,12 @@ static mca_btl_base_module_t** usnic_component_init(int* num_btl_modules,
/* Output all of this module's values. */
const char *devname = module->fabric_info->fabric_attr->name;
opal_output_verbose(5, USNIC_OUT,
"btl:usnic: %s num sqe=%d, num rqe=%d, num cqe=%d",
"btl:usnic: %s num sqe=%d, num rqe=%d, num cqe=%d, num aveq=%d",
devname,
module->sd_num,
module->rd_num,
module->cq_num);
module->cq_num,
module->av_eq_num);
opal_output_verbose(5, USNIC_OUT,
"btl:usnic: %s priority MTU = %" PRIsize_t,
devname,
Expand Down
9 changes: 7 additions & 2 deletions ompi/mca/btl/usnic/btl_usnic_mca.c
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
* All rights reserved.
* Copyright (c) 2006 Sandia National Laboratories. All rights
* reserved.
* Copyright (c) 2008-2015 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2008-2016 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2012 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2015 Intel, Inc. All rights reserved.
Expand Down Expand Up @@ -162,6 +162,7 @@ int opal_btl_usnic_component_register(void)
static int prio_sd_num;
static int prio_rd_num;
static int cq_num;
static int av_eq_num;
static int udp_port_base;
static int max_tiny_msg_size;
static int eager_limit;
Expand Down Expand Up @@ -235,12 +236,16 @@ int opal_btl_usnic_component_register(void)
-1, &cq_num, REGINT_NEG_ONE_OK, OPAL_INFO_LVL_5));
mca_btl_usnic_component.cq_num = (int32_t) cq_num;

CHECK(reg_int("av_eq_num", "Number of event queue entries for peer address resolution",
1024, &av_eq_num, REGINT_GE_ONE, OPAL_INFO_LVL_5));
mca_btl_usnic_component.av_eq_num = (int32_t) av_eq_num;

CHECK(reg_int("base_udp_port", "Base UDP port to use for usNIC communications. If 0, system will pick the port number. If non-zero, it will be added to each process' local rank to obtain the final port number (default: 0)",
0, &udp_port_base, REGINT_GE_ZERO, OPAL_INFO_LVL_5));
mca_btl_usnic_component.udp_port_base = (int) udp_port_base;

CHECK(reg_int("retrans_timeout", "Number of microseconds before retransmitting a frame",
1000, &mca_btl_usnic_component.retrans_timeout,
5000, &mca_btl_usnic_component.retrans_timeout,
REGINT_GE_ONE, OPAL_INFO_LVL_5));

CHECK(reg_int("priority_limit", "Max size of \"priority\" messages (0 = use pre-set defaults; depends on number and type of devices available)",
Expand Down
127 changes: 97 additions & 30 deletions ompi/mca/btl/usnic/btl_usnic_module.c
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
* All rights reserved.
* Copyright (c) 2006 Sandia National Laboratories. All rights
* reserved.
* Copyright (c) 2009-2015 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2009-2016 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2014 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2014 Intel, Inc. All rights reserved
Expand Down Expand Up @@ -69,13 +69,14 @@ static void finalize_one_channel(opal_btl_usnic_module_t *module,


/*
* Loop over all procs sent to us in add_procs and see if we want to
* add a proc/endpoint for them.
* Loop over a block of procs sent to us in add_procs and see if we
* want to add a proc/endpoint for them.
*/
static int add_procs_create_endpoints(opal_btl_usnic_module_t *module,
size_t nprocs,
opal_proc_t **procs,
mca_btl_base_endpoint_t **endpoints)
static int add_procs_block_create_endpoints(opal_btl_usnic_module_t *module,
size_t block_offset,
size_t block_len,
opal_proc_t **procs,
mca_btl_base_endpoint_t **endpoints)
{
int rc;
opal_proc_t* my_proc;
Expand All @@ -87,8 +88,8 @@ static int add_procs_create_endpoints(opal_btl_usnic_module_t *module,
return OPAL_ERR_OUT_OF_RESOURCE;
}

/* Loop over the procs we were given */
for (size_t i = 0; i < nprocs; i++) {
/* Loop over a block in the procs we were given */
for (size_t i = block_offset; i < (block_offset + block_len); i++) {
struct opal_proc_t* opal_proc = procs[i];
opal_btl_usnic_proc_t* usnic_proc;
mca_btl_base_endpoint_t* usnic_endpoint;
Expand Down Expand Up @@ -195,22 +196,22 @@ static void add_procs_warn_unreachable(opal_btl_usnic_module_t *module,
* invoked. Go reap them all.
*/
static int
add_procs_reap_fi_av_inserts(opal_btl_usnic_module_t *module,
size_t array_len,
struct mca_btl_base_endpoint_t **endpoints)
add_procs_block_reap_fi_av_inserts(opal_btl_usnic_module_t *module,
size_t block_offset,
size_t block_len,
struct mca_btl_base_endpoint_t **endpoints)
{
int ret = OPAL_SUCCESS;
int num_left;
size_t i, channel;
uint32_t event;
struct fi_eq_entry entry;
struct fi_eq_err_entry err_entry;

bool error_occurred = false;

/* compute num fi_av_insert completions we are waiting for */
num_left = 0;
for (i = 0; i < array_len; ++i) {
for (i = block_offset; i < (block_offset + block_len); ++i) {
if (NULL != endpoints[i]) {
num_left += USNIC_NUM_CHANNELS;
}
Expand Down Expand Up @@ -266,7 +267,7 @@ add_procs_reap_fi_av_inserts(opal_btl_usnic_module_t *module,
We therefore only want to print a pretty
warning about (and OBJ_RELEASE) that endpoint
the *first* time it is reported. */
for (i = 0; i < array_len; ++i) {
for (i = block_offset; i < (block_offset + block_len); ++i) {
if (endpoints[i] == context->endpoint) {
add_procs_warn_unreachable(module,
context->endpoint);
Expand Down Expand Up @@ -348,7 +349,7 @@ add_procs_reap_fi_av_inserts(opal_btl_usnic_module_t *module,
- If an otherwise-valid endpoint has no dest, that means we timed
out trying to resolve it, so just release that endpoint. */
size_t num_endpoints_created = 0;
for (i = 0; i < array_len; i++) {
for (i = block_offset; i < (block_offset + block_len); i++) {
if (NULL != endpoints[i]) {
bool happy;

Expand Down Expand Up @@ -382,6 +383,79 @@ add_procs_reap_fi_av_inserts(opal_btl_usnic_module_t *module,
return ret;
}

/*
* Create endpoints for the procs we were given in add_procs.
*/
static int add_procs_create_endpoints(struct opal_btl_usnic_module_t* module,
size_t nprocs,
struct opal_proc_t **procs,
struct mca_btl_base_endpoint_t** endpoints)
{
/* We need to ensure that we don't overrun the libfabric AV EQ.
Divide up all the peer address resolutions we need to do into a
series of blocks; insert and complete each block before moving
to the next (note: if performance mandates it, we can move to a
sliding window style of AV inserts to get better concurrency of
AV resolution). */

/* Leave a few empty slots in the AV EQ, just for good measure */
if (module->av_eq_size < 8) {
opal_show_help("help-mpi-btl-usnic.txt", "fi_av_eq too small",
true,
opal_process_info.nodename,
module->av_eq_size,
8);
return OPAL_ERR_OUT_OF_RESOURCE;
}

size_t eq_size = module->av_eq_size - 8;
size_t block_len = eq_size;
size_t num_av_inserts = nprocs * USNIC_NUM_CHANNELS;
size_t num_blocks = num_av_inserts / block_len;
if (eq_size % num_av_inserts != 0) {
++num_blocks;
}

/* Per above, the blocks are expressed in terms of number of AV
inserts. Convert them to be expressed in terms of number of
procs. */
block_len /= USNIC_NUM_CHANNELS;

/* Per above, loop over creating the endpoints so that we do not
overrun the libfabric AV EQ. */
int rc;
for (size_t block_offset = 0, block = 0; block < num_blocks;
block_offset += block_len, ++block) {
/* Adjust for the last block */
if (block_len > (nprocs - block_offset)) {
block_len = nprocs - block_offset;
}

/* First, create endpoints (and procs, if they're not already
created) for the usnic-reachable procs we were given. */
rc = add_procs_block_create_endpoints(module,
block_offset, block_len,
procs, endpoints);
if (OPAL_SUCCESS != rc) {
return rc;
}

/* For each endpoint that was created, we initiated the
process to create NUM_CHANNELS fi_addrs. Go finish all of
those. This will be the final determination of whether we
can use the endpoint or not because we'll find out if each
endpoint is reachable or not. */
rc = add_procs_block_reap_fi_av_inserts(module,
block_offset, block_len,
endpoints);
if (OPAL_SUCCESS != rc) {
return rc;
}
}

return OPAL_SUCCESS;
}

/*
* Add procs to this BTL module, receiving endpoint information from
* the modex. This is done in 2 phases:
Expand All @@ -408,23 +482,13 @@ static int usnic_add_procs(struct mca_btl_base_module_t* base_module,
opal_btl_usnic_module_t* module = (opal_btl_usnic_module_t*) base_module;
int rc;

/* First, create endpoints (and procs, if they're not already
created) for all the usnic-reachable procs we were given. */
/* Go create the endpoints (including all relevant address
resolution) */
rc = add_procs_create_endpoints(module, nprocs, procs, endpoints);
if (OPAL_SUCCESS != rc) {
goto fail;
}

/* For each endpoint that was created, we initiated the process to
create NUM_CHANNELS fi_addrs. Go finish all of those. This
will be the final determination of whether we can use the
endpoint or not because we'll find out if each endpoint is
reachable or not. */
rc = add_procs_reap_fi_av_inserts(module, nprocs, endpoints);
if (OPAL_SUCCESS != rc) {
goto fail;
}

/* Find all the endpoints with a complete set of USD destinations
and mark them as reachable */
for (size_t i = 0; NULL != reachable && i < nprocs; ++i) {
Expand Down Expand Up @@ -1205,7 +1269,7 @@ usnic_send(
/* assign length */
sseg->ss_len = sizeof(opal_btl_usnic_btl_header_t) + frag->sf_size;

sseg->ss_channel = USNIC_PRIORITY_CHANNEL;
sseg->ss_channel = USNIC_DATA_CHANNEL;

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So before this change, all sends were going over the priority channel (is that for acks?)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just small sends. See old line 1178 (a few lines above this one):

if (frag->sf_base.uf_type == OPAL_BTL_USNIC_FRAG_SMALL_SEND &&
             frag->sf_ack_bytes_left < module->max_tiny_payload &&
// ...etc.

sseg->ss_base.us_btl_header->tag = tag;
#if MSGDEBUG1
opal_output(0, "INLINE send, sseg=%p", (void *)sseg);
Expand Down Expand Up @@ -2018,12 +2082,15 @@ static int init_channels(opal_btl_usnic_module_t *module)
}

memset(&eq_attr, 0, sizeof(eq_attr));
eq_attr.size = 1024;
eq_attr.size = module->av_eq_num;
eq_attr.wait_obj = FI_WAIT_UNSPEC;
rc = fi_eq_open(module->fabric, &eq_attr, &module->av_eq, NULL);
if (rc != OPAL_SUCCESS) {
goto destroy;
}
// Save the size of the created EQ
module->av_eq_size = eq_attr.size;

eq_attr.wait_obj = FI_WAIT_FD;
rc = fi_eq_open(module->fabric, &eq_attr, &module->dom_eq, NULL);
if (rc != OPAL_SUCCESS) {
Expand Down
5 changes: 4 additions & 1 deletion ompi/mca/btl/usnic/btl_usnic_module.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
* All rights reserved.
* Copyright (c) 2006 Sandia National Laboratories. All rights
* reserved.
* Copyright (c) 2011-2015 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2011-2016 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
Expand Down Expand Up @@ -110,6 +110,8 @@ typedef struct opal_btl_usnic_module_t {
struct fid_eq *av_eq;
struct fid_av *av;

size_t av_eq_size;

mca_btl_base_module_error_cb_fn_t pml_error_callback;

/* Information about the events */
Expand All @@ -127,6 +129,7 @@ typedef struct opal_btl_usnic_module_t {
int sd_num;
int rd_num;
int cq_num;
int av_eq_num;
int prio_sd_num;
int prio_rd_num;

Expand Down
6 changes: 5 additions & 1 deletion ompi/mca/btl/usnic/btl_usnic_send.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2013-2015 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2013-2016 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
Expand Down Expand Up @@ -76,6 +76,8 @@ opal_btl_usnic_post_segment(
sseg->ss_len);
#endif

assert(channel_id == USNIC_DATA_CHANNEL);

/* Send the segment */
ret = fi_send(channel->ep,
sseg->ss_ptr,
Expand Down Expand Up @@ -126,6 +128,8 @@ opal_btl_usnic_post_ack(
sseg->ss_len);
#endif

assert(channel_id == USNIC_PRIORITY_CHANNEL);

ret = fi_send(channel->ep,
sseg->ss_ptr,
sseg->ss_len + mca_btl_usnic_component.prefix_send_offset,
Expand Down
7 changes: 4 additions & 3 deletions ompi/mca/btl/usnic/btl_usnic_stats.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2013-2015 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2013-2016 Cisco Systems, Inc. All rights reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
Expand Down Expand Up @@ -143,8 +143,9 @@ void opal_btl_usnic_print_stats(
/* Number of un-acked sends (i.e., sends for which we're
still waiting for ACK) */
send_unacked =
endpoint->endpoint_next_seq_to_send -
endpoint->endpoint_ack_seq_rcvd - 1;
SEQ_DIFF(endpoint->endpoint_next_seq_to_send,
SEQ_DIFF(endpoint->endpoint_ack_seq_rcvd, 1));

if (send_unacked > su_max) su_max = send_unacked;
if (send_unacked < su_min) su_min = send_unacked;

Expand Down
Loading