From 18177f32f123495468bf7fe08579796577c91999 Mon Sep 17 00:00:00 2001 From: Jeff Squyres Date: Sat, 30 Jan 2016 07:34:20 -0800 Subject: [PATCH 1/6] usnic: ensure all messages are sent on the data channel Messages should go on the data channel, even if they're short. Only ACKs go on the priority channel. (cherry picked from commit open-mpi/ompi@4de4a263f5dea060ff7fca16c7bf01ebf14d2d13) --- ompi/mca/btl/usnic/btl_usnic_module.c | 4 ++-- ompi/mca/btl/usnic/btl_usnic_send.h | 6 +++++- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/ompi/mca/btl/usnic/btl_usnic_module.c b/ompi/mca/btl/usnic/btl_usnic_module.c index 6105fd4281..4b7851fe74 100644 --- a/ompi/mca/btl/usnic/btl_usnic_module.c +++ b/ompi/mca/btl/usnic/btl_usnic_module.c @@ -12,7 +12,7 @@ * All rights reserved. * Copyright (c) 2006 Sandia National Laboratories. All rights * reserved. - * Copyright (c) 2009-2015 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2009-2016 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2014 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2014 Intel, Inc. All rights reserved @@ -1205,7 +1205,7 @@ usnic_send( /* assign length */ sseg->ss_len = sizeof(opal_btl_usnic_btl_header_t) + frag->sf_size; - sseg->ss_channel = USNIC_PRIORITY_CHANNEL; + sseg->ss_channel = USNIC_DATA_CHANNEL; sseg->ss_base.us_btl_header->tag = tag; #if MSGDEBUG1 opal_output(0, "INLINE send, sseg=%p", (void *)sseg); diff --git a/ompi/mca/btl/usnic/btl_usnic_send.h b/ompi/mca/btl/usnic/btl_usnic_send.h index 796008d2f7..2020544f20 100644 --- a/ompi/mca/btl/usnic/btl_usnic_send.h +++ b/ompi/mca/btl/usnic/btl_usnic_send.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013-2015 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2013-2016 Cisco Systems, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -76,6 +76,8 @@ opal_btl_usnic_post_segment( sseg->ss_len); #endif + assert(channel_id == USNIC_DATA_CHANNEL); + /* Send the segment */ ret = fi_send(channel->ep, sseg->ss_ptr, @@ -126,6 +128,8 @@ opal_btl_usnic_post_ack( sseg->ss_len); #endif + assert(channel_id == USNIC_PRIORITY_CHANNEL); + ret = fi_send(channel->ep, sseg->ss_ptr, sseg->ss_len + mca_btl_usnic_component.prefix_send_offset, From ff6c26103caa3a6e06e0ff946056309de5a0e26b Mon Sep 17 00:00:00 2001 From: Jeff Squyres Date: Sat, 30 Jan 2016 07:33:19 -0800 Subject: [PATCH 2/6] usnic: fix wraparound sequence number issue Sequence numbers will wrap around; it is not sufficient to check for (seq-1) -- must use the SEQ_DIFF macro to properly handle the wraparound. This bug wasn't serious; it just meant we might retransmit one or two extra times when retransmits were triggerd and the sequence numbers wrapped around their sliding windows. (cherry picked from commit open-mpi/ompi@d624e0d60fc52f61747d431a8deaa1afb435cd2f) --- ompi/mca/btl/usnic/btl_usnic_ack.c | 5 ++--- ompi/mca/btl/usnic/btl_usnic_ack.h | 4 ++-- ompi/mca/btl/usnic/btl_usnic_stats.c | 7 ++++--- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/ompi/mca/btl/usnic/btl_usnic_ack.c b/ompi/mca/btl/usnic/btl_usnic_ack.c index 3a6ae5baac..4616516f52 100644 --- a/ompi/mca/btl/usnic/btl_usnic_ack.c +++ b/ompi/mca/btl/usnic/btl_usnic_ack.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013-2015 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2013-2016 Cisco Systems, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -207,8 +207,7 @@ opal_btl_usnic_ack_send( /* send the seq of the lowest item in the window that we've received */ ack->ss_base.us_btl_header->ack_seq = - endpoint->endpoint_next_contig_seq_to_recv - 1; - + SEQ_DIFF(endpoint->endpoint_next_contig_seq_to_recv, 1); ack->ss_len = sizeof(opal_btl_usnic_btl_header_t); #if MSGDEBUG1 diff --git a/ompi/mca/btl/usnic/btl_usnic_ack.h b/ompi/mca/btl/usnic/btl_usnic_ack.h index 0aaf8306d7..1ef85544c0 100644 --- a/ompi/mca/btl/usnic/btl_usnic_ack.h +++ b/ompi/mca/btl/usnic/btl_usnic_ack.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013-2015 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2013-2016 Cisco Systems, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -92,7 +92,7 @@ opal_btl_usnic_piggyback_ack( if (endpoint->endpoint_ack_needed) { opal_btl_usnic_remove_from_endpoints_needing_ack(endpoint); sseg->ss_base.us_btl_header->ack_seq = - endpoint->endpoint_next_contig_seq_to_recv - 1; + SEQ_DIFF(endpoint->endpoint_next_contig_seq_to_recv, 1); sseg->ss_base.us_btl_header->ack_present = 1; #if MSGDEBUG1 opal_output(0, "Piggy-backing ACK for sequence %"UDSEQ"\n", diff --git a/ompi/mca/btl/usnic/btl_usnic_stats.c b/ompi/mca/btl/usnic/btl_usnic_stats.c index 18f24aa7c5..2f8d110e36 100644 --- a/ompi/mca/btl/usnic/btl_usnic_stats.c +++ b/ompi/mca/btl/usnic/btl_usnic_stats.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013-2015 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2013-2016 Cisco Systems, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -143,8 +143,9 @@ void opal_btl_usnic_print_stats( /* Number of un-acked sends (i.e., sends for which we're still waiting for ACK) */ send_unacked = - endpoint->endpoint_next_seq_to_send - - endpoint->endpoint_ack_seq_rcvd - 1; + SEQ_DIFF(endpoint->endpoint_next_seq_to_send, + SEQ_DIFF(endpoint->endpoint_ack_seq_rcvd, 1)); + if (send_unacked > su_max) su_max = send_unacked; if (send_unacked < su_min) su_min = send_unacked; From db13b3a0bf5eb3a0d14b13eed95b453d723f3ea3 Mon Sep 17 00:00:00 2001 From: Jeff Squyres Date: Sat, 30 Jan 2016 06:58:46 -0800 Subject: [PATCH 3/6] usnic: don't overrun the fi_av_insert() EQ Add endpoints in a blocked manner so that we don't overrun the fi_av_insert() event queue. Also make the AV EQ length an MCA param, and report it in mca_btl_base_verbose >=5 output. (cherry picked from commit open-mpi/ompi@db825abc003d24b787005191b58af692f6ff2c77) --- ompi/mca/btl/usnic/btl_usnic.h | 5 +- ompi/mca/btl/usnic/btl_usnic_component.c | 5 +- ompi/mca/btl/usnic/btl_usnic_mca.c | 7 +- ompi/mca/btl/usnic/btl_usnic_module.c | 128 +++++++++++++++++----- ompi/mca/btl/usnic/btl_usnic_module.h | 5 +- ompi/mca/btl/usnic/help-mpi-btl-usnic.txt | 16 ++- 6 files changed, 132 insertions(+), 34 deletions(-) diff --git a/ompi/mca/btl/usnic/btl_usnic.h b/ompi/mca/btl/usnic/btl_usnic.h index 0d815cc438..cc094ce38f 100644 --- a/ompi/mca/btl/usnic/btl_usnic.h +++ b/ompi/mca/btl/usnic/btl_usnic.h @@ -11,7 +11,7 @@ * All rights reserved. * Copyright (c) 2006 Sandia National Laboratories. All rights * reserved. - * Copyright (c) 2011-2015 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2011-2016 Cisco Systems, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -181,6 +181,9 @@ typedef struct opal_btl_usnic_component_t { /** max completion queue entries per module */ int32_t cq_num; + /** max number of entries in AV EQ */ + int32_t av_eq_num; + /** retrans characteristics */ int retrans_timeout; diff --git a/ompi/mca/btl/usnic/btl_usnic_component.c b/ompi/mca/btl/usnic/btl_usnic_component.c index b33e11df9c..53d95ae8fe 100644 --- a/ompi/mca/btl/usnic/btl_usnic_component.c +++ b/ompi/mca/btl/usnic/btl_usnic_component.c @@ -956,11 +956,12 @@ static mca_btl_base_module_t** usnic_component_init(int* num_btl_modules, /* Output all of this module's values. */ const char *devname = module->fabric_info->fabric_attr->name; opal_output_verbose(5, USNIC_OUT, - "btl:usnic: %s num sqe=%d, num rqe=%d, num cqe=%d", + "btl:usnic: %s num sqe=%d, num rqe=%d, num cqe=%d, num aveq=%d", devname, module->sd_num, module->rd_num, - module->cq_num); + module->cq_num, + module->av_eq_num); opal_output_verbose(5, USNIC_OUT, "btl:usnic: %s priority MTU = %" PRIsize_t, devname, diff --git a/ompi/mca/btl/usnic/btl_usnic_mca.c b/ompi/mca/btl/usnic/btl_usnic_mca.c index c0df778871..655128fd5f 100644 --- a/ompi/mca/btl/usnic/btl_usnic_mca.c +++ b/ompi/mca/btl/usnic/btl_usnic_mca.c @@ -11,7 +11,7 @@ * All rights reserved. * Copyright (c) 2006 Sandia National Laboratories. All rights * reserved. - * Copyright (c) 2008-2015 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2008-2016 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2012 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2015 Intel, Inc. All rights reserved. @@ -162,6 +162,7 @@ int opal_btl_usnic_component_register(void) static int prio_sd_num; static int prio_rd_num; static int cq_num; + static int av_eq_num; static int udp_port_base; static int max_tiny_msg_size; static int eager_limit; @@ -235,6 +236,10 @@ int opal_btl_usnic_component_register(void) -1, &cq_num, REGINT_NEG_ONE_OK, OPAL_INFO_LVL_5)); mca_btl_usnic_component.cq_num = (int32_t) cq_num; + CHECK(reg_int("av_eq_num", "Number of event queue entries for peer address resolution (-1 = pre-set defaults; depends on number and type of devices available; will error if ac_eq_num < 8)", + -1, &av_eq_num, REGINT_NEG_ONE_OK, OPAL_INFO_LVL_5)); + mca_btl_usnic_component.av_eq_num = (int32_t) av_eq_num; + CHECK(reg_int("base_udp_port", "Base UDP port to use for usNIC communications. If 0, system will pick the port number. If non-zero, it will be added to each process' local rank to obtain the final port number (default: 0)", 0, &udp_port_base, REGINT_GE_ZERO, OPAL_INFO_LVL_5)); mca_btl_usnic_component.udp_port_base = (int) udp_port_base; diff --git a/ompi/mca/btl/usnic/btl_usnic_module.c b/ompi/mca/btl/usnic/btl_usnic_module.c index 4b7851fe74..d631b283d6 100644 --- a/ompi/mca/btl/usnic/btl_usnic_module.c +++ b/ompi/mca/btl/usnic/btl_usnic_module.c @@ -69,13 +69,14 @@ static void finalize_one_channel(opal_btl_usnic_module_t *module, /* - * Loop over all procs sent to us in add_procs and see if we want to - * add a proc/endpoint for them. + * Loop over a block of procs sent to us in add_procs and see if we + * want to add a proc/endpoint for them. */ -static int add_procs_create_endpoints(opal_btl_usnic_module_t *module, - size_t nprocs, - opal_proc_t **procs, - mca_btl_base_endpoint_t **endpoints) +static int add_procs_block_create_endpoints(opal_btl_usnic_module_t *module, + size_t block_offset, + size_t block_len, + opal_proc_t **procs, + mca_btl_base_endpoint_t **endpoints) { int rc; opal_proc_t* my_proc; @@ -87,8 +88,8 @@ static int add_procs_create_endpoints(opal_btl_usnic_module_t *module, return OPAL_ERR_OUT_OF_RESOURCE; } - /* Loop over the procs we were given */ - for (size_t i = 0; i < nprocs; i++) { + /* Loop over a block in the procs we were given */ + for (size_t i = block_offset; i < (block_offset + block_len); i++) { struct opal_proc_t* opal_proc = procs[i]; opal_btl_usnic_proc_t* usnic_proc; mca_btl_base_endpoint_t* usnic_endpoint; @@ -195,9 +196,10 @@ static void add_procs_warn_unreachable(opal_btl_usnic_module_t *module, * invoked. Go reap them all. */ static int -add_procs_reap_fi_av_inserts(opal_btl_usnic_module_t *module, - size_t array_len, - struct mca_btl_base_endpoint_t **endpoints) +add_procs_block_reap_fi_av_inserts(opal_btl_usnic_module_t *module, + size_t block_offset, + size_t block_len, + struct mca_btl_base_endpoint_t **endpoints) { int ret = OPAL_SUCCESS; int num_left; @@ -205,12 +207,11 @@ add_procs_reap_fi_av_inserts(opal_btl_usnic_module_t *module, uint32_t event; struct fi_eq_entry entry; struct fi_eq_err_entry err_entry; - bool error_occurred = false; /* compute num fi_av_insert completions we are waiting for */ num_left = 0; - for (i = 0; i < array_len; ++i) { + for (i = block_offset; i < (block_offset + block_len); ++i) { if (NULL != endpoints[i]) { num_left += USNIC_NUM_CHANNELS; } @@ -266,7 +267,7 @@ add_procs_reap_fi_av_inserts(opal_btl_usnic_module_t *module, We therefore only want to print a pretty warning about (and OBJ_RELEASE) that endpoint the *first* time it is reported. */ - for (i = 0; i < array_len; ++i) { + for (i = block_offset; i < (block_offset + block_len); ++i) { if (endpoints[i] == context->endpoint) { add_procs_warn_unreachable(module, context->endpoint); @@ -348,7 +349,7 @@ add_procs_reap_fi_av_inserts(opal_btl_usnic_module_t *module, - If an otherwise-valid endpoint has no dest, that means we timed out trying to resolve it, so just release that endpoint. */ size_t num_endpoints_created = 0; - for (i = 0; i < array_len; i++) { + for (i = block_offset; i < (block_offset + block_len); i++) { if (NULL != endpoints[i]) { bool happy; @@ -382,6 +383,79 @@ add_procs_reap_fi_av_inserts(opal_btl_usnic_module_t *module, return ret; } +/* + * Create endpoints for the procs we were given in add_procs. + */ +static int add_procs_create_endpoints(struct opal_btl_usnic_module_t* module, + size_t nprocs, + struct opal_proc_t **procs, + struct mca_btl_base_endpoint_t** endpoints) +{ + /* We need to ensure that we don't overrun the libfabric AV EQ. + Divide up all the peer address resolutions we need to do into a + series of blocks; insert and complete each block before moving + to the next (note: if performance mandates it, we can move to a + sliding window style of AV inserts to get better concurrency of + AV resolution). */ + + /* Leave a few empty slots in the AV EQ, just for good measure */ + if (module->av_eq_size < 8) { + opal_show_help("help-mpi-btl-usnic.txt", "fi_av_eq too small", + true, + opal_process_info.nodename, + module->av_eq_size, + 8); + return OPAL_ERR_OUT_OF_RESOURCE; + } + + size_t eq_size = module->av_eq_size - 8; + size_t block_len = eq_size; + size_t num_av_inserts = nprocs * USNIC_NUM_CHANNELS; + size_t num_blocks = num_av_inserts / eq_size; + if (eq_size % num_av_inserts != 0) { + ++num_blocks; + } + + /* Per above, the blocks are expressed in terms of number of AV + inserts. Convert them to be expressed in terms of number of + procs. */ + block_len /= USNIC_NUM_CHANNELS; + + /* Per above, loop over creating the endpoints so that we do not + overrun the libfabric AV EQ. */ + int rc; + for (size_t block_offset = 0, block = 0; block < num_blocks; + block_offset += block_len, ++block) { + /* Adjust for the last block */ + if (block_len > (nprocs - block_offset)) { + block_len = nprocs - block_offset; + } + + /* First, create endpoints (and procs, if they're not already + created) for the usnic-reachable procs we were given. */ + rc = add_procs_block_create_endpoints(module, + block_offset, block_len, + procs, endpoints); + if (OPAL_SUCCESS != rc) { + return rc; + } + + /* For each endpoint that was created, we initiated the + process to create NUM_CHANNELS fi_addrs. Go finish all of + those. This will be the final determination of whether we + can use the endpoint or not because we'll find out if each + endpoint is reachable or not. */ + rc = add_procs_block_reap_fi_av_inserts(module, + block_offset, block_len, + endpoints); + if (OPAL_SUCCESS != rc) { + return rc; + } + } + + return OPAL_SUCCESS; +} + /* * Add procs to this BTL module, receiving endpoint information from * the modex. This is done in 2 phases: @@ -408,23 +482,13 @@ static int usnic_add_procs(struct mca_btl_base_module_t* base_module, opal_btl_usnic_module_t* module = (opal_btl_usnic_module_t*) base_module; int rc; - /* First, create endpoints (and procs, if they're not already - created) for all the usnic-reachable procs we were given. */ + /* Go create the endpoints (including all relevant address + resolution) */ rc = add_procs_create_endpoints(module, nprocs, procs, endpoints); if (OPAL_SUCCESS != rc) { goto fail; } - /* For each endpoint that was created, we initiated the process to - create NUM_CHANNELS fi_addrs. Go finish all of those. This - will be the final determination of whether we can use the - endpoint or not because we'll find out if each endpoint is - reachable or not. */ - rc = add_procs_reap_fi_av_inserts(module, nprocs, endpoints); - if (OPAL_SUCCESS != rc) { - goto fail; - } - /* Find all the endpoints with a complete set of USD destinations and mark them as reachable */ for (size_t i = 0; NULL != reachable && i < nprocs; ++i) { @@ -1831,6 +1895,11 @@ static void init_queue_lengths(opal_btl_usnic_module_t *module) } else { module->cq_num = mca_btl_usnic_component.cq_num; } + if (-1 == mca_btl_usnic_component.av_eq_num) { + module->av_eq_num = 1024; + } else { + module->av_eq_num = mca_btl_usnic_component.av_eq_num; + } /* * Queue sizes for priority channel scale with # of endpoint. A @@ -2018,12 +2087,15 @@ static int init_channels(opal_btl_usnic_module_t *module) } memset(&eq_attr, 0, sizeof(eq_attr)); - eq_attr.size = 1024; + eq_attr.size = module->av_eq_num; eq_attr.wait_obj = FI_WAIT_UNSPEC; rc = fi_eq_open(module->fabric, &eq_attr, &module->av_eq, NULL); if (rc != OPAL_SUCCESS) { goto destroy; } + // Save the size of the created EQ + module->av_eq_size = eq_attr.size; + eq_attr.wait_obj = FI_WAIT_FD; rc = fi_eq_open(module->fabric, &eq_attr, &module->dom_eq, NULL); if (rc != OPAL_SUCCESS) { diff --git a/ompi/mca/btl/usnic/btl_usnic_module.h b/ompi/mca/btl/usnic/btl_usnic_module.h index 4f23eeac44..b4f5d0c739 100644 --- a/ompi/mca/btl/usnic/btl_usnic_module.h +++ b/ompi/mca/btl/usnic/btl_usnic_module.h @@ -11,7 +11,7 @@ * All rights reserved. * Copyright (c) 2006 Sandia National Laboratories. All rights * reserved. - * Copyright (c) 2011-2015 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2011-2016 Cisco Systems, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -110,6 +110,8 @@ typedef struct opal_btl_usnic_module_t { struct fid_eq *av_eq; struct fid_av *av; + size_t av_eq_size; + mca_btl_base_module_error_cb_fn_t pml_error_callback; /* Information about the events */ @@ -127,6 +129,7 @@ typedef struct opal_btl_usnic_module_t { int sd_num; int rd_num; int cq_num; + int av_eq_num; int prio_sd_num; int prio_rd_num; diff --git a/ompi/mca/btl/usnic/help-mpi-btl-usnic.txt b/ompi/mca/btl/usnic/help-mpi-btl-usnic.txt index c10770aef9..ae29d64bac 100644 --- a/ompi/mca/btl/usnic/help-mpi-btl-usnic.txt +++ b/ompi/mca/btl/usnic/help-mpi-btl-usnic.txt @@ -1,6 +1,6 @@ # -*- text -*- # -# Copyright (c) 2012-2015 Cisco Systems, Inc. All rights reserved. +# Copyright (c) 2012-2016 Cisco Systems, Inc. All rights reserved. # # $COPYRIGHT$ # @@ -240,6 +240,20 @@ abort. usNIC interface: %s Current ARP timeout: %d (btl_usnic_arp_timeout MCA param) # +[fi_av_eq too small] + +The usnic BTL was told to create an address resolution queue that was +too small via the mca_btl_usnic_av_eq_num MCA parameter. This +parameter controls how many outstanding peer address resolutions can +be outstanding at a time. Larger values allow more concurrent address +resolutions, but consume more memory. + + Server: %s + av_eq_num param value: %d + av_eq_num minimum value: %d + +Your job will likely either perform poorly, or will abort. +# [unreachable peer IP] WARNING: Open MPI failed to find a route to a peer IP address via a specific usNIC interface. This usually indicates a problem in the IP From 6dc44048272dfc5bc0977e29d327435fc673e3b9 Mon Sep 17 00:00:00 2001 From: Jeff Squyres Date: Sat, 30 Jan 2016 10:46:14 -0800 Subject: [PATCH 4/6] usnic: better av_eq_num default value handling (cherry picked from commit open-mpi/ompi@797d5026c889988b250bbdd8519c798e11b3f27e) --- ompi/mca/btl/usnic/btl_usnic_mca.c | 4 ++-- ompi/mca/btl/usnic/btl_usnic_module.c | 5 ----- 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/ompi/mca/btl/usnic/btl_usnic_mca.c b/ompi/mca/btl/usnic/btl_usnic_mca.c index 655128fd5f..72c5583f77 100644 --- a/ompi/mca/btl/usnic/btl_usnic_mca.c +++ b/ompi/mca/btl/usnic/btl_usnic_mca.c @@ -236,8 +236,8 @@ int opal_btl_usnic_component_register(void) -1, &cq_num, REGINT_NEG_ONE_OK, OPAL_INFO_LVL_5)); mca_btl_usnic_component.cq_num = (int32_t) cq_num; - CHECK(reg_int("av_eq_num", "Number of event queue entries for peer address resolution (-1 = pre-set defaults; depends on number and type of devices available; will error if ac_eq_num < 8)", - -1, &av_eq_num, REGINT_NEG_ONE_OK, OPAL_INFO_LVL_5)); + CHECK(reg_int("av_eq_num", "Number of event queue entries for peer address resolution", + 1024, &av_eq_num, REGINT_NEG_ONE_OK, OPAL_INFO_LVL_5)); mca_btl_usnic_component.av_eq_num = (int32_t) av_eq_num; CHECK(reg_int("base_udp_port", "Base UDP port to use for usNIC communications. If 0, system will pick the port number. If non-zero, it will be added to each process' local rank to obtain the final port number (default: 0)", diff --git a/ompi/mca/btl/usnic/btl_usnic_module.c b/ompi/mca/btl/usnic/btl_usnic_module.c index d631b283d6..f35ce4ad98 100644 --- a/ompi/mca/btl/usnic/btl_usnic_module.c +++ b/ompi/mca/btl/usnic/btl_usnic_module.c @@ -1895,11 +1895,6 @@ static void init_queue_lengths(opal_btl_usnic_module_t *module) } else { module->cq_num = mca_btl_usnic_component.cq_num; } - if (-1 == mca_btl_usnic_component.av_eq_num) { - module->av_eq_num = 1024; - } else { - module->av_eq_num = mca_btl_usnic_component.av_eq_num; - } /* * Queue sizes for priority channel scale with # of endpoint. A From 2715e03aabd91e21972b289539376ef48f83fd69 Mon Sep 17 00:00:00 2001 From: Jeff Squyres Date: Sat, 30 Jan 2016 10:49:14 -0800 Subject: [PATCH 5/6] usnic: change retrans timeout to 5ms A bunch of empirical testing has shown that increasing the retranmit timeout from 1ms to 5ms doesn't adversely affect performance, yet decreases the number of gratuitious retransmissions. (cherry picked from commit open-mpi/ompi@c2615a473233b7f331d9925a687d310981420a0f) --- ompi/mca/btl/usnic/btl_usnic_mca.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ompi/mca/btl/usnic/btl_usnic_mca.c b/ompi/mca/btl/usnic/btl_usnic_mca.c index 72c5583f77..9e74e4b74c 100644 --- a/ompi/mca/btl/usnic/btl_usnic_mca.c +++ b/ompi/mca/btl/usnic/btl_usnic_mca.c @@ -245,7 +245,7 @@ int opal_btl_usnic_component_register(void) mca_btl_usnic_component.udp_port_base = (int) udp_port_base; CHECK(reg_int("retrans_timeout", "Number of microseconds before retransmitting a frame", - 1000, &mca_btl_usnic_component.retrans_timeout, + 5000, &mca_btl_usnic_component.retrans_timeout, REGINT_GE_ONE, OPAL_INFO_LVL_5)); CHECK(reg_int("priority_limit", "Max size of \"priority\" messages (0 = use pre-set defaults; depends on number and type of devices available)", From 6f517bd88d6bdd677384b5a109c317e15df3c7f7 Mon Sep 17 00:00:00 2001 From: Jeff Squyres Date: Mon, 1 Feb 2016 11:14:30 -0800 Subject: [PATCH 6/6] usnic: minor updates from code review Three minor updates from the code review of https://github.com/open-mpi/ompi-release/pull/933: * Remove an extra blank line a show_help message * We no longer allow -1 for the MCA param btl_usnic_av_eq_num, so change the flag to REGINT_GE_ONE * Change "num_blocks" definition to be in terms of block_len (not eq_size) (cherry picked from commit open-mpi/ompi@9f3ed00125a2960c39c0e5597a35da15b36dd848) --- ompi/mca/btl/usnic/btl_usnic_mca.c | 2 +- ompi/mca/btl/usnic/btl_usnic_module.c | 2 +- ompi/mca/btl/usnic/help-mpi-btl-usnic.txt | 1 - 3 files changed, 2 insertions(+), 3 deletions(-) diff --git a/ompi/mca/btl/usnic/btl_usnic_mca.c b/ompi/mca/btl/usnic/btl_usnic_mca.c index 9e74e4b74c..910131d8f4 100644 --- a/ompi/mca/btl/usnic/btl_usnic_mca.c +++ b/ompi/mca/btl/usnic/btl_usnic_mca.c @@ -237,7 +237,7 @@ int opal_btl_usnic_component_register(void) mca_btl_usnic_component.cq_num = (int32_t) cq_num; CHECK(reg_int("av_eq_num", "Number of event queue entries for peer address resolution", - 1024, &av_eq_num, REGINT_NEG_ONE_OK, OPAL_INFO_LVL_5)); + 1024, &av_eq_num, REGINT_GE_ONE, OPAL_INFO_LVL_5)); mca_btl_usnic_component.av_eq_num = (int32_t) av_eq_num; CHECK(reg_int("base_udp_port", "Base UDP port to use for usNIC communications. If 0, system will pick the port number. If non-zero, it will be added to each process' local rank to obtain the final port number (default: 0)", diff --git a/ompi/mca/btl/usnic/btl_usnic_module.c b/ompi/mca/btl/usnic/btl_usnic_module.c index f35ce4ad98..2fc6288642 100644 --- a/ompi/mca/btl/usnic/btl_usnic_module.c +++ b/ompi/mca/btl/usnic/btl_usnic_module.c @@ -411,7 +411,7 @@ static int add_procs_create_endpoints(struct opal_btl_usnic_module_t* module, size_t eq_size = module->av_eq_size - 8; size_t block_len = eq_size; size_t num_av_inserts = nprocs * USNIC_NUM_CHANNELS; - size_t num_blocks = num_av_inserts / eq_size; + size_t num_blocks = num_av_inserts / block_len; if (eq_size % num_av_inserts != 0) { ++num_blocks; } diff --git a/ompi/mca/btl/usnic/help-mpi-btl-usnic.txt b/ompi/mca/btl/usnic/help-mpi-btl-usnic.txt index ae29d64bac..055b0954b7 100644 --- a/ompi/mca/btl/usnic/help-mpi-btl-usnic.txt +++ b/ompi/mca/btl/usnic/help-mpi-btl-usnic.txt @@ -241,7 +241,6 @@ abort. Current ARP timeout: %d (btl_usnic_arp_timeout MCA param) # [fi_av_eq too small] - The usnic BTL was told to create an address resolution queue that was too small via the mca_btl_usnic_av_eq_num MCA parameter. This parameter controls how many outstanding peer address resolutions can