diff --git a/opal/mca/btl/usnic/btl_usnic.h b/opal/mca/btl/usnic/btl_usnic.h index 0d815cc438..cc094ce38f 100644 --- a/opal/mca/btl/usnic/btl_usnic.h +++ b/opal/mca/btl/usnic/btl_usnic.h @@ -11,7 +11,7 @@ * All rights reserved. * Copyright (c) 2006 Sandia National Laboratories. All rights * reserved. - * Copyright (c) 2011-2015 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2011-2016 Cisco Systems, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -181,6 +181,9 @@ typedef struct opal_btl_usnic_component_t { /** max completion queue entries per module */ int32_t cq_num; + /** max number of entries in AV EQ */ + int32_t av_eq_num; + /** retrans characteristics */ int retrans_timeout; diff --git a/opal/mca/btl/usnic/btl_usnic_ack.c b/opal/mca/btl/usnic/btl_usnic_ack.c index 3a6ae5baac..4616516f52 100644 --- a/opal/mca/btl/usnic/btl_usnic_ack.c +++ b/opal/mca/btl/usnic/btl_usnic_ack.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013-2015 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2013-2016 Cisco Systems, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -207,8 +207,7 @@ opal_btl_usnic_ack_send( /* send the seq of the lowest item in the window that we've received */ ack->ss_base.us_btl_header->ack_seq = - endpoint->endpoint_next_contig_seq_to_recv - 1; - + SEQ_DIFF(endpoint->endpoint_next_contig_seq_to_recv, 1); ack->ss_len = sizeof(opal_btl_usnic_btl_header_t); #if MSGDEBUG1 diff --git a/opal/mca/btl/usnic/btl_usnic_ack.h b/opal/mca/btl/usnic/btl_usnic_ack.h index 0aaf8306d7..1ef85544c0 100644 --- a/opal/mca/btl/usnic/btl_usnic_ack.h +++ b/opal/mca/btl/usnic/btl_usnic_ack.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013-2015 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2013-2016 Cisco Systems, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -92,7 +92,7 @@ opal_btl_usnic_piggyback_ack( if (endpoint->endpoint_ack_needed) { opal_btl_usnic_remove_from_endpoints_needing_ack(endpoint); sseg->ss_base.us_btl_header->ack_seq = - endpoint->endpoint_next_contig_seq_to_recv - 1; + SEQ_DIFF(endpoint->endpoint_next_contig_seq_to_recv, 1); sseg->ss_base.us_btl_header->ack_present = 1; #if MSGDEBUG1 opal_output(0, "Piggy-backing ACK for sequence %"UDSEQ"\n", diff --git a/opal/mca/btl/usnic/btl_usnic_component.c b/opal/mca/btl/usnic/btl_usnic_component.c index 18e33c9629..b49e51bb57 100644 --- a/opal/mca/btl/usnic/btl_usnic_component.c +++ b/opal/mca/btl/usnic/btl_usnic_component.c @@ -956,11 +956,12 @@ static mca_btl_base_module_t** usnic_component_init(int* num_btl_modules, /* Output all of this module's values. */ const char *devname = module->fabric_info->fabric_attr->name; opal_output_verbose(5, USNIC_OUT, - "btl:usnic: %s num sqe=%d, num rqe=%d, num cqe=%d", + "btl:usnic: %s num sqe=%d, num rqe=%d, num cqe=%d, num aveqe=%d", devname, module->sd_num, module->rd_num, - module->cq_num); + module->cq_num, + module->av_eq_num); opal_output_verbose(5, USNIC_OUT, "btl:usnic: %s priority MTU = %" PRIsize_t, devname, diff --git a/opal/mca/btl/usnic/btl_usnic_mca.c b/opal/mca/btl/usnic/btl_usnic_mca.c index c0df778871..910131d8f4 100644 --- a/opal/mca/btl/usnic/btl_usnic_mca.c +++ b/opal/mca/btl/usnic/btl_usnic_mca.c @@ -11,7 +11,7 @@ * All rights reserved. * Copyright (c) 2006 Sandia National Laboratories. All rights * reserved. - * Copyright (c) 2008-2015 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2008-2016 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2012 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2015 Intel, Inc. All rights reserved. @@ -162,6 +162,7 @@ int opal_btl_usnic_component_register(void) static int prio_sd_num; static int prio_rd_num; static int cq_num; + static int av_eq_num; static int udp_port_base; static int max_tiny_msg_size; static int eager_limit; @@ -235,12 +236,16 @@ int opal_btl_usnic_component_register(void) -1, &cq_num, REGINT_NEG_ONE_OK, OPAL_INFO_LVL_5)); mca_btl_usnic_component.cq_num = (int32_t) cq_num; + CHECK(reg_int("av_eq_num", "Number of event queue entries for peer address resolution", + 1024, &av_eq_num, REGINT_GE_ONE, OPAL_INFO_LVL_5)); + mca_btl_usnic_component.av_eq_num = (int32_t) av_eq_num; + CHECK(reg_int("base_udp_port", "Base UDP port to use for usNIC communications. If 0, system will pick the port number. If non-zero, it will be added to each process' local rank to obtain the final port number (default: 0)", 0, &udp_port_base, REGINT_GE_ZERO, OPAL_INFO_LVL_5)); mca_btl_usnic_component.udp_port_base = (int) udp_port_base; CHECK(reg_int("retrans_timeout", "Number of microseconds before retransmitting a frame", - 1000, &mca_btl_usnic_component.retrans_timeout, + 5000, &mca_btl_usnic_component.retrans_timeout, REGINT_GE_ONE, OPAL_INFO_LVL_5)); CHECK(reg_int("priority_limit", "Max size of \"priority\" messages (0 = use pre-set defaults; depends on number and type of devices available)", diff --git a/opal/mca/btl/usnic/btl_usnic_module.c b/opal/mca/btl/usnic/btl_usnic_module.c index 6105fd4281..2fc6288642 100644 --- a/opal/mca/btl/usnic/btl_usnic_module.c +++ b/opal/mca/btl/usnic/btl_usnic_module.c @@ -12,7 +12,7 @@ * All rights reserved. * Copyright (c) 2006 Sandia National Laboratories. All rights * reserved. - * Copyright (c) 2009-2015 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2009-2016 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2014 Los Alamos National Security, LLC. All rights * reserved. * Copyright (c) 2014 Intel, Inc. All rights reserved @@ -69,13 +69,14 @@ static void finalize_one_channel(opal_btl_usnic_module_t *module, /* - * Loop over all procs sent to us in add_procs and see if we want to - * add a proc/endpoint for them. + * Loop over a block of procs sent to us in add_procs and see if we + * want to add a proc/endpoint for them. */ -static int add_procs_create_endpoints(opal_btl_usnic_module_t *module, - size_t nprocs, - opal_proc_t **procs, - mca_btl_base_endpoint_t **endpoints) +static int add_procs_block_create_endpoints(opal_btl_usnic_module_t *module, + size_t block_offset, + size_t block_len, + opal_proc_t **procs, + mca_btl_base_endpoint_t **endpoints) { int rc; opal_proc_t* my_proc; @@ -87,8 +88,8 @@ static int add_procs_create_endpoints(opal_btl_usnic_module_t *module, return OPAL_ERR_OUT_OF_RESOURCE; } - /* Loop over the procs we were given */ - for (size_t i = 0; i < nprocs; i++) { + /* Loop over a block in the procs we were given */ + for (size_t i = block_offset; i < (block_offset + block_len); i++) { struct opal_proc_t* opal_proc = procs[i]; opal_btl_usnic_proc_t* usnic_proc; mca_btl_base_endpoint_t* usnic_endpoint; @@ -195,9 +196,10 @@ static void add_procs_warn_unreachable(opal_btl_usnic_module_t *module, * invoked. Go reap them all. */ static int -add_procs_reap_fi_av_inserts(opal_btl_usnic_module_t *module, - size_t array_len, - struct mca_btl_base_endpoint_t **endpoints) +add_procs_block_reap_fi_av_inserts(opal_btl_usnic_module_t *module, + size_t block_offset, + size_t block_len, + struct mca_btl_base_endpoint_t **endpoints) { int ret = OPAL_SUCCESS; int num_left; @@ -205,12 +207,11 @@ add_procs_reap_fi_av_inserts(opal_btl_usnic_module_t *module, uint32_t event; struct fi_eq_entry entry; struct fi_eq_err_entry err_entry; - bool error_occurred = false; /* compute num fi_av_insert completions we are waiting for */ num_left = 0; - for (i = 0; i < array_len; ++i) { + for (i = block_offset; i < (block_offset + block_len); ++i) { if (NULL != endpoints[i]) { num_left += USNIC_NUM_CHANNELS; } @@ -266,7 +267,7 @@ add_procs_reap_fi_av_inserts(opal_btl_usnic_module_t *module, We therefore only want to print a pretty warning about (and OBJ_RELEASE) that endpoint the *first* time it is reported. */ - for (i = 0; i < array_len; ++i) { + for (i = block_offset; i < (block_offset + block_len); ++i) { if (endpoints[i] == context->endpoint) { add_procs_warn_unreachable(module, context->endpoint); @@ -348,7 +349,7 @@ add_procs_reap_fi_av_inserts(opal_btl_usnic_module_t *module, - If an otherwise-valid endpoint has no dest, that means we timed out trying to resolve it, so just release that endpoint. */ size_t num_endpoints_created = 0; - for (i = 0; i < array_len; i++) { + for (i = block_offset; i < (block_offset + block_len); i++) { if (NULL != endpoints[i]) { bool happy; @@ -382,6 +383,79 @@ add_procs_reap_fi_av_inserts(opal_btl_usnic_module_t *module, return ret; } +/* + * Create endpoints for the procs we were given in add_procs. + */ +static int add_procs_create_endpoints(struct opal_btl_usnic_module_t* module, + size_t nprocs, + struct opal_proc_t **procs, + struct mca_btl_base_endpoint_t** endpoints) +{ + /* We need to ensure that we don't overrun the libfabric AV EQ. + Divide up all the peer address resolutions we need to do into a + series of blocks; insert and complete each block before moving + to the next (note: if performance mandates it, we can move to a + sliding window style of AV inserts to get better concurrency of + AV resolution). */ + + /* Leave a few empty slots in the AV EQ, just for good measure */ + if (module->av_eq_size < 8) { + opal_show_help("help-mpi-btl-usnic.txt", "fi_av_eq too small", + true, + opal_process_info.nodename, + module->av_eq_size, + 8); + return OPAL_ERR_OUT_OF_RESOURCE; + } + + size_t eq_size = module->av_eq_size - 8; + size_t block_len = eq_size; + size_t num_av_inserts = nprocs * USNIC_NUM_CHANNELS; + size_t num_blocks = num_av_inserts / block_len; + if (eq_size % num_av_inserts != 0) { + ++num_blocks; + } + + /* Per above, the blocks are expressed in terms of number of AV + inserts. Convert them to be expressed in terms of number of + procs. */ + block_len /= USNIC_NUM_CHANNELS; + + /* Per above, loop over creating the endpoints so that we do not + overrun the libfabric AV EQ. */ + int rc; + for (size_t block_offset = 0, block = 0; block < num_blocks; + block_offset += block_len, ++block) { + /* Adjust for the last block */ + if (block_len > (nprocs - block_offset)) { + block_len = nprocs - block_offset; + } + + /* First, create endpoints (and procs, if they're not already + created) for the usnic-reachable procs we were given. */ + rc = add_procs_block_create_endpoints(module, + block_offset, block_len, + procs, endpoints); + if (OPAL_SUCCESS != rc) { + return rc; + } + + /* For each endpoint that was created, we initiated the + process to create NUM_CHANNELS fi_addrs. Go finish all of + those. This will be the final determination of whether we + can use the endpoint or not because we'll find out if each + endpoint is reachable or not. */ + rc = add_procs_block_reap_fi_av_inserts(module, + block_offset, block_len, + endpoints); + if (OPAL_SUCCESS != rc) { + return rc; + } + } + + return OPAL_SUCCESS; +} + /* * Add procs to this BTL module, receiving endpoint information from * the modex. This is done in 2 phases: @@ -408,23 +482,13 @@ static int usnic_add_procs(struct mca_btl_base_module_t* base_module, opal_btl_usnic_module_t* module = (opal_btl_usnic_module_t*) base_module; int rc; - /* First, create endpoints (and procs, if they're not already - created) for all the usnic-reachable procs we were given. */ + /* Go create the endpoints (including all relevant address + resolution) */ rc = add_procs_create_endpoints(module, nprocs, procs, endpoints); if (OPAL_SUCCESS != rc) { goto fail; } - /* For each endpoint that was created, we initiated the process to - create NUM_CHANNELS fi_addrs. Go finish all of those. This - will be the final determination of whether we can use the - endpoint or not because we'll find out if each endpoint is - reachable or not. */ - rc = add_procs_reap_fi_av_inserts(module, nprocs, endpoints); - if (OPAL_SUCCESS != rc) { - goto fail; - } - /* Find all the endpoints with a complete set of USD destinations and mark them as reachable */ for (size_t i = 0; NULL != reachable && i < nprocs; ++i) { @@ -1205,7 +1269,7 @@ usnic_send( /* assign length */ sseg->ss_len = sizeof(opal_btl_usnic_btl_header_t) + frag->sf_size; - sseg->ss_channel = USNIC_PRIORITY_CHANNEL; + sseg->ss_channel = USNIC_DATA_CHANNEL; sseg->ss_base.us_btl_header->tag = tag; #if MSGDEBUG1 opal_output(0, "INLINE send, sseg=%p", (void *)sseg); @@ -2018,12 +2082,15 @@ static int init_channels(opal_btl_usnic_module_t *module) } memset(&eq_attr, 0, sizeof(eq_attr)); - eq_attr.size = 1024; + eq_attr.size = module->av_eq_num; eq_attr.wait_obj = FI_WAIT_UNSPEC; rc = fi_eq_open(module->fabric, &eq_attr, &module->av_eq, NULL); if (rc != OPAL_SUCCESS) { goto destroy; } + // Save the size of the created EQ + module->av_eq_size = eq_attr.size; + eq_attr.wait_obj = FI_WAIT_FD; rc = fi_eq_open(module->fabric, &eq_attr, &module->dom_eq, NULL); if (rc != OPAL_SUCCESS) { diff --git a/opal/mca/btl/usnic/btl_usnic_module.h b/opal/mca/btl/usnic/btl_usnic_module.h index 4f23eeac44..b4f5d0c739 100644 --- a/opal/mca/btl/usnic/btl_usnic_module.h +++ b/opal/mca/btl/usnic/btl_usnic_module.h @@ -11,7 +11,7 @@ * All rights reserved. * Copyright (c) 2006 Sandia National Laboratories. All rights * reserved. - * Copyright (c) 2011-2015 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2011-2016 Cisco Systems, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -110,6 +110,8 @@ typedef struct opal_btl_usnic_module_t { struct fid_eq *av_eq; struct fid_av *av; + size_t av_eq_size; + mca_btl_base_module_error_cb_fn_t pml_error_callback; /* Information about the events */ @@ -127,6 +129,7 @@ typedef struct opal_btl_usnic_module_t { int sd_num; int rd_num; int cq_num; + int av_eq_num; int prio_sd_num; int prio_rd_num; diff --git a/opal/mca/btl/usnic/btl_usnic_send.h b/opal/mca/btl/usnic/btl_usnic_send.h index 796008d2f7..2020544f20 100644 --- a/opal/mca/btl/usnic/btl_usnic_send.h +++ b/opal/mca/btl/usnic/btl_usnic_send.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013-2015 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2013-2016 Cisco Systems, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -76,6 +76,8 @@ opal_btl_usnic_post_segment( sseg->ss_len); #endif + assert(channel_id == USNIC_DATA_CHANNEL); + /* Send the segment */ ret = fi_send(channel->ep, sseg->ss_ptr, @@ -126,6 +128,8 @@ opal_btl_usnic_post_ack( sseg->ss_len); #endif + assert(channel_id == USNIC_PRIORITY_CHANNEL); + ret = fi_send(channel->ep, sseg->ss_ptr, sseg->ss_len + mca_btl_usnic_component.prefix_send_offset, diff --git a/opal/mca/btl/usnic/btl_usnic_stats.c b/opal/mca/btl/usnic/btl_usnic_stats.c index 68968d19b0..9c3acac868 100644 --- a/opal/mca/btl/usnic/btl_usnic_stats.c +++ b/opal/mca/btl/usnic/btl_usnic_stats.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013-2015 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2013-2016 Cisco Systems, Inc. All rights reserved. * $COPYRIGHT$ * * Additional copyrights may follow @@ -145,8 +145,9 @@ void opal_btl_usnic_print_stats( /* Number of un-acked sends (i.e., sends for which we're still waiting for ACK) */ send_unacked = - endpoint->endpoint_next_seq_to_send - - endpoint->endpoint_ack_seq_rcvd - 1; + SEQ_DIFF(endpoint->endpoint_next_seq_to_send, + SEQ_DIFF(endpoint->endpoint_ack_seq_rcvd, 1)); + if (send_unacked > su_max) su_max = send_unacked; if (send_unacked < su_min) su_min = send_unacked; diff --git a/opal/mca/btl/usnic/help-mpi-btl-usnic.txt b/opal/mca/btl/usnic/help-mpi-btl-usnic.txt index c10770aef9..055b0954b7 100644 --- a/opal/mca/btl/usnic/help-mpi-btl-usnic.txt +++ b/opal/mca/btl/usnic/help-mpi-btl-usnic.txt @@ -1,6 +1,6 @@ # -*- text -*- # -# Copyright (c) 2012-2015 Cisco Systems, Inc. All rights reserved. +# Copyright (c) 2012-2016 Cisco Systems, Inc. All rights reserved. # # $COPYRIGHT$ # @@ -240,6 +240,19 @@ abort. usNIC interface: %s Current ARP timeout: %d (btl_usnic_arp_timeout MCA param) # +[fi_av_eq too small] +The usnic BTL was told to create an address resolution queue that was +too small via the mca_btl_usnic_av_eq_num MCA parameter. This +parameter controls how many outstanding peer address resolutions can +be outstanding at a time. Larger values allow more concurrent address +resolutions, but consume more memory. + + Server: %s + av_eq_num param value: %d + av_eq_num minimum value: %d + +Your job will likely either perform poorly, or will abort. +# [unreachable peer IP] WARNING: Open MPI failed to find a route to a peer IP address via a specific usNIC interface. This usually indicates a problem in the IP