Skip to content
This repository was archived by the owner on Sep 30, 2022. It is now read-only.

Commit 8948c6f

Browse files
committed
Merge pull request #934 from jsquyres/pr/v2.0.0/usnic-fixes
v2.0.0: usnic fixes
2 parents ebb9af5 + 37000f2 commit 8948c6f

10 files changed

+142
-46
lines changed

opal/mca/btl/usnic/btl_usnic.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
* All rights reserved.
1212
* Copyright (c) 2006 Sandia National Laboratories. All rights
1313
* reserved.
14-
* Copyright (c) 2011-2015 Cisco Systems, Inc. All rights reserved.
14+
* Copyright (c) 2011-2016 Cisco Systems, Inc. All rights reserved.
1515
* $COPYRIGHT$
1616
*
1717
* Additional copyrights may follow
@@ -181,6 +181,9 @@ typedef struct opal_btl_usnic_component_t {
181181
/** max completion queue entries per module */
182182
int32_t cq_num;
183183

184+
/** max number of entries in AV EQ */
185+
int32_t av_eq_num;
186+
184187
/** retrans characteristics */
185188
int retrans_timeout;
186189

opal/mca/btl/usnic/btl_usnic_ack.c

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2013-2015 Cisco Systems, Inc. All rights reserved.
2+
* Copyright (c) 2013-2016 Cisco Systems, Inc. All rights reserved.
33
* $COPYRIGHT$
44
*
55
* Additional copyrights may follow
@@ -207,8 +207,7 @@ opal_btl_usnic_ack_send(
207207
/* send the seq of the lowest item in the window that
208208
we've received */
209209
ack->ss_base.us_btl_header->ack_seq =
210-
endpoint->endpoint_next_contig_seq_to_recv - 1;
211-
210+
SEQ_DIFF(endpoint->endpoint_next_contig_seq_to_recv, 1);
212211
ack->ss_len = sizeof(opal_btl_usnic_btl_header_t);
213212

214213
#if MSGDEBUG1

opal/mca/btl/usnic/btl_usnic_ack.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2013-2015 Cisco Systems, Inc. All rights reserved.
2+
* Copyright (c) 2013-2016 Cisco Systems, Inc. All rights reserved.
33
* $COPYRIGHT$
44
*
55
* Additional copyrights may follow
@@ -92,7 +92,7 @@ opal_btl_usnic_piggyback_ack(
9292
if (endpoint->endpoint_ack_needed) {
9393
opal_btl_usnic_remove_from_endpoints_needing_ack(endpoint);
9494
sseg->ss_base.us_btl_header->ack_seq =
95-
endpoint->endpoint_next_contig_seq_to_recv - 1;
95+
SEQ_DIFF(endpoint->endpoint_next_contig_seq_to_recv, 1);
9696
sseg->ss_base.us_btl_header->ack_present = 1;
9797
#if MSGDEBUG1
9898
opal_output(0, "Piggy-backing ACK for sequence %"UDSEQ"\n",

opal/mca/btl/usnic/btl_usnic_component.c

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -956,11 +956,12 @@ static mca_btl_base_module_t** usnic_component_init(int* num_btl_modules,
956956
/* Output all of this module's values. */
957957
const char *devname = module->fabric_info->fabric_attr->name;
958958
opal_output_verbose(5, USNIC_OUT,
959-
"btl:usnic: %s num sqe=%d, num rqe=%d, num cqe=%d",
959+
"btl:usnic: %s num sqe=%d, num rqe=%d, num cqe=%d, num aveqe=%d",
960960
devname,
961961
module->sd_num,
962962
module->rd_num,
963-
module->cq_num);
963+
module->cq_num,
964+
module->av_eq_num);
964965
opal_output_verbose(5, USNIC_OUT,
965966
"btl:usnic: %s priority MTU = %" PRIsize_t,
966967
devname,

opal/mca/btl/usnic/btl_usnic_mca.c

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
* All rights reserved.
1212
* Copyright (c) 2006 Sandia National Laboratories. All rights
1313
* reserved.
14-
* Copyright (c) 2008-2015 Cisco Systems, Inc. All rights reserved.
14+
* Copyright (c) 2008-2016 Cisco Systems, Inc. All rights reserved.
1515
* Copyright (c) 2012 Los Alamos National Security, LLC. All rights
1616
* reserved.
1717
* Copyright (c) 2015 Intel, Inc. All rights reserved.
@@ -162,6 +162,7 @@ int opal_btl_usnic_component_register(void)
162162
static int prio_sd_num;
163163
static int prio_rd_num;
164164
static int cq_num;
165+
static int av_eq_num;
165166
static int udp_port_base;
166167
static int max_tiny_msg_size;
167168
static int eager_limit;
@@ -235,12 +236,16 @@ int opal_btl_usnic_component_register(void)
235236
-1, &cq_num, REGINT_NEG_ONE_OK, OPAL_INFO_LVL_5));
236237
mca_btl_usnic_component.cq_num = (int32_t) cq_num;
237238

239+
CHECK(reg_int("av_eq_num", "Number of event queue entries for peer address resolution",
240+
1024, &av_eq_num, REGINT_GE_ONE, OPAL_INFO_LVL_5));
241+
mca_btl_usnic_component.av_eq_num = (int32_t) av_eq_num;
242+
238243
CHECK(reg_int("base_udp_port", "Base UDP port to use for usNIC communications. If 0, system will pick the port number. If non-zero, it will be added to each process' local rank to obtain the final port number (default: 0)",
239244
0, &udp_port_base, REGINT_GE_ZERO, OPAL_INFO_LVL_5));
240245
mca_btl_usnic_component.udp_port_base = (int) udp_port_base;
241246

242247
CHECK(reg_int("retrans_timeout", "Number of microseconds before retransmitting a frame",
243-
1000, &mca_btl_usnic_component.retrans_timeout,
248+
5000, &mca_btl_usnic_component.retrans_timeout,
244249
REGINT_GE_ONE, OPAL_INFO_LVL_5));
245250

246251
CHECK(reg_int("priority_limit", "Max size of \"priority\" messages (0 = use pre-set defaults; depends on number and type of devices available)",

opal/mca/btl/usnic/btl_usnic_module.c

Lines changed: 97 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
* All rights reserved.
1313
* Copyright (c) 2006 Sandia National Laboratories. All rights
1414
* reserved.
15-
* Copyright (c) 2009-2015 Cisco Systems, Inc. All rights reserved.
15+
* Copyright (c) 2009-2016 Cisco Systems, Inc. All rights reserved.
1616
* Copyright (c) 2014 Los Alamos National Security, LLC. All rights
1717
* reserved.
1818
* Copyright (c) 2014 Intel, Inc. All rights reserved
@@ -69,13 +69,14 @@ static void finalize_one_channel(opal_btl_usnic_module_t *module,
6969

7070

7171
/*
72-
* Loop over all procs sent to us in add_procs and see if we want to
73-
* add a proc/endpoint for them.
72+
* Loop over a block of procs sent to us in add_procs and see if we
73+
* want to add a proc/endpoint for them.
7474
*/
75-
static int add_procs_create_endpoints(opal_btl_usnic_module_t *module,
76-
size_t nprocs,
77-
opal_proc_t **procs,
78-
mca_btl_base_endpoint_t **endpoints)
75+
static int add_procs_block_create_endpoints(opal_btl_usnic_module_t *module,
76+
size_t block_offset,
77+
size_t block_len,
78+
opal_proc_t **procs,
79+
mca_btl_base_endpoint_t **endpoints)
7980
{
8081
int rc;
8182
opal_proc_t* my_proc;
@@ -87,8 +88,8 @@ static int add_procs_create_endpoints(opal_btl_usnic_module_t *module,
8788
return OPAL_ERR_OUT_OF_RESOURCE;
8889
}
8990

90-
/* Loop over the procs we were given */
91-
for (size_t i = 0; i < nprocs; i++) {
91+
/* Loop over a block in the procs we were given */
92+
for (size_t i = block_offset; i < (block_offset + block_len); i++) {
9293
struct opal_proc_t* opal_proc = procs[i];
9394
opal_btl_usnic_proc_t* usnic_proc;
9495
mca_btl_base_endpoint_t* usnic_endpoint;
@@ -195,22 +196,22 @@ static void add_procs_warn_unreachable(opal_btl_usnic_module_t *module,
195196
* invoked. Go reap them all.
196197
*/
197198
static int
198-
add_procs_reap_fi_av_inserts(opal_btl_usnic_module_t *module,
199-
size_t array_len,
200-
struct mca_btl_base_endpoint_t **endpoints)
199+
add_procs_block_reap_fi_av_inserts(opal_btl_usnic_module_t *module,
200+
size_t block_offset,
201+
size_t block_len,
202+
struct mca_btl_base_endpoint_t **endpoints)
201203
{
202204
int ret = OPAL_SUCCESS;
203205
int num_left;
204206
size_t i, channel;
205207
uint32_t event;
206208
struct fi_eq_entry entry;
207209
struct fi_eq_err_entry err_entry;
208-
209210
bool error_occurred = false;
210211

211212
/* compute num fi_av_insert completions we are waiting for */
212213
num_left = 0;
213-
for (i = 0; i < array_len; ++i) {
214+
for (i = block_offset; i < (block_offset + block_len); ++i) {
214215
if (NULL != endpoints[i]) {
215216
num_left += USNIC_NUM_CHANNELS;
216217
}
@@ -266,7 +267,7 @@ add_procs_reap_fi_av_inserts(opal_btl_usnic_module_t *module,
266267
We therefore only want to print a pretty
267268
warning about (and OBJ_RELEASE) that endpoint
268269
the *first* time it is reported. */
269-
for (i = 0; i < array_len; ++i) {
270+
for (i = block_offset; i < (block_offset + block_len); ++i) {
270271
if (endpoints[i] == context->endpoint) {
271272
add_procs_warn_unreachable(module,
272273
context->endpoint);
@@ -348,7 +349,7 @@ add_procs_reap_fi_av_inserts(opal_btl_usnic_module_t *module,
348349
- If an otherwise-valid endpoint has no dest, that means we timed
349350
out trying to resolve it, so just release that endpoint. */
350351
size_t num_endpoints_created = 0;
351-
for (i = 0; i < array_len; i++) {
352+
for (i = block_offset; i < (block_offset + block_len); i++) {
352353
if (NULL != endpoints[i]) {
353354
bool happy;
354355

@@ -382,6 +383,79 @@ add_procs_reap_fi_av_inserts(opal_btl_usnic_module_t *module,
382383
return ret;
383384
}
384385

386+
/*
387+
* Create endpoints for the procs we were given in add_procs.
388+
*/
389+
static int add_procs_create_endpoints(struct opal_btl_usnic_module_t* module,
390+
size_t nprocs,
391+
struct opal_proc_t **procs,
392+
struct mca_btl_base_endpoint_t** endpoints)
393+
{
394+
/* We need to ensure that we don't overrun the libfabric AV EQ.
395+
Divide up all the peer address resolutions we need to do into a
396+
series of blocks; insert and complete each block before moving
397+
to the next (note: if performance mandates it, we can move to a
398+
sliding window style of AV inserts to get better concurrency of
399+
AV resolution). */
400+
401+
/* Leave a few empty slots in the AV EQ, just for good measure */
402+
if (module->av_eq_size < 8) {
403+
opal_show_help("help-mpi-btl-usnic.txt", "fi_av_eq too small",
404+
true,
405+
opal_process_info.nodename,
406+
module->av_eq_size,
407+
8);
408+
return OPAL_ERR_OUT_OF_RESOURCE;
409+
}
410+
411+
size_t eq_size = module->av_eq_size - 8;
412+
size_t block_len = eq_size;
413+
size_t num_av_inserts = nprocs * USNIC_NUM_CHANNELS;
414+
size_t num_blocks = num_av_inserts / block_len;
415+
if (eq_size % num_av_inserts != 0) {
416+
++num_blocks;
417+
}
418+
419+
/* Per above, the blocks are expressed in terms of number of AV
420+
inserts. Convert them to be expressed in terms of number of
421+
procs. */
422+
block_len /= USNIC_NUM_CHANNELS;
423+
424+
/* Per above, loop over creating the endpoints so that we do not
425+
overrun the libfabric AV EQ. */
426+
int rc;
427+
for (size_t block_offset = 0, block = 0; block < num_blocks;
428+
block_offset += block_len, ++block) {
429+
/* Adjust for the last block */
430+
if (block_len > (nprocs - block_offset)) {
431+
block_len = nprocs - block_offset;
432+
}
433+
434+
/* First, create endpoints (and procs, if they're not already
435+
created) for the usnic-reachable procs we were given. */
436+
rc = add_procs_block_create_endpoints(module,
437+
block_offset, block_len,
438+
procs, endpoints);
439+
if (OPAL_SUCCESS != rc) {
440+
return rc;
441+
}
442+
443+
/* For each endpoint that was created, we initiated the
444+
process to create NUM_CHANNELS fi_addrs. Go finish all of
445+
those. This will be the final determination of whether we
446+
can use the endpoint or not because we'll find out if each
447+
endpoint is reachable or not. */
448+
rc = add_procs_block_reap_fi_av_inserts(module,
449+
block_offset, block_len,
450+
endpoints);
451+
if (OPAL_SUCCESS != rc) {
452+
return rc;
453+
}
454+
}
455+
456+
return OPAL_SUCCESS;
457+
}
458+
385459
/*
386460
* Add procs to this BTL module, receiving endpoint information from
387461
* the modex. This is done in 2 phases:
@@ -408,23 +482,13 @@ static int usnic_add_procs(struct mca_btl_base_module_t* base_module,
408482
opal_btl_usnic_module_t* module = (opal_btl_usnic_module_t*) base_module;
409483
int rc;
410484

411-
/* First, create endpoints (and procs, if they're not already
412-
created) for all the usnic-reachable procs we were given. */
485+
/* Go create the endpoints (including all relevant address
486+
resolution) */
413487
rc = add_procs_create_endpoints(module, nprocs, procs, endpoints);
414488
if (OPAL_SUCCESS != rc) {
415489
goto fail;
416490
}
417491

418-
/* For each endpoint that was created, we initiated the process to
419-
create NUM_CHANNELS fi_addrs. Go finish all of those. This
420-
will be the final determination of whether we can use the
421-
endpoint or not because we'll find out if each endpoint is
422-
reachable or not. */
423-
rc = add_procs_reap_fi_av_inserts(module, nprocs, endpoints);
424-
if (OPAL_SUCCESS != rc) {
425-
goto fail;
426-
}
427-
428492
/* Find all the endpoints with a complete set of USD destinations
429493
and mark them as reachable */
430494
for (size_t i = 0; NULL != reachable && i < nprocs; ++i) {
@@ -1205,7 +1269,7 @@ usnic_send(
12051269
/* assign length */
12061270
sseg->ss_len = sizeof(opal_btl_usnic_btl_header_t) + frag->sf_size;
12071271

1208-
sseg->ss_channel = USNIC_PRIORITY_CHANNEL;
1272+
sseg->ss_channel = USNIC_DATA_CHANNEL;
12091273
sseg->ss_base.us_btl_header->tag = tag;
12101274
#if MSGDEBUG1
12111275
opal_output(0, "INLINE send, sseg=%p", (void *)sseg);
@@ -2018,12 +2082,15 @@ static int init_channels(opal_btl_usnic_module_t *module)
20182082
}
20192083

20202084
memset(&eq_attr, 0, sizeof(eq_attr));
2021-
eq_attr.size = 1024;
2085+
eq_attr.size = module->av_eq_num;
20222086
eq_attr.wait_obj = FI_WAIT_UNSPEC;
20232087
rc = fi_eq_open(module->fabric, &eq_attr, &module->av_eq, NULL);
20242088
if (rc != OPAL_SUCCESS) {
20252089
goto destroy;
20262090
}
2091+
// Save the size of the created EQ
2092+
module->av_eq_size = eq_attr.size;
2093+
20272094
eq_attr.wait_obj = FI_WAIT_FD;
20282095
rc = fi_eq_open(module->fabric, &eq_attr, &module->dom_eq, NULL);
20292096
if (rc != OPAL_SUCCESS) {

opal/mca/btl/usnic/btl_usnic_module.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
* All rights reserved.
1212
* Copyright (c) 2006 Sandia National Laboratories. All rights
1313
* reserved.
14-
* Copyright (c) 2011-2015 Cisco Systems, Inc. All rights reserved.
14+
* Copyright (c) 2011-2016 Cisco Systems, Inc. All rights reserved.
1515
* $COPYRIGHT$
1616
*
1717
* Additional copyrights may follow
@@ -110,6 +110,8 @@ typedef struct opal_btl_usnic_module_t {
110110
struct fid_eq *av_eq;
111111
struct fid_av *av;
112112

113+
size_t av_eq_size;
114+
113115
mca_btl_base_module_error_cb_fn_t pml_error_callback;
114116

115117
/* Information about the events */
@@ -127,6 +129,7 @@ typedef struct opal_btl_usnic_module_t {
127129
int sd_num;
128130
int rd_num;
129131
int cq_num;
132+
int av_eq_num;
130133
int prio_sd_num;
131134
int prio_rd_num;
132135

opal/mca/btl/usnic/btl_usnic_send.h

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2013-2015 Cisco Systems, Inc. All rights reserved.
2+
* Copyright (c) 2013-2016 Cisco Systems, Inc. All rights reserved.
33
* $COPYRIGHT$
44
*
55
* Additional copyrights may follow
@@ -76,6 +76,8 @@ opal_btl_usnic_post_segment(
7676
sseg->ss_len);
7777
#endif
7878

79+
assert(channel_id == USNIC_DATA_CHANNEL);
80+
7981
/* Send the segment */
8082
ret = fi_send(channel->ep,
8183
sseg->ss_ptr,
@@ -126,6 +128,8 @@ opal_btl_usnic_post_ack(
126128
sseg->ss_len);
127129
#endif
128130

131+
assert(channel_id == USNIC_PRIORITY_CHANNEL);
132+
129133
ret = fi_send(channel->ep,
130134
sseg->ss_ptr,
131135
sseg->ss_len + mca_btl_usnic_component.prefix_send_offset,

opal/mca/btl/usnic/btl_usnic_stats.c

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2013-2015 Cisco Systems, Inc. All rights reserved.
2+
* Copyright (c) 2013-2016 Cisco Systems, Inc. All rights reserved.
33
* $COPYRIGHT$
44
*
55
* Additional copyrights may follow
@@ -145,8 +145,9 @@ void opal_btl_usnic_print_stats(
145145
/* Number of un-acked sends (i.e., sends for which we're
146146
still waiting for ACK) */
147147
send_unacked =
148-
endpoint->endpoint_next_seq_to_send -
149-
endpoint->endpoint_ack_seq_rcvd - 1;
148+
SEQ_DIFF(endpoint->endpoint_next_seq_to_send,
149+
SEQ_DIFF(endpoint->endpoint_ack_seq_rcvd, 1));
150+
150151
if (send_unacked > su_max) su_max = send_unacked;
151152
if (send_unacked < su_min) su_min = send_unacked;
152153

0 commit comments

Comments
 (0)