12
12
* All rights reserved.
13
13
* Copyright (c) 2006 Sandia National Laboratories. All rights
14
14
* reserved.
15
- * Copyright (c) 2009-2015 Cisco Systems, Inc. All rights reserved.
15
+ * Copyright (c) 2009-2016 Cisco Systems, Inc. All rights reserved.
16
16
* Copyright (c) 2014 Los Alamos National Security, LLC. All rights
17
17
* reserved.
18
18
* Copyright (c) 2014 Intel, Inc. All rights reserved
@@ -69,13 +69,14 @@ static void finalize_one_channel(opal_btl_usnic_module_t *module,
69
69
70
70
71
71
/*
72
- * Loop over all procs sent to us in add_procs and see if we want to
73
- * add a proc/endpoint for them.
72
+ * Loop over a block of procs sent to us in add_procs and see if we
73
+ * want to add a proc/endpoint for them.
74
74
*/
75
- static int add_procs_create_endpoints (opal_btl_usnic_module_t * module ,
76
- size_t nprocs ,
77
- opal_proc_t * * procs ,
78
- mca_btl_base_endpoint_t * * endpoints )
75
+ static int add_procs_block_create_endpoints (opal_btl_usnic_module_t * module ,
76
+ size_t block_offset ,
77
+ size_t block_len ,
78
+ opal_proc_t * * procs ,
79
+ mca_btl_base_endpoint_t * * endpoints )
79
80
{
80
81
int rc ;
81
82
opal_proc_t * my_proc ;
@@ -87,8 +88,8 @@ static int add_procs_create_endpoints(opal_btl_usnic_module_t *module,
87
88
return OPAL_ERR_OUT_OF_RESOURCE ;
88
89
}
89
90
90
- /* Loop over the procs we were given */
91
- for (size_t i = 0 ; i < nprocs ; i ++ ) {
91
+ /* Loop over a block in the procs we were given */
92
+ for (size_t i = block_offset ; i < ( block_offset + block_len ) ; i ++ ) {
92
93
struct opal_proc_t * opal_proc = procs [i ];
93
94
opal_btl_usnic_proc_t * usnic_proc ;
94
95
mca_btl_base_endpoint_t * usnic_endpoint ;
@@ -195,22 +196,22 @@ static void add_procs_warn_unreachable(opal_btl_usnic_module_t *module,
195
196
* invoked. Go reap them all.
196
197
*/
197
198
static int
198
- add_procs_reap_fi_av_inserts (opal_btl_usnic_module_t * module ,
199
- size_t array_len ,
200
- struct mca_btl_base_endpoint_t * * endpoints )
199
+ add_procs_block_reap_fi_av_inserts (opal_btl_usnic_module_t * module ,
200
+ size_t block_offset ,
201
+ size_t block_len ,
202
+ struct mca_btl_base_endpoint_t * * endpoints )
201
203
{
202
204
int ret = OPAL_SUCCESS ;
203
205
int num_left ;
204
206
size_t i , channel ;
205
207
uint32_t event ;
206
208
struct fi_eq_entry entry ;
207
209
struct fi_eq_err_entry err_entry ;
208
-
209
210
bool error_occurred = false;
210
211
211
212
/* compute num fi_av_insert completions we are waiting for */
212
213
num_left = 0 ;
213
- for (i = 0 ; i < array_len ; ++ i ) {
214
+ for (i = block_offset ; i < ( block_offset + block_len ) ; ++ i ) {
214
215
if (NULL != endpoints [i ]) {
215
216
num_left += USNIC_NUM_CHANNELS ;
216
217
}
@@ -266,7 +267,7 @@ add_procs_reap_fi_av_inserts(opal_btl_usnic_module_t *module,
266
267
We therefore only want to print a pretty
267
268
warning about (and OBJ_RELEASE) that endpoint
268
269
the *first* time it is reported. */
269
- for (i = 0 ; i < array_len ; ++ i ) {
270
+ for (i = block_offset ; i < ( block_offset + block_len ) ; ++ i ) {
270
271
if (endpoints [i ] == context -> endpoint ) {
271
272
add_procs_warn_unreachable (module ,
272
273
context -> endpoint );
@@ -348,7 +349,7 @@ add_procs_reap_fi_av_inserts(opal_btl_usnic_module_t *module,
348
349
- If an otherwise-valid endpoint has no dest, that means we timed
349
350
out trying to resolve it, so just release that endpoint. */
350
351
size_t num_endpoints_created = 0 ;
351
- for (i = 0 ; i < array_len ; i ++ ) {
352
+ for (i = block_offset ; i < ( block_offset + block_len ) ; i ++ ) {
352
353
if (NULL != endpoints [i ]) {
353
354
bool happy ;
354
355
@@ -382,6 +383,79 @@ add_procs_reap_fi_av_inserts(opal_btl_usnic_module_t *module,
382
383
return ret ;
383
384
}
384
385
386
+ /*
387
+ * Create endpoints for the procs we were given in add_procs.
388
+ */
389
+ static int add_procs_create_endpoints (struct opal_btl_usnic_module_t * module ,
390
+ size_t nprocs ,
391
+ struct opal_proc_t * * procs ,
392
+ struct mca_btl_base_endpoint_t * * endpoints )
393
+ {
394
+ /* We need to ensure that we don't overrun the libfabric AV EQ.
395
+ Divide up all the peer address resolutions we need to do into a
396
+ series of blocks; insert and complete each block before moving
397
+ to the next (note: if performance mandates it, we can move to a
398
+ sliding window style of AV inserts to get better concurrency of
399
+ AV resolution). */
400
+
401
+ /* Leave a few empty slots in the AV EQ, just for good measure */
402
+ if (module -> av_eq_size < 8 ) {
403
+ opal_show_help ("help-mpi-btl-usnic.txt" , "fi_av_eq too small" ,
404
+ true,
405
+ opal_process_info .nodename ,
406
+ module -> av_eq_size ,
407
+ 8 );
408
+ return OPAL_ERR_OUT_OF_RESOURCE ;
409
+ }
410
+
411
+ size_t eq_size = module -> av_eq_size - 8 ;
412
+ size_t block_len = eq_size ;
413
+ size_t num_av_inserts = nprocs * USNIC_NUM_CHANNELS ;
414
+ size_t num_blocks = num_av_inserts / block_len ;
415
+ if (eq_size % num_av_inserts != 0 ) {
416
+ ++ num_blocks ;
417
+ }
418
+
419
+ /* Per above, the blocks are expressed in terms of number of AV
420
+ inserts. Convert them to be expressed in terms of number of
421
+ procs. */
422
+ block_len /= USNIC_NUM_CHANNELS ;
423
+
424
+ /* Per above, loop over creating the endpoints so that we do not
425
+ overrun the libfabric AV EQ. */
426
+ int rc ;
427
+ for (size_t block_offset = 0 , block = 0 ; block < num_blocks ;
428
+ block_offset += block_len , ++ block ) {
429
+ /* Adjust for the last block */
430
+ if (block_len > (nprocs - block_offset )) {
431
+ block_len = nprocs - block_offset ;
432
+ }
433
+
434
+ /* First, create endpoints (and procs, if they're not already
435
+ created) for the usnic-reachable procs we were given. */
436
+ rc = add_procs_block_create_endpoints (module ,
437
+ block_offset , block_len ,
438
+ procs , endpoints );
439
+ if (OPAL_SUCCESS != rc ) {
440
+ return rc ;
441
+ }
442
+
443
+ /* For each endpoint that was created, we initiated the
444
+ process to create NUM_CHANNELS fi_addrs. Go finish all of
445
+ those. This will be the final determination of whether we
446
+ can use the endpoint or not because we'll find out if each
447
+ endpoint is reachable or not. */
448
+ rc = add_procs_block_reap_fi_av_inserts (module ,
449
+ block_offset , block_len ,
450
+ endpoints );
451
+ if (OPAL_SUCCESS != rc ) {
452
+ return rc ;
453
+ }
454
+ }
455
+
456
+ return OPAL_SUCCESS ;
457
+ }
458
+
385
459
/*
386
460
* Add procs to this BTL module, receiving endpoint information from
387
461
* the modex. This is done in 2 phases:
@@ -408,23 +482,13 @@ static int usnic_add_procs(struct mca_btl_base_module_t* base_module,
408
482
opal_btl_usnic_module_t * module = (opal_btl_usnic_module_t * ) base_module ;
409
483
int rc ;
410
484
411
- /* First, create endpoints (and procs, if they're not already
412
- created) for all the usnic-reachable procs we were given. */
485
+ /* Go create the endpoints (including all relevant address
486
+ resolution) */
413
487
rc = add_procs_create_endpoints (module , nprocs , procs , endpoints );
414
488
if (OPAL_SUCCESS != rc ) {
415
489
goto fail ;
416
490
}
417
491
418
- /* For each endpoint that was created, we initiated the process to
419
- create NUM_CHANNELS fi_addrs. Go finish all of those. This
420
- will be the final determination of whether we can use the
421
- endpoint or not because we'll find out if each endpoint is
422
- reachable or not. */
423
- rc = add_procs_reap_fi_av_inserts (module , nprocs , endpoints );
424
- if (OPAL_SUCCESS != rc ) {
425
- goto fail ;
426
- }
427
-
428
492
/* Find all the endpoints with a complete set of USD destinations
429
493
and mark them as reachable */
430
494
for (size_t i = 0 ; NULL != reachable && i < nprocs ; ++ i ) {
@@ -1205,7 +1269,7 @@ usnic_send(
1205
1269
/* assign length */
1206
1270
sseg -> ss_len = sizeof (opal_btl_usnic_btl_header_t ) + frag -> sf_size ;
1207
1271
1208
- sseg -> ss_channel = USNIC_PRIORITY_CHANNEL ;
1272
+ sseg -> ss_channel = USNIC_DATA_CHANNEL ;
1209
1273
sseg -> ss_base .us_btl_header -> tag = tag ;
1210
1274
#if MSGDEBUG1
1211
1275
opal_output (0 , "INLINE send, sseg=%p" , (void * )sseg );
@@ -2018,12 +2082,15 @@ static int init_channels(opal_btl_usnic_module_t *module)
2018
2082
}
2019
2083
2020
2084
memset (& eq_attr , 0 , sizeof (eq_attr ));
2021
- eq_attr .size = 1024 ;
2085
+ eq_attr .size = module -> av_eq_num ;
2022
2086
eq_attr .wait_obj = FI_WAIT_UNSPEC ;
2023
2087
rc = fi_eq_open (module -> fabric , & eq_attr , & module -> av_eq , NULL );
2024
2088
if (rc != OPAL_SUCCESS ) {
2025
2089
goto destroy ;
2026
2090
}
2091
+ // Save the size of the created EQ
2092
+ module -> av_eq_size = eq_attr .size ;
2093
+
2027
2094
eq_attr .wait_obj = FI_WAIT_FD ;
2028
2095
rc = fi_eq_open (module -> fabric , & eq_attr , & module -> dom_eq , NULL );
2029
2096
if (rc != OPAL_SUCCESS ) {
0 commit comments