diff --git a/opal/mca/btl/usnic/README.txt b/opal/mca/btl/usnic/README.txt index 1af846b1c9..ab0b7d12b7 100644 --- a/opal/mca/btl/usnic/README.txt +++ b/opal/mca/btl/usnic/README.txt @@ -335,3 +335,40 @@ libfabric abstractions: fi_fabric: corresponds to a VIC PF fi_domain: corresponds to a VIC VF fi_endpoint: resources inside the VIC VF (basically a QP) + +====================================== + +MPI_THREAD_MULTIPLE support + +In order to make usnic btl thread-safe, the mutex locks are issued +to protect the critical path. ie; libfabric routines, book keeping, etc. + +The said lock is btl_usnic_lock. It is a RECURSIVE lock, meaning that +the same thread can take the lock again even if it already has the lock to +allow the callback function to post another segment right away if we know +that the current segment is completed inline. (So we can call send in send +without deadlocking) + +These two functions taking care of hotel checkin/checkout and we +have to protect that part. So we take the mutex lock before we enter the +function. + +- opal_btl_usnic_check_rts() +- opal_btl_usnic_handle_ack() + +We also have to protect the call to libfabric routines + +- opal_btl_usnic_endpoint_send_segment() (fi_send) +- opal_btl_usnic_recv_call() (fi_recvmsg) + +have to be protected as well. + +Also cclient connection checking (opal_btl_usnic_connectivity_ping) has to be +protected. This happens only in the beginning but cclient communicate with cagent +through opal_fd_read/write() and if two or more clients do opal_fd_write() at the +same time, the data might be corrupt. + +With this concept, many functions in btl/usnic that make calls to the +listed functions are protected by OPAL_THREAD_LOCK macro which will only +be active if the user specify MPI_Init_thread() with MPI_THREAD_MULTIPLE +support. diff --git a/opal/mca/btl/usnic/btl_usnic.h b/opal/mca/btl/usnic/btl_usnic.h index cc094ce38f..1b7ece1899 100644 --- a/opal/mca/btl/usnic/btl_usnic.h +++ b/opal/mca/btl/usnic/btl_usnic.h @@ -56,6 +56,10 @@ BEGIN_C_DECLS * at other times as needed or as tuning dictates. */ extern uint64_t opal_btl_usnic_ticks; + +/* Lock for MPU_THREAD_MULTIPLE support */ +extern opal_recursive_mutex_t btl_usnic_lock; + static inline uint64_t get_nsec(void) { diff --git a/opal/mca/btl/usnic/btl_usnic_cclient.c b/opal/mca/btl/usnic/btl_usnic_cclient.c index d76b3b8ca9..77615937e4 100644 --- a/opal/mca/btl/usnic/btl_usnic_cclient.c +++ b/opal/mca/btl/usnic/btl_usnic_cclient.c @@ -197,7 +197,7 @@ int opal_btl_usnic_connectivity_listen(opal_btl_usnic_module_t *module) /* Ensure to NULL-terminate the passed strings */ strncpy(cmd.nodename, opal_process_info.nodename, CONNECTIVITY_NODENAME_LEN - 1); - strncpy(cmd.usnic_name, module->fabric_info->fabric_attr->name, + strncpy(cmd.usnic_name, module->linux_device_name, CONNECTIVITY_IFNAME_LEN - 1); if (OPAL_SUCCESS != opal_fd_write(agent_fd, sizeof(cmd), &cmd)) { @@ -234,6 +234,9 @@ int opal_btl_usnic_connectivity_ping(uint32_t src_ipv4_addr, int src_port, return OPAL_SUCCESS; } + /* Protect opal_fd_write for multithreaded case */ + OPAL_THREAD_LOCK(&btl_usnic_lock); + /* Send the PING command */ int id = CONNECTIVITY_AGENT_CMD_PING; if (OPAL_SUCCESS != opal_fd_write(agent_fd, sizeof(id), &id)) { @@ -260,6 +263,9 @@ int opal_btl_usnic_connectivity_ping(uint32_t src_ipv4_addr, int src_port, /* Will not return */ } + /* Unlock and return */ + OPAL_THREAD_UNLOCK(&btl_usnic_lock); + return OPAL_SUCCESS; } diff --git a/opal/mca/btl/usnic/btl_usnic_compat.c b/opal/mca/btl/usnic/btl_usnic_compat.c index 1289093ac1..de649cb514 100644 --- a/opal/mca/btl/usnic/btl_usnic_compat.c +++ b/opal/mca/btl/usnic/btl_usnic_compat.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2014-2015 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2014-2016 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2015 Intel, Inc. All rights reserved. * $COPYRIGHT$ * @@ -509,6 +509,7 @@ opal_btl_usnic_prepare_src( size_t* size, uint32_t flags) { + OPAL_THREAD_LOCK(&btl_usnic_lock); opal_btl_usnic_module_t *module = (opal_btl_usnic_module_t*) base_module; opal_btl_usnic_send_frag_t *frag; uint32_t payload_len; @@ -535,7 +536,7 @@ opal_btl_usnic_prepare_src( #if MSGDEBUG2 opal_output(0, "prep_src: %s %s frag %p, size=%d+%u (was %u), conv=%p\n", - module->fabric_info->fabric_attr->name, + module->linux_device_name, (reserve + *size) <= module->max_frag_payload?"small":"large", (void *)frag, (int)reserve, (unsigned)*size, (unsigned)osize, (void *)convertor); @@ -552,6 +553,7 @@ opal_btl_usnic_prepare_src( #endif #endif + OPAL_THREAD_UNLOCK(&btl_usnic_lock); return &frag->sf_base.uf_base; } @@ -721,7 +723,7 @@ opal_btl_usnic_prepare_src(struct mca_btl_base_module_t *base_module, #if MSGDEBUG2 opal_output(0, "prep_src: %s %s frag %p, size=%d+%u (was %u), conv=%p\n", - module->fabric_info->fabric_attr->name, + module->linux_device_name, (reserve + *size) <= module->max_frag_payload?"small":"large", (void *)frag, (int)reserve, (unsigned)*size, (unsigned)osize, (void *)convertor); diff --git a/opal/mca/btl/usnic/btl_usnic_component.c b/opal/mca/btl/usnic/btl_usnic_component.c index 07803ce9ee..2a372964e4 100644 --- a/opal/mca/btl/usnic/btl_usnic_component.c +++ b/opal/mca/btl/usnic/btl_usnic_component.c @@ -86,6 +86,9 @@ #define OPAL_BTL_USNIC_NUM_COMPLETIONS 500 +/* MPI_THREAD_MULTIPLE_SUPPORT */ +opal_recursive_mutex_t btl_usnic_lock = OPAL_RECURSIVE_MUTEX_STATIC_INIT; + /* RNG buffer definition */ opal_rng_buff_t opal_btl_usnic_rand_buff = {0}; @@ -222,6 +225,8 @@ static int usnic_component_close(void) opal_btl_usnic_cleanup_tests(); #endif + OBJ_DESTRUCT(&btl_usnic_lock); + return OPAL_SUCCESS; } @@ -322,9 +327,7 @@ static int check_usnic_config(opal_btl_usnic_module_t *module, char str[128]; unsigned unlp; struct fi_usnic_info *uip; - struct fi_info *info; - info = module->fabric_info; uip = &module->usnic_info; /* Note: we add one to num_local_procs to account for *this* @@ -373,7 +376,7 @@ static int check_usnic_config(opal_btl_usnic_module_t *module, "not enough usnic resources", true, opal_process_info.nodename, - info->fabric_attr->name, + module->linux_device_name, str); return OPAL_ERROR; } @@ -538,10 +541,12 @@ static bool filter_module(opal_btl_usnic_module_t *module, struct fi_usnic_info *uip; struct fi_info *info; bool match; + const char *linux_device_name; info = module->fabric_info; uip = &module->usnic_info; src = info->src_addr; + linux_device_name = module->linux_device_name; module_mask = src->sin_addr.s_addr & uip->ui.v1.ui_netmask_be; match = false; for (i = 0; i < filter->n_elt; ++i) { @@ -554,7 +559,7 @@ static bool filter_module(opal_btl_usnic_module_t *module, } } else { - if (strcmp(filter->elts[i].if_name, info->fabric_attr->name) == 0) { + if (strcmp(filter->elts[i].if_name, linux_device_name) == 0) { match = true; break; } @@ -585,6 +590,25 @@ static void free_filter(usnic_if_filter_t *filter) free(filter); } +static int do_fi_getinfo(uint32_t version, struct fi_info **info_list) +{ + struct fi_info hints = {0}; + struct fi_ep_attr ep_attr = {0}; + struct fi_fabric_attr fabric_attr = {0}; + + /* We only want providers named "usnic" that are of type EP_DGRAM */ + fabric_attr.prov_name = "usnic"; + ep_attr.type = FI_EP_DGRAM; + + hints.caps = FI_MSG; + hints.mode = FI_LOCAL_MR | FI_MSG_PREFIX; + hints.addr_format = FI_SOCKADDR; + hints.ep_attr = &ep_attr; + hints.fabric_attr = &fabric_attr; + + return fi_getinfo(version, NULL, 0, 0, &hints, info_list); +} + /* * UD component initialization: * (1) read interface list from kernel and compare against component @@ -606,35 +630,31 @@ static mca_btl_base_module_t** usnic_component_init(int* num_btl_modules, int min_distance, num_local_procs; struct fi_info *info_list; struct fi_info *info; - struct fi_info hints = {0}; - struct fi_ep_attr ep_attr = {0}; - struct fi_fabric_attr fabric_attr = {0}; struct fid_fabric *fabric; struct fid_domain *domain; int ret; *num_btl_modules = 0; - /* Currently refuse to run if MPI_THREAD_MULTIPLE is enabled */ + /* MPI_THREAD_MULTIPLE is only supported in 2.0+ */ if (want_mpi_threads && !mca_btl_base_thread_multiple_override) { - opal_output_verbose(5, USNIC_OUT, - "btl:usnic: MPI_THREAD_MULTIPLE not supported; skipping this component"); - return NULL; - } - /* We only want providers named "usnic that are of type EP_DGRAM */ - fabric_attr.prov_name = "usnic"; - ep_attr.type = FI_EP_DGRAM; + if (OMPI_MAJOR_VERSION >= 2) { + opal_output_verbose(5, USNIC_OUT, + "btl:usnic: MPI_THREAD_MULTIPLE support is in testing phase."); + } + else { + opal_output_verbose(5, USNIC_OUT, + "btl:usnic: MPI_THREAD_MULTIPLE is not supported in version < 2."); + return NULL; + } + } - hints.caps = FI_MSG; - hints.mode = FI_LOCAL_MR | FI_MSG_PREFIX; - hints.addr_format = FI_SOCKADDR; - hints.ep_attr = &ep_attr; - hints.fabric_attr = &fabric_attr; + OBJ_CONSTRUCT(&btl_usnic_lock, opal_recursive_mutex_t); - /* This code understands libfabric API v1.0 and v1.1. Even if we - were compiled with libfabric API v1.0, we still want to request - v1.1 -- here's why: + /* This code understands libfabric API versions v1.0, v1.1, and + v1.4. Even if we were compiled with libfabric API v1.0, we + still want to request v1.1 -- here's why: - In libfabric v1.0.0 (i.e., API v1.0), the usnic provider did not check the value of the "version" parameter passed into @@ -650,6 +670,17 @@ static mca_btl_base_module_t** usnic_component_init(int* num_btl_modules, So never request API v1.0 -- always request a minimum of v1.1. + The usnic provider changed the strings in the fabric and domain + names in API v1.4. With API <= v1.3: + + - fabric name is "usnic_X" (device name) + - domain name is NULL + + With libfabric API >= v1.4: + + - fabric name is "a.b.c.d/e" (CIDR notation of network) + - domain name is "usnic_X" (device name) + NOTE: The configure.m4 in this component will require libfabric >= v1.1.0 (i.e., it won't accept v1.0.0) because of a critical bug in the usnic provider in libfabric v1.0.0. However, the @@ -663,9 +694,17 @@ static mca_btl_base_module_t** usnic_component_init(int* num_btl_modules, Someday, #2 may no longer be true, and we may therefore rip out the libfabric v1.0.0 compatibility code. */ + + /* First try API version 1.4. If that doesn't work, try API + version 1.1. */ uint32_t libfabric_api; - libfabric_api = FI_VERSION(1, 1); - ret = fi_getinfo(libfabric_api, NULL, 0, 0, &hints, &info_list); + libfabric_api = FI_VERSION(1, 4); + ret = do_fi_getinfo(libfabric_api, &info_list); + // Libfabric core will return -FI_ENOSYS if it is too old + if (-FI_ENOSYS == ret) { + libfabric_api = FI_VERSION(1, 1); + ret = do_fi_getinfo(libfabric_api, &info_list); + } if (0 != ret) { opal_output_verbose(5, USNIC_OUT, "btl:usnic: disqualifiying myself due to fi_getinfo failure: %s (%d)", strerror(-ret), ret); @@ -722,7 +761,6 @@ static mca_btl_base_module_t** usnic_component_init(int* num_btl_modules, mca_btl_usnic_component.cq_readerr_try_again_value = -FI_EAGAIN; } - /* libnl initialization */ opal_proc_t *me = opal_proc_local_get(); opal_process_name_t *name = &(me->proc_name); mca_btl_usnic_component.my_hashed_rte_name = @@ -786,13 +824,21 @@ static mca_btl_base_module_t** usnic_component_init(int* num_btl_modules, i < mca_btl_usnic_component.max_modules); ++i, info = info->next) { + // The fabric/domain names changed at libfabric API v1.4 (see above). + char *linux_device_name; + if (libfabric_api <= FI_VERSION(1, 3)) { + linux_device_name = info->fabric_attr->name; + } else { + linux_device_name = info->domain_attr->name; + } + ret = fi_fabric(info->fabric_attr, &fabric, NULL); if (0 != ret) { opal_show_help("help-mpi-btl-usnic.txt", "libfabric API failed", true, opal_process_info.nodename, - info->fabric_attr->name, + linux_device_name, "fi_fabric()", __FILE__, __LINE__, ret, strerror(-ret)); @@ -806,7 +852,7 @@ static mca_btl_base_module_t** usnic_component_init(int* num_btl_modules, "libfabric API failed", true, opal_process_info.nodename, - info->fabric_attr->name, + linux_device_name, "fi_domain()", __FILE__, __LINE__, ret, strerror(-ret)); @@ -815,8 +861,8 @@ static mca_btl_base_module_t** usnic_component_init(int* num_btl_modules, opal_memchecker_base_mem_defined(&domain, sizeof(domain)); opal_output_verbose(5, USNIC_OUT, - "btl:usnic: found: usNIC direct device %s", - info->fabric_attr->name); + "btl:usnic: found: usNIC device %s", + linux_device_name); /* Save a little info on the module that we have already gathered. The rest of the module will be filled in @@ -827,6 +873,12 @@ static mca_btl_base_module_t** usnic_component_init(int* num_btl_modules, module->fabric = fabric; module->domain = domain; module->fabric_info = info; + module->libfabric_api = libfabric_api; + module->linux_device_name = strdup(linux_device_name); + if (NULL == module->linux_device_name) { + OPAL_ERROR_LOG(OPAL_ERR_OUT_OF_RESOURCE); + goto error; + } /* Obtain usnic-specific device info (e.g., netmask) that doesn't come in the normal fi_getinfo(). This allows us to @@ -836,7 +888,7 @@ static mca_btl_base_module_t** usnic_component_init(int* num_btl_modules, if (ret != 0) { opal_output_verbose(5, USNIC_OUT, "btl:usnic: device %s fabric_open_ops failed %d (%s)", - info->fabric_attr->name, ret, fi_strerror(-ret)); + module->linux_device_name, ret, fi_strerror(-ret)); fi_close(&domain->fid); fi_close(&fabric->fid); continue; @@ -849,14 +901,14 @@ static mca_btl_base_module_t** usnic_component_init(int* num_btl_modules, if (ret != 0) { opal_output_verbose(5, USNIC_OUT, "btl:usnic: device %s usnic_getinfo failed %d (%s)", - info->fabric_attr->name, ret, fi_strerror(-ret)); + module->linux_device_name, ret, fi_strerror(-ret)); fi_close(&domain->fid); fi_close(&fabric->fid); continue; } opal_output_verbose(5, USNIC_OUT, "btl:usnic: device %s usnic_info: link speed=%d, netmask=0x%x, ifname=%s, num_vf=%d, qp/vf=%d, cq/vf=%d", - info->fabric_attr->name, + module->linux_device_name, (unsigned int) module->usnic_info.ui.v1.ui_link_speed, (unsigned int) module->usnic_info.ui.v1.ui_netmask_be, module->usnic_info.ui.v1.ui_ifname, @@ -870,7 +922,7 @@ static mca_btl_base_module_t** usnic_component_init(int* num_btl_modules, opal_output_verbose(5, USNIC_OUT, "btl:usnic: %s %s due to %s", (keep_module ? "keeping" : "skipping"), - info->fabric_attr->name, + module->linux_device_name, (filter_incl ? "if_include" : "if_exclude")); if (!keep_module) { fi_close(&domain->fid); @@ -888,7 +940,7 @@ static mca_btl_base_module_t** usnic_component_init(int* num_btl_modules, check_usnic_config(module, num_local_procs) != OPAL_SUCCESS) { opal_output_verbose(5, USNIC_OUT, "btl:usnic: device %s is not provisioned with enough resources -- skipping", - info->fabric_attr->name); + module->linux_device_name); fi_close(&domain->fid); fi_close(&fabric->fid); @@ -902,7 +954,7 @@ static mca_btl_base_module_t** usnic_component_init(int* num_btl_modules, opal_output_verbose(5, USNIC_OUT, "btl:usnic: device %s looks good!", - info->fabric_attr->name); + module->linux_device_name); /* Let this module advance to the next round! */ btls[j++] = &(module->super); @@ -952,7 +1004,7 @@ static mca_btl_base_module_t** usnic_component_init(int* num_btl_modules, btls[num_final_modules++] = &(module->super); /* Output all of this module's values. */ - const char *devname = module->fabric_info->fabric_attr->name; + const char *devname = module->linux_device_name; opal_output_verbose(5, USNIC_OUT, "btl:usnic: %s num sqe=%d, num rqe=%d, num cqe=%d, num aveqe=%d", devname, @@ -1151,6 +1203,8 @@ static int usnic_handle_completion( /* Make the completion be Valgrind-defined */ opal_memchecker_base_mem_defined(seg, sizeof(*seg)); + OPAL_THREAD_LOCK(&btl_usnic_lock); + /* Handle work completions */ switch(seg->us_type) { @@ -1181,6 +1235,8 @@ static int usnic_handle_completion( BTL_ERROR(("Unhandled completion segment type %d", seg->us_type)); break; } + + OPAL_THREAD_UNLOCK(&btl_usnic_lock); return 1; } @@ -1194,7 +1250,7 @@ usnic_handle_cq_error(opal_btl_usnic_module_t* module, if (cq_ret != -FI_EAVAIL) { BTL_ERROR(("%s: cq_read ret = %d (%s)", - module->fabric_info->fabric_attr->name, cq_ret, + module->linux_device_name, cq_ret, fi_strerror(-cq_ret))); channel->chan_error = true; } @@ -1204,7 +1260,7 @@ usnic_handle_cq_error(opal_btl_usnic_module_t* module, return; } else if (rc != mca_btl_usnic_component.cq_readerr_success_value) { BTL_ERROR(("%s: cq_readerr ret = %d (expected %d)", - module->fabric_info->fabric_attr->name, rc, + module->linux_device_name, rc, (int) mca_btl_usnic_component.cq_readerr_success_value)); channel->chan_error = true; } @@ -1217,7 +1273,7 @@ usnic_handle_cq_error(opal_btl_usnic_module_t* module, static int once = 0; if (once++ == 0) { BTL_ERROR(("%s: Channel %d, %s", - module->fabric_info->fabric_attr->name, + module->linux_device_name, channel->chan_index, FI_ECRC == err_entry.prov_errno ? "CRC error" : "message truncation")); @@ -1238,7 +1294,7 @@ usnic_handle_cq_error(opal_btl_usnic_module_t* module, } } else { BTL_ERROR(("%s: CQ[%d] prov_err = %d", - module->fabric_info->fabric_attr->name, channel->chan_index, + module->linux_device_name, channel->chan_index, err_entry.prov_errno)); channel->chan_error = true; } @@ -1451,7 +1507,7 @@ void opal_btl_usnic_component_debug(void) module = mca_btl_usnic_component.usnic_active_modules[i]; opal_output(0, "active_modules[%d]=%p %s max{frag,chunk,tiny}=%llu,%llu,%llu\n", - i, (void *)module, module->fabric_info->fabric_attr->name, + i, (void *)module, module->linux_device_name, (unsigned long long)module->max_frag_payload, (unsigned long long)module->max_chunk_payload, (unsigned long long)module->max_tiny_payload); diff --git a/opal/mca/btl/usnic/btl_usnic_hwloc.c b/opal/mca/btl/usnic/btl_usnic_hwloc.c index ff9442eef3..78ef4c3abc 100644 --- a/opal/mca/btl/usnic/btl_usnic_hwloc.c +++ b/opal/mca/btl/usnic/btl_usnic_hwloc.c @@ -162,7 +162,7 @@ static hwloc_obj_t find_device_numa(opal_btl_usnic_module_t *module) if (obj->type != HWLOC_OBJ_NODE) { opal_output_verbose(5, USNIC_OUT, "btl:usnic:filter_numa: could not find NUMA node for %s; filtering by NUMA distance not possible", - module->fabric_info->fabric_attr->name); + module->linux_device_name); return NULL; } @@ -218,7 +218,7 @@ int opal_btl_usnic_hwloc_distance(opal_btl_usnic_module_t *module) opal_output_verbose(5, USNIC_OUT, "btl:usnic:filter_numa: %s is distance %d from me", - module->fabric_info->fabric_attr->name, + module->linux_device_name, module->numa_distance); } diff --git a/opal/mca/btl/usnic/btl_usnic_map.c b/opal/mca/btl/usnic/btl_usnic_map.c index ce2aca6abe..c9cbd8a83c 100644 --- a/opal/mca/btl/usnic/btl_usnic_map.c +++ b/opal/mca/btl/usnic/btl_usnic_map.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2013-2015 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2013-2016 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2014 Intel, Inc. All rights reserved * $COPYRIGHT$ * @@ -30,8 +30,8 @@ static int map_compare_modules(const void *aa, const void *bb) opal_btl_usnic_module_t *a = *((opal_btl_usnic_module_t**) aa); opal_btl_usnic_module_t *b = *((opal_btl_usnic_module_t**) bb); - return strcmp(a->fabric_info->fabric_attr->name, - b->fabric_info->fabric_attr->name); + return strcmp(a->linux_device_name, + b->linux_device_name); } /* @@ -74,7 +74,7 @@ static int map_output_modules(FILE *fp) prefix_len); fprintf(fp, "device=%s,ip=%s,mss=%" PRIsize_t "\n", - modules[i]->fabric_info->fabric_attr->name, + modules[i]->linux_device_name, ipv4, modules[i]->fabric_info->ep_attr->max_msg_size); } @@ -102,8 +102,8 @@ static int map_compare_endpoints(const void *aa, const void *bb) return -1; } - return strcmp(a->endpoint_module->fabric_info->fabric_attr->name, - b->endpoint_module->fabric_info->fabric_attr->name); + return strcmp(a->endpoint_module->linux_device_name, + b->endpoint_module->linux_device_name); } /* @@ -148,7 +148,7 @@ static int map_output_endpoints(FILE *fp, opal_btl_usnic_proc_t *proc) eps[i]->endpoint_remote_modex.netmask); fprintf(fp, "device=%s@peer_ip=%s", - eps[i]->endpoint_module->fabric_info->fabric_attr->name, + eps[i]->endpoint_module->linux_device_name, ipv4); ++num_output; } diff --git a/opal/mca/btl/usnic/btl_usnic_module.c b/opal/mca/btl/usnic/btl_usnic_module.c index 713c3f44eb..2834cf5cd8 100644 --- a/opal/mca/btl/usnic/btl_usnic_module.c +++ b/opal/mca/btl/usnic/btl_usnic_module.c @@ -67,6 +67,30 @@ static void finalize_one_channel(opal_btl_usnic_module_t *module, struct opal_btl_usnic_channel_t *channel); +static int channel_addr2str(opal_btl_usnic_module_t *module, int channel, + char *str, size_t len_param) +{ + size_t len; + + len = len_param; + fi_av_straddr(module->av, module->mod_channels[channel].info->src_addr, + str, &len); + if (len > len_param) { + opal_show_help("help-mpi-btl-usnic.txt", + "libfabric API failed", + true, + opal_process_info.nodename, + module->linux_device_name, + "fi_av_straddr", __FILE__, __LINE__, + FI_ENODATA, + "Failed to convert address to string: buffer too short"); + + return OPAL_ERR_OUT_OF_RESOURCE; + } + + return OPAL_SUCCESS; +} + /* * Loop over a block of procs sent to us in add_procs and see if we @@ -100,7 +124,7 @@ static int add_procs_block_create_endpoints(opal_btl_usnic_module_t *module, if (opal_proc == my_proc) { opal_output_verbose(75, USNIC_OUT, "btl:usnic:add_procs:%s: not connecting to self", - module->fabric_info->fabric_attr->name); + module->linux_device_name); continue; } @@ -108,7 +132,7 @@ static int add_procs_block_create_endpoints(opal_btl_usnic_module_t *module, if (OPAL_PROC_ON_LOCAL_NODE(opal_proc->proc_flags)) { opal_output_verbose(75, USNIC_OUT, "btl:usnic:add_procs:%s: not connecting to %s on same server", - module->fabric_info->fabric_attr->name, + module->linux_device_name, usnic_compat_proc_name_print(&opal_proc->proc_name)); continue; } @@ -124,7 +148,7 @@ static int add_procs_block_create_endpoints(opal_btl_usnic_module_t *module, skip it */ opal_output_verbose(75, USNIC_OUT, "btl:usnic:add_procs:%s: peer %s on %s does not have usnic modex info; skipping", - module->fabric_info->fabric_attr->name, + module->linux_device_name, usnic_compat_proc_name_print(&opal_proc->proc_name), opal_get_proc_hostname(opal_proc)); continue; @@ -140,7 +164,7 @@ static int add_procs_block_create_endpoints(opal_btl_usnic_module_t *module, if (OPAL_SUCCESS != rc) { opal_output_verbose(5, USNIC_OUT, "btl:usnic:add_procs:%s: unable to create endpoint to peer %s on %s", - module->fabric_info->fabric_attr->name, + module->linux_device_name, usnic_compat_proc_name_print(&opal_proc->proc_name), opal_get_proc_hostname(opal_proc)); OBJ_RELEASE(usnic_proc); @@ -157,12 +181,29 @@ static int add_procs_block_create_endpoints(opal_btl_usnic_module_t *module, modex->ipv4_addr, modex->netmask); + char local_pri_addr[64] = {0}; + rc = channel_addr2str(module, USNIC_PRIORITY_CHANNEL, + local_pri_addr, sizeof(local_pri_addr)); + if (OPAL_SUCCESS != rc) { + OBJ_RELEASE(usnic_proc); + continue; + } + + char local_data_addr[64] = {0}; + rc = channel_addr2str(module, USNIC_DATA_CHANNEL, + local_data_addr, sizeof(local_data_addr)); + if (OPAL_SUCCESS != rc) { + OBJ_RELEASE(usnic_proc); + continue; + } + opal_output_verbose(5, USNIC_OUT, - "btl:usnic:add_procs:%s: new usnic peer endpoint: %s, proirity port %d, data port %d", - module->fabric_info->fabric_attr->name, - str, - modex->ports[USNIC_PRIORITY_CHANNEL], - modex->ports[USNIC_DATA_CHANNEL]); + "btl:usnic:add_procs:%s: new usnic peer endpoint: pri=%s:%d, data=%s:%d (local: pri=%s, data=%s)", + module->linux_device_name, + str, modex->ports[USNIC_PRIORITY_CHANNEL], + str, modex->ports[USNIC_DATA_CHANNEL], + local_pri_addr, + local_data_addr); endpoints[i] = usnic_endpoint; ++num_created; @@ -195,14 +236,14 @@ static void add_procs_warn_unreachable(opal_btl_usnic_module_t *module, opal_output_verbose(15, USNIC_OUT, "btl:usnic: %s (which is %s) couldn't reach peer %s", - module->fabric_info->fabric_attr->name, + module->linux_device_name, module->if_ipv4_addr_str, remote); opal_show_help("help-mpi-btl-usnic.txt", "unreachable peer IP", true, opal_process_info.nodename, module->if_ipv4_addr_str, - module->fabric_info->fabric_attr->name, + module->linux_device_name, opal_get_proc_hostname(endpoint->endpoint_proc->proc_opal), remote); } @@ -301,7 +342,7 @@ add_procs_block_reap_fi_av_inserts(opal_btl_usnic_module_t *module, "libfabric API failed", true, opal_process_info.nodename, - module->fabric_info->fabric_attr->name, + module->linux_device_name, "async insertion result", __FILE__, __LINE__, err_entry.err, "Failed to insert address to AV"); @@ -325,7 +366,7 @@ add_procs_block_reap_fi_av_inserts(opal_btl_usnic_module_t *module, "internal error during init", true, opal_process_info.nodename, - module->fabric_info->fabric_attr->name, + module->linux_device_name, "fi_eq_readerr()", __FILE__, __LINE__, ret, "Returned != sizeof(err_entry)"); @@ -346,7 +387,7 @@ add_procs_block_reap_fi_av_inserts(opal_btl_usnic_module_t *module, "internal error during init", true, opal_process_info.nodename, - module->fabric_info->fabric_attr->name, + module->linux_device_name, "fi_eq_sread()", __FILE__, __LINE__, ret, "Returned != (sizeof(entry) or -FI_EAVAIL)"); @@ -898,6 +939,8 @@ static int usnic_finalize(struct mca_btl_base_module_t* btl) fi_close(&module->domain->fid); fi_close(&module->fabric->fid); + free(module->linux_device_name); + return OPAL_SUCCESS; } @@ -1080,6 +1123,7 @@ opal_btl_usnic_module_progress_sends( /* * Handle all the retransmits we can */ + OPAL_THREAD_LOCK(&btl_usnic_lock); if (OPAL_UNLIKELY(!opal_list_is_empty(&module->pending_resend_segs))) { usnic_do_resends(module); } @@ -1189,6 +1233,7 @@ opal_btl_usnic_module_progress_sends( endpoint = next_endpoint; } + OPAL_THREAD_UNLOCK(&btl_usnic_lock); } /* @@ -1223,6 +1268,7 @@ usnic_send( opal_btl_usnic_module_t *module; opal_btl_usnic_send_segment_t *sseg; + OPAL_THREAD_LOCK(&btl_usnic_lock); endpoint = (opal_btl_usnic_endpoint_t *)base_endpoint; module = (opal_btl_usnic_module_t *)base_module; frag = (opal_btl_usnic_send_frag_t*) descriptor; @@ -1331,6 +1377,7 @@ usnic_send( ++module->stats.pml_module_sends; + OPAL_THREAD_UNLOCK(&btl_usnic_lock); return rc; } @@ -1413,7 +1460,7 @@ static void module_async_event_callback(int fd, short flags, void *arg) opal_show_help("help-mpi-btl-usnic.txt", "libfabric API failed", true, opal_process_info.nodename, - module->fabric_info->fabric_attr->name, + module->linux_device_name, "fi_eq_read()", __FILE__, __LINE__, ret, "Failed to get domain event"); @@ -1432,7 +1479,7 @@ static void module_async_event_callback(int fd, short flags, void *arg) ignore it. */ opal_output_verbose(10, USNIC_OUT, "btl:usnic: got LINK_UP on %s", - module->fabric_info->fabric_attr->name); + module->linux_device_name); break; case 1: // USD_EVENT_LINK_DOWN: @@ -1451,7 +1498,7 @@ static void module_async_event_callback(int fd, short flags, void *arg) opal_show_help("help-mpi-btl-usnic.txt", "async event", true, opal_process_info.nodename, - module->fabric_info->fabric_attr->name, + module->linux_device_name, str, entry.data); fatal = true; } @@ -1482,7 +1529,7 @@ static int create_ep(opal_btl_usnic_module_t* module, "internal error during init", true, opal_process_info.nodename, - module->fabric_info->fabric_attr->name, + module->linux_device_name, "fi_dupinfo() failed", __FILE__, __LINE__, -1, "Unknown"); return OPAL_ERR_OUT_OF_RESOURCE; @@ -1500,14 +1547,14 @@ static int create_ep(opal_btl_usnic_module_t* module, opal_process_info.my_local_rank); } - rc = fi_getinfo(FI_VERSION(1, 1), NULL, 0, 0, hint, &channel->info); + rc = fi_getinfo(module->libfabric_api, NULL, 0, 0, hint, &channel->info); fi_freeinfo(hint); if (0 != rc) { opal_show_help("help-mpi-btl-usnic.txt", "internal error during init", true, opal_process_info.nodename, - module->fabric_info->fabric_attr->name, + module->linux_device_name, "fi_getinfo() failed", __FILE__, __LINE__, rc, fi_strerror(-rc)); return OPAL_ERR_OUT_OF_RESOURCE; @@ -1543,7 +1590,7 @@ static int create_ep(opal_btl_usnic_module_t* module, "internal error during init", true, opal_process_info.nodename, - module->fabric_info->fabric_attr->name, + module->linux_device_name, "fi_endpoint() failed", __FILE__, __LINE__, rc, fi_strerror(-rc)); return OPAL_ERR_OUT_OF_RESOURCE; @@ -1556,7 +1603,7 @@ static int create_ep(opal_btl_usnic_module_t* module, "internal error during init", true, opal_process_info.nodename, - module->fabric_info->fabric_attr->name, + module->linux_device_name, "fi_ep_bind() SCQ to EP failed", __FILE__, __LINE__, rc, fi_strerror(-rc)); return OPAL_ERR_OUT_OF_RESOURCE; @@ -1567,7 +1614,7 @@ static int create_ep(opal_btl_usnic_module_t* module, "internal error during init", true, opal_process_info.nodename, - module->fabric_info->fabric_attr->name, + module->linux_device_name, "fi_ep_bind() RCQ to EP failed", __FILE__, __LINE__, rc, fi_strerror(-rc)); return OPAL_ERR_OUT_OF_RESOURCE; @@ -1578,7 +1625,7 @@ static int create_ep(opal_btl_usnic_module_t* module, "internal error during init", true, opal_process_info.nodename, - module->fabric_info->fabric_attr->name, + module->linux_device_name, "fi_ep_bind() AV to EP failed", __FILE__, __LINE__, rc, fi_strerror(-rc)); return OPAL_ERR_OUT_OF_RESOURCE; @@ -1591,7 +1638,7 @@ static int create_ep(opal_btl_usnic_module_t* module, "internal error during init", true, opal_process_info.nodename, - module->fabric_info->fabric_attr->name, + module->linux_device_name, "fi_enable() failed", __FILE__, __LINE__, rc, fi_strerror(-rc)); return OPAL_ERR_OUT_OF_RESOURCE; @@ -1613,7 +1660,7 @@ static int create_ep(opal_btl_usnic_module_t* module, "internal error during init", true, opal_process_info.nodename, - module->fabric_info->fabric_attr->name, + module->linux_device_name, "fi_getname() failed", __FILE__, __LINE__, rc, fi_strerror(-rc)); return OPAL_ERR_OUT_OF_RESOURCE; @@ -1621,6 +1668,21 @@ static int create_ep(opal_btl_usnic_module_t* module, assert(0 != sin->sin_port); } + char *str; + if (USNIC_PRIORITY_CHANNEL == channel->chan_index) { + str = "priority"; + } else if (USNIC_DATA_CHANNEL == channel->chan_index) { + str = "data"; + } else { + str = "UNKNOWN"; + } + opal_output_verbose(15, USNIC_OUT, + "btl:usnic:create_ep:%s: new usnic local endpoint channel %s: %s:%d", + module->fabric_info->fabric_attr->name, + str, + inet_ntoa(sin->sin_addr), + ntohs(sin->sin_port)); + /* actual sizes */ channel->chan_rd_num = channel->info->rx_attr->size; channel->chan_sd_num = channel->info->tx_attr->size; @@ -1704,7 +1766,7 @@ static int init_one_channel(opal_btl_usnic_module_t *module, "internal error during init", true, opal_process_info.nodename, - module->fabric_info->fabric_attr->name, + module->linux_device_name, "failed to create CQ", __FILE__, __LINE__); goto error; } @@ -1760,7 +1822,7 @@ static int init_one_channel(opal_btl_usnic_module_t *module, "internal error during init", true, opal_process_info.nodename, - module->fabric_info->fabric_attr->name, + module->linux_device_name, "Failed to get receive buffer from freelist", __FILE__, __LINE__); goto error; @@ -1776,7 +1838,7 @@ static int init_one_channel(opal_btl_usnic_module_t *module, "internal error during init", true, opal_process_info.nodename, - module->fabric_info->fabric_attr->name, + module->linux_device_name, "Failed to post receive buffer", __FILE__, __LINE__); goto error; @@ -1843,7 +1905,7 @@ static void init_local_modex_part1(opal_btl_usnic_module_t *module) opal_output_verbose(5, USNIC_OUT, "btl:usnic: %s IP charactertics: %s, %u Mbps", - module->fabric_info->fabric_attr->name, + module->linux_device_name, module->if_ipv4_addr_str, modex->link_speed_mbps); } @@ -2055,7 +2117,7 @@ static int init_mpool(opal_btl_usnic_module_t *module) mpool_resources.register_mem = usnic_reg_mr; mpool_resources.deregister_mem = usnic_dereg_mr; asprintf(&mpool_resources.pool_name, "%s", - module->fabric_info->fabric_attr->name); + module->linux_device_name); module->super.btl_mpool = mca_mpool_base_module_create(mca_btl_usnic_component.usnic_mpool_name, &module->super, &mpool_resources); @@ -2064,7 +2126,7 @@ static int init_mpool(opal_btl_usnic_module_t *module) "internal error during init", true, opal_process_info.nodename, - module->fabric_info->fabric_attr->name, + module->linux_device_name, "create mpool", __FILE__, __LINE__); return OPAL_ERROR; } @@ -2177,7 +2239,7 @@ static void init_async_event(opal_btl_usnic_module_t *module) "libfabric API failed", true, opal_process_info.nodename, - module->fabric_info->fabric_attr->name, + module->linux_device_name, "fi_control(eq, FI_GETWAIT)", __FILE__, __LINE__, ret, fi_strerror(-ret)); diff --git a/opal/mca/btl/usnic/btl_usnic_module.h b/opal/mca/btl/usnic/btl_usnic_module.h index b4f5d0c739..b7d5d6fc0c 100644 --- a/opal/mca/btl/usnic/btl_usnic_module.h +++ b/opal/mca/btl/usnic/btl_usnic_module.h @@ -100,8 +100,10 @@ typedef struct opal_btl_usnic_module_t { /* Cache for use during component_init to associate a module with the libfabric device that it came from. */ + uint32_t libfabric_api; struct fid_fabric *fabric; struct fid_domain *domain; + char *linux_device_name; struct fi_info *fabric_info; struct fi_usnic_ops_fabric *usnic_fabric_ops; struct fi_usnic_ops_av *usnic_av_ops; diff --git a/opal/mca/btl/usnic/btl_usnic_proc.c b/opal/mca/btl/usnic/btl_usnic_proc.c index 9d71a6ed9d..f0fefbff96 100644 --- a/opal/mca/btl/usnic/btl_usnic_proc.c +++ b/opal/mca/btl/usnic/btl_usnic_proc.c @@ -11,7 +11,7 @@ * All rights reserved. * Copyright (c) 2006 Sandia National Laboratories. All rights * reserved. - * Copyright (c) 2013-2015 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2013-2016 Cisco Systems, Inc. All rights reserved. * Copyright (c) 2013-2014 Intel, Inc. All rights reserved * $COPYRIGHT$ * @@ -643,7 +643,7 @@ static int match_modex(opal_btl_usnic_module_t *module, opal_show_help("help-mpi-btl-usnic.txt", "MTU mismatch", true, opal_process_info.nodename, - module->fabric_info->fabric_attr->name, + module->linux_device_name, module->fabric_info->ep_attr->max_msg_size, (NULL == proc->proc_opal->proc_hostname) ? "unknown" : proc->proc_opal->proc_hostname, @@ -700,7 +700,7 @@ static int start_av_insert(opal_btl_usnic_module_t *module, opal_show_help("help-mpi-btl-usnic.txt", "libfabric API failed", true, opal_process_info.nodename, - module->fabric_info->fabric_attr->name, + module->linux_device_name, "fi_av_insert()", __FILE__, __LINE__, ret, "Failed to initiate AV insert"); diff --git a/opal/mca/btl/usnic/btl_usnic_recv.c b/opal/mca/btl/usnic/btl_usnic_recv.c index c77388ef23..443e2b0e96 100644 --- a/opal/mca/btl/usnic/btl_usnic_recv.c +++ b/opal/mca/btl/usnic/btl_usnic_recv.c @@ -340,8 +340,9 @@ void opal_btl_usnic_recv_call(opal_btl_usnic_module_t *module, opal_output(0, " Received ACK for sequence number %" UDSEQ " from %s to %s\n", bseg->us_btl_header->ack_seq, remote_ip, local_ip); #endif + OPAL_THREAD_LOCK(&btl_usnic_lock); opal_btl_usnic_handle_ack(endpoint, ack_seq); - + OPAL_THREAD_UNLOCK(&btl_usnic_lock); goto repost; } diff --git a/opal/mca/btl/usnic/btl_usnic_recv.h b/opal/mca/btl/usnic/btl_usnic_recv.h index 4773bba4aa..70ffa7d4db 100644 --- a/opal/mca/btl/usnic/btl_usnic_recv.h +++ b/opal/mca/btl/usnic/btl_usnic_recv.h @@ -157,8 +157,10 @@ opal_btl_usnic_check_rx_seq( #if MSGDEBUG1 opal_output(0, "Handle piggy-packed ACK seq %"UDSEQ"\n", seg->rs_base.us_btl_header->ack_seq); #endif + OPAL_THREAD_LOCK(&btl_usnic_lock); opal_btl_usnic_handle_ack(endpoint, seg->rs_base.us_btl_header->ack_seq); + OPAL_THREAD_UNLOCK(&btl_usnic_lock); } /* Do we have room in the endpoint's receiver window? diff --git a/opal/mca/btl/usnic/btl_usnic_send.h b/opal/mca/btl/usnic/btl_usnic_send.h index 2020544f20..86676a35b9 100644 --- a/opal/mca/btl/usnic/btl_usnic_send.h +++ b/opal/mca/btl/usnic/btl_usnic_send.h @@ -216,7 +216,7 @@ opal_btl_usnic_endpoint_send_segment( "CHUNK" : "FRAG", sseg->ss_base.us_btl_header->pkt_seq, sseg->ss_base.us_btl_header->sender, - endpoint->endpoint_module->fabric_info->fabric_attr->name, + endpoint->endpoint_module->linux_device_name, local_ip, module->local_modex.ports[sseg->ss_channel], (void*)sseg, diff --git a/opal/mca/btl/usnic/btl_usnic_stats.c b/opal/mca/btl/usnic/btl_usnic_stats.c index 9c3acac868..a0c3393cc7 100644 --- a/opal/mca/btl/usnic/btl_usnic_stats.c +++ b/opal/mca/btl/usnic/btl_usnic_stats.c @@ -86,7 +86,7 @@ void opal_btl_usnic_print_stats( prefix, opal_proc_local_get()->proc_name.vpid, - module->fabric_info->fabric_attr->name, + module->linux_device_name, module->stats.num_total_sends, module->mod_channels[USNIC_PRIORITY_CHANNEL].num_channel_sends, @@ -394,7 +394,7 @@ static void setup_mpit_pvars_enum(void) devices[i].value = i; rc = asprintf(&str, "%s,%hhu.%hhu.%hhu.%hhu/%" PRIu32, - m->fabric_info->fabric_attr->name, + m->linux_device_name, c[0], c[1], c[2], c[3], usnic_netmask_to_cidrlen(sin->sin_addr.s_addr)); assert(rc > 0);