Skip to content

Commit

Permalink
Merge pull request #7134 from wckzhang/btl_tcp_interface_match
Browse files Browse the repository at this point in the history
btl tcp: Use reachability and graph solving for global interface matching
  • Loading branch information
bwbarrett authored Jan 27, 2020
2 parents 10f6a77 + e958f3c commit fc8c7a5
Show file tree
Hide file tree
Showing 6 changed files with 372 additions and 640 deletions.
8 changes: 2 additions & 6 deletions opal/mca/btl/tcp/btl_tcp.c
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
* Copyright (c) 2016-2017 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* Copyright (c) 2016 Intel, Inc. All rights reserved.
* Copyright (c) 2019 Amazon.com, Inc. or its affiliates. All Rights
* reserved.
*
* $COPYRIGHT$
*
Expand Down Expand Up @@ -101,12 +103,6 @@ int mca_btl_tcp_add_procs( struct mca_btl_base_module_t* btl,
continue;
}

/*
* Check to make sure that the peer has at least as many interface
* addresses exported as we are trying to use. If not, then
* don't bind this BTL instance to the proc.
*/

OPAL_THREAD_LOCK(&tcp_proc->proc_lock);

for (uint32_t j = 0 ; j < (uint32_t)tcp_proc->proc_endpoint_count ; ++j) {
Expand Down
6 changes: 6 additions & 0 deletions opal/mca/btl/tcp/btl_tcp.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
* and Technology (RIST). All rights reserved.
* Copyright (c) 2014-2015 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2019 Amazon.com, Inc. or its affiliates. All Rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
Expand Down Expand Up @@ -107,6 +109,7 @@ struct mca_btl_tcp_component_t {
uint32_t tcp_num_btls; /**< number of interfaces available to the TCP component */
unsigned int tcp_num_links; /**< number of logical links per physical device */
struct mca_btl_tcp_module_t **tcp_btls; /**< array of available BTL modules */
opal_list_t local_ifs; /**< opal list of local opal_if_t interfaces */
int tcp_free_list_num; /**< initial size of free lists */
int tcp_free_list_max; /**< maximum size of free lists */
int tcp_free_list_inc; /**< number of elements to alloc when growing free lists */
Expand Down Expand Up @@ -163,6 +166,9 @@ OPAL_MODULE_DECLSPEC extern mca_btl_tcp_component_t mca_btl_tcp_component;
*/
struct mca_btl_tcp_module_t {
mca_btl_base_module_t super; /**< base BTL interface */
uint32_t btl_index; /**< Local BTL module index, used for vertex
data and used as a hash key when
solving module matching problem */
uint16_t tcp_ifkindex; /** <BTL kernel interface index */
struct sockaddr_storage tcp_ifaddr; /**< First address
discovered for this
Expand Down
20 changes: 14 additions & 6 deletions opal/mca/btl/tcp/btl_tcp_addr.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,9 @@
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2019 Amazon.com, Inc. or its affiliates. All Rights
* reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
Expand All @@ -30,37 +33,43 @@
#ifdef HAVE_NETINET_IN_H
#include <netinet/in.h>
#endif

#include <assert.h>

/**
* Modex address structure.
*
* One of these structures will be sent for every btl module in use by
* the local BTL TCP component.
* the local BTL TCP component. This is used to construct an opal_if_t
* structure for the reachability component as well as populate the
* mca_btl_tcp_addr_t structure on remote procs. These will be used
* for interface matching and filling out the mca_btl_base_endpoint_t
* structure.
*/
struct mca_btl_tcp_modex_addr_t {
uint8_t addr[16]; /* endpoint address. for addr_family
of MCA_BTL_TCP_AF_INET, only the
first 4 bytes have meaning. */
uint32_t addr_ifkindex; /* endpoint kernel index */
uint32_t addr_mask; /* ip mask */
uint32_t addr_bandwidth; /* interface bandwidth */
uint16_t addr_port; /* endpoint listen port */
uint8_t addr_family; /* endpoint address family. Note that
this is
MCA_BTL_TCP_AF_{INET,INET6}, not
the traditional
AF_INET/AF_INET6. */
uint8_t padding[1]; /* padd out to an 8-byte word */
uint8_t padding[1]; /* pad out to an 8-byte word */
};
typedef struct mca_btl_tcp_modex_addr_t mca_btl_tcp_modex_addr_t;

_Static_assert(sizeof(struct mca_btl_tcp_modex_addr_t) == 32, "mca_btl_tcp_modex_addr_t");

/**
* Remote peer address structure
*
* One of these structures will be allocated for every remote endpoint
* associated with a remote proc. The data is pulled from the
* mca_btl_tcp_modex_addr_t structure, except for the addr_inuse
* field, which is local.
* mca_btl_tcp_modex_addr_t structure.
*/
struct mca_btl_tcp_addr_t {
union {
Expand All @@ -73,7 +82,6 @@ struct mca_btl_tcp_addr_t {
int addr_ifkindex; /**< remote interface index assigned with
this address */
uint8_t addr_family; /**< AF_INET or AF_INET6 */
bool addr_inuse; /**< local meaning only */
};
typedef struct mca_btl_tcp_addr_t mca_btl_tcp_addr_t;

Expand Down
54 changes: 41 additions & 13 deletions opal/mca/btl/tcp/btl_tcp_component.c
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@
* Copyright (c) 2014-2015 Intel, Inc. All rights reserved.
* Copyright (c) 2014-2017 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* Copyright (c) 2018 Amazon.com, Inc. or its affiliates. All Rights reserved.
* Copyright (c) 2018-2019 Amazon.com, Inc. or its affiliates. All Rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
Expand Down Expand Up @@ -69,13 +70,15 @@
#include "opal/util/net.h"
#include "opal/util/fd.h"
#include "opal/util/show_help.h"
#include "opal/util/string_copy.h"
#include "opal/util/printf.h"
#include "opal/constants.h"
#include "opal/mca/btl/btl.h"
#include "opal/mca/btl/base/base.h"
#include "opal/mca/mpool/base/base.h"
#include "opal/mca/btl/base/btl_base_error.h"
#include "opal/mca/pmix/pmix.h"
#include "opal/mca/reachable/base/base.h"
#include "opal/threads/threads.h"

#include "opal/constants.h"
Expand Down Expand Up @@ -368,6 +371,7 @@ static int mca_btl_tcp_component_open(void)
mca_btl_tcp_component.tcp_btls = NULL;

/* initialize objects */
OBJ_CONSTRUCT(&mca_btl_tcp_component.local_ifs, opal_list_t);
OBJ_CONSTRUCT(&mca_btl_tcp_component.tcp_lock, opal_mutex_t);
OBJ_CONSTRUCT(&mca_btl_tcp_component.tcp_procs, opal_proc_table_t);
OBJ_CONSTRUCT(&mca_btl_tcp_component.tcp_events, opal_list_t);
Expand Down Expand Up @@ -477,6 +481,7 @@ static int mca_btl_tcp_component_close(void)
OBJ_DESTRUCT(&mca_btl_tcp_component.tcp_frag_max);
OBJ_DESTRUCT(&mca_btl_tcp_component.tcp_frag_user);
OBJ_DESTRUCT(&mca_btl_tcp_component.tcp_lock);
OBJ_DESTRUCT(&mca_btl_tcp_component.local_ifs);

#if OPAL_CUDA_SUPPORT
mca_common_cuda_fini();
Expand All @@ -493,8 +498,9 @@ static int mca_btl_tcp_component_close(void)
static int mca_btl_tcp_create(const int if_kindex, const char* if_name)
{
struct mca_btl_tcp_module_t* btl;
opal_if_t *copied_interface, *selected_interface;
char param[256];
int i;
int i, if_index;
struct sockaddr_storage addr;
bool found = false;

Expand All @@ -515,18 +521,15 @@ static int mca_btl_tcp_create(const int if_kindex, const char* if_name)
* 10.1.0.1 as the one that is published in the modex and used for
* connection.
*/
for (i = opal_ifbegin() ; i >= 0 ; i = opal_ifnext(i)) {
int ret;

if (if_kindex != opal_ifindextokindex(i)) {
OPAL_LIST_FOREACH(selected_interface, &opal_if_list, opal_if_t) {
if (if_kindex != selected_interface->if_kernel_index) {
continue;
}

ret = opal_ifindextoaddr(i, (struct sockaddr*)&addr,
sizeof(struct sockaddr_storage));
if (OPAL_SUCCESS != ret) {
return ret;
}
if_index = selected_interface->if_index;

memcpy((struct sockaddr*)&addr, &selected_interface->if_addr,
MIN(sizeof(struct sockaddr_storage), sizeof(selected_interface->if_addr)));

if (addr.ss_family == AF_INET &&
4 != mca_btl_tcp_component.tcp_disable_family) {
Expand All @@ -548,12 +551,19 @@ static int mca_btl_tcp_create(const int if_kindex, const char* if_name)
btl = (struct mca_btl_tcp_module_t *)malloc(sizeof(mca_btl_tcp_module_t));
if(NULL == btl)
return OPAL_ERR_OUT_OF_RESOURCE;
copied_interface = OBJ_NEW(opal_if_t);
if (NULL == copied_interface) {
free(btl);
return OPAL_ERR_OUT_OF_RESOURCE;
}
memcpy(btl, &mca_btl_tcp_module, sizeof(mca_btl_tcp_module));
OBJ_CONSTRUCT(&btl->tcp_endpoints, opal_list_t);
OBJ_CONSTRUCT(&btl->tcp_endpoints_mutex, opal_mutex_t);
mca_btl_tcp_component.tcp_btls[mca_btl_tcp_component.tcp_num_btls++] = btl;

/* initialize the btl */
/* This index is used as a key for a hash table used for interface matching. */
btl->btl_index = mca_btl_tcp_component.tcp_num_btls - 1;
btl->tcp_ifkindex = (uint16_t) if_kindex;
#if MCA_BTL_TCP_STATISTICS
btl->tcp_bytes_recv = 0;
Expand All @@ -562,6 +572,7 @@ static int mca_btl_tcp_create(const int if_kindex, const char* if_name)
#endif

memcpy(&btl->tcp_ifaddr, &addr, sizeof(struct sockaddr_storage));
btl->tcp_ifmask = selected_interface->if_mask;

/* allow user to specify interface bandwidth */
sprintf(param, "bandwidth_%s", if_name);
Expand Down Expand Up @@ -603,6 +614,21 @@ static int mca_btl_tcp_create(const int if_kindex, const char* if_name)
}
}

/* Add another entry to the local interface list */
opal_string_copy(copied_interface->if_name, if_name, OPAL_IF_NAMESIZE);
copied_interface->if_index = if_index;
copied_interface->if_kernel_index = btl->tcp_ifkindex;
copied_interface->af_family = btl->tcp_ifaddr.ss_family;
copied_interface->if_flags = selected_interface->if_flags;
copied_interface->if_speed = selected_interface->if_speed;
memcpy(&copied_interface->if_addr, &btl->tcp_ifaddr, sizeof(struct sockaddr_storage));
copied_interface->if_mask = selected_interface->if_mask;
copied_interface->if_bandwidth = btl->super.btl_bandwidth;
memcpy(&copied_interface->if_mac, &selected_interface->if_mac, sizeof(copied_interface->if_mac));
copied_interface->ifmtu = selected_interface->ifmtu;

opal_list_append(&mca_btl_tcp_component.local_ifs, &(copied_interface->super));

opal_output_verbose(5, opal_btl_base_framework.framework_output,
"btl:tcp: %p: if %s kidx %d cnt %i addr %s %s bw %d lt %d\n",
(void*)btl, if_name, (int) btl->tcp_ifkindex, i,
Expand Down Expand Up @@ -1188,7 +1214,6 @@ static int mca_btl_tcp_component_exchange(void)
memcpy(&addrs[i].addr, &(inaddr6->sin6_addr),
sizeof(struct in6_addr));
addrs[i].addr_port = mca_btl_tcp_component.tcp6_listen_port;
addrs[i].addr_ifkindex = btl->tcp_ifkindex;
addrs[i].addr_family = MCA_BTL_TCP_AF_INET6;
opal_output_verbose(5, opal_btl_base_framework.framework_output,
"btl: tcp: exchange: %d %d IPv6 %s",
Expand All @@ -1202,7 +1227,6 @@ static int mca_btl_tcp_component_exchange(void)
memcpy(&addrs[i].addr, &(inaddr->sin_addr),
sizeof(struct in_addr));
addrs[i].addr_port = mca_btl_tcp_component.tcp_listen_port;
addrs[i].addr_ifkindex = btl->tcp_ifkindex;
addrs[i].addr_family = MCA_BTL_TCP_AF_INET;
opal_output_verbose(5, opal_btl_base_framework.framework_output,
"btl: tcp: exchange: %d %d IPv4 %s",
Expand All @@ -1212,6 +1236,10 @@ static int mca_btl_tcp_component_exchange(void)
BTL_ERROR(("Unexpected address family: %d", addr->sa_family));
return OPAL_ERR_BAD_PARAM;
}

addrs[i].addr_ifkindex = btl->tcp_ifkindex;
addrs[i].addr_mask = btl->tcp_ifmask;
addrs[i].addr_bandwidth = btl->super.btl_bandwidth;
}

OPAL_MODEX_SEND(rc, OPAL_PMIX_GLOBAL,
Expand Down
Loading

0 comments on commit fc8c7a5

Please sign in to comment.