Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

btl tcp: Use reachability and graph solving for global interface matching #7134

Merged
merged 1 commit into from
Jan 27, 2020
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 2 additions & 6 deletions opal/mca/btl/tcp/btl_tcp.c
Original file line number Diff line number Diff line change
@@ -15,6 +15,8 @@
* Copyright (c) 2016-2017 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* Copyright (c) 2016 Intel, Inc. All rights reserved.
* Copyright (c) 2019 Amazon.com, Inc. or its affiliates. All Rights
* reserved.
*
* $COPYRIGHT$
*
@@ -90,12 +92,6 @@ int mca_btl_tcp_add_procs( struct mca_btl_base_module_t* btl,
continue;
}

/*
* Check to make sure that the peer has at least as many interface
* addresses exported as we are trying to use. If not, then
* don't bind this BTL instance to the proc.
*/

OPAL_THREAD_LOCK(&tcp_proc->proc_lock);

for (uint32_t j = 0 ; j < (uint32_t)tcp_proc->proc_endpoint_count ; ++j) {
6 changes: 6 additions & 0 deletions opal/mca/btl/tcp/btl_tcp.h
Original file line number Diff line number Diff line change
@@ -15,6 +15,8 @@
* and Technology (RIST). All rights reserved.
* Copyright (c) 2014-2015 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2019 Amazon.com, Inc. or its affiliates. All Rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@@ -107,6 +109,7 @@ struct mca_btl_tcp_component_t {
uint32_t tcp_num_btls; /**< number of interfaces available to the TCP component */
unsigned int tcp_num_links; /**< number of logical links per physical device */
struct mca_btl_tcp_module_t **tcp_btls; /**< array of available BTL modules */
opal_list_t local_ifs; /**< opal list of local opal_if_t interfaces */
int tcp_free_list_num; /**< initial size of free lists */
int tcp_free_list_max; /**< maximum size of free lists */
int tcp_free_list_inc; /**< number of elements to alloc when growing free lists */
@@ -163,6 +166,9 @@ OPAL_MODULE_DECLSPEC extern mca_btl_tcp_component_t mca_btl_tcp_component;
*/
struct mca_btl_tcp_module_t {
mca_btl_base_module_t super; /**< base BTL interface */
uint32_t btl_index; /**< Local BTL module index, used for vertex
data and used as a hash key when
solving module matching problem */
uint16_t tcp_ifkindex; /** <BTL kernel interface index */
struct sockaddr_storage tcp_ifaddr; /**< First address
discovered for this
20 changes: 14 additions & 6 deletions opal/mca/btl/tcp/btl_tcp_addr.h
Original file line number Diff line number Diff line change
@@ -9,6 +9,9 @@
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2019 Amazon.com, Inc. or its affiliates. All Rights
* reserved.
*
* $COPYRIGHT$
*
* Additional copyrights may follow
@@ -30,37 +33,43 @@
#ifdef HAVE_NETINET_IN_H
#include <netinet/in.h>
#endif

#include <assert.h>

/**
* Modex address structure.
*
* One of these structures will be sent for every btl module in use by
* the local BTL TCP component.
* the local BTL TCP component. This is used to construct an opal_if_t
* structure for the reachability component as well as populate the
* mca_btl_tcp_addr_t structure on remote procs. These will be used
* for interface matching and filling out the mca_btl_base_endpoint_t
* structure.
*/
struct mca_btl_tcp_modex_addr_t {
uint8_t addr[16]; /* endpoint address. for addr_family
of MCA_BTL_TCP_AF_INET, only the
first 4 bytes have meaning. */
uint32_t addr_ifkindex; /* endpoint kernel index */
uint32_t addr_mask; /* ip mask */
uint32_t addr_bandwidth; /* interface bandwidth */
uint16_t addr_port; /* endpoint listen port */
uint8_t addr_family; /* endpoint address family. Note that
this is
MCA_BTL_TCP_AF_{INET,INET6}, not
the traditional
AF_INET/AF_INET6. */
uint8_t padding[1]; /* padd out to an 8-byte word */
uint8_t padding[1]; /* pad out to an 8-byte word */
};
typedef struct mca_btl_tcp_modex_addr_t mca_btl_tcp_modex_addr_t;

_Static_assert(sizeof(struct mca_btl_tcp_modex_addr_t) == 32, "mca_btl_tcp_modex_addr_t");

/**
* Remote peer address structure
*
* One of these structures will be allocated for every remote endpoint
* associated with a remote proc. The data is pulled from the
* mca_btl_tcp_modex_addr_t structure, except for the addr_inuse
* field, which is local.
* mca_btl_tcp_modex_addr_t structure.
*/
struct mca_btl_tcp_addr_t {
union {
@@ -73,7 +82,6 @@ struct mca_btl_tcp_addr_t {
int addr_ifkindex; /**< remote interface index assigned with
this address */
uint8_t addr_family; /**< AF_INET or AF_INET6 */
bool addr_inuse; /**< local meaning only */
};
typedef struct mca_btl_tcp_addr_t mca_btl_tcp_addr_t;

54 changes: 41 additions & 13 deletions opal/mca/btl/tcp/btl_tcp_component.c
Original file line number Diff line number Diff line change
@@ -19,7 +19,8 @@
* Copyright (c) 2014-2015 Intel, Inc. All rights reserved.
* Copyright (c) 2014-2017 Research Organization for Information Science
* and Technology (RIST). All rights reserved.
* Copyright (c) 2018 Amazon.com, Inc. or its affiliates. All Rights reserved.
* Copyright (c) 2018-2019 Amazon.com, Inc. or its affiliates. All Rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@@ -69,13 +70,15 @@
#include "opal/util/net.h"
#include "opal/util/fd.h"
#include "opal/util/show_help.h"
#include "opal/util/string_copy.h"
#include "opal/util/printf.h"
#include "opal/constants.h"
#include "opal/mca/btl/btl.h"
#include "opal/mca/btl/base/base.h"
#include "opal/mca/mpool/base/base.h"
#include "opal/mca/btl/base/btl_base_error.h"
#include "opal/mca/pmix/pmix.h"
#include "opal/mca/reachable/base/base.h"
#include "opal/threads/threads.h"

#include "opal/constants.h"
@@ -368,6 +371,7 @@ static int mca_btl_tcp_component_open(void)
mca_btl_tcp_component.tcp_btls = NULL;

/* initialize objects */
OBJ_CONSTRUCT(&mca_btl_tcp_component.local_ifs, opal_list_t);
OBJ_CONSTRUCT(&mca_btl_tcp_component.tcp_lock, opal_mutex_t);
OBJ_CONSTRUCT(&mca_btl_tcp_component.tcp_procs, opal_proc_table_t);
OBJ_CONSTRUCT(&mca_btl_tcp_component.tcp_events, opal_list_t);
@@ -477,6 +481,7 @@ static int mca_btl_tcp_component_close(void)
OBJ_DESTRUCT(&mca_btl_tcp_component.tcp_frag_max);
OBJ_DESTRUCT(&mca_btl_tcp_component.tcp_frag_user);
OBJ_DESTRUCT(&mca_btl_tcp_component.tcp_lock);
OBJ_DESTRUCT(&mca_btl_tcp_component.local_ifs);

#if OPAL_CUDA_SUPPORT
mca_common_cuda_fini();
@@ -493,8 +498,9 @@ static int mca_btl_tcp_component_close(void)
static int mca_btl_tcp_create(const int if_kindex, const char* if_name)
{
struct mca_btl_tcp_module_t* btl;
opal_if_t *copied_interface, *selected_interface;
char param[256];
int i;
int i, if_index;
struct sockaddr_storage addr;
bool found = false;

@@ -515,18 +521,15 @@ static int mca_btl_tcp_create(const int if_kindex, const char* if_name)
* 10.1.0.1 as the one that is published in the modex and used for
* connection.
*/
for (i = opal_ifbegin() ; i >= 0 ; i = opal_ifnext(i)) {
int ret;

if (if_kindex != opal_ifindextokindex(i)) {
OPAL_LIST_FOREACH(selected_interface, &opal_if_list, opal_if_t) {
if (if_kindex != selected_interface->if_kernel_index) {
continue;
}

ret = opal_ifindextoaddr(i, (struct sockaddr*)&addr,
sizeof(struct sockaddr_storage));
if (OPAL_SUCCESS != ret) {
return ret;
}
if_index = selected_interface->if_index;

memcpy((struct sockaddr*)&addr, &selected_interface->if_addr,
MIN(sizeof(struct sockaddr_storage), sizeof(selected_interface->if_addr)));

if (addr.ss_family == AF_INET &&
4 != mca_btl_tcp_component.tcp_disable_family) {
@@ -548,12 +551,19 @@ static int mca_btl_tcp_create(const int if_kindex, const char* if_name)
btl = (struct mca_btl_tcp_module_t *)malloc(sizeof(mca_btl_tcp_module_t));
if(NULL == btl)
return OPAL_ERR_OUT_OF_RESOURCE;
copied_interface = OBJ_NEW(opal_if_t);
if (NULL == copied_interface) {
free(btl);
return OPAL_ERR_OUT_OF_RESOURCE;
}
memcpy(btl, &mca_btl_tcp_module, sizeof(mca_btl_tcp_module));
OBJ_CONSTRUCT(&btl->tcp_endpoints, opal_list_t);
OBJ_CONSTRUCT(&btl->tcp_endpoints_mutex, opal_mutex_t);
mca_btl_tcp_component.tcp_btls[mca_btl_tcp_component.tcp_num_btls++] = btl;

/* initialize the btl */
/* This index is used as a key for a hash table used for interface matching. */
btl->btl_index = mca_btl_tcp_component.tcp_num_btls - 1;
btl->tcp_ifkindex = (uint16_t) if_kindex;
#if MCA_BTL_TCP_STATISTICS
btl->tcp_bytes_recv = 0;
@@ -562,6 +572,7 @@ static int mca_btl_tcp_create(const int if_kindex, const char* if_name)
#endif

memcpy(&btl->tcp_ifaddr, &addr, sizeof(struct sockaddr_storage));
btl->tcp_ifmask = selected_interface->if_mask;

/* allow user to specify interface bandwidth */
sprintf(param, "bandwidth_%s", if_name);
@@ -603,6 +614,21 @@ static int mca_btl_tcp_create(const int if_kindex, const char* if_name)
}
}

/* Add another entry to the local interface list */
opal_string_copy(copied_interface->if_name, if_name, OPAL_IF_NAMESIZE);
copied_interface->if_index = if_index;
copied_interface->if_kernel_index = btl->tcp_ifkindex;
copied_interface->af_family = btl->tcp_ifaddr.ss_family;
copied_interface->if_flags = selected_interface->if_flags;
copied_interface->if_speed = selected_interface->if_speed;
memcpy(&copied_interface->if_addr, &btl->tcp_ifaddr, sizeof(struct sockaddr_storage));
copied_interface->if_mask = selected_interface->if_mask;
copied_interface->if_bandwidth = btl->super.btl_bandwidth;
memcpy(&copied_interface->if_mac, &selected_interface->if_mac, sizeof(copied_interface->if_mac));
copied_interface->ifmtu = selected_interface->ifmtu;

opal_list_append(&mca_btl_tcp_component.local_ifs, &(copied_interface->super));

opal_output_verbose(5, opal_btl_base_framework.framework_output,
"btl:tcp: %p: if %s kidx %d cnt %i addr %s %s bw %d lt %d\n",
(void*)btl, if_name, (int) btl->tcp_ifkindex, i,
@@ -1188,7 +1214,6 @@ static int mca_btl_tcp_component_exchange(void)
memcpy(&addrs[i].addr, &(inaddr6->sin6_addr),
sizeof(struct in6_addr));
addrs[i].addr_port = mca_btl_tcp_component.tcp6_listen_port;
addrs[i].addr_ifkindex = btl->tcp_ifkindex;
addrs[i].addr_family = MCA_BTL_TCP_AF_INET6;
opal_output_verbose(5, opal_btl_base_framework.framework_output,
"btl: tcp: exchange: %d %d IPv6 %s",
@@ -1202,7 +1227,6 @@ static int mca_btl_tcp_component_exchange(void)
memcpy(&addrs[i].addr, &(inaddr->sin_addr),
sizeof(struct in_addr));
addrs[i].addr_port = mca_btl_tcp_component.tcp_listen_port;
addrs[i].addr_ifkindex = btl->tcp_ifkindex;
addrs[i].addr_family = MCA_BTL_TCP_AF_INET;
opal_output_verbose(5, opal_btl_base_framework.framework_output,
"btl: tcp: exchange: %d %d IPv4 %s",
@@ -1212,6 +1236,10 @@ static int mca_btl_tcp_component_exchange(void)
BTL_ERROR(("Unexpected address family: %d", addr->sa_family));
return OPAL_ERR_BAD_PARAM;
}

addrs[i].addr_ifkindex = btl->tcp_ifkindex;
addrs[i].addr_mask = btl->tcp_ifmask;
addrs[i].addr_bandwidth = btl->super.btl_bandwidth;
}

OPAL_MODEX_SEND(rc, OPAL_PMIX_GLOBAL,
874 changes: 304 additions & 570 deletions opal/mca/btl/tcp/btl_tcp_proc.c

Large diffs are not rendered by default.

50 changes: 5 additions & 45 deletions opal/mca/btl/tcp/btl_tcp_proc.h
Original file line number Diff line number Diff line change
@@ -10,6 +10,8 @@
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2010 Oracle and/or its affiliates. All rights reserved
* Copyright (c) 2019 Amazon.com, Inc. or its affiliates. All Rights
* reserved.
* $COPYRIGHT$
*
* Additional copyrights may follow
@@ -53,57 +55,15 @@ struct mca_btl_tcp_proc_t {
size_t proc_endpoint_count;
/**< number of endpoints */

opal_hash_table_t btl_index_to_endpoint;
/**< interface match table, matches btl_index to remote addresses of type mca_btl_tcp_addr_t */

opal_mutex_t proc_lock;
/**< lock to protect against concurrent access to proc state */
};
typedef struct mca_btl_tcp_proc_t mca_btl_tcp_proc_t;
OBJ_CLASS_DECLARATION(mca_btl_tcp_proc_t);

/* the highest possible interface kernel index we can handle */
#define MAX_KERNEL_INTERFACE_INDEX 65536

/* the maximum number of kernel interfaces we can handle */
#define MAX_KERNEL_INTERFACES 8

/* The maximum number of interfaces that we can have and use the
* recursion code for determining the best set of connections. When
* the number is greater than this, we switch to a simpler algorithm
* to speed things up. */
#define MAX_PERMUTATION_INTERFACES 8

/*
* FIXME: this should probably be part of an ompi list, so we need the
* appropriate definitions
*/

struct mca_btl_tcp_interface_t {
struct sockaddr_storage* ipv4_address;
struct sockaddr_storage* ipv6_address;
mca_btl_tcp_addr_t* ipv4_endpoint_addr;
mca_btl_tcp_addr_t* ipv6_endpoint_addr;
uint32_t ipv4_netmask;
uint32_t ipv6_netmask;
int kernel_index;
int peer_interface;
int index;
int inuse;
};

typedef struct mca_btl_tcp_interface_t mca_btl_tcp_interface_t;

/*
* describes the quality of a possible connection between a local and
* a remote network interface
*/
enum mca_btl_tcp_connection_quality {
CQ_NO_CONNECTION,
CQ_PRIVATE_DIFFERENT_NETWORK,
CQ_PRIVATE_SAME_NETWORK,
CQ_PUBLIC_DIFFERENT_NETWORK,
CQ_PUBLIC_SAME_NETWORK
};


mca_btl_tcp_proc_t* mca_btl_tcp_proc_create(opal_proc_t* proc);
mca_btl_tcp_proc_t* mca_btl_tcp_proc_lookup(const opal_process_name_t* name);
int mca_btl_tcp_proc_insert(mca_btl_tcp_proc_t*, mca_btl_base_endpoint_t*);