Skip to content

Commit

Permalink
tcp btl: Fix multiple-link connection establishment.
Browse files Browse the repository at this point in the history
Fix case where the btl_tcp_links MCA parameter is used to create multiple TCP connections between peers.
Three issues were resulting in hangs during large message transfer:
  * The 2nd..btl_tcp_link connections were dropped during establishment because the per-process
    address check was binary, rather than a count
  * The accept handler would not skip a btl module that was already in use, resulting in all
    connections for a given address being vectored to a single btl
  * Multiple addresses in the same subnet caused connections to be
    stalled, as the receiver would always use the same (first) address
    found.  Binding the outgoing connection solves this issue

Signed-off-by: Jordan Cherry <cherryj@amazon.com>
  • Loading branch information
jocherry committed Nov 28, 2017
1 parent d3a23f9 commit e20fc6c
Show file tree
Hide file tree
Showing 3 changed files with 25 additions and 5 deletions.
2 changes: 2 additions & 0 deletions opal/mca/btl/tcp/btl_tcp_component.c
Original file line number Diff line number Diff line change
Expand Up @@ -511,6 +511,8 @@ static int mca_btl_tcp_create(int if_kindex, const char* if_name)
btl->tcp_send_handler = 0;
#endif

opal_ifkindextoaddr(if_kindex, (struct sockaddr*) &btl->tcp_ifaddr,
sizeof (btl->tcp_ifaddr));
/* allow user to specify interface bandwidth */
sprintf(param, "bandwidth_%s", if_name);
mca_btl_tcp_param_register_uint(param, NULL, btl->super.btl_bandwidth, OPAL_INFO_LVL_5, &btl->super.btl_bandwidth);
Expand Down
15 changes: 15 additions & 0 deletions opal/mca/btl/tcp/btl_tcp_endpoint.c
Original file line number Diff line number Diff line change
Expand Up @@ -715,6 +715,21 @@ static int mca_btl_tcp_endpoint_start_connect(mca_btl_base_endpoint_t* btl_endpo
}
}

/* Bind the socket to one of the addresses associated with
* this btl module. This sets the source IP to one of the
* addresses shared in modex, so that the destination rank
* can properly pair btl modules, even in cases where Linux
* might do something unexpected with routing */
opal_socklen_t sockaddr_addrlen = sizeof(struct sockaddr_storage);
if (NULL != &btl_endpoint->endpoint_btl->tcp_ifaddr) {
if (bind(btl_endpoint->endpoint_sd, (struct sockaddr*) &btl_endpoint->endpoint_btl->tcp_ifaddr, sockaddr_addrlen) < 0) {
BTL_ERROR(("bind() failed: %s (%d)", strerror(opal_socket_errno), opal_socket_errno));

CLOSE_THE_SOCKET(btl_endpoint->endpoint_sd);
return OPAL_ERROR;
}
}

/* start the connect - will likely fail with EINPROGRESS */
mca_btl_tcp_proc_tosocks(btl_endpoint->endpoint_addr, &endpoint_addr);

Expand Down
13 changes: 8 additions & 5 deletions opal/mca/btl/tcp/btl_tcp_proc.c
Original file line number Diff line number Diff line change
Expand Up @@ -486,7 +486,7 @@ int mca_btl_tcp_proc_insert( mca_btl_tcp_proc_t* btl_proc,
}

/*
* in case the peer address has all intended connections,
* in case the peer address has created all intended connections,
* mark the complete peer interface as 'not available'
*/
if(endpoint_addr->addr_inuse >= mca_btl_tcp_component.tcp_num_links) {
Expand Down Expand Up @@ -812,10 +812,11 @@ void mca_btl_tcp_proc_accept(mca_btl_tcp_proc_t* btl_proc, struct sockaddr* addr
OPAL_THREAD_LOCK(&btl_proc->proc_lock);
for( size_t i = 0; i < btl_proc->proc_endpoint_count; i++ ) {
mca_btl_base_endpoint_t* btl_endpoint = btl_proc->proc_endpoints[i];
/* We are not here to make a decision about what is good socket
* and what is not. We simply check that this socket fit the endpoint
* end we prepare for the real decision function mca_btl_tcp_endpoint_accept. */
if( btl_endpoint->endpoint_addr->addr_family != addr->sa_family ) {
/* We are not here to make a decision about what is a good socket
* and what is not, we simply check two conditions to see if connection
* is acceptable: 1. It is the correct address family (IPv4 vs IPv6)
* and 2. A connection is not open on that endpoint*/
if( btl_endpoint->endpoint_addr->addr_family != addr->sa_family || btl_endpoint->endpoint_state != MCA_BTL_TCP_CLOSED ) {
continue;
}
switch (addr->sa_family) {
Expand Down Expand Up @@ -857,6 +858,8 @@ void mca_btl_tcp_proc_accept(mca_btl_tcp_proc_t* btl_proc, struct sockaddr* addr
;
}

/* Set state to CONNECTING to ensure that subsequent conenctions do not attempt to re-use endpoint in the num_links > 1 case*/
btl_endpoint->endpoint_state = MCA_BTL_TCP_CONNECTING;
(void)mca_btl_tcp_endpoint_accept(btl_endpoint, addr, sd);
OPAL_THREAD_UNLOCK(&btl_proc->proc_lock);
return;
Expand Down

0 comments on commit e20fc6c

Please sign in to comment.