Skip to content

Commit

Permalink
proxy; connection to LB recoverable
Browse files Browse the repository at this point in the history
Since nsm v1.13.2 the interface name generator no longer considers
the connection ID. Thus, there was not much point keeping the random
UUID prefix when creating the connection ID for proxy -> LB connection
requests. From now on old connections are looked up using nsm monitor
connection feature prior to requesting a connection to an LB.
  • Loading branch information
zolug committed Jul 18, 2024
1 parent 7e60e98 commit d28bd64
Show file tree
Hide file tree
Showing 6 changed files with 80 additions and 30 deletions.
12 changes: 7 additions & 5 deletions cmd/proxy/internal/client/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,16 +21,18 @@ import (
"net/url"
"time"

"github.com/networkservicemesh/api/pkg/api/networkservice"
"github.com/nordix/meridio/pkg/nsm"
)

// Config - configuration for network service client
type Config struct {
Name string
RequestTimeout time.Duration
ConnectTo url.URL
MaxTokenLifetime time.Duration
APIClient *nsm.APIClient
Name string
RequestTimeout time.Duration
ConnectTo url.URL
MaxTokenLifetime time.Duration
APIClient *nsm.APIClient
MonitorConnectionClient networkservice.MonitorConnectionClient
}

// IsValid - check if configuration is valid
Expand Down
79 changes: 61 additions & 18 deletions cmd/proxy/internal/client/fullmesh.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@ import (
"time"

"github.com/go-logr/logr"
"github.com/google/uuid"
"github.com/networkservicemesh/api/pkg/api/networkservice"
"github.com/networkservicemesh/api/pkg/api/registry"
registryrefresh "github.com/networkservicemesh/sdk/pkg/registry/common/refresh"
Expand Down Expand Up @@ -134,37 +133,81 @@ func (fmnsc *FullMeshNetworkServiceClient) addNetworkServiceClient(networkServic
if fmnsc.serviceClosed || fmnsc.networkServiceClientExists(networkServiceEndpointName) {
return
}

var monitoredConnections map[string]*networkservice.Connection
networkServiceClient := NewSimpleNetworkServiceClient(fmnsc.ctx, fmnsc.config, fmnsc.client)
request := fmnsc.baseRequest.Clone()
request.Connection.NetworkServiceEndpointName = networkServiceEndpointName
// UUID part at the start of the conn id will be used by NSM to generate
// the interface name (we want it to be unique).
// The random part also secures that there should be no connections with
// the same connection id maintened by NSMgr (for example after a possible
// Proxy crash). Hence, no need for a NSM connection monitor to attempt
// connection recovery when trying to connect the first time.
// TODO: But the random part also implies, that after a crash the old IPs
// will be be leaked.
// TODO: Consider trying to recover a connection towards the new NSE via
// NSM Monitor Connection, before generating an ID for a new connection.
// That's because in case of an instable system NSEs might be reported
// unaivalble and then shortly available again. NSM Close() related the
// unavailable report might fail, thus leaving interfaces behind whose IPs
// could get freed up before the old interfaces would disappear. Luckily,
// the old interfaces should be removed from the bridge, thus hopefully not
// causing problem just confusion.
request.Connection.Id = fmt.Sprintf("%s-%s-%s-%s", uuid.New().String(),
// Note: Starting with NSM v1.13.2 the NSM interface name does NOT depend
// on the connection ID. Therefore, the random UUID part had been removed
// from the ID. This allows the proxy to attempt recovery of any previous
// connections after crash or temporary LB NSE unavailability (enabling
// recovery of assocaited IPs as well). This can be achieved by using NSM's
// connection monitor feature when connecting an LB NSE.
id := fmt.Sprintf("%s-%s-%s",
fmnsc.config.Name,
request.Connection.NetworkService,
request.Connection.NetworkServiceEndpointName,
)
request.Connection.Id = id
logger := fmnsc.logger.WithValues("func", "addNetworkServiceClient",
"name", networkServiceEndpointName,
"service", request.Connection.NetworkService,
"id", request.Connection.Id,
)
logger.Info("Add network service endpoint")

// Check if NSM already tracks a connection with the same ID, if it does
// then re-use the connection
if fmnsc.config.MonitorConnectionClient != nil {
monitorCli := fmnsc.config.MonitorConnectionClient
stream, err := monitorCli.MonitorConnections(fmnsc.ctx, &networkservice.MonitorScopeSelector{
PathSegments: []*networkservice.PathSegment{
{
Id: id,
},
},
})
if err != nil {
logger.Error(err, "Failed to create monitorConnectionClient")
} else {
event, err := stream.Recv()
if err != nil {
// probably running a really old NSM version, don't like this but let it continue anyways
// XXX: I guess nsmgr crash/upgrade would cause EOF here, after which it would
// make no sense trying again to fetch connections from an "empty" nsmgr...
logger.Error(err, "error from monitorConnection stream")
} else {
monitoredConnections = event.Connections
}
}
}

// Update request based on recovered connection(s) if any
for _, conn := range monitoredConnections {
path := conn.GetPath()
if path != nil && path.Index == 1 && path.PathSegments[0].Id == id && conn.Mechanism.Type == request.MechanismPreferences[0].Type {
logger.Info("Recovered connection", "connection", conn)
// TODO: consider merging any possible labels from baseRequest
request.Connection = conn
request.Connection.Path.Index = 0
request.Connection.Id = id
break
}
}

// Check if recovered connection indicates issue with control plane,
// if so request reselect. Otherwise, the connection request might fail
// if an old path segment (e.g. forwarder) was replaced in the meantime.
// (refer to https://github.com/networkservicemesh/cmd-nsc/pull/600)
if request.GetConnection().State == networkservice.State_DOWN ||
request.Connection.NetworkServiceEndpointName != networkServiceEndpointName {
logger.Info("Request reselect for recovered connection")
request.GetConnection().Mechanism = nil
request.GetConnection().NetworkServiceEndpointName = networkServiceEndpointName // must be a valid reselect request because fullmeshtracker in case of heal with reconnect would do the same
request.GetConnection().State = networkservice.State_RESELECT_REQUESTED
}

// Request would try forever, but what if the NetworkServiceEndpoint is removed in the meantime?
// The recv() method must not be blocked by a pending Request that might not ever succeed.
// Also, on NSE removal networkServiceClient must be capable of cancelling a pending request,
Expand Down
12 changes: 7 additions & 5 deletions cmd/proxy/internal/service/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,15 +46,17 @@ func GetNSC(ctx context.Context,
config *config.Config,
nsmAPIClient *nsm.APIClient,
p *proxy.Proxy,
interfaceMonitorClient networkservice.NetworkServiceClient) client.NetworkServiceClient {
interfaceMonitorClient networkservice.NetworkServiceClient,
monitorConnectionClient networkservice.MonitorConnectionClient) client.NetworkServiceClient {

logger := log.FromContextOrGlobal(ctx).WithValues("func", "GetNSC")
logger.Info("Create Full Mesh NSC")
clientConfig := &client.Config{
Name: config.Name,
RequestTimeout: config.RequestTimeout,
ConnectTo: config.ConnectTo,
APIClient: nsmAPIClient,
Name: config.Name,
RequestTimeout: config.RequestTimeout,
ConnectTo: config.ConnectTo,
APIClient: nsmAPIClient,
MonitorConnectionClient: monitorConnectionClient,
}
// Note: naming the interface is left to NSM (refer to getNameFromConnection())
// However NSM does not seem to ensure uniqueness either. Might need to revisit...
Expand Down
2 changes: 1 addition & 1 deletion cmd/proxy/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -240,7 +240,7 @@ func main() {

// create and start NSC that connects all remote NSE belonging to the right service
interfaceMonitorClient := interfacemonitor.NewClient(interfaceMonitor, p, netUtils)
nsmClient := service.GetNSC(ctx, &config, nsmAPIClient, p, interfaceMonitorClient)
nsmClient := service.GetNSC(ctx, &config, nsmAPIClient, p, interfaceMonitorClient, monitorClient)
defer nsmClient.Close()
go func() {
service.StartNSC(nsmClient, config.NetworkServiceName)
Expand Down
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@ require (
github.com/golang/mock v1.6.0
github.com/golang/protobuf v1.5.4
github.com/google/nftables v0.2.0
github.com/google/uuid v1.3.1
github.com/kelseyhightower/envconfig v1.4.0
github.com/networkservicemesh/api v1.13.2
github.com/networkservicemesh/sdk v1.13.2
Expand Down Expand Up @@ -79,6 +78,7 @@ require (
github.com/google/gofuzz v1.2.0 // indirect
github.com/google/pprof v0.0.0-20210720184732-4bb14d4b1be1 // indirect
github.com/google/shlex v0.0.0-20191202100458-e7afc7fbc510 // indirect
github.com/google/uuid v1.3.1 // indirect
github.com/gorilla/websocket v1.5.0 // indirect
github.com/grpc-ecosystem/grpc-gateway/v2 v2.16.0 // indirect
github.com/imdario/mergo v0.3.12 // indirect
Expand Down
3 changes: 3 additions & 0 deletions pkg/nsm/fullmeshtracker/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,9 @@ func (fmtc *fullMeshTrackerClient) Request(ctx context.Context, request *network

// Close -
// TODO: remove NSE from the map when they are removed (real close)
// Note related to above TODO: Heal with reconnect must be taken into consideration,
// because it would close the connection first and then try to reconnect by erasing
// NetworkServiceEndpointName, which could break proxy connecting to a specific LB.
func (fmtc *fullMeshTrackerClient) Close(ctx context.Context, conn *networkservice.Connection, opts ...grpc.CallOption) (*emptypb.Empty, error) {
_, err := next.Client(ctx).Close(ctx, conn, opts...)
return &emptypb.Empty{}, err
Expand Down

0 comments on commit d28bd64

Please sign in to comment.