Skip to content

Commit

Permalink
xds: prefer fed state gateway definitions if they're fresher (#11522) (
Browse files Browse the repository at this point in the history
…#11532)

Fixes an issue described in #10132, where if two DCs are WAN federated
over mesh gateways, and the gateway in the non-primary DC is terminated
and receives a new IP address (as is commonly the case when running them
on ephemeral compute instances) the primary DC is unable to re-establish
its connection until the agent running on its own gateway is restarted.

This was happening because we always preferred gateways discovered by
the `Internal.ServiceDump` RPC (which would fail because there's no way
to dial the remote DC) over those discovered in the federation state,
which is replicated as long as the primary DC's gateway is reachable.
  • Loading branch information
boxofrad authored Nov 9, 2021
1 parent 2c1f532 commit 4b38292
Show file tree
Hide file tree
Showing 7 changed files with 392 additions and 6 deletions.
3 changes: 3 additions & 0 deletions .changelog/11522.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
```release-note:bug
xds: fixes a bug where replacing a mesh gateway node used for WAN federation (with another that has a different IP) could leave gateways in the other DC unable to re-establish the connection
```
51 changes: 51 additions & 0 deletions agent/proxycfg/testing.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"context"
"fmt"
"io/ioutil"
"math"
"path"
"path/filepath"
"sync"
Expand Down Expand Up @@ -1418,6 +1419,56 @@ func TestConfigSnapshotMeshGatewayUsingFederationStates(t testing.T) *ConfigSnap
return testConfigSnapshotMeshGateway(t, true, true)
}

func TestConfigSnapshotMeshGatewayNewerInformationInFederationStates(t testing.T) *ConfigSnapshot {
snap := TestConfigSnapshotMeshGateway(t)

// Create a duplicate entry in FedStateGateways, with a high ModifyIndex, to
// verify that fresh data in the federation state is preferred over stale data
// in GatewayGroups.
svc := structs.TestNodeServiceMeshGatewayWithAddrs(t,
"10.0.1.3", 8443,
structs.ServiceAddress{Address: "10.0.1.3", Port: 8443},
structs.ServiceAddress{Address: "198.18.1.3", Port: 443},
)
svc.RaftIndex.ModifyIndex = math.MaxUint64

snap.MeshGateway.FedStateGateways = map[string]structs.CheckServiceNodes{
"dc2": {
{
Node: snap.MeshGateway.GatewayGroups["dc2"][0].Node,
Service: svc,
},
},
}

return snap
}

func TestConfigSnapshotMeshGatewayOlderInformationInFederationStates(t testing.T) *ConfigSnapshot {
snap := TestConfigSnapshotMeshGateway(t)

// Create a duplicate entry in FedStateGateways, with a low ModifyIndex, to
// verify that stale data in the federation state is ignored in favor of the
// fresher data in GatewayGroups.
svc := structs.TestNodeServiceMeshGatewayWithAddrs(t,
"10.0.1.3", 8443,
structs.ServiceAddress{Address: "10.0.1.3", Port: 8443},
structs.ServiceAddress{Address: "198.18.1.3", Port: 443},
)
svc.RaftIndex.ModifyIndex = 0

snap.MeshGateway.FedStateGateways = map[string]structs.CheckServiceNodes{
"dc2": {
{
Node: snap.MeshGateway.GatewayGroups["dc2"][0].Node,
Service: svc,
},
},
}

return snap
}

func TestConfigSnapshotMeshGatewayNoServices(t testing.T) *ConfigSnapshot {
return testConfigSnapshotMeshGateway(t, false, false)
}
Expand Down
3 changes: 3 additions & 0 deletions agent/structs/testing_catalog.go
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,9 @@ func TestNodeServiceMeshGatewayWithAddrs(t testing.T, address string, port int,
TaggedAddressLAN: lanAddr,
TaggedAddressWAN: wanAddr,
},
RaftIndex: RaftIndex{
ModifyIndex: 1,
},
}
}

Expand Down
55 changes: 49 additions & 6 deletions agent/xds/endpoints.go
Original file line number Diff line number Diff line change
Expand Up @@ -123,13 +123,56 @@ func (s *Server) endpointsFromSnapshotMeshGateway(cfgSnap *proxycfg.ConfigSnapsh
continue
}

endpoints, ok := cfgSnap.MeshGateway.GatewayGroups[dc]
if !ok {
endpoints, ok = cfgSnap.MeshGateway.FedStateGateways[dc]
if !ok { // not possible
s.Logger.Error("skipping mesh gateway endpoints because no definition found", "datacenter", dc)
continue
// Mesh gateways in remote DCs are discovered in two ways:
//
// 1. Via an Internal.ServiceDump RPC in the remote DC (GatewayGroups).
// 2. In the federation state that is replicated from the primary DC (FedStateGateways).
//
// We determine which set to use based on whichever contains the highest
// raft ModifyIndex (and is therefore most up-to-date).
//
// Previously, GatewayGroups was always given presedence over FedStateGateways
// but this was problematic when using mesh gateways for WAN federation.
//
// Consider the following example:
//
// - Primary and Secondary DCs are WAN Federated via local mesh gateways.
//
// - Secondary DC's mesh gateway is running on an ephemeral compute instance
// and is abruptly terminated and rescheduled with a *new IP address*.
//
// - Primary DC's mesh gateway is no longer able to connect to the Secondary
// DC as its proxy is configured with the old IP address. Therefore any RPC
// from the Primary to the Secondary DC will fail (including the one to
// discover the gateway's new IP address).
//
// - Secondary DC performs its regular anti-entropy of federation state data
// to the Primary DC (this succeeds as there is still connectivity in this
// direction).
//
// - At this point the Primary DC's mesh gateway should observe the new IP
// address and reconfigure its proxy, however as we always prioritised
// GatewayGroups this didn't happen and the connection remained severed.
maxModifyIndex := func(vals structs.CheckServiceNodes) uint64 {
var max uint64
for _, v := range vals {
if i := v.Service.RaftIndex.ModifyIndex; i > max {
max = i
}
}
return max
}

endpoints := cfgSnap.MeshGateway.GatewayGroups[dc]
fedStateEndpoints := cfgSnap.MeshGateway.FedStateGateways[dc]

if maxModifyIndex(fedStateEndpoints) > maxModifyIndex(endpoints) {
endpoints = fedStateEndpoints
}

if len(endpoints) == 0 {
s.Logger.Error("skipping mesh gateway endpoints because no definition found", "datacenter", dc)
continue
}

{ // standard connect
Expand Down
8 changes: 8 additions & 0 deletions agent/xds/endpoints_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -244,6 +244,14 @@ func Test_endpointsFromSnapshot(t *testing.T) {
create: proxycfg.TestConfigSnapshotMeshGatewayUsingFederationStates,
setup: nil,
},
{
name: "mesh-gateway-newer-information-in-federation-states",
create: proxycfg.TestConfigSnapshotMeshGatewayNewerInformationInFederationStates,
},
{
name: "mesh-gateway-older-information-in-federation-states",
create: proxycfg.TestConfigSnapshotMeshGatewayOlderInformationInFederationStates,
},
{
name: "mesh-gateway-no-services",
create: proxycfg.TestConfigSnapshotMeshGatewayNoServices,
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
{
"versionInfo": "00000001",
"resources": [
{
"@type": "type.googleapis.com/envoy.api.v2.ClusterLoadAssignment",
"clusterName": "bar.default.dc1.internal.11111111-2222-3333-4444-555555555555.consul",
"endpoints": [
{
"lbEndpoints": [
{
"endpoint": {
"address": {
"socketAddress": {
"address": "172.16.1.6",
"portValue": 2222
}
}
},
"healthStatus": "HEALTHY",
"loadBalancingWeight": 1
},
{
"endpoint": {
"address": {
"socketAddress": {
"address": "172.16.1.7",
"portValue": 2222
}
}
},
"healthStatus": "HEALTHY",
"loadBalancingWeight": 1
},
{
"endpoint": {
"address": {
"socketAddress": {
"address": "172.16.1.8",
"portValue": 2222
}
}
},
"healthStatus": "HEALTHY",
"loadBalancingWeight": 1
}
]
}
]
},
{
"@type": "type.googleapis.com/envoy.api.v2.ClusterLoadAssignment",
"clusterName": "dc2.internal.11111111-2222-3333-4444-555555555555.consul",
"endpoints": [
{
"lbEndpoints": [
{
"endpoint": {
"address": {
"socketAddress": {
"address": "198.18.1.3",
"portValue": 443
}
}
},
"healthStatus": "HEALTHY",
"loadBalancingWeight": 1
}
]
}
]
},
{
"@type": "type.googleapis.com/envoy.api.v2.ClusterLoadAssignment",
"clusterName": "foo.default.dc1.internal.11111111-2222-3333-4444-555555555555.consul",
"endpoints": [
{
"lbEndpoints": [
{
"endpoint": {
"address": {
"socketAddress": {
"address": "172.16.1.3",
"portValue": 2222
}
}
},
"healthStatus": "HEALTHY",
"loadBalancingWeight": 1
},
{
"endpoint": {
"address": {
"socketAddress": {
"address": "172.16.1.4",
"portValue": 2222
}
}
},
"healthStatus": "HEALTHY",
"loadBalancingWeight": 1
},
{
"endpoint": {
"address": {
"socketAddress": {
"address": "172.16.1.5",
"portValue": 2222
}
}
},
"healthStatus": "HEALTHY",
"loadBalancingWeight": 1
},
{
"endpoint": {
"address": {
"socketAddress": {
"address": "172.16.1.9",
"portValue": 2222
}
}
},
"healthStatus": "HEALTHY",
"loadBalancingWeight": 1
}
]
}
]
}
],
"typeUrl": "type.googleapis.com/envoy.api.v2.ClusterLoadAssignment",
"nonce": "00000001"
}
Loading

0 comments on commit 4b38292

Please sign in to comment.