From e0692e0bd23302467f20dc804b14faa3e29512bd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Julian=20T=C3=B6lle?= Date: Tue, 18 Jul 2023 12:26:22 +0200 Subject: [PATCH] fix(routes): deleting wrong routes when other server has same private IP In case another server in the project has the same Private IP (in another network) as one of our cluster nodes, hccm would happily delete&recreate the route on every route reconciliation. This fixes the bug by only adding the Private IPs in the correct network to the AllServersCache ByPrivateIP lookup map. Fixes #470 Co-authored-by: Jonas Lammler --- hcloud/routes.go | 11 ++++++--- internal/hcops/server.go | 13 ++++++++++ internal/hcops/server_test.go | 46 +++++++++++++++++++++++++++++++++++ 3 files changed, 67 insertions(+), 3 deletions(-) diff --git a/hcloud/routes.go b/hcloud/routes.go index 4883dcd19..9221265d6 100644 --- a/hcloud/routes.go +++ b/hcloud/routes.go @@ -35,9 +35,14 @@ func newRoutes(client *hcloud.Client, networkID int64) (*routes, error) { } return &routes{ - client: client, - network: networkObj, - serverCache: &hcops.AllServersCache{LoadFunc: client.Server.All}, + client: client, + network: networkObj, + serverCache: &hcops.AllServersCache{ + // client.Server.All will load ALL the servers in the project, even those + // that are not part of the Kubernetes cluster. + LoadFunc: client.Server.All, + Network: networkObj, + }, }, nil } diff --git a/internal/hcops/server.go b/internal/hcops/server.go index 6a38031ad..7c7dc9280 100644 --- a/internal/hcops/server.go +++ b/internal/hcops/server.go @@ -9,6 +9,7 @@ import ( "github.com/hetznercloud/hcloud-cloud-controller-manager/internal/metrics" "github.com/hetznercloud/hcloud-go/v2/hcloud" + "k8s.io/klog/v2" ) // AllServersCache caches the result of the LoadFunc and provides random access @@ -21,6 +22,9 @@ type AllServersCache struct { LoadTimeout time.Duration MaxAge time.Duration + // If set, only IPs in this network will be considered for [ByPrivateIP] + Network *hcloud.Network + lastRefresh time.Time byPrivIP map[string]*hcloud.Server byName map[string]*hcloud.Server @@ -102,9 +106,18 @@ func (c *AllServersCache) getCache(getSrv func() (*hcloud.Server, bool)) (*hclou for _, srv := range srvs { // Index servers by the IPs of their private networks for _, n := range srv.PrivateNet { + if c.Network != nil { + if n.Network.ID != c.Network.ID { + // Only consider private IPs in the right network + continue + } + } if n.IP == nil { continue } + if _, ok := c.byPrivIP[n.IP.String()]; ok { + klog.Warningf("Overriding AllServersCache entry for private ip %s with server %s\n", n.IP.String(), srv.Name) + } c.byPrivIP[n.IP.String()] = srv } diff --git a/internal/hcops/server_test.go b/internal/hcops/server_test.go index 53c5b9b5a..9ca9fe95c 100644 --- a/internal/hcops/server_test.go +++ b/internal/hcops/server_test.go @@ -197,6 +197,52 @@ func TestAllServersCache_ClientError(t *testing.T) { runAllServersCacheTests(t, "Not found", tmpl, cacheOps) } +func TestAllServersCache_DuplicatePrivateIP(t *testing.T) { + // Regression test for https://github.com/hetznercloud/hcloud-cloud-controller-manager/issues/470 + + network := &hcloud.Network{ + ID: 12345, + Name: "cluster-network", + } + srv := &hcloud.Server{ + ID: 101010, + Name: "cluster-node", + PrivateNet: []hcloud.ServerPrivateNet{ + { + IP: net.ParseIP("10.0.0.4"), + Network: network, + }, + }, + } + srvInvalid := &hcloud.Server{ + ID: 101012, + Name: "invalid-node", + PrivateNet: []hcloud.ServerPrivateNet{ + { + IP: net.ParseIP("10.0.0.4"), + Network: &hcloud.Network{ + ID: 54321, + Name: "invalid-network", + }, + }, + }, + } + + cacheOps := newAllServersCacheOps(t, srv) + tmpl := allServersCacheTestCase{ + SetUp: func(t *testing.T, tt *allServersCacheTestCase) { + tt.Cache.Network = network + + tt.ServerClient. + On("All", mock.Anything). + Return([]*hcloud.Server{srv, srvInvalid}, nil) + }, + Expected: srv, + } + + runAllServersCacheTests(t, "DuplicatePrivateIP", tmpl, cacheOps) +} + type allServersCacheOp func(c *hcops.AllServersCache) (*hcloud.Server, error) func newAllServersCacheOps(t *testing.T, srv *hcloud.Server) map[string]allServersCacheOp {