Skip to content

Commit

Permalink
fix(routes): many requests for outdated routes by rate limiting (#675)
Browse files Browse the repository at this point in the history
Fixes #673

In `routes.ListRoutes()` we have to find the matching server/node for
every route in the network. We find the server by utilizing a cache that
maps every private IP to the corresponding server.

This cache has a feature that refreshes the list of servers if an entry
can not be found. This is sensible, as the server might have been just
created. This is also fatal, as this refresh happens for every single
cache access. If there are a lot of routes in the network that do not
belong to any server, we refresh the cache many times for each
`ListRoutes()`. This is even more serious, as `ListRoutes()` is being
called every 10-30 seconds (see #395).

This commit introduces a `rate.Limiter` in the `AllServersCache` which
only allows the refresh to happen every 30 seconds.
  • Loading branch information
apricote authored Jul 5, 2024
1 parent d018087 commit e283b7d
Show file tree
Hide file tree
Showing 4 changed files with 125 additions and 6 deletions.
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ require (
github.com/spf13/pflag v1.0.5
github.com/stretchr/testify v1.9.0
github.com/syself/hrobot-go v0.2.5
golang.org/x/time v0.3.0
k8s.io/api v0.30.2
k8s.io/apimachinery v0.30.2
k8s.io/client-go v0.30.2
Expand Down Expand Up @@ -90,7 +91,6 @@ require (
golang.org/x/sys v0.18.0 // indirect
golang.org/x/term v0.18.0 // indirect
golang.org/x/text v0.14.0 // indirect
golang.org/x/time v0.3.0 // indirect
google.golang.org/appengine v1.6.7 // indirect
google.golang.org/genproto v0.0.0-20230803162519-f966b187b2e5 // indirect
google.golang.org/genproto/googleapis/api v0.0.0-20230726155614-23370e0ffb3e // indirect
Expand Down
10 changes: 8 additions & 2 deletions hcloud/routes.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import (
"net"
"time"

"golang.org/x/time/rate"
"k8s.io/apimachinery/pkg/types"
cloudprovider "k8s.io/cloud-provider"
"k8s.io/klog/v2"
Expand All @@ -16,6 +17,10 @@ import (
"github.com/hetznercloud/hcloud-go/v2/hcloud"
)

var (
serversCacheMissRefreshRate = rate.Every(30 * time.Second)
)

type routes struct {
client *hcloud.Client
network *hcloud.Network
Expand All @@ -40,8 +45,9 @@ func newRoutes(client *hcloud.Client, networkID int64) (*routes, error) {
serverCache: &hcops.AllServersCache{
// client.Server.All will load ALL the servers in the project, even those
// that are not part of the Kubernetes cluster.
LoadFunc: client.Server.All,
Network: networkObj,
LoadFunc: client.Server.All,
Network: networkObj,
CacheMissRefreshLimiter: rate.NewLimiter(serversCacheMissRefreshRate, 1),
},
}, nil
}
Expand Down
7 changes: 5 additions & 2 deletions internal/hcops/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import (
"sync"
"time"

"golang.org/x/time/rate"
"k8s.io/klog/v2"

"github.com/hetznercloud/hcloud-cloud-controller-manager/internal/metrics"
Expand All @@ -22,6 +23,8 @@ type AllServersCache struct {
LoadFunc func(context.Context) ([]*hcloud.Server, error)
LoadTimeout time.Duration
MaxAge time.Duration
// Set to limit the amount of refreshes due to cache misses
CacheMissRefreshLimiter *rate.Limiter

// If set, only IPs in this network will be considered for [ByPrivateIP]
Network *hcloud.Network
Expand Down Expand Up @@ -97,8 +100,8 @@ func (c *AllServersCache) getFromCache(retrieveFromCacheMaps func() (*hcloud.Ser
return server, nil
}

// If the server was not in the cache, we want to refresh if we did not already in this call.
if !cacheRefreshed {
// If the server was not in the cache, we want to refresh if we did not already in this call and if there is available limit.
if !cacheRefreshed && c.CacheMissRefreshLimiter.Allow() {
if err := c.refreshCache(); err != nil {
return nil, fmt.Errorf("%s: %w", op, err)
}
Expand Down
112 changes: 111 additions & 1 deletion internal/hcops/server_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import (

"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/mock"
"golang.org/x/time/rate"

"github.com/hetznercloud/hcloud-cloud-controller-manager/internal/hcops"
"github.com/hetznercloud/hcloud-cloud-controller-manager/internal/mocks"
Expand Down Expand Up @@ -150,6 +151,81 @@ func TestAllServersCache_CacheRefresh(t *testing.T) {
runAllServersCacheTests(t, "Cache refresh", tmpl, cacheOps)
}

func TestAllServersCache_CacheMissRefresh(t *testing.T) {
srv := &hcloud.Server{
ID: 1337,
Name: "cache-miss-refresh",
PrivateNet: []hcloud.ServerPrivateNet{
{
IP: net.ParseIP("10.0.0.9"),
},
},
}
cacheOps := newAllServersCacheOps(t, srv)
tmpl := allServersCacheTestCase{
SetUp: func(t *testing.T, tt *allServersCacheTestCase) {
tt.ServerClient.
On("All", mock.Anything).
Return([]*hcloud.Server{}, nil).Once()

tt.ServerClient.
On("All", mock.Anything).
Return([]*hcloud.Server{srv}, nil).Once()

// Setup initial cache
result, err := tt.Cache.ByName(srv.Name)
assert.Error(t, err)
assert.Nil(t, result)
},
Assert: func(t *testing.T, tt *allServersCacheTestCase) {
// All must be called only twice. Once during set-up because the cache is expired,
// and then again during the cache retrieval because the server was not found.
tt.ServerClient.AssertNumberOfCalls(t, "All", 2)
},
Expected: srv,
}

runAllServersCacheTests(t, "Cache refresh", tmpl, cacheOps)
}

func TestAllServersCache_CacheRefreshLimited(t *testing.T) {
srv := &hcloud.Server{
ID: 1337,
Name: "cache-refresh-limited",
PrivateNet: []hcloud.ServerPrivateNet{
{
IP: net.ParseIP("10.0.0.9"),
},
},
}
cacheOps := newAllServersCacheOps(t, srv)
tmpl := allServersCacheTestCase{
SetUp: func(t *testing.T, tt *allServersCacheTestCase) {
tt.ServerClient.
On("All", mock.Anything).
Return([]*hcloud.Server{}, nil)

result, err := tt.Cache.ByName(srv.Name)
assert.Error(t, err)
assert.Nil(t, result)

result, err = tt.Cache.ByName(srv.Name)
assert.Error(t, err)
assert.Nil(t, result)
},
Assert: func(t *testing.T, tt *allServersCacheTestCase) {
// All must be called only twice. Once during set-up because the cache is expired,
// and then again during set-up to trigger the limiter.

tt.ServerClient.AssertNumberOfCalls(t, "All", 2)
},
Expected: nil,
ExpectedErr: hcops.ErrNotFound,
}

runAllServersCacheTests(t, "Cache refresh", tmpl, cacheOps)
}

func TestAllServersCache_NotFound(t *testing.T) {
srv := &hcloud.Server{
ID: 101010,
Expand Down Expand Up @@ -197,6 +273,39 @@ func TestAllServersCache_ClientError(t *testing.T) {
runAllServersCacheTests(t, "Not found", tmpl, cacheOps)
}

func TestAllServersCache_CacheMissRefreshClientError(t *testing.T) {
expectedErr := errors.New("cache-miss-refresh-client-error")
srv := &hcloud.Server{
ID: 202020,
Name: "cache-miss-refresh-client-error",
PrivateNet: []hcloud.ServerPrivateNet{
{
IP: net.ParseIP("10.0.0.5"),
},
},
}
cacheOps := newAllServersCacheOps(t, srv)
tmpl := allServersCacheTestCase{
SetUp: func(_ *testing.T, tt *allServersCacheTestCase) {
tt.ServerClient.
On("All", mock.Anything).
Return([]*hcloud.Server{}, nil).Once()

// Load the cache once
result, err := tt.Cache.ByName(srv.Name)
assert.Error(t, err)
assert.Nil(t, result)

tt.ServerClient.
On("All", mock.Anything).
Return(nil, expectedErr).Once()
},
ExpectedErr: expectedErr,
}

runAllServersCacheTests(t, "Not found", tmpl, cacheOps)
}

func TestAllServersCache_DuplicatePrivateIP(t *testing.T) {
// Regression test for https://github.com/hetznercloud/hcloud-cloud-controller-manager/issues/470

Expand Down Expand Up @@ -284,7 +393,8 @@ type allServersCacheTestCase struct {
func (tt *allServersCacheTestCase) run(t *testing.T) {
tt.ServerClient = mocks.NewServerClient(t)
tt.Cache = &hcops.AllServersCache{
LoadFunc: tt.ServerClient.All,
LoadFunc: tt.ServerClient.All,
CacheMissRefreshLimiter: rate.NewLimiter(rate.Every(1*time.Minute), 1),
}

if tt.SetUp != nil {
Expand Down

0 comments on commit e283b7d

Please sign in to comment.