From bdffd66ca2b3906c428313ef36578aca58159131 Mon Sep 17 00:00:00 2001 From: Janis Kemper Date: Wed, 15 Nov 2023 09:53:29 +0100 Subject: [PATCH] feat(robot): handle ratelimiting with constant backoff Add a constant backoff in case we are being rate limited by the Robot API. This is done through an opaque wrapping of the robot.Client interface, so callers do not need to do this manually. --- hcloud/cloud.go | 6 +- internal/config/config.go | 22 +++--- internal/config/config_test.go | 14 ++-- internal/robot/{client.go => cache.go} | 5 +- internal/robot/ratelimit.go | 93 ++++++++++++++++++++++++++ internal/robot/ratelimit_test.go | 68 +++++++++++++++++++ 6 files changed, 191 insertions(+), 17 deletions(-) rename internal/robot/{client.go => cache.go} (92%) create mode 100644 internal/robot/ratelimit.go create mode 100644 internal/robot/ratelimit_test.go diff --git a/hcloud/cloud.go b/hcloud/cloud.go index bd467e6dd..17f43f8ea 100644 --- a/hcloud/cloud.go +++ b/hcloud/cloud.go @@ -85,7 +85,11 @@ func newCloud(_ io.Reader) (cloudprovider.Interface, error) { var robotClient robot.Client if cfg.Robot.Enabled { c := hrobot.NewBasicAuthClient(cfg.Robot.User, cfg.Robot.Password) - robotClient = robot.NewClient(c, cfg.Robot.CacheTimeout) + + robotClient = robot.NewRateLimitedClient( + cfg.Robot.RateLimitWaitTime, + robot.NewCachedClient(cfg.Robot.CacheTimeout, c), + ) } var networkID int64 diff --git a/internal/config/config.go b/internal/config/config.go index dacfab4fe..9c97e2a62 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -14,10 +14,11 @@ const ( hcloudNetwork = "HCLOUD_NETWORK" hcloudDebug = "HCLOUD_DEBUG" - robotEnabled = "ROBOT_ENABLED" - robotUser = "ROBOT_USER" - robotPassword = "ROBOT_PASSWORD" - robotCacheTimeout = "ROBOT_CACHE_TIMEOUT" + robotEnabled = "ROBOT_ENABLED" + robotUser = "ROBOT_USER" + robotPassword = "ROBOT_PASSWORD" + robotCacheTimeout = "ROBOT_CACHE_TIMEOUT" + robotRateLimitWaitTime = "ROBOT_RATE_LIMIT_WAIT_TIME" hcloudInstancesAddressFamily = "HCLOUD_INSTANCES_ADDRESS_FAMILY" @@ -43,10 +44,11 @@ type HCloudClientConfiguration struct { } type RobotConfiguration struct { - Enabled bool - User string - Password string - CacheTimeout time.Duration + Enabled bool + User string + Password string + CacheTimeout time.Duration + RateLimitWaitTime time.Duration } type MetricsConfiguration struct { @@ -124,6 +126,10 @@ func Read() (HCCMConfiguration, error) { if cfg.Robot.CacheTimeout == 0 { cfg.Robot.CacheTimeout = 5 * time.Minute } + cfg.Robot.RateLimitWaitTime, err = getEnvDuration(robotRateLimitWaitTime) + if err != nil { + errs = append(errs, err) + } cfg.Metrics.Enabled, err = getEnvBool(hcloudMetricsEnabled, true) if err != nil { diff --git a/internal/config/config_test.go b/internal/config/config_test.go index 860e4647c..367b640d4 100644 --- a/internal/config/config_test.go +++ b/internal/config/config_test.go @@ -74,14 +74,16 @@ func TestRead(t *testing.T) { "ROBOT_ENABLED", "true", "ROBOT_USER", "foobar", "ROBOT_PASSWORD", "secret-password", + "ROBOT_RATE_LIMIT_WAIT_TIME", "5m", "ROBOT_CACHE_TIMEOUT", "1m", }, want: HCCMConfiguration{ Robot: RobotConfiguration{ - Enabled: true, - User: "foobar", - Password: "secret-password", - CacheTimeout: 1 * time.Minute, + Enabled: true, + User: "foobar", + Password: "secret-password", + CacheTimeout: 1 * time.Minute, + RateLimitWaitTime: 5 * time.Minute, }, Metrics: MetricsConfiguration{Enabled: true, Address: ":8233"}, Instance: InstanceConfiguration{AddressFamily: AddressFamilyIPv4}, @@ -194,8 +196,10 @@ failed to parse HCLOUD_NETWORK_ROUTES_ENABLED: strconv.ParseBool: parsing "si": name: "error parsing duration values", env: []string{ "ROBOT_CACHE_TIMEOUT", "biweekly", + "ROBOT_RATE_LIMIT_WAIT_TIME", "42fortnights", }, - wantErr: errors.New(`failed to parse ROBOT_CACHE_TIMEOUT: time: invalid duration "biweekly"`), + wantErr: errors.New(`failed to parse ROBOT_CACHE_TIMEOUT: time: invalid duration "biweekly" +failed to parse ROBOT_RATE_LIMIT_WAIT_TIME: time: unknown unit "fortnights" in duration "42fortnights"`), }, } diff --git a/internal/robot/client.go b/internal/robot/cache.go similarity index 92% rename from internal/robot/client.go rename to internal/robot/cache.go index aa3522bf2..2bb96c71c 100644 --- a/internal/robot/client.go +++ b/internal/robot/cache.go @@ -4,12 +4,11 @@ import ( "sync" "time" - hrobot "github.com/syself/hrobot-go" hrobotmodels "github.com/syself/hrobot-go/models" ) type cacheRobotClient struct { - robotClient hrobot.RobotClient + robotClient Client timeout time.Duration lastUpdate time.Time @@ -21,7 +20,7 @@ type cacheRobotClient struct { serversByID map[int]*hrobotmodels.Server } -func NewClient(robotClient hrobot.RobotClient, cacheTimeout time.Duration) Client { +func NewCachedClient(cacheTimeout time.Duration, robotClient Client) Client { return &cacheRobotClient{ timeout: cacheTimeout, robotClient: robotClient, diff --git a/internal/robot/ratelimit.go b/internal/robot/ratelimit.go new file mode 100644 index 000000000..81dfd8695 --- /dev/null +++ b/internal/robot/ratelimit.go @@ -0,0 +1,93 @@ +package robot + +import ( + "fmt" + "strings" + "time" + + hrobotmodels "github.com/syself/hrobot-go/models" +) + +type rateLimitClient struct { + robotClient Client + + waitTime time.Duration + exceeded bool + lastChecked time.Time +} + +func NewRateLimitedClient(rateLimitWaitTime time.Duration, robotClient Client) Client { + return &rateLimitClient{ + robotClient: robotClient, + + waitTime: rateLimitWaitTime, + } +} + +func (c *rateLimitClient) ServerGet(id int) (*hrobotmodels.Server, error) { + if c.isExceeded() { + return nil, c.getRateLimitError() + } + + server, err := c.robotClient.ServerGet(id) + c.handleError(err) + return server, err +} + +func (c *rateLimitClient) ServerGetList() ([]hrobotmodels.Server, error) { + if c.isExceeded() { + return nil, c.getRateLimitError() + } + + servers, err := c.robotClient.ServerGetList() + c.handleError(err) + return servers, err +} + +func (c *rateLimitClient) ResetGet(id int) (*hrobotmodels.Reset, error) { + if c.isExceeded() { + return nil, c.getRateLimitError() + } + + reset, err := c.robotClient.ResetGet(id) + c.handleError(err) + return reset, err +} + +func (c *rateLimitClient) set() { + c.exceeded = true + c.lastChecked = time.Now() +} + +func (c *rateLimitClient) isExceeded() bool { + if !c.exceeded { + return false + } + + if time.Now().Before(c.lastChecked.Add(c.waitTime)) { + return true + } + // Waiting time is over. Should try again + c.exceeded = false + c.lastChecked = time.Time{} + return false +} + +func (c *rateLimitClient) handleError(err error) { + if err == nil { + return + } + + if hrobotmodels.IsError(err, hrobotmodels.ErrorCodeRateLimitExceeded) || strings.Contains(err.Error(), "server responded with status code 403") { + c.set() + } +} + +func (c *rateLimitClient) getRateLimitError() error { + if !c.isExceeded() { + return nil + } + + nextPossibleCall := c.lastChecked.Add(c.waitTime) + return fmt.Errorf("rate limit exceeded, next try at %q", nextPossibleCall.String()) +} diff --git a/internal/robot/ratelimit_test.go b/internal/robot/ratelimit_test.go new file mode 100644 index 000000000..aac711c36 --- /dev/null +++ b/internal/robot/ratelimit_test.go @@ -0,0 +1,68 @@ +package robot + +import ( + "strings" + "testing" + "time" + + "github.com/stretchr/testify/assert" + hrobotmodels "github.com/syself/hrobot-go/models" + + "github.com/hetznercloud/hcloud-cloud-controller-manager/internal/mocks" +) + +func TestRateLimit(t *testing.T) { + mock := mocks.RobotClient{} + mock.On("ServerGetList").Return([]hrobotmodels.Server{}, nil).Once() + + client := NewRateLimitedClient(5*time.Minute, &mock) + + servers, err := client.ServerGetList() + assert.NoError(t, err) + assert.Len(t, servers, 0) + mock.AssertNumberOfCalls(t, "ServerGetList", 1) + + mock.On("ServerGetList").Return(nil, hrobotmodels.Error{Code: hrobotmodels.ErrorCodeRateLimitExceeded, Message: "Rate limit exceeded"}).Once() + _, err = client.ServerGetList() + assert.Error(t, err) + mock.AssertNumberOfCalls(t, "ServerGetList", 2) + + // No further call should be made + _, err = client.ServerGetList() + assert.Error(t, err) + mock.AssertNumberOfCalls(t, "ServerGetList", 2) +} + +func TestRateLimitIsExceeded(t *testing.T) { + client := rateLimitClient{ + waitTime: 5 * time.Minute, + exceeded: true, + lastChecked: time.Now(), + } + // Just exceeded + assert.True(t, client.isExceeded()) + + // Exceeded longer than wait time ago + client.lastChecked = time.Now().Add(-6 * time.Minute) + assert.False(t, client.isExceeded()) + + // Not exceeded ever + client.exceeded = false + client.lastChecked = time.Time{} + assert.False(t, client.isExceeded()) +} + +func TestRateLimitGetRateLimitError(t *testing.T) { + client := rateLimitClient{ + waitTime: 5 * time.Minute, + } + err := client.getRateLimitError() + assert.NoError(t, err) + + client.exceeded = true + client.lastChecked = time.Now() + + err = client.getRateLimitError() + assert.Error(t, err) + assert.True(t, strings.HasPrefix(err.Error(), "rate limit exceeded, next try at ")) +}