Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Configurable stale job data timeout #417

Merged
merged 1 commit into from
Nov 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions cmd/controller/controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ func TestReadAndParseConfig(t *testing.T) {
ImagePullBackOffGracePeriod: 60 * time.Second,
JobCancelCheckerPollInterval: 10 * time.Second,
PollInterval: 5 * time.Second,
StaleJobDataTimeout: 10 * time.Second,
MaxInFlight: 100,
Namespace: "my-buildkite-ns",
Org: "my-buildkite-org",
Expand Down
1 change: 1 addition & 0 deletions examples/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ job-ttl: 5m
image-pull-backoff-grace-period: 60s
job-cancel-checker-poll-interval: 10s
poll-interval: 5s
stale-job-data-timeout: 10s
max-in-flight: 100
namespace: my-buildkite-ns
org: my-buildkite-org
Expand Down
25 changes: 13 additions & 12 deletions internal/controller/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,18 +25,19 @@ var DefaultAgentImage = "ghcr.io/buildkite/agent:" + version.Version()
// mapstructure (the module) supports switching the struct tag to "json", viper does not. So we have
// to have the `mapstructure` tag for viper and the `json` tag is used by the mapstructure!
type Config struct {
Debug bool `json:"debug"`
JobTTL time.Duration `json:"job-ttl"`
PollInterval time.Duration `json:"poll-interval"`
AgentTokenSecret string `json:"agent-token-secret" validate:"required"`
BuildkiteToken string `json:"buildkite-token" validate:"required"`
Image string `json:"image" validate:"required"`
MaxInFlight int `json:"max-in-flight" validate:"min=0"`
Namespace string `json:"namespace" validate:"required"`
Org string `json:"org" validate:"required"`
Tags stringSlice `json:"tags" validate:"min=1"`
ProfilerAddress string `json:"profiler-address" validate:"omitempty,hostname_port"`
GraphQLEndpoint string `json:"graphql-endpoint" validate:"omitempty"`
Debug bool `json:"debug"`
JobTTL time.Duration `json:"job-ttl"`
PollInterval time.Duration `json:"poll-interval"`
StaleJobDataTimeout time.Duration `json:"stale-job-data-timeout" validate:"omitempty"`
AgentTokenSecret string `json:"agent-token-secret" validate:"required"`
BuildkiteToken string `json:"buildkite-token" validate:"required"`
Image string `json:"image" validate:"required"`
MaxInFlight int `json:"max-in-flight" validate:"min=0"`
Namespace string `json:"namespace" validate:"required"`
Org string `json:"org" validate:"required"`
Tags stringSlice `json:"tags" validate:"min=1"`
ProfilerAddress string `json:"profiler-address" validate:"omitempty,hostname_port"`
GraphQLEndpoint string `json:"graphql-endpoint" validate:"omitempty"`
// Agent endpoint is set in agent-config.

// ClusterUUID field is mandatory for most new orgs.
Expand Down
17 changes: 9 additions & 8 deletions internal/controller/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,14 +38,15 @@ func Run(
}

m, err := monitor.New(logger.Named("monitor"), k8sClient, monitor.Config{
GraphQLEndpoint: cfg.GraphQLEndpoint,
Namespace: cfg.Namespace,
Org: cfg.Org,
ClusterUUID: cfg.ClusterUUID,
MaxInFlight: cfg.MaxInFlight,
PollInterval: cfg.PollInterval,
Tags: cfg.Tags,
Token: cfg.BuildkiteToken,
GraphQLEndpoint: cfg.GraphQLEndpoint,
Namespace: cfg.Namespace,
Org: cfg.Org,
ClusterUUID: cfg.ClusterUUID,
MaxInFlight: cfg.MaxInFlight,
PollInterval: cfg.PollInterval,
StaleJobDataTimeout: cfg.StaleJobDataTimeout,
Tags: cfg.Tags,
Token: cfg.BuildkiteToken,
})
if err != nil {
logger.Fatal("failed to create monitor", zap.Error(err))
Expand Down
28 changes: 17 additions & 11 deletions internal/controller/monitor/monitor.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,14 +22,15 @@ type Monitor struct {
}

type Config struct {
GraphQLEndpoint string
Namespace string
Token string
ClusterUUID string
MaxInFlight int
PollInterval time.Duration
Org string
Tags []string
GraphQLEndpoint string
Namespace string
Token string
ClusterUUID string
MaxInFlight int
PollInterval time.Duration
StaleJobDataTimeout time.Duration
Org string
Tags []string
}

type Job struct {
Expand All @@ -47,8 +48,12 @@ type JobHandler interface {
func New(logger *zap.Logger, k8s kubernetes.Interface, cfg Config) (*Monitor, error) {
graphqlClient := api.NewClient(cfg.Token, cfg.GraphQLEndpoint)

if cfg.PollInterval < time.Second {
cfg.PollInterval = time.Second
// Poll no more frequently than every 1s (please don't DoS us).
cfg.PollInterval = min(cfg.PollInterval, time.Second)

// Default StaleJobDataTimeout to 10s.
if cfg.StaleJobDataTimeout <= 0 {
cfg.StaleJobDataTimeout = 10 * time.Second
}

return &Monitor{
Expand Down Expand Up @@ -179,7 +184,7 @@ func (m *Monitor) createJobs(ctx context.Context, logger *zap.Logger, handler Jo
// Why not pass directly to handler.Create? Because that might
// interrupt scheduling a pod, when all we want is to bound the
// time spent waiting for the limiter.
staleCtx, staleCancel := context.WithTimeout(ctx, m.cfg.PollInterval)
staleCtx, staleCancel := context.WithTimeout(ctx, m.cfg.StaleJobDataTimeout)
defer staleCancel()

// TODO: sort by ScheduledAt in the API
Expand Down Expand Up @@ -209,6 +214,7 @@ func (m *Monitor) createJobs(ctx context.Context, logger *zap.Logger, handler Jo

logger.Debug("creating job", zap.String("uuid", j.Uuid))
if err := handler.Create(ctx, job); err != nil {
// Note: this check is for the original context, not staleCtx.
if ctx.Err() != nil {
return
}
Expand Down