Skip to content

Commit

Permalink
[Feature] Parametrize Scheduling Graceful Duration (#1641)
Browse files Browse the repository at this point in the history
  • Loading branch information
ajanikow authored Apr 8, 2024
1 parent a4d7331 commit 1d86f4e
Show file tree
Hide file tree
Showing 6 changed files with 39 additions and 1 deletion.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
- (Maintenance) Update Go to 1.22.2
- (Feature) Object Checksum
- (Bugfix) Use Rendered Spec in case of scheduling compare
- (Feature) Parametrize Scheduling Graceful Duration

## [1.2.39](https://github.com/arangodb/kube-arangodb/tree/1.2.39) (2024-03-11)
- (Feature) Extract Scheduler API
Expand Down
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -198,6 +198,7 @@ Flags:
--timeout.backup-upload duration The request timeout to the ArangoDB during uploading files (default 5m0s)
--timeout.force-delete-pod-grace-period duration Default period when ArangoDB Pod should be forcefully removed after all containers were stopped - set to 0 to disable forceful removals (default 15m0s)
--timeout.k8s duration The request timeout to the kubernetes (default 2s)
--timeout.pod-scheduling-grace-period duration Default period when ArangoDB Pod should be deleted in case of scheduling info change - set to 0 to disable (default 15s)
--timeout.reconciliation duration The reconciliation timeout to the ArangoDB CR (default 1m0s)
--timeout.shard-rebuild duration Timeout after which particular out-synced shard is considered as failed and rebuild is triggered (default 1h0m0s)
--timeout.shard-rebuild-retry duration Timeout after which rebuild shards retry flow is triggered (default 4h0m0s)
Expand Down
3 changes: 3 additions & 0 deletions cmd/cmd.go
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,7 @@ var (
backupArangoD time.Duration
backupUploadArangoD time.Duration
forcePodDeletionGracePeriod time.Duration
podSchedulingGracePeriod time.Duration
}
operatorImageDiscovery struct {
timeout time.Duration
Expand Down Expand Up @@ -226,6 +227,7 @@ func init() {
f.DurationVar(&operatorTimeouts.backupArangoD, "timeout.backup-arangod", globals.BackupDefaultArangoClientTimeout, "The request timeout to the ArangoDB during backup calls")
f.DurationVar(&operatorTimeouts.backupUploadArangoD, "timeout.backup-upload", globals.BackupUploadArangoClientTimeout, "The request timeout to the ArangoDB during uploading files")
f.DurationVar(&operatorTimeouts.forcePodDeletionGracePeriod, "timeout.force-delete-pod-grace-period", globals.DefaultForcePodDeletionGracePeriodTimeout, "Default period when ArangoDB Pod should be forcefully removed after all containers were stopped - set to 0 to disable forceful removals")
f.DurationVar(&operatorTimeouts.podSchedulingGracePeriod, "timeout.pod-scheduling-grace-period", globals.DefaultPodSchedulingGracePeriod, "Default period when ArangoDB Pod should be deleted in case of scheduling info change - set to 0 to disable")
f.DurationVar(&shutdownOptions.delay, "shutdown.delay", defaultShutdownDelay, "The delay before running shutdown handlers")
f.DurationVar(&shutdownOptions.timeout, "shutdown.timeout", defaultShutdownTimeout, "Timeout for shutdown handlers")
f.DurationVar(&operatorReconciliationRetry.delay, "operator.reconciliation.retry.delay", globals.DefaultOperatorUpdateRetryDelay, "Delay between Object Update operations in the Reconciliation loop")
Expand Down Expand Up @@ -294,6 +296,7 @@ func executeMain(cmd *cobra.Command, args []string) {
globals.GetGlobalTimeouts().BackupArangoClientTimeout().Set(operatorTimeouts.backupArangoD)
globals.GetGlobalTimeouts().BackupArangoClientUploadTimeout().Set(operatorTimeouts.backupUploadArangoD)
globals.GetGlobalTimeouts().ForcePodDeletionGracePeriodTimeout().Set(operatorTimeouts.forcePodDeletionGracePeriod)
globals.GetGlobalTimeouts().PodSchedulingGracePeriod().Set(operatorTimeouts.podSchedulingGracePeriod)

globals.GetGlobals().Retry().OperatorUpdateRetryDelay().Set(operatorReconciliationRetry.delay)
globals.GetGlobals().Retry().OperatorUpdateRetryCount().Set(operatorReconciliationRetry.count)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,14 @@ package reconcile
import (
"context"
"reflect"
"time"

core "k8s.io/api/core/v1"

api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1"
"github.com/arangodb/kube-arangodb/pkg/deployment/actions"
"github.com/arangodb/kube-arangodb/pkg/util"
"github.com/arangodb/kube-arangodb/pkg/util/globals"
"github.com/arangodb/kube-arangodb/pkg/util/k8sutil"
)

Expand All @@ -38,6 +40,12 @@ func (r *Reconciler) createMemberPodSchedulingFailurePlan(ctx context.Context,
_ k8sutil.APIObject, spec api.DeploymentSpec, status api.DeploymentStatus, context PlanBuilderContext) api.Plan {

var p api.Plan

if globals.GetGlobalTimeouts().PodSchedulingGracePeriod().Get() == 0 {
// Scheduling grace period is not enabled
return nil
}

if !status.Conditions.IsTrue(api.ConditionTypePodSchedulingFailure) {
return p
}
Expand All @@ -55,6 +63,19 @@ func (r *Reconciler) createMemberPodSchedulingFailurePlan(ctx context.Context,
continue
}

if c, ok := m.Member.Conditions.Get(api.ConditionTypeScheduled); !ok {
// Action cant proceed if pod is not scheduled
continue
} else if c.LastTransitionTime.IsZero() {
// LastTransitionTime is not set
continue
} else {
if time.Since(c.LastTransitionTime.Time) <= globals.GetGlobalTimeouts().PodSchedulingGracePeriod().Get() {
// In grace period
continue
}
}

imageInfo, imageFound := context.SelectImageForMember(spec, status, m.Member)
if !imageFound {
l.Warn("could not find image for already created member")
Expand Down
5 changes: 5 additions & 0 deletions pkg/deployment/resources/pod_inspector.go
Original file line number Diff line number Diff line change
Expand Up @@ -393,6 +393,11 @@ func (r *Resources) InspectPods(ctx context.Context, cachedStatus inspectorInter
nextInterval = nextInterval.ReduceTo(recheckSoonPodInspectorInterval)
}
} else {
if memberStatus.Conditions.Update(api.ConditionTypeScheduled, false, "Pod is not scheduled", "") {
updateMemberStatusNeeded = true
nextInterval = nextInterval.ReduceTo(recheckSoonPodInspectorInterval)
}

if k8sutil.IsPodNotScheduledFor(pod, podScheduleTimeout) {
// Pod cannot be scheduled for to long
log.Str("pod-name", pod.GetName()).Debug("Pod scheduling timeout")
Expand Down
9 changes: 8 additions & 1 deletion pkg/util/globals/global.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ const (
DefaultArangoDCheckTimeout = time.Second * 2
DefaultReconciliationTimeout = time.Minute
DefaultForcePodDeletionGracePeriodTimeout = 15 * time.Minute
DefaultPodSchedulingGracePeriod = 15 * time.Second

BackupDefaultArangoClientTimeout = 30 * time.Second
BackupUploadArangoClientTimeout = 300 * time.Second
Expand Down Expand Up @@ -61,6 +62,7 @@ var globalObj = &globals{
backupArangoClientTimeout: NewTimeout(BackupDefaultArangoClientTimeout),
backupArangoClientUploadTimeout: NewTimeout(BackupUploadArangoClientTimeout),
forcePodDeletionGracePeriodTimeout: NewTimeout(DefaultForcePodDeletionGracePeriodTimeout),
podSchedulingGracePeriod: NewTimeout(DefaultPodSchedulingGracePeriod),
},
kubernetes: &globalKubernetes{
requestBatchSize: NewInt64(DefaultKubernetesRequestBatchSize),
Expand Down Expand Up @@ -147,6 +149,7 @@ type GlobalTimeouts interface {
Agency() Timeout

ForcePodDeletionGracePeriodTimeout() Timeout
PodSchedulingGracePeriod() Timeout

BackupArangoClientTimeout() Timeout
BackupArangoClientUploadTimeout() Timeout
Expand All @@ -156,13 +159,17 @@ type globalTimeouts struct {
requests, arangod, reconciliation, arangodCheck, agency, shardRebuild, shardRebuildRetry Timeout
backupArangoClientTimeout Timeout
backupArangoClientUploadTimeout Timeout
forcePodDeletionGracePeriodTimeout Timeout
forcePodDeletionGracePeriodTimeout, podSchedulingGracePeriod Timeout
}

func (g *globalTimeouts) ForcePodDeletionGracePeriodTimeout() Timeout {
return g.forcePodDeletionGracePeriodTimeout
}

func (g *globalTimeouts) PodSchedulingGracePeriod() Timeout {
return g.podSchedulingGracePeriod
}

func (g *globalTimeouts) Agency() Timeout {
return g.agency
}
Expand Down

0 comments on commit 1d86f4e

Please sign in to comment.