Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Feature] Parametrize ForceDelete timeout #1632

Merged
merged 1 commit into from
Mar 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
- (Bugfix) Remove ImagePullSecrets Reference from Container
- (Feature) DebugPackage ArangoProfiles
- (Feature) Scheduler CLI
- (Feature) Parametrize ForceDelete timeout

## [1.2.39](https://github.com/arangodb/kube-arangodb/tree/1.2.39) (2024-03-11)
- (Feature) Extract Scheduler API
Expand Down
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,7 @@ Flags:
--kubernetes.max-batch-size int Size of batch during objects read (default 256)
--kubernetes.qps float32 Number of queries per second for k8s API (default 15)
--log.format string Set log format. Allowed values: 'pretty', 'JSON'. If empty, default format is used (default "pretty")
--log.level stringArray Set log levels in format <level> or <logger>=<level>. Possible loggers: action, agency, api-server, assertion, backup-operator, chaos-monkey, crd, deployment, deployment-ci, deployment-reconcile, deployment-replication, deployment-resilience, deployment-resources, deployment-storage, deployment-storage-pc, deployment-storage-service, http, inspector, integrations, k8s-client, ml-batchjob-operator, ml-cronjob-operator, ml-extension-operator, ml-extension-shutdown, ml-storage-operator, monitor, operator, operator-arangojob-handler, operator-v2, operator-v2-event, operator-v2-worker, panics, pod_compare, root, root-event-recorder, server, server-authentication (default [info])
--log.level stringArray Set log levels in format <level> or <logger>=<level>. Possible loggers: action, agency, api-server, assertion, backup-operator, chaos-monkey, crd, deployment, deployment-ci, deployment-reconcile, deployment-replication, deployment-resilience, deployment-resources, deployment-storage, deployment-storage-pc, deployment-storage-service, http, inspector, integrations, k8s-client, ml-batchjob-operator, ml-cronjob-operator, ml-extension-operator, ml-extension-shutdown, ml-storage-operator, monitor, operator, operator-arangojob-handler, operator-v2, operator-v2-event, operator-v2-worker, panics, pod_compare, root, root-event-recorder, scheduler, server, server-authentication (default [info])
--log.sampling If true, operator will try to minimize duplication of logging events (default true)
--memory-limit uint Define memory limit for hard shutdown and the dump of goroutines. Used for testing
--metrics.excluded-prefixes stringArray List of the excluded metrics prefixes
Expand Down Expand Up @@ -196,6 +196,7 @@ Flags:
--timeout.arangod-check duration The version check request timeout to the ArangoDB (default 2s)
--timeout.backup-arangod duration The request timeout to the ArangoDB during backup calls (default 30s)
--timeout.backup-upload duration The request timeout to the ArangoDB during uploading files (default 5m0s)
--timeout.force-delete-pod-grace-period duration Default period when ArangoDB Pod should be forcefully removed after all containers were stopped - set to 0 to disable forceful removals (default 15m0s)
--timeout.k8s duration The request timeout to the kubernetes (default 2s)
--timeout.reconciliation duration The reconciliation timeout to the ArangoDB CR (default 1m0s)
--timeout.shard-rebuild duration Timeout after which particular out-synced shard is considered as failed and rebuild is triggered (default 1h0m0s)
Expand Down
21 changes: 12 additions & 9 deletions cmd/cmd.go
Original file line number Diff line number Diff line change
Expand Up @@ -147,15 +147,16 @@ var (
concurrentUploads int
}
operatorTimeouts struct {
k8s time.Duration
arangoD time.Duration
arangoDCheck time.Duration
reconciliation time.Duration
agency time.Duration
shardRebuild time.Duration
shardRebuildRetry time.Duration
backupArangoD time.Duration
backupUploadArangoD time.Duration
k8s time.Duration
arangoD time.Duration
arangoDCheck time.Duration
reconciliation time.Duration
agency time.Duration
shardRebuild time.Duration
shardRebuildRetry time.Duration
backupArangoD time.Duration
backupUploadArangoD time.Duration
forcePodDeletionGracePeriod time.Duration
}
operatorImageDiscovery struct {
timeout time.Duration
Expand Down Expand Up @@ -224,6 +225,7 @@ func init() {
f.DurationVar(&operatorTimeouts.shardRebuildRetry, "timeout.shard-rebuild-retry", globals.DefaultOutSyncedShardRebuildRetryTimeout, "Timeout after which rebuild shards retry flow is triggered")
f.DurationVar(&operatorTimeouts.backupArangoD, "timeout.backup-arangod", globals.BackupDefaultArangoClientTimeout, "The request timeout to the ArangoDB during backup calls")
f.DurationVar(&operatorTimeouts.backupUploadArangoD, "timeout.backup-upload", globals.BackupUploadArangoClientTimeout, "The request timeout to the ArangoDB during uploading files")
f.DurationVar(&operatorTimeouts.forcePodDeletionGracePeriod, "timeout.force-delete-pod-grace-period", globals.DefaultForcePodDeletionGracePeriodTimeout, "Default period when ArangoDB Pod should be forcefully removed after all containers were stopped - set to 0 to disable forceful removals")
f.DurationVar(&shutdownOptions.delay, "shutdown.delay", defaultShutdownDelay, "The delay before running shutdown handlers")
f.DurationVar(&shutdownOptions.timeout, "shutdown.timeout", defaultShutdownTimeout, "Timeout for shutdown handlers")
f.DurationVar(&operatorReconciliationRetry.delay, "operator.reconciliation.retry.delay", globals.DefaultOperatorUpdateRetryDelay, "Delay between Object Update operations in the Reconciliation loop")
Expand Down Expand Up @@ -291,6 +293,7 @@ func executeMain(cmd *cobra.Command, args []string) {
globals.GetGlobalTimeouts().ShardRebuildRetry().Set(operatorTimeouts.shardRebuildRetry)
globals.GetGlobalTimeouts().BackupArangoClientTimeout().Set(operatorTimeouts.backupArangoD)
globals.GetGlobalTimeouts().BackupArangoClientUploadTimeout().Set(operatorTimeouts.backupUploadArangoD)
globals.GetGlobalTimeouts().ForcePodDeletionGracePeriodTimeout().Set(operatorTimeouts.forcePodDeletionGracePeriod)

globals.GetGlobals().Retry().OperatorUpdateRetryDelay().Set(operatorReconciliationRetry.delay)
globals.GetGlobals().Retry().OperatorUpdateRetryCount().Set(operatorReconciliationRetry.count)
Expand Down
17 changes: 9 additions & 8 deletions pkg/deployment/resources/pod_inspector.go
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,6 @@ const (
// we will mark the pod as scheduled for termination
recheckSoonPodInspectorInterval = util.Interval(time.Second) // Time between Pod inspection if we think something will change soon
maxPodInspectorInterval = util.Interval(time.Hour) // Maximum time between Pod inspection (if nothing else happens)
forcePodDeletionGracePeriod = 15 * time.Minute
)

func (r *Resources) handleRestartedPod(pod *core.Pod, memberStatus *api.MemberStatus, wasTerminated, markAsTerminated *bool) {
Expand Down Expand Up @@ -426,13 +425,15 @@ func (r *Resources) InspectPods(ctx context.Context, cachedStatus inspectorInter
var gps int64 = 10

forceDelete := false
if t := k8sutil.PodStopTime(pod); !t.IsZero() {
if time.Since(t) > forcePodDeletionGracePeriod {
forceDelete = true
}
} else if t := pod.DeletionTimestamp; t != nil {
if time.Since(t.Time) > forcePodDeletionGracePeriod {
forceDelete = true
if gracePeriod := globals.GetGlobalTimeouts().ForcePodDeletionGracePeriodTimeout().Get(); gracePeriod > 0 {
if t := k8sutil.PodStopTime(pod); !t.IsZero() {
if time.Since(t) > gracePeriod {
forceDelete = true
}
} else if t := pod.DeletionTimestamp; t != nil {
if time.Since(t.Time) > gracePeriod {
forceDelete = true
}
}
}

Expand Down
39 changes: 24 additions & 15 deletions pkg/util/globals/global.go
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
//
// DISCLAIMER
//
// Copyright 2016-2023 ArangoDB GmbH, Cologne, Germany
// Copyright 2016-2024 ArangoDB GmbH, Cologne, Germany
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
Expand All @@ -23,11 +23,12 @@ package globals
import "time"

const (
DefaultKubernetesTimeout = 2 * time.Second
DefaultArangoDTimeout = time.Second * 5
DefaultArangoDAgencyTimeout = time.Second * 10
DefaultArangoDCheckTimeout = time.Second * 2
DefaultReconciliationTimeout = time.Minute
DefaultKubernetesTimeout = 2 * time.Second
DefaultArangoDTimeout = time.Second * 5
DefaultArangoDAgencyTimeout = time.Second * 10
DefaultArangoDCheckTimeout = time.Second * 2
DefaultReconciliationTimeout = time.Minute
DefaultForcePodDeletionGracePeriodTimeout = 15 * time.Minute

BackupDefaultArangoClientTimeout = 30 * time.Second
BackupUploadArangoClientTimeout = 300 * time.Second
Expand All @@ -50,15 +51,16 @@ const (

var globalObj = &globals{
timeouts: &globalTimeouts{
requests: NewTimeout(DefaultKubernetesTimeout),
arangod: NewTimeout(DefaultArangoDTimeout),
arangodCheck: NewTimeout(DefaultArangoDCheckTimeout),
reconciliation: NewTimeout(DefaultReconciliationTimeout),
agency: NewTimeout(DefaultArangoDAgencyTimeout),
shardRebuild: NewTimeout(DefaultOutSyncedShardRebuildTimeout),
shardRebuildRetry: NewTimeout(DefaultOutSyncedShardRebuildRetryTimeout),
backupArangoClientTimeout: NewTimeout(BackupDefaultArangoClientTimeout),
backupArangoClientUploadTimeout: NewTimeout(BackupUploadArangoClientTimeout),
requests: NewTimeout(DefaultKubernetesTimeout),
arangod: NewTimeout(DefaultArangoDTimeout),
arangodCheck: NewTimeout(DefaultArangoDCheckTimeout),
reconciliation: NewTimeout(DefaultReconciliationTimeout),
agency: NewTimeout(DefaultArangoDAgencyTimeout),
shardRebuild: NewTimeout(DefaultOutSyncedShardRebuildTimeout),
shardRebuildRetry: NewTimeout(DefaultOutSyncedShardRebuildRetryTimeout),
backupArangoClientTimeout: NewTimeout(BackupDefaultArangoClientTimeout),
backupArangoClientUploadTimeout: NewTimeout(BackupUploadArangoClientTimeout),
forcePodDeletionGracePeriodTimeout: NewTimeout(DefaultForcePodDeletionGracePeriodTimeout),
},
kubernetes: &globalKubernetes{
requestBatchSize: NewInt64(DefaultKubernetesRequestBatchSize),
Expand Down Expand Up @@ -144,6 +146,8 @@ type GlobalTimeouts interface {
ArangoDCheck() Timeout
Agency() Timeout

ForcePodDeletionGracePeriodTimeout() Timeout

BackupArangoClientTimeout() Timeout
BackupArangoClientUploadTimeout() Timeout
}
Expand All @@ -152,6 +156,11 @@ type globalTimeouts struct {
requests, arangod, reconciliation, arangodCheck, agency, shardRebuild, shardRebuildRetry Timeout
backupArangoClientTimeout Timeout
backupArangoClientUploadTimeout Timeout
forcePodDeletionGracePeriodTimeout Timeout
}

func (g *globalTimeouts) ForcePodDeletionGracePeriodTimeout() Timeout {
return g.forcePodDeletionGracePeriodTimeout
}

func (g *globalTimeouts) Agency() Timeout {
Expand Down