Skip to content

Commit

Permalink
feat(execution): Avoid using activeDeadlineSeconds to kill tasks, use…
Browse files Browse the repository at this point in the history
… deletion instead
  • Loading branch information
irvinlim committed Jun 3, 2022
1 parent 9e17087 commit 53774f9
Show file tree
Hide file tree
Showing 23 changed files with 417 additions and 736 deletions.
15 changes: 13 additions & 2 deletions .dockerignore
Original file line number Diff line number Diff line change
@@ -1,8 +1,19 @@
# More info: https://docs.docker.com/engine/reference/builder/#dockerignore-file
# Ignore build and test binaries.
.dockerignore
.gitignore
.github/
.local/
bin/
testbin/
build/
dist/
yamls/
.local
config/
docs/
hack/
Makefile
LICENSE
*.md
*.yaml
*.yml
*.cov
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,7 @@ dev: dev-build dev-push dev-deploy ## Builds local Docker images of all componen

.PHONY: dev-build
dev-build: goreleaser ## Builds local Docker images of all components. Use the DEV_IMAGE_TAG environment variable to override the image tag that is built.
./hack/build-images.sh "$(DEV_IMAGE_TAG)"
./hack/build-images.sh "$(IMAGE_NAME_PREFIX)" "$(DEV_IMAGE_TAG)"

.PHONY: dev-push
dev-push: ## Pushes all Docker images to the specified registry. Use the DEV_IMAGE_TAG environment variable to override the image tag, and DEV_IMAGE_REGISTRY to override the image registry to use (usually a local one).
Expand Down
16 changes: 4 additions & 12 deletions apis/config/v1alpha1/dynamicconfig_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,22 +43,14 @@ type JobExecutionConfig struct {
// +optional
DefaultPendingTimeoutSeconds *int64 `json:"defaultPendingTimeoutSeconds,omitempty"`

// DeleteKillingTasksTimeoutSeconds is the duration we delete the task to kill
// it instead of using active deadline, if previous efforts were ineffective.
// Set this value to 0 to immediately use deletion.
//
// Default: 180
// +optional
DeleteKillingTasksTimeoutSeconds *int64 `json:"deleteKillingTasksTimeoutSeconds,omitempty"`

// ForceDeleteKillingTasksTimeoutSeconds is the duration before we use force
// deletion instead of normal deletion. This timeout is computed from the
// ForceDeleteTaskTimeoutSeconds is the duration before we use force deletion
// instead of normal deletion. This timeout is computed from the
// deletionTimestamp of the object, which may also include an additional delay
// of deletionGracePeriodSeconds. Set this value to 0 to disable force deletion.
//
// Default: 120
// Default: 900
// +optional
ForceDeleteKillingTasksTimeoutSeconds *int64 `json:"forceDeleteKillingTasksTimeoutSeconds,omitempty"`
ForceDeleteTaskTimeoutSeconds *int64 `json:"forceDeleteTaskTimeoutSeconds,omitempty"`
}

// +kubebuilder:object:root=true
Expand Down
9 changes: 2 additions & 7 deletions apis/config/v1alpha1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 0 additions & 4 deletions apis/execution/v1alpha1/job_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -647,10 +647,6 @@ const (
// specified pending timeout.
TaskPendingTimeout TaskResult = "PendingTimeout"

// TaskDeadlineExceeded means that the task had failed to terminate within its
// active deadline and has now been terminated.
TaskDeadlineExceeded TaskResult = "DeadlineExceeded"

// TaskKilled means that the task is killed externally and now terminated.
TaskKilled TaskResult = "Killed"
)
Expand Down
76 changes: 38 additions & 38 deletions config/execution/dynamic/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,45 +14,47 @@ data:
apiVersion: config.furiko.io/v1alpha1
kind: JobExecutionConfig
# defaultTTLSecondsAfterFinished is the default time-to-live (TTL) for a Job
# after it has finished. Lower this value to reduce the strain on the
# cluster/kubelet. Set to 0 to delete immediately after the Job is finished.
# The default time-to-live (TTL) for a Job after it has finished. Lower this
# value to reduce the strain on the cluster/kubelet. Set to 0 to delete immediately
# after the Job is finished.
defaultTTLSecondsAfterFinished: 3600
# defaultPendingTimeoutSeconds is default timeout to use if job does not
# specify the pending timeout. By default, this is a non-zero value to prevent
# permanently stuck jobs. To disable default pending timeout, set this to 0.
# The default timeout for a task to remain in a pending state. Defaults to 15 minutes
# in order to prevent jobs from retrying indefinitely.
#
# To prevent setting a default pending timeout globally, set this to 0. Individual jobs
# can still specify spec.taskPendingTimeoutSeconds in the JobTemplate to override this
# global default value.
defaultPendingTimeoutSeconds: 900
# deleteKillingTasksTimeoutSeconds is the duration we delete the task to kill
# it instead of using active deadline, if previous efforts were ineffective.
# Set this value to 0 to immediately use deletion.
deleteKillingTasksTimeoutSeconds: 180
# forceDeleteKillingTasksTimeoutSeconds is the duration before we use force
# deletion instead of normal deletion. This timeout is computed from the
# deletionTimestamp of the object, which may also include an additional delay
# of deletionGracePeriodSeconds. Set this value to 0 to disable force deletion.
forceDeleteKillingTasksTimeoutSeconds: 120
# The duration before the controller uses force deletion instead of normal deletion.
# This timeout is computed from the deletionTimestamp of the object, which may also include
# an additional delay of deletionGracePeriodSeconds.
#
# Force deletion causes the task to be deleted without confirmation that the task has already
# terminated. When pod is used for taskTemplate, this means that
#
# Set this value to 0 to disable force deletion globally. Individual jobs can also specify
# spec.forbidTaskForceDeletion in the JobTemplate to disable force deletion if this
# behavior is not desired.
forceDeleteTaskTimeoutSeconds: 900
jobConfigs: |
apiVersion: config.furiko.io/v1alpha1
kind: JobConfigExecutionConfig
# maxEnqueuedJobs is the global maximum enqueued jobs that can be enqueued for
# a single JobConfig.
# The global maximum enqueued jobs that can be enqueued for a single JobConfig.
maxEnqueuedJobs: 20
cron: |
apiVersion: config.furiko.io/v1alpha1
kind: CronExecutionConfig
# cronFormat specifies the format used to parse cron expressions. Select
# between "standard" (default) or "quartz".
# Specifies the format used to parse cron expressions. Select between "standard"
# (default) or "quartz".
cronFormat: "standard"
# cronHashNames specifies if cron expressions should be hashed using the
# JobConfig's name.
# Specifies if cron expressions should be hashed using the JobConfig's name.
#
# This enables "hash cron expressions", which looks like `0 H * * *`. This
# particular example means to run once a day on the 0th minute of some hour,
Expand All @@ -63,9 +65,8 @@ data:
# If disabled, any JobConfigs that use the `H` syntax will throw a parse error.
cronHashNames: true
# cronHashSecondsByDefault specifies if the seconds field of a cron expression
# should be a `H` or `0` by default. If enabled, it will be `H`, otherwise it
# will default to `0`.
# Specifies if the seconds field of a cron expression should be a `H` or `0`
# by default. If enabled, it will be `H`, otherwise it will default to `0`.
#
# For JobConfigs which use a short cron expression format (i.e. 5 or 6 tokens
# long), the seconds field is omitted and is typically assumed to be `0` (e.g.
Expand All @@ -77,27 +78,26 @@ data:
# this would be `0 5 10 * * * *`.
cronHashSecondsByDefault: false
# cronHashFields specifies if the fields should be hashed along with the
# JobConfig's name.
# Specifies if the fields should be hashed along with the JobConfig's name.
#
# For example, `H H * * * * *` will always hash the seconds and minutes to the
# same value, for example 00:37:37, 01:37:37, etc. Enabling this option will
# append additional keys to be hashed to introduce additional non-determinism.
cronHashFields: true
# defaultTimezone defines a default timezone to use for JobConfigs that do not
# specify a timezone. If left empty, UTC will be used as the default timezone.
# Defines a default timezone to use for JobConfigs that do not specify a timezone.
# If left empty, UTC will be used as the default timezone.
defaultTimezone: "UTC"
# maxMissedSchedules defines a maximum number of jobs that the controller
# should back-schedule, or attempt to create after coming back up from
# downtime. Having a sane value here would prevent a thundering herd of jobs
# being scheduled that would exhaust resources in the cluster. Set this to 0 to
# disable back-scheduling.
# Defines the maximum number of jobs that the controller should back-schedule,
# or attempt to create after coming back up from downtime. Having a sane value
# here would prevent a thundering herd of jobs being scheduled that would exhaust
# resources in the cluster.
#
# Set this to 0 to disable back-scheduling.
maxMissedSchedules: 5
# maxDowntimeThresholdSeconds defines the maximum downtime that the controller
# can tolerate. If the controller was intentionally shut down for an extended
# period of time, we should not attempt to back-schedule jobs once it was
# started.
# Defines the maximum downtime that the controller can tolerate. If the controller
# was shut down for an extended period of time, any jobs that should have been created
# beyond the maximum downtime threshold would not be back-scheduled once it is started again.
maxDowntimeThresholdSeconds: 300
20 changes: 16 additions & 4 deletions hack/build-images.sh
Original file line number Diff line number Diff line change
Expand Up @@ -21,19 +21,31 @@ set -euo pipefail
## Builds all Docker images to a specified image tag, but does not push them.
## Uses Dockerfile.dev which will build all entrypoints in the same image.

if [ $# -ne 1 ]
if [ $# -lt 2 ]
then
echo 'Usage:'
echo ' ./build-images.sh IMAGE_TAG'
echo ' ./build-images.sh IMAGE_NAME_PREFIX IMAGE_TAG'
exit 1
fi

# Positional arguments.
IMAGE_TAG="$1"
IMAGE_NAME_PREFIX="$1"
if [[ -z "${IMAGE_NAME_PREFIX}" ]]
then
echo 'Error: IMAGE_NAME_PREFIX cannot be empty'
exit 2
fi

IMAGE_TAG="$2"
if [[ -z "${IMAGE_TAG}" ]]
then
echo 'Error: IMAGE_TAG cannot be empty'
exit 2
fi

# Build all images.
# Note that in the development build, all entrypoints are bundled in the same image.
while IFS= read -r IMAGE; do
TARGET_IMAGE="${IMAGE}:${IMAGE_TAG}"
TARGET_IMAGE="${IMAGE_NAME_PREFIX}/${IMAGE}:${IMAGE_TAG}"
docker build --file=Dockerfile.dev -t "${TARGET_IMAGE}" .
done < ./hack/docker-images.txt
7 changes: 3 additions & 4 deletions pkg/config/defaults.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,9 @@ import (

var (
DefaultJobExecutionConfig = &configv1alpha1.JobExecutionConfig{
DefaultTTLSecondsAfterFinished: pointer.Int64(3600),
DefaultPendingTimeoutSeconds: pointer.Int64(900),
DeleteKillingTasksTimeoutSeconds: pointer.Int64(180),
ForceDeleteKillingTasksTimeoutSeconds: pointer.Int64(120),
DefaultTTLSecondsAfterFinished: pointer.Int64(3600),
DefaultPendingTimeoutSeconds: pointer.Int64(900),
ForceDeleteTaskTimeoutSeconds: pointer.Int64(900),
}

DefaultJobConfigExecutionConfig = &configv1alpha1.JobConfigExecutionConfig{
Expand Down
71 changes: 52 additions & 19 deletions pkg/execution/controllers/jobcontroller/controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@ package jobcontroller_test

import (
"fmt"
"strconv"
"time"

corev1 "k8s.io/api/core/v1"
Expand Down Expand Up @@ -98,6 +97,49 @@ var (
// Job with pod pending.
fakeJobPending = generateJobStatusFromPod(fakeJob, fakePodPending)

// Job with pod in killing.
fakeJobWithPodInKilling = func() *execution.Job {
newJob := fakeJobWithKillTimestamp.DeepCopy()
// NOTE(irvinlim): The State is not yet updated here because the Pod is not yet reconciled
newJob.Status.Phase = execution.JobKilling
newJob.Status.Condition.Waiting = &execution.JobConditionWaiting{
Reason: "DeletingTasks",
Message: "Waiting for 1 out of 1 task(s) to be deleted",
}
newJob.Status.Tasks[0].DeletedStatus = &execution.TaskStatus{
State: execution.TaskTerminated,
Result: execution.TaskKilled,
}
return newJob
}()

// Job with pod in killing after update.
fakeJobWithPodInKillingAfterUpdate = func() *execution.Job {
newJob := fakeJobWithPodInKilling.DeepCopy()
newJob.Status.Tasks[0].Status.State = execution.TaskKilling
return newJob
}()

// Job with pod reaching pending timeout.
fakeJobWithPodInPendingTimeout = func() *execution.Job {
newJob := fakeJobPending.DeepCopy()
// NOTE(irvinlim): The State is not yet updated here because the Pod is not yet reconciled
newJob.Status.Tasks[0].DeletedStatus = &execution.TaskStatus{
State: execution.TaskTerminated,
Result: execution.TaskPendingTimeout,
Reason: "PendingTimeout",
Message: "Task exceeded pending timeout of 15m0s",
}
return newJob
}()

// Job with pod reaching pending timeout after update.
fakeJobWithPodInPendingTimeoutAfterUpdate = func() *execution.Job {
newJob := fakeJobWithPodInPendingTimeout.DeepCopy()
newJob.Status.Tasks[0].Status.State = execution.TaskKilling
return newJob
}()

// Job with kill timestamp.
fakeJobWithKillTimestamp = func() *execution.Job {
newJob := fakeJobPending.DeepCopy()
Expand Down Expand Up @@ -165,7 +207,7 @@ var (

// Job with pod being deleted.
fakeJobPodDeleting = func() *execution.Job {
newJob := generateJobStatusFromPod(fakeJobWithKillTimestamp, fakePodTerminating)
newJob := generateJobStatusFromPod(fakeJobWithKillTimestamp, fakePodPendingDeleting)
newJob.Status.Tasks[0].DeletedStatus = &execution.TaskStatus{
State: execution.TaskTerminated,
Result: execution.TaskKilled,
Expand All @@ -184,7 +226,7 @@ var (

// Job with pod being force deleted.
fakeJobPodForceDeleting = func() *execution.Job {
newJob := generateJobStatusFromPod(fakeJobWithKillTimestamp, fakePodTerminating)
newJob := generateJobStatusFromPod(fakeJobWithKillTimestamp, fakePodPendingDeleting)
newJob.Status.Tasks[0].DeletedStatus = &execution.TaskStatus{
State: execution.TaskTerminated,
Result: execution.TaskKilled,
Expand Down Expand Up @@ -250,26 +292,18 @@ var (
return newPod
}()

// Pod that is in Pending state and is in the process of being killed.
fakePodTerminating = killPod(fakePodPending, testutils.Mktime(killTime))
// Pod that is in Pending state and is in the process of being deleted.
fakePodPendingDeleting = deletePod(fakePodPending, testutils.Mkmtime(killTime))

// Pod that is terminating and has deletion timestamp.
fakePodDeleting = func() *corev1.Pod {
newPod := fakePodTerminating.DeepCopy()
newPod := fakePodPendingDeleting.DeepCopy()
deleteTime := metav1.NewTime(testutils.Mkmtimep(killTime).
Add(time.Duration(*config.DefaultJobExecutionConfig.DeleteKillingTasksTimeoutSeconds) * time.Second))
Add(time.Duration(*config.DefaultJobExecutionConfig.ForceDeleteTaskTimeoutSeconds) * time.Second))
newPod.DeletionTimestamp = &deleteTime
return newPod
}()

// Pod that is in Pending state and is in the process of being killed by pending
// timeout.
fakePodPendingTimeoutTerminating = func() *corev1.Pod {
newPod := killPod(fakePodPending, testutils.Mktime(later15m))
meta.SetAnnotation(newPod, podtaskexecutor.LabelKeyKilledFromPendingTimeout, "1")
return newPod
}()

// Pod that is Succeeded.
fakePodFinished = func() *corev1.Pod {
newPod := fakePodResult.DeepCopy()
Expand Down Expand Up @@ -439,10 +473,9 @@ func generateJobStatusFromPod(rj *execution.Job, pods ...*corev1.Pod) *execution
return newRj
}

// killPod returns a new Pod after setting the kill timestamp.
func killPod(pod *corev1.Pod, ts time.Time) *corev1.Pod {
// deletePod returns a new Pod after setting the deletion timestamp.
func deletePod(pod *corev1.Pod, ts metav1.Time) *corev1.Pod {
newPod := pod.DeepCopy()
meta.SetAnnotation(newPod, podtaskexecutor.LabelKeyTaskKillTimestamp, strconv.Itoa(int(ts.Unix())))
newPod.Spec.ActiveDeadlineSeconds = pointer.Int64(int64(ts.Sub(newPod.Status.StartTime.Time).Seconds()))
newPod.DeletionTimestamp = &ts
return newPod
}
Loading

0 comments on commit 53774f9

Please sign in to comment.