feat(execution): Avoid using activeDeadlineSeconds to kill tasks, use…

… deletion instead
furiko-io · Jun 3, 2022 · 53774f9 · 53774f9
1 parent 9e17087
commit 53774f9
Show file tree

Hide file tree

Showing 23 changed files with 417 additions and 736 deletions.
diff --git a/.dockerignore b/.dockerignore
@@ -1,8 +1,19 @@
 # More info: https://docs.docker.com/engine/reference/builder/#dockerignore-file
-# Ignore build and test binaries.
+.dockerignore
+.gitignore
+.github/
+.local/
 bin/
 testbin/
 build/
 dist/
 yamls/
-.local
+config/
+docs/
+hack/
+Makefile
+LICENSE
+*.md
+*.yaml
+*.yml
+*.cov
diff --git a/Makefile b/Makefile
@@ -186,7 +186,7 @@ dev: dev-build dev-push dev-deploy ## Builds local Docker images of all componen
 
 .PHONY: dev-build
 dev-build: goreleaser ## Builds local Docker images of all components. Use the DEV_IMAGE_TAG environment variable to override the image tag that is built.
-	./hack/build-images.sh "$(DEV_IMAGE_TAG)"
+	./hack/build-images.sh "$(IMAGE_NAME_PREFIX)" "$(DEV_IMAGE_TAG)"
 
 .PHONY: dev-push
 dev-push: ## Pushes all Docker images to the specified registry. Use the DEV_IMAGE_TAG environment variable to override the image tag, and DEV_IMAGE_REGISTRY to override the image registry to use (usually a local one).

diff --git a/apis/config/v1alpha1/dynamicconfig_types.go b/apis/config/v1alpha1/dynamicconfig_types.go
@@ -43,22 +43,14 @@ type JobExecutionConfig struct {
 	// +optional
 	DefaultPendingTimeoutSeconds *int64 `json:"defaultPendingTimeoutSeconds,omitempty"`
 
-	// DeleteKillingTasksTimeoutSeconds is the duration we delete the task to kill
-	// it instead of using active deadline, if previous efforts were ineffective.
-	// Set this value to 0 to immediately use deletion.
-	//
-	// Default: 180
-	// +optional
-	DeleteKillingTasksTimeoutSeconds *int64 `json:"deleteKillingTasksTimeoutSeconds,omitempty"`
-
-	// ForceDeleteKillingTasksTimeoutSeconds is the duration before we use force
-	// deletion instead of normal deletion. This timeout is computed from the
+	// ForceDeleteTaskTimeoutSeconds is the duration before we use force deletion
+	// instead of normal deletion. This timeout is computed from the
 	// deletionTimestamp of the object, which may also include an additional delay
 	// of deletionGracePeriodSeconds. Set this value to 0 to disable force deletion.
 	//
-	// Default: 120
+	// Default: 900
 	// +optional
-	ForceDeleteKillingTasksTimeoutSeconds *int64 `json:"forceDeleteKillingTasksTimeoutSeconds,omitempty"`
+	ForceDeleteTaskTimeoutSeconds *int64 `json:"forceDeleteTaskTimeoutSeconds,omitempty"`
 }
 
 // +kubebuilder:object:root=true

diff --git a/apis/config/v1alpha1/zz_generated.deepcopy.go b/apis/config/v1alpha1/zz_generated.deepcopy.go
diff --git a/apis/execution/v1alpha1/job_types.go b/apis/execution/v1alpha1/job_types.go
@@ -647,10 +647,6 @@ const (
 	// specified pending timeout.
 	TaskPendingTimeout TaskResult = "PendingTimeout"
 
-	// TaskDeadlineExceeded means that the task had failed to terminate within its
-	// active deadline and has now been terminated.
-	TaskDeadlineExceeded TaskResult = "DeadlineExceeded"
-
 	// TaskKilled means that the task is killed externally and now terminated.
 	TaskKilled TaskResult = "Killed"
 )

diff --git a/config/execution/dynamic/config.yaml b/config/execution/dynamic/config.yaml
@@ -14,45 +14,47 @@ data:
     apiVersion: config.furiko.io/v1alpha1
     kind: JobExecutionConfig
 
-    # defaultTTLSecondsAfterFinished is the default time-to-live (TTL) for a Job
-    # after it has finished. Lower this value to reduce the strain on the
-    # cluster/kubelet. Set to 0 to delete immediately after the Job is finished.
+    # The default time-to-live (TTL) for a Job after it has finished. Lower this
+    # value to reduce the strain on the cluster/kubelet. Set to 0 to delete immediately
+    # after the Job is finished.
     defaultTTLSecondsAfterFinished: 3600
 
-    # defaultPendingTimeoutSeconds is default timeout to use if job does not
-    # specify the pending timeout. By default, this is a non-zero value to prevent
-    # permanently stuck jobs. To disable default pending timeout, set this to 0.
+    # The default timeout for a task to remain in a pending state. Defaults to 15 minutes
+    # in order to prevent jobs from retrying indefinitely.
+    #
+    # To prevent setting a default pending timeout globally, set this to 0. Individual jobs
+    # can still specify spec.taskPendingTimeoutSeconds in the JobTemplate to override this
+    # global default value.
     defaultPendingTimeoutSeconds: 900
 
-    # deleteKillingTasksTimeoutSeconds is the duration we delete the task to kill
-    # it instead of using active deadline, if previous efforts were ineffective.
-    # Set this value to 0 to immediately use deletion.
-    deleteKillingTasksTimeoutSeconds: 180
-
-    # forceDeleteKillingTasksTimeoutSeconds is the duration before we use force
-    # deletion instead of normal deletion. This timeout is computed from the
-    # deletionTimestamp of the object, which may also include an additional delay
-    # of deletionGracePeriodSeconds. Set this value to 0 to disable force deletion.
-    forceDeleteKillingTasksTimeoutSeconds: 120
+    # The duration before the controller uses force deletion instead of normal deletion.
+    # This timeout is computed from the deletionTimestamp of the object, which may also include
+    # an additional delay of deletionGracePeriodSeconds.
+    #
+    # Force deletion causes the task to be deleted without confirmation that the task has already
+    # terminated. When pod is used for taskTemplate, this means that
+    #
+    # Set this value to 0 to disable force deletion globally. Individual jobs can also specify
+    # spec.forbidTaskForceDeletion in the JobTemplate to disable force deletion if this
+    # behavior is not desired.
+    forceDeleteTaskTimeoutSeconds: 900
 
   jobConfigs: |
     apiVersion: config.furiko.io/v1alpha1
     kind: JobConfigExecutionConfig
 
-    # maxEnqueuedJobs is the global maximum enqueued jobs that can be enqueued for
-    # a single JobConfig.
+    # The global maximum enqueued jobs that can be enqueued for a single JobConfig.
     maxEnqueuedJobs: 20
 
   cron: |
     apiVersion: config.furiko.io/v1alpha1
     kind: CronExecutionConfig
 
-    # cronFormat specifies the format used to parse cron expressions. Select
-    # between "standard" (default) or "quartz".
+    # Specifies the format used to parse cron expressions. Select between "standard"
+    # (default) or "quartz".
     cronFormat: "standard"
 
-    # cronHashNames specifies if cron expressions should be hashed using the
-    # JobConfig's name.
+    # Specifies if cron expressions should be hashed using the JobConfig's name.
     #
     # This enables "hash cron expressions", which looks like `0 H * * *`. This
     # particular example means to run once a day on the 0th minute of some hour,
@@ -63,9 +65,8 @@ data:
     # If disabled, any JobConfigs that use the `H` syntax will throw a parse error.
     cronHashNames: true
 
-    # cronHashSecondsByDefault specifies if the seconds field of a cron expression
-    # should be a `H` or `0` by default. If enabled, it will be `H`, otherwise it
-    # will default to `0`.
+    # Specifies if the seconds field of a cron expression should be a `H` or `0`
+    # by default. If enabled, it will be `H`, otherwise it will default to `0`.
     #
     # For JobConfigs which use a short cron expression format (i.e. 5 or 6 tokens
     # long), the seconds field is omitted and is typically assumed to be `0` (e.g.
@@ -77,27 +78,26 @@ data:
     # this would be `0 5 10 * * * *`.
     cronHashSecondsByDefault: false
 
-    # cronHashFields specifies if the fields should be hashed along with the
-    # JobConfig's name.
+    # Specifies if the fields should be hashed along with the JobConfig's name.
     #
     # For example, `H H * * * * *` will always hash the seconds and minutes to the
     # same value, for example 00:37:37, 01:37:37, etc. Enabling this option will
     # append additional keys to be hashed to introduce additional non-determinism.
     cronHashFields: true
 
-    # defaultTimezone defines a default timezone to use for JobConfigs that do not
-    # specify a timezone. If left empty, UTC will be used as the default timezone.
+    # Defines a default timezone to use for JobConfigs that do not specify a timezone.
+    # If left empty, UTC will be used as the default timezone.
     defaultTimezone: "UTC"
 
-    # maxMissedSchedules defines a maximum number of jobs that the controller
-    # should back-schedule, or attempt to create after coming back up from
-    # downtime. Having a sane value here would prevent a thundering herd of jobs
-    # being scheduled that would exhaust resources in the cluster. Set this to 0 to
-    # disable back-scheduling.
+    # Defines the maximum number of jobs that the controller should back-schedule,
+    # or attempt to create after coming back up from downtime. Having a sane value
+    # here would prevent a thundering herd of jobs being scheduled that would exhaust
+    # resources in the cluster.
+    #
+    # Set this to 0 to disable back-scheduling.
     maxMissedSchedules: 5
 
-    # maxDowntimeThresholdSeconds defines the maximum downtime that the controller
-    # can tolerate. If the controller was intentionally shut down for an extended
-    # period of time, we should not attempt to back-schedule jobs once it was
-    # started.
+    # Defines the maximum downtime that the controller can tolerate. If the controller
+    # was shut down for an extended period of time, any jobs that should have been created
+    # beyond the maximum downtime threshold would not be back-scheduled once it is started again.
     maxDowntimeThresholdSeconds: 300
diff --git a/hack/build-images.sh b/hack/build-images.sh
@@ -21,19 +21,31 @@ set -euo pipefail
 ## Builds all Docker images to a specified image tag, but does not push them.
 ## Uses Dockerfile.dev which will build all entrypoints in the same image.
 
-if [ $# -ne 1 ]
+if [ $# -lt 2 ]
 then
   echo 'Usage:'
-  echo '  ./build-images.sh IMAGE_TAG'
+  echo '  ./build-images.sh IMAGE_NAME_PREFIX IMAGE_TAG'
   exit 1
 fi
 
 # Positional arguments.
-IMAGE_TAG="$1"
+IMAGE_NAME_PREFIX="$1"
+if [[ -z "${IMAGE_NAME_PREFIX}" ]]
+then
+  echo 'Error: IMAGE_NAME_PREFIX cannot be empty'
+  exit 2
+fi
+
+IMAGE_TAG="$2"
+if [[ -z "${IMAGE_TAG}" ]]
+then
+  echo 'Error: IMAGE_TAG cannot be empty'
+  exit 2
+fi
 
 # Build all images.
 # Note that in the development build, all entrypoints are bundled in the same image.
 while IFS= read -r IMAGE; do
-  TARGET_IMAGE="${IMAGE}:${IMAGE_TAG}"
+  TARGET_IMAGE="${IMAGE_NAME_PREFIX}/${IMAGE}:${IMAGE_TAG}"
   docker build --file=Dockerfile.dev -t "${TARGET_IMAGE}" .
 done < ./hack/docker-images.txt
diff --git a/pkg/config/defaults.go b/pkg/config/defaults.go
@@ -24,10 +24,9 @@ import (
 
 var (
 	DefaultJobExecutionConfig = &configv1alpha1.JobExecutionConfig{
-		DefaultTTLSecondsAfterFinished:        pointer.Int64(3600),
-		DefaultPendingTimeoutSeconds:          pointer.Int64(900),
-		DeleteKillingTasksTimeoutSeconds:      pointer.Int64(180),
-		ForceDeleteKillingTasksTimeoutSeconds: pointer.Int64(120),
+		DefaultTTLSecondsAfterFinished: pointer.Int64(3600),
+		DefaultPendingTimeoutSeconds:   pointer.Int64(900),
+		ForceDeleteTaskTimeoutSeconds:  pointer.Int64(900),
 	}
 
 	DefaultJobConfigExecutionConfig = &configv1alpha1.JobConfigExecutionConfig{

diff --git a/pkg/execution/controllers/jobcontroller/controller_test.go b/pkg/execution/controllers/jobcontroller/controller_test.go
@@ -18,7 +18,6 @@ package jobcontroller_test
 
 import (
 	"fmt"
-	"strconv"
 	"time"
 
 	corev1 "k8s.io/api/core/v1"
@@ -98,6 +97,49 @@ var (
 	// Job with pod pending.
 	fakeJobPending = generateJobStatusFromPod(fakeJob, fakePodPending)
 
+	// Job with pod in killing.
+	fakeJobWithPodInKilling = func() *execution.Job {
+		newJob := fakeJobWithKillTimestamp.DeepCopy()
+		// NOTE(irvinlim): The State is not yet updated here because the Pod is not yet reconciled
+		newJob.Status.Phase = execution.JobKilling
+		newJob.Status.Condition.Waiting = &execution.JobConditionWaiting{
+			Reason:  "DeletingTasks",
+			Message: "Waiting for 1 out of 1 task(s) to be deleted",
+		}
+		newJob.Status.Tasks[0].DeletedStatus = &execution.TaskStatus{
+			State:  execution.TaskTerminated,
+			Result: execution.TaskKilled,
+		}
+		return newJob
+	}()
+
+	// Job with pod in killing after update.
+	fakeJobWithPodInKillingAfterUpdate = func() *execution.Job {
+		newJob := fakeJobWithPodInKilling.DeepCopy()
+		newJob.Status.Tasks[0].Status.State = execution.TaskKilling
+		return newJob
+	}()
+
+	// Job with pod reaching pending timeout.
+	fakeJobWithPodInPendingTimeout = func() *execution.Job {
+		newJob := fakeJobPending.DeepCopy()
+		// NOTE(irvinlim): The State is not yet updated here because the Pod is not yet reconciled
+		newJob.Status.Tasks[0].DeletedStatus = &execution.TaskStatus{
+			State:   execution.TaskTerminated,
+			Result:  execution.TaskPendingTimeout,
+			Reason:  "PendingTimeout",
+			Message: "Task exceeded pending timeout of 15m0s",
+		}
+		return newJob
+	}()
+
+	// Job with pod reaching pending timeout after update.
+	fakeJobWithPodInPendingTimeoutAfterUpdate = func() *execution.Job {
+		newJob := fakeJobWithPodInPendingTimeout.DeepCopy()
+		newJob.Status.Tasks[0].Status.State = execution.TaskKilling
+		return newJob
+	}()
+
 	// Job with kill timestamp.
 	fakeJobWithKillTimestamp = func() *execution.Job {
 		newJob := fakeJobPending.DeepCopy()
@@ -165,7 +207,7 @@ var (
 
 	// Job with pod being deleted.
 	fakeJobPodDeleting = func() *execution.Job {
-		newJob := generateJobStatusFromPod(fakeJobWithKillTimestamp, fakePodTerminating)
+		newJob := generateJobStatusFromPod(fakeJobWithKillTimestamp, fakePodPendingDeleting)
 		newJob.Status.Tasks[0].DeletedStatus = &execution.TaskStatus{
 			State:   execution.TaskTerminated,
 			Result:  execution.TaskKilled,
@@ -184,7 +226,7 @@ var (
 
 	// Job with pod being force deleted.
 	fakeJobPodForceDeleting = func() *execution.Job {
-		newJob := generateJobStatusFromPod(fakeJobWithKillTimestamp, fakePodTerminating)
+		newJob := generateJobStatusFromPod(fakeJobWithKillTimestamp, fakePodPendingDeleting)
 		newJob.Status.Tasks[0].DeletedStatus = &execution.TaskStatus{
 			State:   execution.TaskTerminated,
 			Result:  execution.TaskKilled,
@@ -250,26 +292,18 @@ var (
 		return newPod
 	}()
 
-	// Pod that is in Pending state and is in the process of being killed.
-	fakePodTerminating = killPod(fakePodPending, testutils.Mktime(killTime))
+	// Pod that is in Pending state and is in the process of being deleted.
+	fakePodPendingDeleting = deletePod(fakePodPending, testutils.Mkmtime(killTime))
 
 	// Pod that is terminating and has deletion timestamp.
 	fakePodDeleting = func() *corev1.Pod {
-		newPod := fakePodTerminating.DeepCopy()
+		newPod := fakePodPendingDeleting.DeepCopy()
 		deleteTime := metav1.NewTime(testutils.Mkmtimep(killTime).
-			Add(time.Duration(*config.DefaultJobExecutionConfig.DeleteKillingTasksTimeoutSeconds) * time.Second))
+			Add(time.Duration(*config.DefaultJobExecutionConfig.ForceDeleteTaskTimeoutSeconds) * time.Second))
 		newPod.DeletionTimestamp = &deleteTime
 		return newPod
 	}()
 
-	// Pod that is in Pending state and is in the process of being killed by pending
-	// timeout.
-	fakePodPendingTimeoutTerminating = func() *corev1.Pod {
-		newPod := killPod(fakePodPending, testutils.Mktime(later15m))
-		meta.SetAnnotation(newPod, podtaskexecutor.LabelKeyKilledFromPendingTimeout, "1")
-		return newPod
-	}()
-
 	// Pod that is Succeeded.
 	fakePodFinished = func() *corev1.Pod {
 		newPod := fakePodResult.DeepCopy()
@@ -439,10 +473,9 @@ func generateJobStatusFromPod(rj *execution.Job, pods ...*corev1.Pod) *execution
 	return newRj
 }
 
-// killPod returns a new Pod after setting the kill timestamp.
-func killPod(pod *corev1.Pod, ts time.Time) *corev1.Pod {
+// deletePod returns a new Pod after setting the deletion timestamp.
+func deletePod(pod *corev1.Pod, ts metav1.Time) *corev1.Pod {
 	newPod := pod.DeepCopy()
-	meta.SetAnnotation(newPod, podtaskexecutor.LabelKeyTaskKillTimestamp, strconv.Itoa(int(ts.Unix())))
-	newPod.Spec.ActiveDeadlineSeconds = pointer.Int64(int64(ts.Sub(newPod.Status.StartTime.Time).Seconds()))
+	newPod.DeletionTimestamp = &ts
 	return newPod
 }