Skip to content

Commit

Permalink
TEP-0121: Implement Retries in TaskRun
Browse files Browse the repository at this point in the history
This commit implements `Retries` in `TaskRun`, and removes the
logic that PipelineRun controller relies on `RetriesStatus` to
determine the termination of a TaskRun or CustomRun/Run.

Key Changes:
- New `Retries` field in both `v1beta1.TaskRun` and `v1.TaskRun`
- Archive retry attempt history in `RetriesStatus` for a failed
  `TaskRun`, before sending kubernetes and cloud events before a
  reconcile loop ends.
- Unit Tests to test the `TaskRun` object changes, especially the
  changes on `status.conditions` and `status.retriesStatus` after
  being reconciled once (one reconcile loop).
  • Loading branch information
XinruZhang committed Nov 29, 2022
1 parent 4ee22f0 commit 819e26e
Show file tree
Hide file tree
Showing 27 changed files with 3,353 additions and 1,755 deletions.
50 changes: 49 additions & 1 deletion docs/pipeline-api.md
Original file line number Diff line number Diff line change
Expand Up @@ -1148,6 +1148,18 @@ TaskRunSpecStatusMessage
</tr>
<tr>
<td>
<code>retries</code><br/>
<em>
int
</em>
</td>
<td>
<em>(Optional)</em>
<p>Retries represents how many times this task run should be retried in case of task failure.</p>
</td>
</tr>
<tr>
<td>
<code>timeout</code><br/>
<em>
<a href="https://godoc.org/k8s.io/apimachinery/pkg/apis/meta/v1#Duration">
Expand Down Expand Up @@ -4781,6 +4793,18 @@ TaskRunSpecStatusMessage
</tr>
<tr>
<td>
<code>retries</code><br/>
<em>
int
</em>
</td>
<td>
<em>(Optional)</em>
<p>Retries represents how many times this task run should be retried in case of task failure.</p>
</td>
</tr>
<tr>
<td>
<code>timeout</code><br/>
<em>
<a href="https://godoc.org/k8s.io/apimachinery/pkg/apis/meta/v1#Duration">
Expand Down Expand Up @@ -7906,6 +7930,18 @@ TaskRunSpecStatusMessage
</tr>
<tr>
<td>
<code>retries</code><br/>
<em>
int
</em>
</td>
<td>
<em>(Optional)</em>
<p>Retries represents how many times this task run should be retried in case of task failure.</p>
</td>
</tr>
<tr>
<td>
<code>timeout</code><br/>
<em>
<a href="https://godoc.org/k8s.io/apimachinery/pkg/apis/meta/v1#Duration">
Expand Down Expand Up @@ -12830,6 +12866,18 @@ TaskRunSpecStatusMessage
</tr>
<tr>
<td>
<code>retries</code><br/>
<em>
int
</em>
</td>
<td>
<em>(Optional)</em>
<p>Retries represents how many times this task run should be retried in case of task failure.</p>
</td>
</tr>
<tr>
<td>
<code>timeout</code><br/>
<em>
<a href="https://godoc.org/k8s.io/apimachinery/pkg/apis/meta/v1#Duration">
Expand Down Expand Up @@ -13824,7 +13872,7 @@ string
<h3 id="tekton.dev/v1beta1.CustomRunStatus">CustomRunStatus
</h3>
<p>
(<em>Appears on:</em><a href="#tekton.dev/v1beta1.CustomRun">CustomRun</a>)
(<em>Appears on:</em><a href="#tekton.dev/v1beta1.CustomRun">CustomRun</a>, <a href="#tekton.dev/v1beta1.CustomRunStatusFields">CustomRunStatusFields</a>)
</p>
<div>
<p>CustomRunStatus defines the observed state of CustomRun</p>
Expand Down
14 changes: 14 additions & 0 deletions docs/taskruns.md
Original file line number Diff line number Diff line change
Expand Up @@ -698,6 +698,20 @@ object(s), if present. Any `Request` or `Limit` specified by the user (on `Task`

For more information, see the [`LimitRange` support in Pipeline](./compute-resources.md#limitrange-support).

### Specifying `Retries`
You can use the `retries` field to set how many times you want to retry on a failed TaskRun.
All TaskRun failures are retriable except for `Cancellation`.

For a retriable `TaskRun`, when an error occurs:
- The error status is archived in `.status.RetriesStatus`
- The `Succeed` condition in `.status` is updated:
```
Type: Succeed
Status: Unknown
Reason: CompleteWithRetries
```
- Unset `status.StartTime` and `status.PodName` to trigger another retry attempt.

### Configuring the failure timeout

You can use the `timeout` field to set the `TaskRun's` desired timeout value. If you do not specify this
Expand Down
7 changes: 7 additions & 0 deletions pkg/apis/pipeline/v1/openapi_generated.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 5 additions & 0 deletions pkg/apis/pipeline/v1/swagger.json
Original file line number Diff line number Diff line change
Expand Up @@ -1787,6 +1787,11 @@
"description": "PodTemplate holds pod specific configuration",
"$ref": "#/definitions/pod.Template"
},
"retries": {
"description": "Retries represents how many times this task run should be retried in case of task failure.",
"type": "integer",
"format": "int32"
},
"serviceAccountName": {
"type": "string",
"default": ""
Expand Down
3 changes: 3 additions & 0 deletions pkg/apis/pipeline/v1/taskrun_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,9 @@ type TaskRunSpec struct {
// Status message for cancellation.
// +optional
StatusMessage TaskRunSpecStatusMessage `json:"statusMessage,omitempty"`
// Retries represents how many times this task run should be retried in case of task failure.
// +optional
Retries int `json:"retries,omitempty"`
// Time after which the build times out. Defaults to 1 hour.
// Specified build timeout should be less than 24h.
// Refer Go's ParseDuration documentation for expected format: https://golang.org/pkg/time/#ParseDuration
Expand Down
7 changes: 7 additions & 0 deletions pkg/apis/pipeline/v1beta1/openapi_generated.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 5 additions & 0 deletions pkg/apis/pipeline/v1beta1/swagger.json
Original file line number Diff line number Diff line change
Expand Up @@ -2793,6 +2793,11 @@
"resources": {
"$ref": "#/definitions/v1beta1.TaskRunResources"
},
"retries": {
"description": "Retries represents how many times this task run should be retried in case of task failure.",
"type": "integer",
"format": "int32"
},
"serviceAccountName": {
"type": "string",
"default": ""
Expand Down
2 changes: 2 additions & 0 deletions pkg/apis/pipeline/v1beta1/taskrun_conversion.go
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ func (trs *TaskRunSpec) ConvertTo(ctx context.Context, sink *v1.TaskRunSpec) err
}
sink.Status = v1.TaskRunSpecStatus(trs.Status)
sink.StatusMessage = v1.TaskRunSpecStatusMessage(trs.StatusMessage)
sink.Retries = trs.Retries
sink.Timeout = trs.Timeout
sink.PodTemplate = trs.PodTemplate
sink.Workspaces = nil
Expand Down Expand Up @@ -141,6 +142,7 @@ func (trs *TaskRunSpec) ConvertFrom(ctx context.Context, source *v1.TaskRunSpec)
}
trs.Status = TaskRunSpecStatus(source.Status)
trs.StatusMessage = TaskRunSpecStatusMessage(source.StatusMessage)
trs.Retries = source.Retries
trs.Timeout = source.Timeout
trs.PodTemplate = source.PodTemplate
trs.Workspaces = nil
Expand Down
12 changes: 12 additions & 0 deletions pkg/apis/pipeline/v1beta1/taskrun_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,9 @@ type TaskRunSpec struct {
// Status message for cancellation.
// +optional
StatusMessage TaskRunSpecStatusMessage `json:"statusMessage,omitempty"`
// Retries represents how many times this task run should be retried in case of task failure.
// +optional
Retries int `json:"retries,omitempty"`
// Time after which the build times out. Defaults to 1 hour.
// Specified build timeout should be less than 24h.
// Refer Go's ParseDuration documentation for expected format: https://golang.org/pkg/time/#ParseDuration
Expand Down Expand Up @@ -471,6 +474,15 @@ func (tr *TaskRun) IsTaskRunResultDone() bool {
return !tr.Status.GetCondition(apis.ConditionType(TaskRunConditionResultsVerified.String())).IsUnknown()
}

// IsRetriable returns true if the TaskRun's Retries is not exhausted.
func (tr *TaskRun) IsRetriable() bool {
retriesDone := len(tr.Status.RetriesStatus)
if retriesDone < tr.Spec.Retries {
return true
}
return false
}

// HasTimedOut returns true if the TaskRun runtime is beyond the allowed timeout
func (tr *TaskRun) HasTimedOut(ctx context.Context, c clock.PassiveClock) bool {
if tr.Status.StartTime.IsZero() {
Expand Down
25 changes: 25 additions & 0 deletions pkg/reconciler/events/event.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@ const (
EventReasonStarted = "Started"
// EventReasonError is the reason set for events related to TaskRuns / PipelineRuns reconcile errors
EventReasonError = "Error"
// EventReasonRetry is the reason set for events related to retrying a runtime object.
EventReasonRetry = "Retry"
)

// Emit emits events for object
Expand Down Expand Up @@ -123,3 +125,26 @@ func EmitError(c record.EventRecorder, err error, object runtime.Object) {
c.Event(object, corev1.EventTypeWarning, EventReasonError, err.Error())
}
}

// EmitOnRetry emits events for the object to be retried.
//
// Two types of events are supported, Kubernetes Event and Cloud Event.
// - Kubernetes Events are always sent.
// - Cloud Events are always sent if event sink is configured.
func EmitOnRetry(ctx context.Context, object runtime.Object) {
recorder := controller.GetEventRecorder(ctx)
logger := logging.FromContext(ctx)
configs := config.FromContextOrDefaults(ctx)

// Always send kubernetes event.
recorder.Event(object, corev1.EventTypeNormal, EventReasonRetry, "")

sendCloudEvents := (configs.Defaults.DefaultCloudEventsSink != "")
if sendCloudEvents {
ctx = cloudevents.ContextWithTarget(ctx, configs.Defaults.DefaultCloudEventsSink)
err := cloudevent.SendCloudEventWithRetries(ctx, object)
if err != nil {
logger.Warnf("Failed to emit cloud events %v", err.Error())
}
}
}
31 changes: 2 additions & 29 deletions pkg/reconciler/pipelinerun/pipelinerun.go
Original file line number Diff line number Diff line change
Expand Up @@ -862,26 +862,10 @@ func (c *Reconciler) createTaskRuns(ctx context.Context, rpt *resources.Resolved
func (c *Reconciler) createTaskRun(ctx context.Context, taskRunName string, params []v1beta1.Param, rpt *resources.ResolvedPipelineTask, pr *v1beta1.PipelineRun, storageBasePath string) (*v1beta1.TaskRun, error) {
logger := logging.FromContext(ctx)

tr, _ := c.taskRunLister.TaskRuns(pr.Namespace).Get(taskRunName)
if tr != nil {
// retry should happen only when the taskrun has failed
if !tr.Status.GetCondition(apis.ConditionSucceeded).IsFalse() {
return tr, nil
}
// Don't modify the lister cache's copy.
tr = tr.DeepCopy()
// is a retry
addRetryHistory(tr)
clearStatus(tr)
tr.Status.MarkResourceOngoing("", "")
logger.Infof("Updating taskrun %s with cleared status and retry history (length: %d).", tr.GetName(), len(tr.Status.RetriesStatus))
return c.PipelineClientSet.TektonV1beta1().TaskRuns(pr.Namespace).UpdateStatus(ctx, tr, metav1.UpdateOptions{})
}

rpt.PipelineTask = resources.ApplyPipelineTaskContexts(rpt.PipelineTask)
taskRunSpec := pr.GetTaskRunSpec(rpt.PipelineTask.Name)
params = append(params, rpt.PipelineTask.Params...)
tr = &v1beta1.TaskRun{
tr := &v1beta1.TaskRun{
ObjectMeta: metav1.ObjectMeta{
Name: taskRunName,
Namespace: pr.Namespace,
Expand All @@ -890,6 +874,7 @@ func (c *Reconciler) createTaskRun(ctx context.Context, taskRunName string, para
Annotations: combineTaskRunAndTaskSpecAnnotations(pr, rpt.PipelineTask),
},
Spec: v1beta1.TaskRunSpec{
Retries: rpt.PipelineTask.Retries,
Params: params,
ServiceAccountName: taskRunSpec.TaskServiceAccountName,
PodTemplate: taskRunSpec.TaskPodTemplate,
Expand Down Expand Up @@ -1104,18 +1089,6 @@ func combinedSubPath(workspaceSubPath string, pipelineTaskSubPath string) string
return filepath.Join(workspaceSubPath, pipelineTaskSubPath)
}

func addRetryHistory(tr *v1beta1.TaskRun) {
newStatus := *tr.Status.DeepCopy()
newStatus.RetriesStatus = nil
tr.Status.RetriesStatus = append(tr.Status.RetriesStatus, newStatus)
}

func clearStatus(tr *v1beta1.TaskRun) {
tr.Status.StartTime = nil
tr.Status.CompletionTime = nil
tr.Status.PodName = ""
}

func getTaskrunAnnotations(pr *v1beta1.PipelineRun) map[string]string {
// Propagate annotations from PipelineRun to TaskRun.
annotations := make(map[string]string, len(pr.ObjectMeta.Annotations)+1)
Expand Down
Loading

0 comments on commit 819e26e

Please sign in to comment.