From 6859fb8b40a1da0357ca4f04e4df7fe07fb13eef Mon Sep 17 00:00:00 2001 From: Andrea Frittoli Date: Sun, 5 Apr 2020 11:21:25 +0100 Subject: [PATCH] Emit events for all TaskRun lifecycle events Start emitting events for additional TaskRun lifecyle events: - taskrun started - taskrun timeout Introduce pre-run and post-run functions that are invoked asynchronously when the taskrun starts and completes, to emit events. These same functions shall be used to trigger any other async behaviour on start/stop of taskruns. Add documentation on events. Fixes #2328 Work towards #2082 --- docs/events.md | 39 ++++++++++ docs/pipelineruns.md | 1 + docs/taskruns.md | 6 +- pkg/reconciler/event.go | 4 + pkg/reconciler/event_test.go | 8 ++ pkg/reconciler/taskrun/cancel.go | 77 ------------------- .../cloudevent/cloud_event_controller.go | 3 +- pkg/reconciler/taskrun/taskrun.go | 67 +++++++++------- 8 files changed, 95 insertions(+), 110 deletions(-) create mode 100644 docs/events.md delete mode 100644 pkg/reconciler/taskrun/cancel.go diff --git a/docs/events.md b/docs/events.md new file mode 100644 index 00000000000..d3e5f51a181 --- /dev/null +++ b/docs/events.md @@ -0,0 +1,39 @@ + +# Events + +Tekton runtime resources, specifically `TaskRuns` and `PipelineRuns`, +emit events when they are executed, so that users can monitor their lifecycle +and react to it. Tekton emits [kubernetes events](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.18/#event-v1-core), that can be retrieve from the resource via +`kubectl describe [resource]`. + +No events are emitted for `Conditions` today. + +## TaskRuns + +`TaskRun` events are generated for the following `Reasons`: +- `Started`: this is triggered the first time the `TaskRun` is picked by the + reconciler from its work queue, so it only happens if web-hook validation was + successful. Note that this event does not imply that a step started executing, + as several conditions must be met first: + - task and bound resource validation must be successful + - attached conditions must run successfully + - the `Pod` associated to the `TaskRun` must be successfully scheduled +- `Succeeded`: this is triggered once all steps in the `TaskRun` are executed + successfully, including post-steps injected by Tekton. +- `Failed`: this is triggered if the `TaskRun` is completed, but not successfully. + Causes of failure may be: one the steps failed, the `TaskRun` was cancelled or + the `TaskRun` timed out. + +## PipelineRuns + +`PipelineRun` events are generated for the following `Reasons`: +- `Succeeded`: this is triggered once all `Tasks` reachable via the DAG are + executed successfully. +- `Failed`: this is triggered if the `PipelineRun` is completed, but not + successfully. Causes of failure may be: one the `Tasks` failed or the + `PipelineRun` was cancelled. diff --git a/docs/pipelineruns.md b/docs/pipelineruns.md index ce81253d185..a6c682295c5 100644 --- a/docs/pipelineruns.md +++ b/docs/pipelineruns.md @@ -29,6 +29,7 @@ Creation of a `PipelineRun` will trigger the creation of - [Workspaces](#workspaces) - [Cancelling a PipelineRun](#cancelling-a-pipelinerun) - [LimitRanges](#limitranges) + - [Events](events.md#pipelineruns) ## Syntax diff --git a/docs/taskruns.md b/docs/taskruns.md index 880dfd81b29..017fba5c3ca 100644 --- a/docs/taskruns.md +++ b/docs/taskruns.md @@ -30,14 +30,14 @@ A `TaskRun` runs until all `steps` have completed or until a failure occurs. - [Steps](#steps) - [Results](#results) - [Cancelling a TaskRun](#cancelling-a-taskrun) + - [Sidecars](#sidecars) + - [LimitRanges](#limitranges) + - [Events](events.md#taskruns) - [Examples](#examples) - [Example TaskRun](#example-taskrun) - [Example with embedded specs](#example-with-embedded-specs) - [Example Task Reuse](#example-task-reuse) - [Using a `ServiceAccount`](#using-a-serviceaccount) - - [Sidecars](#sidecars) - - [LimitRanges](#limitranges) - --- ## Syntax diff --git a/pkg/reconciler/event.go b/pkg/reconciler/event.go index 5341ad73c87..08b4c0fc8fa 100644 --- a/pkg/reconciler/event.go +++ b/pkg/reconciler/event.go @@ -31,6 +31,10 @@ func EmitEvent(c record.EventRecorder, beforeCondition *apis.Condition, afterCon c.Event(object, corev1.EventTypeNormal, "Succeeded", afterCondition.Message) } else if afterCondition.Status == corev1.ConditionFalse { c.Event(object, corev1.EventTypeWarning, "Failed", afterCondition.Message) + } else { + if beforeCondition == nil { + c.Event(object, corev1.EventTypeNormal, "Started", "") + } } } } diff --git a/pkg/reconciler/event_test.go b/pkg/reconciler/event_test.go index f6c464c23c7..091492acaa0 100644 --- a/pkg/reconciler/event_test.go +++ b/pkg/reconciler/event_test.go @@ -80,6 +80,14 @@ func TestEmitEvent(t *testing.T) { Status: corev1.ConditionTrue, }, expectEvent: true, + }, { + name: "nil to unknown", + before: nil, + after: &apis.Condition{ + Type: apis.ConditionSucceeded, + Status: corev1.ConditionUnknown, + }, + expectEvent: true, }} for _, ts := range testcases { diff --git a/pkg/reconciler/taskrun/cancel.go b/pkg/reconciler/taskrun/cancel.go deleted file mode 100644 index 8eff915d7da..00000000000 --- a/pkg/reconciler/taskrun/cancel.go +++ /dev/null @@ -1,77 +0,0 @@ -/* -Copyright 2019 The Tekton Authors - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package taskrun - -import ( - "fmt" - "time" - - "github.com/tektoncd/pipeline/pkg/apis/pipeline/v1alpha1" - podconvert "github.com/tektoncd/pipeline/pkg/pod" - corev1 "k8s.io/api/core/v1" - "k8s.io/apimachinery/pkg/api/errors" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/client-go/kubernetes" - "knative.dev/pkg/apis" -) - -type logger interface { - Warn(args ...interface{}) - Warnf(template string, args ...interface{}) - Infof(template string, args ...interface{}) -} - -func killTaskRun(tr *v1alpha1.TaskRun, clientSet kubernetes.Interface, - logger logger, reason, message string) error { - - logger.Warn("stopping task run %q because of %q", tr.Name, reason) - tr.Status.SetCondition(&apis.Condition{ - Type: apis.ConditionSucceeded, - Status: corev1.ConditionFalse, - Reason: reason, - Message: message, - }) - - // update tr completed time - tr.Status.CompletionTime = &metav1.Time{Time: time.Now()} - - if tr.Status.PodName == "" { - logger.Warnf("task run %q has no pod running yet", tr.Name) - return nil - } - - // tr.Status.PodName will be empty if the pod was never successfully created. This condition - // can be reached, for example, by the pod never being schedulable due to limits imposed by - // a namespace's ResourceQuota. - err := clientSet.CoreV1().Pods(tr.Namespace).Delete(tr.Status.PodName, &metav1.DeleteOptions{}) - if err != nil && !errors.IsNotFound(err) { - logger.Warnf("Failed to terminate pod: %v", err) - return err - } - return nil -} - -// cancelTaskRun marks the TaskRun as cancelled and delete pods linked to it. -func cancelTaskRun(tr *v1alpha1.TaskRun, clientSet kubernetes.Interface, logger logger) error { - message := fmt.Sprintf("TaskRun %q was cancelled", tr.Name) - return killTaskRun(tr, clientSet, logger, "TaskRunCancelled", message) -} - -func timeoutTaskRun(tr *v1alpha1.TaskRun, clientSet kubernetes.Interface, logger logger) error { - message := fmt.Sprintf("TaskRun %q failed to finish within %q", tr.Name, tr.Spec.Timeout.Duration.String()) - return killTaskRun(tr, clientSet, logger, podconvert.ReasonTimedOut, message) -} diff --git a/pkg/reconciler/taskrun/resources/cloudevent/cloud_event_controller.go b/pkg/reconciler/taskrun/resources/cloudevent/cloud_event_controller.go index 04fde4fbc85..694a3ebe2fc 100644 --- a/pkg/reconciler/taskrun/resources/cloudevent/cloud_event_controller.go +++ b/pkg/reconciler/taskrun/resources/cloudevent/cloud_event_controller.go @@ -66,8 +66,7 @@ func cloudEventDeliveryFromTargets(targets []string) []v1alpha1.CloudEventDelive } // SendCloudEvents is used by the TaskRun controller to send cloud events once -// the TaskRun is complete. `tr` is used to obtain the list of targets but also -// to construct the body of the +// the TaskRun is complete. `tr` is used to obtain the list of targets func SendCloudEvents(tr *v1alpha1.TaskRun, ceclient CEClient, logger *zap.SugaredLogger) error { logger = logger.With(zap.String("taskrun", tr.Name)) diff --git a/pkg/reconciler/taskrun/taskrun.go b/pkg/reconciler/taskrun/taskrun.go index 0ab304c0ddf..27461239b66 100644 --- a/pkg/reconciler/taskrun/taskrun.go +++ b/pkg/reconciler/taskrun/taskrun.go @@ -106,11 +106,15 @@ func (c *Reconciler) Reconcile(ctx context.Context, key string) error { // If the TaskRun is just starting, this will also set the starttime, // from which the timeout will immediately begin counting down. - tr.Status.InitializeConditions() - // In case node time was not synchronized, when controller has been scheduled to other nodes. - if tr.Status.StartTime.Sub(tr.CreationTimestamp.Time) < 0 { - c.Logger.Warnf("TaskRun %s createTimestamp %s is after the taskRun started %s", tr.GetRunKey(), tr.CreationTimestamp, tr.Status.StartTime) - tr.Status.StartTime = &tr.CreationTimestamp + if !tr.HasStarted() { + tr.Status.InitializeConditions() + // In case node time was not synchronized, when controller has been scheduled to other nodes. + if tr.Status.StartTime.Sub(tr.CreationTimestamp.Time) < 0 { + c.Logger.Warnf("TaskRun %s createTimestamp %s is after the taskRun started %s", tr.GetRunKey(), tr.CreationTimestamp, tr.Status.StartTime) + tr.Status.StartTime = &tr.CreationTimestamp + } + // Run asnyc startup hooks + go c.preRunAsyncHook(ctx, tr) } // If the TaskRun is complete, run some post run fixtures when applicable @@ -164,36 +168,20 @@ func (c *Reconciler) Reconcile(ctx context.Context, key string) error { // If the TaskRun is cancelled, kill resources and update status if tr.IsCancelled() { before := tr.Status.GetCondition(apis.ConditionSucceeded) -<<<<<<< HEAD message := fmt.Sprintf("TaskRun %q was cancelled", tr.Name) err := c.failTaskRun(tr, v1beta1.TaskRunReasonCancelled, message) - after := tr.Status.GetCondition(apis.ConditionSucceeded) - reconciler.EmitEvent(c.Recorder, before, after, tr) + go c.postRunAsyncHook(ctx, tr, before) return multierror.Append(err, c.updateStatusLabelsAndAnnotations(tr, original)).ErrorOrNil() -======= - err := cancelTaskRun(tr, c.KubeClientSet, c.Logger) - after := tr.Status.GetCondition(apis.ConditionSucceeded) - reconciler.EmitEvent(c.Recorder, before, after, tr) - return err ->>>>>>> Consolidate cancel and timeout logic } // Check if the TaskRun has timed out; if it is, this will set its status // accordingly. if tr.HasTimedOut() { before := tr.Status.GetCondition(apis.ConditionSucceeded) -<<<<<<< HEAD message := fmt.Sprintf("TaskRun %q failed to finish within %q", tr.Name, tr.GetTimeout()) err := c.failTaskRun(tr, podconvert.ReasonTimedOut, message) - after := tr.Status.GetCondition(apis.ConditionSucceeded) - reconciler.EmitEvent(c.Recorder, before, after, tr) + go c.postRunAsyncHook(ctx, tr, before) return multierror.Append(err, c.updateStatusLabelsAndAnnotations(tr, original)).ErrorOrNil() -======= - err := timeoutTaskRun(tr, c.KubeClientSet, c.Logger) - after := tr.Status.GetCondition(apis.ConditionSucceeded) - reconciler.EmitEvent(c.Recorder, before, after, tr) - return err ->>>>>>> Consolidate cancel and timeout logic } // Reconcile this copy of the task run and then write back any status @@ -205,15 +193,31 @@ func (c *Reconciler) Reconcile(ctx context.Context, key string) error { return multierror.Append(merr, c.updateStatusLabelsAndAnnotations(tr, original)).ErrorOrNil() } +// Run any async logic that may be required at start-up time. This method is used +// to emit events, notifications or any other async operation +func (c *Reconciler) preRunAsyncHook(ctx context.Context, tr *v1alpha1.TaskRun) { + c.Logger.Infof("preRunAsyncHook: %s", tr.Name) + + // Emit event + afterCondition := tr.Status.GetCondition(apis.ConditionSucceeded) + reconciler.EmitEvent(c.Recorder, nil, afterCondition, tr) +} + +// Run any async logic that may be required once the tr is successfully reconciled +// This method is used to emit events, notifications or any other async operation +func (c *Reconciler) postRunAsyncHook(ctx context.Context, tr *v1alpha1.TaskRun, beforeCondition *apis.Condition) { + c.Logger.Infof("postRunAsyncHook: %s", tr.Name) + + // Emit event + afterCondition := tr.Status.GetCondition(apis.ConditionSucceeded) + reconciler.EmitEvent(c.Recorder, beforeCondition, afterCondition, tr) +} + func (c *Reconciler) reconcile(ctx context.Context, tr *v1alpha1.TaskRun) error { // We may be reading a version of the object that was stored at an older version // and may not have had all of the assumed default specified. tr.SetDefaults(contexts.WithUpgradeViaDefaulting(ctx)) - if tr.Spec.Timeout == nil { - tr.Spec.Timeout = &metav1.Duration{Duration: config.DefaultTimeoutMinutes * time.Minute} - } - if err := tr.ConvertTo(ctx, &v1beta1.TaskRun{}); err != nil { if ce, ok := err.(*v1beta1.CannotConvertError); ok { tr.Status.MarkResourceNotConvertible(ce) @@ -366,7 +370,14 @@ func (c *Reconciler) reconcile(ctx context.Context, tr *v1alpha1.TaskRun) error after := tr.Status.GetCondition(apis.ConditionSucceeded) - reconciler.EmitEvent(c.Recorder, before, after, tr) + // If after is different from before and status is not Unknown, the taskrun + // has completed its work - except for post-run tasks like emitting events, + // recording metrics, sending cloud events. + // Once tr.isDone becomes true, even when this key is queued, `reconcile` + // won't be invoked so we won't pass through here again + if tr.IsDone() && after != before { + go c.postRunAsyncHook(ctx, tr, before) + } c.Logger.Infof("Successfully reconciled taskrun %s/%s with status: %#v", tr.Name, tr.Namespace, after) return nil