Skip to content

Commit

Permalink
TEP-0121: Move metrics recorder out of reconcile() to ReconcileKind()
Browse files Browse the repository at this point in the history
TEP-0121: Rewrite metric recorder for TaskRun Retries

Prior to this commit, metrics recording durationAndCountMetrics()
is in reconcile() (instead of ReconcileKind()), which only counts
the taskrun numbers where the taskruns that have no preparation
error / not cancelled right after scheduled / not timed out before
calling reconcile() (for example, be pending for a long time.).

This commit moves durationAndCountMetrics() into ReconcileKind()
to address the issue above.

It's worth mentioning that the recording metrics logic was
originally in the ReconcileKind() function, but the commit 1375219,
aming at addressing the recount issue, **moves that logic to `reconcile()`**
[1] (probably for better code style? 🤔).

[1] 1375219#diff-6e67e9c647bbe2a08807ff5ccbdd7dc9036df373e56b9774d3996f92ab7ceabaL138-L145
  • Loading branch information
XinruZhang committed Dec 13, 2022
1 parent a127323 commit 49224af
Show file tree
Hide file tree
Showing 2 changed files with 183 additions and 2 deletions.
11 changes: 9 additions & 2 deletions pkg/reconciler/taskrun/taskrun.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import (
"fmt"
"reflect"
"strings"
"sync"
"time"

"github.com/hashicorp/go-multierror"
Expand Down Expand Up @@ -89,13 +90,16 @@ type Reconciler struct {

// Check that our Reconciler implements taskrunreconciler.Interface
var (
_ taskrunreconciler.Interface = (*Reconciler)(nil)
_ taskrunreconciler.Interface = (*Reconciler)(nil)
wg = sync.WaitGroup{}
)

// ReconcileKind compares the actual state with the desired, and attempts to
// converge the two. It then updates the Status block of the Task Run
// resource with the current status of the resource.
func (c *Reconciler) ReconcileKind(ctx context.Context, tr *v1beta1.TaskRun) pkgreconciler.Event {
wg.Add(1)
defer c.durationAndCountMetrics(ctx, tr)
logger := logging.FromContext(ctx)
ctx = cloudevent.ToContext(ctx, c.cloudEventClient)
// By this time, params and workspaces should not be propagated for embedded tasks so we cannot
Expand Down Expand Up @@ -222,6 +226,7 @@ func (c *Reconciler) checkPodFailed(tr *v1beta1.TaskRun) (bool, v1beta1.TaskRunR

func (c *Reconciler) durationAndCountMetrics(ctx context.Context, tr *v1beta1.TaskRun) {
logger := logging.FromContext(ctx)
logger.Errorf("----------------------------------------")
if tr.IsDone() {
newTr, err := c.taskRunLister.TaskRuns(tr.Namespace).Get(tr.Name)
if err != nil && !k8serrors.IsNotFound(err) {
Expand All @@ -233,14 +238,17 @@ func (c *Reconciler) durationAndCountMetrics(ctx context.Context, tr *v1beta1.Ta
}

before := newTr.Status.GetCondition(apis.ConditionSucceeded)
wg.Add(1)
go func(metrics *taskrunmetrics.Recorder) {
if err := metrics.DurationAndCount(ctx, tr, before); err != nil {
logger.Warnf("Failed to log the metrics : %v", err)
}
if err := metrics.CloudEvents(ctx, tr); err != nil {
logger.Warnf("Failed to log the metrics : %v", err)
}
wg.Done()
}(c.metrics)
wg.Done()
}
}

Expand Down Expand Up @@ -445,7 +453,6 @@ func (c *Reconciler) prepare(ctx context.Context, tr *v1beta1.TaskRun) (*v1beta1
// error but it does not sync updates back to etcd. It does not emit events.
// `reconcile` consumes spec and resources returned by `prepare`
func (c *Reconciler) reconcile(ctx context.Context, tr *v1beta1.TaskRun, rtr *resources.ResolvedTaskResources) error {
defer c.durationAndCountMetrics(ctx, tr)
logger := logging.FromContext(ctx)
recorder := controller.GetEventRecorder(ctx)
var err error
Expand Down
174 changes: 174 additions & 0 deletions pkg/reconciler/taskrun/taskrun_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ import (
ttesting "github.com/tektoncd/pipeline/pkg/reconciler/testing"
"github.com/tektoncd/pipeline/pkg/reconciler/volumeclaim"
resolutioncommon "github.com/tektoncd/pipeline/pkg/resolution/common"
"github.com/tektoncd/pipeline/pkg/taskrunmetrics"
"github.com/tektoncd/pipeline/pkg/workspace"
"github.com/tektoncd/pipeline/test"
"github.com/tektoncd/pipeline/test/diff"
Expand All @@ -67,6 +68,7 @@ import (
"knative.dev/pkg/controller"
"knative.dev/pkg/kmeta"
"knative.dev/pkg/logging"
"knative.dev/pkg/metrics/metricstest"
"knative.dev/pkg/ptr"

pkgreconciler "knative.dev/pkg/reconciler"
Expand Down Expand Up @@ -5232,3 +5234,175 @@ status:
})
}
}

func TestReconcile_Metrics(t *testing.T) {
var (
toBeCanceledTaskRun = parse.MustParseV1beta1TaskRun(t, `
metadata:
name: test-taskrun-canceled
namespace: foo
spec:
status: TaskRunCancelled
taskRef:
name: test-task
status:
startTime: "2021-12-31T23:59:59Z"
conditions:
- reason: Running
status: Unknown
type: Succeeded
`)
toBeTimedOutTaskRun = parse.MustParseV1beta1TaskRun(t, `
metadata:
name: test-taskrun-timedout
namespace: foo
spec:
taskRef:
name: test-task
status:
startTime: "2021-12-31T00:00:00Z"
conditions:
- reason: Running
status: Unknown
type: Succeeded
`)
toFailOnPodFailureTaskRun = parse.MustParseV1beta1TaskRun(t, `
metadata:
name: test-taskrun-pod-failure
namespace: foo
spec:
taskRef:
name: test-task
status:
startTime: "2021-12-31T23:59:59Z"
podName: test-taskrun-run-retry-pod-failure-pod
steps:
- container: step-unamed-0
name: unamed-0
waiting:
reason: "ImagePullBackOff"
`)
failedPod = &corev1.Pod{
ObjectMeta: metav1.ObjectMeta{Name: "test-taskrun-pod-failure-pod"},
Status: corev1.PodStatus{Conditions: []corev1.PodCondition{{
Type: corev1.PodReady,
Status: "False",
Reason: "PodFailed",
}}},
}
toFailOnPrepareTaskRun = parse.MustParseV1beta1TaskRun(t, `
metadata:
name: test-taskrun-prepare-failure
namespace: foo
spec:
taskRef:
name: test-task
status:
startTime: "2021-12-31T23:59:59Z"
conditions:
- reason: Running
status: Unknown
type: Succeeded
`)
)
for _, tc := range []struct {
name string
tr *v1beta1.TaskRun
task *v1beta1.Task
pod *corev1.Pod
// expectedDurationTags map[string]string
// expectedCountTags map[string]string
expectedCount int64
}{{
name: "to be cancelled taskrun",
tr: toBeCanceledTaskRun,
task: simpleTask,
// expectedCountTags: map[string]string{
// "status": "success",
// },
expectedCount: 0,
}, {
name: "to be timedout taskrun",
tr: toBeTimedOutTaskRun,
task: simpleTask,
// expectedDurationTags: map[string]string{
// "task": "test-task",
// "taskrun": "test-taskrun-timedout",
// "namespace": "foo",
// "status": "failed",
// },
expectedCount: 1,
}, {
name: "pod failure taskrun",
tr: toFailOnPodFailureTaskRun,
task: simpleTask,
pod: failedPod,
// expectedDurationTags: map[string]string{
// "task": "test-task",
// "taskrun": "test-taskrun-pod-failure",
// "namespace": "foo",
// "status": "failed",
// },
expectedCount: 1,
}, {
name: "pod failure taskrun",
tr: toFailOnPrepareTaskRun,
// expectedDurationTags: map[string]string{
// "task": "test-task",
// "taskrun": "test-taskrun-prepare-failure",
// "namespace": "foo",
// "status": "failed",
// },
expectedCount: 1,
}} {
t.Run(tc.name, func(t *testing.T) {
d := test.Data{
TaskRuns: []*v1beta1.TaskRun{tc.tr},
ConfigMaps: []*corev1.ConfigMap{{
ObjectMeta: metav1.ObjectMeta{Namespace: system.Namespace(), Name: config.GetFeatureFlagsConfigName()},
Data: map[string]string{
"metrics.taskrun.level": config.TaskrunLevelAtTaskrun,
},
}},
}
if tc.task != nil {
d.Tasks = []*v1beta1.Task{tc.task}
}
if tc.pod != nil {
d.Pods = []*corev1.Pod{tc.pod}
}

testAssets, cancel := getTaskRunController(t, d)
defer cancel()

metrics, err := taskrunmetrics.NewRecorder(testAssets.Ctx)
if err != nil {
t.Fatalf("NewRecorder: %v", err)
}

createServiceAccount(t, testAssets, "default", tc.tr.Namespace)

// Use the test assets to create a *Reconciler directly for focused testing.
r := &Reconciler{
KubeClientSet: testAssets.Clients.Kube,
PipelineClientSet: testAssets.Clients.Pipeline,
Clock: testClock,
taskRunLister: testAssets.Informers.TaskRun.Lister(),
resourceLister: testAssets.Informers.PipelineResource.Lister(),
limitrangeLister: testAssets.Informers.LimitRange.Lister(),
cloudEventClient: testAssets.Clients.CloudEvents,
metrics: metrics,
entrypointCache: nil, // Not used
pvcHandler: volumeclaim.NewPVCHandler(testAssets.Clients.Kube, testAssets.Logger),
}
_ = r.ReconcileKind(testAssets.Ctx, tc.tr)

expectedCountTags := map[string]string{
"status": "success",
}
wg.Wait()

metricstest.CheckCountData(t, "taskrun_count", expectedCountTags, tc.expectedCount)
})
}
}

0 comments on commit 49224af

Please sign in to comment.