diff --git a/docs/metrics.md b/docs/metrics.md index 984dd2c3bb57..e6fddfd4291c 100644 --- a/docs/metrics.md +++ b/docs/metrics.md @@ -201,14 +201,25 @@ Metrics for the [Four Golden Signals](https://sre.google/sre-book/monitoring-dis Some metric attributes may have high cardinality and are marked with ⚠️ to warn you. You may need to disable this metric or disable the attribute. +#### `cronworkflows_concurrencypolicy_triggered` + +A counter of the number of times a CronWorkflow has triggered its `concurrencyPolicy` to limit the number of workflows running. + +| attribute | explanation | +|-------------|-------------------------------------------| +| `name` | ⚠️ The name of the CronWorkflow | +| `namespace` | The namespace of the CronWorkflow | +| `concurrency_policy` | The concurrency policy which was triggered, will be either `Forbid` or `Replace` | + #### `cronworkflows_triggered_total` -A counter of the number of times a CronWorkflow has been +A counter of the number of times a CronWorkflow has been triggered. +Suppressed runs due to `concurrencyPolicy: Forbid` will not be counted. | attribute | explanation | |-------------|-------------------------------------------| -| `name` | ⚠️ The name of the CronWorkflow. | -| `namespace` | The namespace in which the pod is running | +| `name` | ⚠️ The name of the CronWorkflow | +| `namespace` | The namespace of the CronWorkflow | #### `gauge` @@ -229,8 +240,8 @@ A counter of certain errors incurred by the controller. The currently tracked specific errors are - `OperationPanic` - the controller `panic()` on a programming bug -- `CronWorkflowSubmissionError` - A cron workflow failed submission -- `CronWorkflowSpecError` - A cron workflow has an invalid specification +- `CronWorkflowSubmissionError` - A CronWorkflow failed submission +- `CronWorkflowSpecError` - A CronWorkflow has an invalid specification #### `k8s_request_total` diff --git a/docs/upgrading.md b/docs/upgrading.md index 59b6fd03fc4f..d17f155899a8 100644 --- a/docs/upgrading.md +++ b/docs/upgrading.md @@ -29,6 +29,7 @@ These notes explain the differences in using the Prometheus `/metrics` endpoint The following are new metrics: +* `cronworkflows_concurrencypolicy_triggered` * `cronworkflows_triggered_total` * `is_leader` * `k8s_request_duration` diff --git a/test/e2e/metrics_test.go b/test/e2e/metrics_test.go index 4de7c2d79662..3dbb74ba431b 100644 --- a/test/e2e/metrics_test.go +++ b/test/e2e/metrics_test.go @@ -111,19 +111,37 @@ func (s *MetricsSuite) TestFailedMetric() { }) } -func (s *MetricsSuite) TestCronTriggeredCounter() { +func (s *MetricsSuite) TestCronCountersForbid() { s.Given(). - CronWorkflow(`@testdata/cronworkflow-metrics.yaml`). + CronWorkflow(`@testdata/cronworkflow-metrics-forbid.yaml`). When(). CreateCronWorkflow(). - Wait(1 * time.Minute). // This pattern is used in cron_test.go too + Wait(2 * time.Minute). // This pattern is used in cron_test.go too Then(). ExpectCron(func(t *testing.T, cronWf *wfv1.CronWorkflow) { s.e(s.T()).GET(""). Expect(). Status(200). Body(). - Contains(`cronworkflows_triggered_total{name="test-cron-metric",namespace="argo"} 1`) + Contains(`cronworkflows_triggered_total{name="test-cron-metric-forbid",namespace="argo"} 1`). // 2nd run was Forbid + Contains(`cronworkflows_concurrencypolicy_triggered{concurrency_policy="Forbid",name="test-cron-metric-forbid",namespace="argo"} 1`) + }) +} + +func (s *MetricsSuite) TestCronCountersReplace() { + s.Given(). + CronWorkflow(`@testdata/cronworkflow-metrics-replace.yaml`). + When(). + CreateCronWorkflow(). + Wait(2 * time.Minute). // This pattern is used in cron_test.go too + Then(). + ExpectCron(func(t *testing.T, cronWf *wfv1.CronWorkflow) { + s.e(s.T()).GET(""). + Expect(). + Status(200). + Body(). + Contains(`cronworkflows_triggered_total{name="test-cron-metric-replace",namespace="argo"} 2`). + Contains(`cronworkflows_concurrencypolicy_triggered{concurrency_policy="Replace",name="test-cron-metric-replace",namespace="argo"} 1`) }) } diff --git a/test/e2e/testdata/cronworkflow-metrics.yaml b/test/e2e/testdata/cronworkflow-metrics-forbid.yaml similarity index 61% rename from test/e2e/testdata/cronworkflow-metrics.yaml rename to test/e2e/testdata/cronworkflow-metrics-forbid.yaml index e3fdd9c13e9b..7e4cb97c4ad7 100644 --- a/test/e2e/testdata/cronworkflow-metrics.yaml +++ b/test/e2e/testdata/cronworkflow-metrics-forbid.yaml @@ -1,10 +1,10 @@ apiVersion: argoproj.io/v1alpha1 kind: CronWorkflow metadata: - name: test-cron-metric + name: test-cron-metric-forbid spec: schedule: "* * * * *" - concurrencyPolicy: "Allow" + concurrencyPolicy: "Forbid" startingDeadlineSeconds: 0 workflowSpec: metadata: @@ -12,8 +12,9 @@ spec: workflows.argoproj.io/test: "true" podGC: strategy: OnPodCompletion - entrypoint: whalesay + entrypoint: sleep templates: - - name: whalesay + - name: sleep container: - image: argoproj/argosay:v2 + image: alpine:latest + command: [sh, -c, "sleep 120"] diff --git a/test/e2e/testdata/cronworkflow-metrics-replace.yaml b/test/e2e/testdata/cronworkflow-metrics-replace.yaml new file mode 100644 index 000000000000..7b2a8534b24f --- /dev/null +++ b/test/e2e/testdata/cronworkflow-metrics-replace.yaml @@ -0,0 +1,20 @@ +apiVersion: argoproj.io/v1alpha1 +kind: CronWorkflow +metadata: + name: test-cron-metric-replace +spec: + schedule: "* * * * *" + concurrencyPolicy: "Replace" + startingDeadlineSeconds: 0 + workflowSpec: + metadata: + labels: + workflows.argoproj.io/test: "true" + podGC: + strategy: OnPodCompletion + entrypoint: sleep + templates: + - name: sleep + container: + image: alpine:latest + command: [sh, -c, "sleep 120"] diff --git a/util/telemetry/attributes.go b/util/telemetry/attributes.go index fad80c8bec07..562f92f69a3d 100644 --- a/util/telemetry/attributes.go +++ b/util/telemetry/attributes.go @@ -10,7 +10,8 @@ const ( AttribBuildGitTreeState string = `git_treestate` AttribBuildGitTag string = `git_tag` - AttribCronWFName string = `name` + AttribCronWFName string = `name` + AttribConcurrencyPolicy string = `concurrency_policy` AttribErrorCause string = "cause" diff --git a/workflow/cron/operator.go b/workflow/cron/operator.go index 86a36cf81995..1644153b4c9c 100644 --- a/workflow/cron/operator.go +++ b/workflow/cron/operator.go @@ -86,7 +86,6 @@ func (woc *cronWfOperationCtx) run(ctx context.Context, scheduledRuntime time.Ti defer woc.persistUpdate(ctx) woc.log.Infof("Running %s", woc.name) - woc.metrics.CronWfTrigger(ctx, woc.name, woc.cronWf.ObjectMeta.Namespace) // If the cron workflow has a schedule that was just updated, update its annotation if woc.cronWf.IsUsingNewSchedule() { @@ -114,6 +113,8 @@ func (woc *cronWfOperationCtx) run(ctx context.Context, scheduledRuntime time.Ti return } + woc.metrics.CronWfTrigger(ctx, woc.name, woc.cronWf.ObjectMeta.Namespace) + wf := common.ConvertCronWorkflowToWorkflowWithProperties(woc.cronWf, getChildWorkflowName(woc.cronWf.Name, scheduledRuntime), scheduledRuntime) runWf, err := util.SubmitWorkflow(ctx, woc.wfClient, woc.wfClientset, woc.cronWf.Namespace, wf, &v1alpha1.SubmitOpts{}) @@ -289,11 +290,13 @@ func (woc *cronWfOperationCtx) enforceRuntimePolicy(ctx context.Context) (bool, // Do nothing case v1alpha1.ForbidConcurrent: if len(woc.cronWf.Status.Active) > 0 { + woc.metrics.CronWfPolicy(ctx, woc.name, woc.cronWf.ObjectMeta.Namespace, v1alpha1.ForbidConcurrent) woc.log.Infof("%s has 'ConcurrencyPolicy: Forbid' and has an active Workflow so it was not run", woc.name) return false, nil } case v1alpha1.ReplaceConcurrent: if len(woc.cronWf.Status.Active) > 0 { + woc.metrics.CronWfPolicy(ctx, woc.name, woc.cronWf.ObjectMeta.Namespace, v1alpha1.ReplaceConcurrent) woc.log.Infof("%s has 'ConcurrencyPolicy: Replace' and has active Workflows", woc.name) err := woc.terminateOutstandingWorkflows(ctx) if err != nil { diff --git a/workflow/metrics/counter_cronworkflow_policy.go b/workflow/metrics/counter_cronworkflow_policy.go new file mode 100644 index 000000000000..d77e5172dfeb --- /dev/null +++ b/workflow/metrics/counter_cronworkflow_policy.go @@ -0,0 +1,29 @@ +package metrics + +import ( + "context" + + wfv1 "github.com/argoproj/argo-workflows/v3/pkg/apis/workflow/v1alpha1" + "github.com/argoproj/argo-workflows/v3/util/telemetry" +) + +const ( + nameCronPolicy = `cronworkflows_concurrencypolicy_triggered` +) + +func addCronWfPolicyCounter(_ context.Context, m *Metrics) error { + return m.CreateInstrument(telemetry.Int64Counter, + nameCronPolicy, + "Total number of times CronWorkflow concurrencyPolicy has triggered", + "{cronworkflow}", + telemetry.WithAsBuiltIn(), + ) +} + +func (m *Metrics) CronWfPolicy(ctx context.Context, name, namespace string, policy wfv1.ConcurrencyPolicy) { + m.AddInt(ctx, nameCronPolicy, 1, telemetry.InstAttribs{ + {Name: telemetry.AttribCronWFName, Value: name}, + {Name: telemetry.AttribWorkflowNamespace, Value: namespace}, + {Name: telemetry.AttribConcurrencyPolicy, Value: string(policy)}, + }) +} diff --git a/workflow/metrics/metrics.go b/workflow/metrics/metrics.go index 622b3147a9c4..46f5b6195996 100644 --- a/workflow/metrics/metrics.go +++ b/workflow/metrics/metrics.go @@ -42,6 +42,7 @@ func New(ctx context.Context, serviceName, prometheusName string, config *teleme addPodPendingCounter, addWorkflowPhaseGauge, addCronWfTriggerCounter, + addCronWfPolicyCounter, addWorkflowPhaseCounter, addWorkflowTemplateCounter, addWorkflowTemplateHistogram,