From a81afb06de23034ff5261990a36f3077774ae37a Mon Sep 17 00:00:00 2001 From: Faizan Qazi Date: Wed, 3 Mar 2021 17:05:39 -0500 Subject: [PATCH] jobs: add job metrics per-type to track success, failure, and cancel Fixes: #59711 Previously, there were only over all counters tracking how many jobs were completed, cancelled, or failed. This was inadequate because it didn't make it easy to tell in aggregate what job types they were. To address this, this patch will add counters for different job types for tracking success, failure, and cancellation. Release justification: Low risk change only adding a metric inside the crdb_internal.feature_usage table Release note: None --- pkg/jobs/metrics.go | 37 +++++++++++++++++++ pkg/jobs/registry.go | 4 ++ .../logictest/testdata/logic_test/alter_table | 11 ++++++ .../testdata/logic_test/distsql_stats | 15 ++++++++ pkg/sql/logictest/testdata/logic_test/jobs | 11 ++++++ pkg/sql/testdata/telemetry/schema | 1 + 6 files changed, 79 insertions(+) diff --git a/pkg/jobs/metrics.go b/pkg/jobs/metrics.go index 064ef97bdbd8..6c1c72a20793 100644 --- a/pkg/jobs/metrics.go +++ b/pkg/jobs/metrics.go @@ -16,6 +16,7 @@ import ( "time" "github.com/cockroachdb/cockroach/pkg/jobs/jobspb" + "github.com/cockroachdb/cockroach/pkg/server/telemetry" "github.com/cockroachdb/cockroach/pkg/util/metric" io_prometheus_client "github.com/prometheus/client_model/go" ) @@ -150,3 +151,39 @@ func (m *Metrics) init(histogramWindowInterval time.Duration) { // MakeChangefeedMetricsHook allows for registration of changefeed metrics from // ccl code. var MakeChangefeedMetricsHook func(time.Duration) metric.Struct + +// JobTelemetryMetrics is a telemetry metrics for individual job types. +type JobTelemetryMetrics struct { + Successful telemetry.Counter + Failed telemetry.Counter + Canceled telemetry.Counter +} + +// newJobTelemetryMetrics creates a new JobTelemetryMetrics object +// for a given job type name. +func newJobTelemetryMetrics(jobName string) *JobTelemetryMetrics { + return &JobTelemetryMetrics{ + Successful: telemetry.GetCounterOnce(fmt.Sprintf("sql.schema.job.%s.successful", jobName)), + Failed: telemetry.GetCounterOnce(fmt.Sprintf("sql.schema.job.%s.failed", jobName)), + Canceled: telemetry.GetCounterOnce(fmt.Sprintf("sql.schema.job.%s.canceled", jobName)), + } +} + +// getJobTelemetryMetricsArray initializes an array of job related telemetry +// metrics +func getJobTelemetryMetricsArray() [jobspb.NumJobTypes]*JobTelemetryMetrics { + var metrics [jobspb.NumJobTypes]*JobTelemetryMetrics + for i := 0; i < jobspb.NumJobTypes; i++ { + jt := jobspb.Type(i) + if jt == jobspb.TypeUnspecified { // do not track TypeUnspecified + continue + } + typeStr := strings.ToLower(strings.Replace(jt.String(), " ", "_", -1)) + metrics[i] = newJobTelemetryMetrics(typeStr) + } + return metrics +} + +// TelemetryMetrics contains telemetry metrics for different +// job types. +var TelemetryMetrics = getJobTelemetryMetricsArray() diff --git a/pkg/jobs/registry.go b/pkg/jobs/registry.go index 101cf0a3fb6e..cb02b5670c78 100644 --- a/pkg/jobs/registry.go +++ b/pkg/jobs/registry.go @@ -24,6 +24,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/keys" "github.com/cockroachdb/cockroach/pkg/kv" "github.com/cockroachdb/cockroach/pkg/security" + "github.com/cockroachdb/cockroach/pkg/server/telemetry" "github.com/cockroachdb/cockroach/pkg/settings" "github.com/cockroachdb/cockroach/pkg/settings/cluster" "github.com/cockroachdb/cockroach/pkg/sql/catalog" @@ -1218,6 +1219,7 @@ func (r *Registry) stepThroughStateMachine( // restarted during the next adopt loop and reverting will be retried. return errors.Wrapf(err, "job %d: could not mark as canceled: %v", job.ID(), jobErr) } + telemetry.Inc(TelemetryMetrics[jobType].Canceled) return errors.WithSecondaryError(errors.Errorf("job %s", status), jobErr) case StatusSucceeded: if jobErr != nil { @@ -1232,6 +1234,7 @@ func (r *Registry) stepThroughStateMachine( // better. return r.stepThroughStateMachine(ctx, execCtx, resumer, job, StatusReverting, errors.Wrapf(err, "could not mark job %d as succeeded", job.ID())) } + telemetry.Inc(TelemetryMetrics[jobType].Successful) return nil case StatusReverting: if err := job.reverted(ctx, nil /* txn */, jobErr, nil /* fn */); err != nil { @@ -1286,6 +1289,7 @@ func (r *Registry) stepThroughStateMachine( // restarted during the next adopt loop and reverting will be retried. return errors.Wrapf(err, "job %d: could not mark as failed: %s", job.ID(), jobErr) } + telemetry.Inc(TelemetryMetrics[jobType].Failed) return jobErr default: return errors.NewAssertionErrorWithWrappedErrf(jobErr, diff --git a/pkg/sql/logictest/testdata/logic_test/alter_table b/pkg/sql/logictest/testdata/logic_test/alter_table index 0063789d8011..180cdb78d028 100644 --- a/pkg/sql/logictest/testdata/logic_test/alter_table +++ b/pkg/sql/logictest/testdata/logic_test/alter_table @@ -1688,3 +1688,14 @@ SELECT count(descriptor_id) WHERE descriptor_id = ('test.public.t45985'::REGCLASS)::INT8; ---- 0 + +# Validate that the schema_change_successful metric +query T +SELECT feature_name FROM crdb_internal.feature_usage +WHERE feature_name IN ('sql.schema.job.schema_change.successful', +'sql.schema.job.schema_change.failed') AND +usage_count > 0 +ORDER BY feature_name DESC +---- +sql.schema.job.schema_change.successful +sql.schema.job.schema_change.failed diff --git a/pkg/sql/logictest/testdata/logic_test/distsql_stats b/pkg/sql/logictest/testdata/logic_test/distsql_stats index cfb8ce4f3eb7..3cfe4f12a2f5 100644 --- a/pkg/sql/logictest/testdata/logic_test/distsql_stats +++ b/pkg/sql/logictest/testdata/logic_test/distsql_stats @@ -1065,3 +1065,18 @@ SHOW STATISTICS USING JSON FOR TABLE greeting_stats statement ok ALTER TABLE greeting_stats INJECT STATISTICS '$stats' + +# Validate that the schema_change_successful metric +query T +SELECT feature_name FROM crdb_internal.feature_usage +WHERE feature_name in ('sql.schema.job.typedesc_schema_change.successful', +'sql.schema.job.schema_change.successful', +'sql.schema.job.create_stats.successful', +'sql.schema.job.auto_create_stats.successful') AND +usage_count > 0 +ORDER BY feature_name DESC +---- +sql.schema.job.typedesc_schema_change.successful +sql.schema.job.schema_change.successful +sql.schema.job.create_stats.successful +sql.schema.job.auto_create_stats.successful diff --git a/pkg/sql/logictest/testdata/logic_test/jobs b/pkg/sql/logictest/testdata/logic_test/jobs index 21214acf5197..8266571ede86 100644 --- a/pkg/sql/logictest/testdata/logic_test/jobs +++ b/pkg/sql/logictest/testdata/logic_test/jobs @@ -130,3 +130,14 @@ user testuser # testuser should no longer have the ability to control jobs. statement error pq: user testuser does not have CONTROLJOB privilege PAUSE JOB (SELECT job_id FROM [SHOW JOBS] WHERE user_name = 'testuser2' AND job_type = 'SCHEMA CHANGE GC') + +user root + +# Validate that the schema_change_successful metric +query T +SELECT feature_name FROM crdb_internal.feature_usage +WHERE feature_name in ('sql.schema.job.schema_change.successful') AND +usage_count > 0 +ORDER BY feature_name DESC +---- +sql.schema.job.schema_change.successful diff --git a/pkg/sql/testdata/telemetry/schema b/pkg/sql/testdata/telemetry/schema index 3563a15fb29d..48f1c7240b61 100644 --- a/pkg/sql/testdata/telemetry/schema +++ b/pkg/sql/testdata/telemetry/schema @@ -55,6 +55,7 @@ sql.schema.alter_table sql.schema.alter_table.add_column sql.schema.alter_table.add_column.references sql.schema.alter_table.add_constraint +sql.schema.job.schema_change.successful sql.schema.new_column_type.int8 schema