From d8305f230ee0e4d4ff71fbd3e76d065241a24b41 Mon Sep 17 00:00:00 2001 From: Faizan Qazi Date: Wed, 28 Jul 2021 14:12:24 -0400 Subject: [PATCH] sql: add metric for schema change job user/database errors Fixes: #68111 Previously, we had no way of tracking the reason a schema change job failed in our metrics. This was inadequate because we had no way of knowing why schema jobs failed. To address this, this patch will add metrics for tracking failures due to constraint violations and database errors. Release note (sql change): Add a new metrics to track schema job failure (sql.schema_changer.errors.all, sql.schema_changer.errors.constraint_violation, sql.schema_changer.errors.uncategorized), errors inside the crdb_internal.feature_usage table. --- .../logictest/testdata/logic_test/alter_table | 16 +++++++++++++ pkg/sql/schema_changer.go | 23 +++++++++++++++++++ pkg/sql/schema_changer_metrics.go | 10 +++++++- pkg/sql/schema_changer_test.go | 7 ++++++ pkg/sql/sqltelemetry/schema.go | 6 +++++ 5 files changed, 61 insertions(+), 1 deletion(-) diff --git a/pkg/sql/logictest/testdata/logic_test/alter_table b/pkg/sql/logictest/testdata/logic_test/alter_table index cb620c1b9966..55b85dbc9e63 100644 --- a/pkg/sql/logictest/testdata/logic_test/alter_table +++ b/pkg/sql/logictest/testdata/logic_test/alter_table @@ -1833,3 +1833,19 @@ CREATE TABLE public.t67234 ( FAMILY fam_0_k_a_b (k, a, b), CONSTRAINT t67234_c2 UNIQUE WITHOUT INDEX (b) WHERE a > 0:::INT8 ) + +# Sanity: Check the number of user errors and +# database errors in the test. +query I +SELECT count(usage_count) + FROM crdb_internal.feature_usage + WHERE feature_name = 'sql.schema_changer.errors.constraint_violation' and usage_count >= 13; +---- +1 + +query I +SELECT count(usage_count) + FROM crdb_internal.feature_usage + WHERE feature_name = 'sql.schema_changer.errors.uncategorized' and usage_count >= 4; +---- +1 diff --git a/pkg/sql/schema_changer.go b/pkg/sql/schema_changer.go index edec06905c2d..341b4bc1224d 100644 --- a/pkg/sql/schema_changer.go +++ b/pkg/sql/schema_changer.go @@ -26,6 +26,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/kv/kvclient/rangecache" "github.com/cockroachdb/cockroach/pkg/roachpb" "github.com/cockroachdb/cockroach/pkg/security" + "github.com/cockroachdb/cockroach/pkg/server/telemetry" "github.com/cockroachdb/cockroach/pkg/settings" "github.com/cockroachdb/cockroach/pkg/settings/cluster" "github.com/cockroachdb/cockroach/pkg/sql/catalog" @@ -138,6 +139,18 @@ func NewSchemaChangerForTesting( } } +// IsConstraintError returns true if the error is considered as +// an error introduced by the user. For example a constraint +// violation. +func IsConstraintError(err error) bool { + pgCode := pgerror.GetPGCode(err) + return pgCode == pgcode.CheckViolation || + pgCode == pgcode.UniqueViolation || + pgCode == pgcode.ForeignKeyViolation || + pgCode == pgcode.NotNullViolation || + pgCode == pgcode.IntegrityConstraintViolation +} + // IsPermanentSchemaChangeError returns true if the error results in // a permanent failure of a schema change. This function is a allowlist // instead of a blocklist: only known safe errors are confirmed to not be @@ -2192,10 +2205,20 @@ func (r schemaChangeResumer) Resume(ctx context.Context, execCtx interface{}) er // including the schema change not having the first mutation in line. log.Warningf(ctx, "error while running schema change, retrying: %v", scErr) sc.metrics.RetryErrors.Inc(1) + if IsConstraintError(scErr) { + telemetry.Inc(sc.metrics.ConstraintErrors) + } else { + telemetry.Inc(sc.metrics.UncategorizedErrors) + } default: if ctx.Err() == nil { sc.metrics.PermanentErrors.Inc(1) } + if IsConstraintError(scErr) { + telemetry.Inc(sc.metrics.ConstraintErrors) + } else { + telemetry.Inc(sc.metrics.UncategorizedErrors) + } // All other errors lead to a failed job. return scErr } diff --git a/pkg/sql/schema_changer_metrics.go b/pkg/sql/schema_changer_metrics.go index 0d78fd88363c..bbf4e10a042b 100644 --- a/pkg/sql/schema_changer_metrics.go +++ b/pkg/sql/schema_changer_metrics.go @@ -10,7 +10,11 @@ package sql -import "github.com/cockroachdb/cockroach/pkg/util/metric" +import ( + "github.com/cockroachdb/cockroach/pkg/server/telemetry" + "github.com/cockroachdb/cockroach/pkg/sql/sqltelemetry" + "github.com/cockroachdb/cockroach/pkg/util/metric" +) // TODO(ajwerner): Add many more metrics. @@ -47,6 +51,8 @@ type SchemaChangerMetrics struct { Successes *metric.Counter RetryErrors *metric.Counter PermanentErrors *metric.Counter + ConstraintErrors telemetry.Counter + UncategorizedErrors telemetry.Counter } // MetricStruct makes SchemaChangerMetrics a metric.Struct. @@ -61,5 +67,7 @@ func NewSchemaChangerMetrics() *SchemaChangerMetrics { Successes: metric.NewCounter(metaSuccesses), RetryErrors: metric.NewCounter(metaRetryErrors), PermanentErrors: metric.NewCounter(metaPermanentErrors), + ConstraintErrors: sqltelemetry.SchemaChangeErrorCounter("constraint_violation"), + UncategorizedErrors: sqltelemetry.SchemaChangeErrorCounter("uncategorized"), } } diff --git a/pkg/sql/schema_changer_test.go b/pkg/sql/schema_changer_test.go index c01c9a865ad6..05e4c097cb74 100644 --- a/pkg/sql/schema_changer_test.go +++ b/pkg/sql/schema_changer_test.go @@ -6282,6 +6282,13 @@ SELECT value WHERE name = 'sql.schema_changer.permanent_errors'; `).Scan(&permanentErrors)) require.Equal(t, 1, permanentErrors) + var userErrors int + require.NoError(t, sqlDB.QueryRow(` +SELECT usage_count + FROM crdb_internal.feature_usage + WHERE feature_name = 'sql.schema_changer.errors.constraint_violation'; +`).Scan(&userErrors)) + require.GreaterOrEqual(t, userErrors, 1) } t.Run("error-before-backfill", func(t *testing.T) { diff --git a/pkg/sql/sqltelemetry/schema.go b/pkg/sql/sqltelemetry/schema.go index 8f5967a180c8..45a9f51df404 100644 --- a/pkg/sql/sqltelemetry/schema.go +++ b/pkg/sql/sqltelemetry/schema.go @@ -160,3 +160,9 @@ var CreateUnloggedTableCounter = telemetry.GetCounterOnce("sql.schema.create_unl // SchemaRefreshMaterializedView is to be incremented every time a materialized // view is refreshed. var SchemaRefreshMaterializedView = telemetry.GetCounterOnce("sql.schema.refresh_materialized_view") + +// SchemaChangeErrorCounter is to be incremented for different types +// of errors. +func SchemaChangeErrorCounter(typ string) telemetry.Counter { + return telemetry.GetCounter(fmt.Sprintf("sql.schema_changer.errors.%s", typ)) +}