From 3375d8743522ecdc0d38d5b9ee8e215e4aa89ab5 Mon Sep 17 00:00:00 2001 From: Faizan Qazi Date: Wed, 28 Jul 2021 14:12:24 -0400 Subject: [PATCH] sql: add metric for schema change job user/database errors Fixes: #68111 Previously, we had no way of tracking the reason a schema change job failed in our metrics. This was inadequate because we had no way of knowing why schema jobs failed. To address this, this patch will add metrics for tracking failures due to constraint violations and database errors. Release note (sql change): Add a new metrics to track schema job failure (sql.schema_changer.errors.all, sql.schema_changer.errors.constraint_violation, sql.schema_changer.errors.uncategorized), errors inside the crdb_internal.feature_usage table. --- .../logictest/testdata/logic_test/alter_table | 16 +++++++++++++ pkg/sql/schema_changer.go | 23 +++++++++++++++++++ pkg/sql/schema_changer_metrics.go | 10 +++++++- pkg/sql/schema_changer_test.go | 7 ++++++ pkg/sql/sqltelemetry/schema.go | 6 +++++ 5 files changed, 61 insertions(+), 1 deletion(-) diff --git a/pkg/sql/logictest/testdata/logic_test/alter_table b/pkg/sql/logictest/testdata/logic_test/alter_table index e25e089e645e..fd3267debb7b 100644 --- a/pkg/sql/logictest/testdata/logic_test/alter_table +++ b/pkg/sql/logictest/testdata/logic_test/alter_table @@ -1860,3 +1860,19 @@ CREATE TABLE public.t67234 ( FAMILY fam_0_k_a_b (k, a, b), CONSTRAINT t67234_c2 UNIQUE WITHOUT INDEX (b) WHERE a > 0:::INT8 ) + +# Sanity: Check the number of user errors and +# database errors in the test. +query I +SELECT count(usage_count) + FROM crdb_internal.feature_usage + WHERE feature_name = 'sql.schema_changer.errors.constraint_violation' and usage_count >= 13; +---- +1 + +query I +SELECT count(usage_count) + FROM crdb_internal.feature_usage + WHERE feature_name = 'sql.schema_changer.errors.uncategorized' and usage_count >= 4; +---- +1 diff --git a/pkg/sql/schema_changer.go b/pkg/sql/schema_changer.go index c2ba284096b0..664153387606 100644 --- a/pkg/sql/schema_changer.go +++ b/pkg/sql/schema_changer.go @@ -26,6 +26,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/kv/kvclient/rangecache" "github.com/cockroachdb/cockroach/pkg/roachpb" "github.com/cockroachdb/cockroach/pkg/security" + "github.com/cockroachdb/cockroach/pkg/server/telemetry" "github.com/cockroachdb/cockroach/pkg/settings" "github.com/cockroachdb/cockroach/pkg/settings/cluster" "github.com/cockroachdb/cockroach/pkg/sql/catalog" @@ -137,6 +138,18 @@ func NewSchemaChangerForTesting( } } +// IsConstraintError returns true if the error is considered as +// an error introduced by the user. For example a constraint +// violation. +func IsConstraintError(err error) bool { + pgCode := pgerror.GetPGCode(err) + return pgCode == pgcode.CheckViolation || + pgCode == pgcode.UniqueViolation || + pgCode == pgcode.ForeignKeyViolation || + pgCode == pgcode.NotNullViolation || + pgCode == pgcode.IntegrityConstraintViolation +} + // IsPermanentSchemaChangeError returns true if the error results in // a permanent failure of a schema change. This function is a allowlist // instead of a blocklist: only known safe errors are confirmed to not be @@ -2114,10 +2127,20 @@ func (r schemaChangeResumer) Resume(ctx context.Context, execCtx interface{}) er // including the schema change not having the first mutation in line. log.Warningf(ctx, "error while running schema change, retrying: %v", scErr) sc.metrics.RetryErrors.Inc(1) + if IsConstraintError(scErr) { + telemetry.Inc(sc.metrics.ConstraintErrors) + } else { + telemetry.Inc(sc.metrics.UncategorizedErrors) + } default: if ctx.Err() == nil { sc.metrics.PermanentErrors.Inc(1) } + if IsConstraintError(scErr) { + telemetry.Inc(sc.metrics.ConstraintErrors) + } else { + telemetry.Inc(sc.metrics.UncategorizedErrors) + } // All other errors lead to a failed job. return scErr } diff --git a/pkg/sql/schema_changer_metrics.go b/pkg/sql/schema_changer_metrics.go index 0d78fd88363c..bbf4e10a042b 100644 --- a/pkg/sql/schema_changer_metrics.go +++ b/pkg/sql/schema_changer_metrics.go @@ -10,7 +10,11 @@ package sql -import "github.com/cockroachdb/cockroach/pkg/util/metric" +import ( + "github.com/cockroachdb/cockroach/pkg/server/telemetry" + "github.com/cockroachdb/cockroach/pkg/sql/sqltelemetry" + "github.com/cockroachdb/cockroach/pkg/util/metric" +) // TODO(ajwerner): Add many more metrics. @@ -47,6 +51,8 @@ type SchemaChangerMetrics struct { Successes *metric.Counter RetryErrors *metric.Counter PermanentErrors *metric.Counter + ConstraintErrors telemetry.Counter + UncategorizedErrors telemetry.Counter } // MetricStruct makes SchemaChangerMetrics a metric.Struct. @@ -61,5 +67,7 @@ func NewSchemaChangerMetrics() *SchemaChangerMetrics { Successes: metric.NewCounter(metaSuccesses), RetryErrors: metric.NewCounter(metaRetryErrors), PermanentErrors: metric.NewCounter(metaPermanentErrors), + ConstraintErrors: sqltelemetry.SchemaChangeErrorCounter("constraint_violation"), + UncategorizedErrors: sqltelemetry.SchemaChangeErrorCounter("uncategorized"), } } diff --git a/pkg/sql/schema_changer_test.go b/pkg/sql/schema_changer_test.go index e0f19d21dc21..18748edd0d8e 100644 --- a/pkg/sql/schema_changer_test.go +++ b/pkg/sql/schema_changer_test.go @@ -6297,6 +6297,13 @@ SELECT value WHERE name = 'sql.schema_changer.permanent_errors'; `).Scan(&permanentErrors)) require.Equal(t, 1, permanentErrors) + var userErrors int + require.NoError(t, sqlDB.QueryRow(` +SELECT usage_count + FROM crdb_internal.feature_usage + WHERE feature_name = 'sql.schema_changer.errors.constraint_violation'; +`).Scan(&userErrors)) + require.GreaterOrEqual(t, userErrors, 1) } t.Run("error-before-backfill", func(t *testing.T) { diff --git a/pkg/sql/sqltelemetry/schema.go b/pkg/sql/sqltelemetry/schema.go index b4c1c80cea28..308262775111 100644 --- a/pkg/sql/sqltelemetry/schema.go +++ b/pkg/sql/sqltelemetry/schema.go @@ -165,3 +165,9 @@ var CreateUnloggedTableCounter = telemetry.GetCounterOnce("sql.schema.create_unl // SchemaRefreshMaterializedView is to be incremented every time a materialized // view is refreshed. var SchemaRefreshMaterializedView = telemetry.GetCounterOnce("sql.schema.refresh_materialized_view") + +// SchemaChangeErrorCounter is to be incremented for different types +// of errors. +func SchemaChangeErrorCounter(typ string) telemetry.Counter { + return telemetry.GetCounter(fmt.Sprintf("sql.schema_changer.errors.%s", typ)) +}