From 602b5dea1835d0621149f5f816203bf14c0b77f0 Mon Sep 17 00:00:00 2001 From: sumeerbhola Date: Wed, 20 Dec 2023 15:36:39 -0500 Subject: [PATCH] storage: disable crashing for single delete callbacks As elaborated in code comments, these can be false positives caused by delete-only compactions. The metrics are not disabled, under the rare chance that we see a problem in a cluster and one of the metrics (if it happens to not be a false positive) gives us a hint about the cause of the problem. For the same reason, we log such callbacks every 5 minutes, so we can correlate the key in such logs with an actual problem. False positives should be rare, especially for the invariant violation callback. Informs https://github.com/cockroachdb/cockroach/issues/115881 Informs https://github.com/cockroachdb/cockroach/issues/114421 Epic: none Release note (ops change): The cluster settings storage.single_delete.crash_on_invariant_violation.enabled and storage.single_delete.crash_on_ineffectual.enabled are disabled and must not be enabled. --- docs/generated/settings/settings.html | 2 - pkg/kv/kvserver/metrics.go | 2 + pkg/storage/engine.go | 4 + pkg/storage/metamorphic/BUILD.bazel | 1 - pkg/storage/metamorphic/generator.go | 16 ---- pkg/storage/pebble.go | 131 +++++++++++++++++++------- 6 files changed, 101 insertions(+), 55 deletions(-) diff --git a/docs/generated/settings/settings.html b/docs/generated/settings/settings.html index 6fc0aa9633d8..9b2f0f86b121 100644 --- a/docs/generated/settings/settings.html +++ b/docs/generated/settings/settings.html @@ -265,8 +265,6 @@
storage.ingest_split.enabled
booleanfalseset to true to use ingest-time splitting to lower write-amplification (experimental)Dedicated/Self-Hosted
storage.max_sync_duration
duration20smaximum duration for disk operations; any operations that take longer than this setting trigger a warning log entry or process crashServerless/Dedicated/Self-Hosted (read-only)
storage.max_sync_duration.fatal.enabled
booleantrueif true, fatal the process when a disk operation exceeds storage.max_sync_durationServerless/Dedicated/Self-Hosted -
storage.single_delete.crash_on_ineffectual.enabled
booleantrueset to true to crash if the single delete was ineffectualDedicated/Self-Hosted -
storage.single_delete.crash_on_invariant_violation.enabled
booleantrueset to true to crash if the single delete invariant is violatedDedicated/Self-Hosted
storage.value_blocks.enabled
booleantrueset to true to enable writing of value blocks in sstablesServerless/Dedicated/Self-Hosted
timeseries.storage.enabled
booleantrueif set, periodic timeseries data is stored within the cluster; disabling is not recommended unless you are storing the data elsewhereDedicated/Self-Hosted
timeseries.storage.resolution_10s.ttl
duration240h0m0sthe maximum age of time series data stored at the 10 second resolution. Data older than this is subject to rollup and deletion.Serverless/Dedicated/Self-Hosted (read-only) diff --git a/pkg/kv/kvserver/metrics.go b/pkg/kv/kvserver/metrics.go index caad08cb418c..3f36362e120e 100644 --- a/pkg/kv/kvserver/metrics.go +++ b/pkg/kv/kvserver/metrics.go @@ -781,12 +781,14 @@ bytes preserved during flushes and compactions over the lifetime of the process. Measurement: "Bytes", Unit: metric.Unit_BYTES, } + // TODO(sumeer): remove, since can fire due to delete-only compactions. metaStorageSingleDelInvariantViolationCount = metric.Metadata{ Name: "storage.single-delete.invariant-violation", Help: "Number of SingleDelete invariant violations", Measurement: "Events", Unit: metric.Unit_COUNT, } + // TODO(sumeer): remove, since can fire due to delete-only compactions. metaStorageSingleDelIneffectualCount = metric.Metadata{ Name: "storage.single-delete.ineffectual", Help: "Number of SingleDeletes that were ineffectual", diff --git a/pkg/storage/engine.go b/pkg/storage/engine.go index 49858ab6f68a..ee04a3caa488 100644 --- a/pkg/storage/engine.go +++ b/pkg/storage/engine.go @@ -1176,9 +1176,13 @@ type Metrics struct { // SingleDelInvariantViolationCount counts the number of times a // SingleDelete was found to violate the invariant that it should only be // used when there is at most one older Set for it to delete. + // + // TODO(sumeer): remove, since can fire due to delete-only compactions. SingleDelInvariantViolationCount int64 // SingleDelIneffectualCount counts the number of times a SingleDelete was // ineffectual, i.e., it was elided without deleting anything. + // + // TODO(sumeer): remove, since can fire due to delete-only compactions. SingleDelIneffectualCount int64 // SharedStorageWriteBytes counts the number of bytes written to shared storage. SharedStorageWriteBytes int64 diff --git a/pkg/storage/metamorphic/BUILD.bazel b/pkg/storage/metamorphic/BUILD.bazel index 9dd41dfa462a..c1bd89caabed 100644 --- a/pkg/storage/metamorphic/BUILD.bazel +++ b/pkg/storage/metamorphic/BUILD.bazel @@ -21,7 +21,6 @@ go_library( "//pkg/storage", "//pkg/storage/enginepb", "//pkg/util/hlc", - "//pkg/util/log", "//pkg/util/randutil", "//pkg/util/syncutil", "//pkg/util/uint128", diff --git a/pkg/storage/metamorphic/generator.go b/pkg/storage/metamorphic/generator.go index c83d22e372c2..4545084f9459 100644 --- a/pkg/storage/metamorphic/generator.go +++ b/pkg/storage/metamorphic/generator.go @@ -25,7 +25,6 @@ import ( "github.com/cockroachdb/cockroach/pkg/storage" "github.com/cockroachdb/cockroach/pkg/storage/enginepb" "github.com/cockroachdb/cockroach/pkg/util/hlc" - "github.com/cockroachdb/cockroach/pkg/util/log" "github.com/cockroachdb/pebble" "github.com/cockroachdb/pebble/vfs" ) @@ -62,21 +61,6 @@ func (e *engineConfig) create(path string, fs vfs.FS) (storage.Engine, error) { pebbleConfig.Opts.Cache = pebble.NewCache(1 << 20) defer pebbleConfig.Opts.Cache.Unref() - pebbleConfig.Opts.Experimental.IneffectualSingleDeleteCallback = func(userKey []byte) { - // TODO(sumeer): figure out what is causing these callbacks. - if false { - ek, ok := storage.DecodeEngineKey(userKey) - if !ok { - log.Fatalf(context.Background(), "unable to decode %s", roachpb.Key(userKey).String()) - } - ltk, err := ek.ToLockTableKey() - if err != nil { - log.Fatalf(context.Background(), "%s", err.Error()) - } - log.Fatalf(context.Background(), "Ineffectual SingleDel on k=%s str=%s txn=%s", - ltk.Key.String(), ltk.Strength.String(), ltk.TxnUUID.String()) - } - } return storage.NewPebble(context.Background(), pebbleConfig) } diff --git a/pkg/storage/pebble.go b/pkg/storage/pebble.go index 9d324e7e846c..519dd4f7602b 100644 --- a/pkg/storage/pebble.go +++ b/pkg/storage/pebble.go @@ -159,20 +159,76 @@ var IngestAsFlushable = settings.RegisterBoolSetting( util.ConstantWithMetamorphicTestBool( "storage.ingest_as_flushable.enabled", true)) +// DO NOT set storage.single_delete.crash_on_invariant_violation.enabled or +// storage.single_delete.crash_on_ineffectual.enabled to true. +// +// Pebble's delete-only compactions can cause a recent RANGEDEL to peek below +// an older SINGLEDEL and delete an arbitrary subset of data below that +// SINGLEDEL. When that SINGLEDEL gets compacted (without the RANGEDEL), any +// of these callbacks can happen, without it being a real correctness problem. +// +// Example 1: +// RANGEDEL [a, c)#10 in L0 +// SINGLEDEL b#5 in L1 +// SET b#3 in L6 +// +// If the L6 file containing the SET is narrow and the L1 file containing the +// SINGLEDEL is wide, a delete-only compaction can remove the file in L2 +// before the SINGLEDEL is compacted down. Then when the SINGLEDEL is +// compacted down, it will not find any SET to delete, resulting in the +// ineffectual callback. +// +// Example 2: +// RANGEDEL [a, z)#60 in L0 +// SINGLEDEL g#50 in L1 +// SET g#40 in L2 +// RANGEDEL [g,h)#30 in L3 +// SET g#20 in L6 +// +// In this example, the two SETs represent the same intent, and the RANGEDELs +// are caused by the CRDB range being dropped. That is, the transaction wrote +// the intent once, range was dropped, then added back, which caused the SET +// again, then the transaction committed, causing a SINGLEDEL, and then the +// range was dropped again. The older RANGEDEL can get fragmented due to +// compactions it has been part of. Say this L3 file containing the RANGEDEL +// is very narrow, while the L1, L2, L6 files are wider than the RANGEDEL in +// L0. Then the RANGEDEL in L3 can be dropped using a delete-only compaction, +// resulting in an LSM with state: +// +// RANGEDEL [a, z)#60 in L0 +// SINGLEDEL g#50 in L1 +// SET g#40 in L2 +// SET g#20 in L6 +// +// A multi-level compaction involving L1, L2, L6 will cause the invariant +// violation callback. This example doesn't need multi-level compactions: say +// there was a Pebble snapshot at g#21 preventing g#20 from being dropped when +// it meets g#40 in a compaction. That snapshot will not save RANGEDEL +// [g,h)#30, so we can have: +// +// SINGLEDEL g#50 in L1 +// SET g#40, SET g#20 in L6 +// +// And say the snapshot is removed and then the L1 and L6 compaction happens, +// resulting in the invariant violation callback. +// +// TODO(sumeer): remove these cluster settings or figure out a way to bring +// back some invariant checking. + var SingleDeleteCrashOnInvariantViolation = settings.RegisterBoolSetting( settings.SystemOnly, "storage.single_delete.crash_on_invariant_violation.enabled", "set to true to crash if the single delete invariant is violated", - true, - settings.WithPublic, + false, + settings.WithVisibility(settings.Reserved), ) var SingleDeleteCrashOnIneffectual = settings.RegisterBoolSetting( settings.SystemOnly, "storage.single_delete.crash_on_ineffectual.enabled", "set to true to crash if the single delete was ineffectual", - true, - settings.WithPublic, + false, + settings.WithVisibility(settings.Reserved), ) // ShouldUseEFOS returns true if either of the UseEFOS or UseExciseForSnapshots @@ -875,6 +931,8 @@ type Pebble struct { storeIDPebbleLog *base.StoreIDContainer replayer *replay.WorkloadCollector + + singleDelLogEvery log.EveryN } // WorkloadCollector implements an workloadCollectorGetter and returns the @@ -1162,26 +1220,27 @@ func NewPebble(ctx context.Context, cfg PebbleConfig) (p *Pebble, err error) { storeProps := computeStoreProperties(ctx, cfg.Dir, opts.ReadOnly, encryptionEnv != nil /* encryptionEnabled */) p = &Pebble{ - FS: opts.FS, - readOnly: opts.ReadOnly, - path: cfg.Dir, - auxDir: auxDir, - ballastPath: ballastPath, - ballastSize: cfg.BallastSize, - maxSize: cfg.MaxSize, - attrs: cfg.Attrs, - properties: storeProps, - settings: cfg.Settings, - encryption: encryptionEnv, - fileLock: opts.Lock, - fileRegistry: fileRegistry, - unencryptedFS: unencryptedFS, - logger: opts.LoggerAndTracer, - logCtx: logCtx, - storeIDPebbleLog: storeIDContainer, - closer: filesystemCloser, - onClose: cfg.onClose, - replayer: replay.NewWorkloadCollector(cfg.StorageConfig.Dir), + FS: opts.FS, + readOnly: opts.ReadOnly, + path: cfg.Dir, + auxDir: auxDir, + ballastPath: ballastPath, + ballastSize: cfg.BallastSize, + maxSize: cfg.MaxSize, + attrs: cfg.Attrs, + properties: storeProps, + settings: cfg.Settings, + encryption: encryptionEnv, + fileLock: opts.Lock, + fileRegistry: fileRegistry, + unencryptedFS: unencryptedFS, + logger: opts.LoggerAndTracer, + logCtx: logCtx, + storeIDPebbleLog: storeIDContainer, + closer: filesystemCloser, + onClose: cfg.onClose, + replayer: replay.NewWorkloadCollector(cfg.StorageConfig.Dir), + singleDelLogEvery: log.Every(5 * time.Minute), } // In test builds, add a layer of VFS middleware that ensures users of an // Engine don't try to use the filesystem after the Engine has been closed. @@ -1197,24 +1256,24 @@ func NewPebble(ctx context.Context, cfg PebbleConfig) (p *Pebble, err error) { } opts.Experimental.SingleDeleteInvariantViolationCallback = func(userKey []byte) { - logFunc := log.Errorf + logFunc := func(ctx context.Context, format string, args ...interface{}) {} if SingleDeleteCrashOnInvariantViolation.Get(&cfg.Settings.SV) { logFunc = log.Fatalf + } else if p.singleDelLogEvery.ShouldLog() { + logFunc = log.Infof } - logFunc(logCtx, "SingleDel invariant violation on key %s", roachpb.Key(userKey)) + logFunc(logCtx, "SingleDel invariant violation callback (can be false positive) on key %s", roachpb.Key(userKey)) atomic.AddInt64(&p.singleDelInvariantViolationCount, 1) } - // TODO(sumeer): remove this nil check once the metamorphic test is fixed, - // and does not need to override. - if opts.Experimental.IneffectualSingleDeleteCallback == nil { - opts.Experimental.IneffectualSingleDeleteCallback = func(userKey []byte) { - logFunc := log.Errorf - if SingleDeleteCrashOnIneffectual.Get(&cfg.Settings.SV) { - logFunc = log.Fatalf - } - logFunc(logCtx, "Ineffectual SingleDel on key %s", roachpb.Key(userKey)) - atomic.AddInt64(&p.singleDelIneffectualCount, 1) + opts.Experimental.IneffectualSingleDeleteCallback = func(userKey []byte) { + logFunc := func(ctx context.Context, format string, args ...interface{}) {} + if SingleDeleteCrashOnInvariantViolation.Get(&cfg.Settings.SV) { + logFunc = log.Fatalf + } else if p.singleDelLogEvery.ShouldLog() { + logFunc = log.Infof } + logFunc(logCtx, "Ineffectual SingleDel callback (can be false positive) on key %s", roachpb.Key(userKey)) + atomic.AddInt64(&p.singleDelIneffectualCount, 1) } // MaxConcurrentCompactions can be set by multiple sources, but all the