Skip to content

Commit

Permalink
Merge #116889
Browse files Browse the repository at this point in the history
116889: storage: disable crashing for single delete callbacks r=sumeerbhola a=sumeerbhola

As elaborated in code comments, these can be false positives caused by delete-only compactions. The metrics are not disabled, under the rare chance that we see a problem in a cluster and one of the metrics (if it happens to not be a false positive) gives us a hint about the cause of the problem. For the same reason, we log such callbacks every 5 minutes, so we can correlate the key in such logs with an actual problem. False positives should be rare, especially for the invariant violation callback.

Informs #115881
Informs #114421

Epic: none

Release note (ops change): The cluster settings
storage.single_delete.crash_on_invariant_violation.enabled and storage.single_delete.crash_on_ineffectual.enabled are disabled and must not be enabled.

Co-authored-by: sumeerbhola <sumeer@cockroachlabs.com>
  • Loading branch information
craig[bot] and sumeerbhola committed Dec 21, 2023
2 parents caf7928 + c1eac7f commit 4870247
Show file tree
Hide file tree
Showing 6 changed files with 101 additions and 55 deletions.
2 changes: 0 additions & 2 deletions docs/generated/settings/settings.html
Original file line number Diff line number Diff line change
Expand Up @@ -269,8 +269,6 @@
<tr><td><div id="setting-storage-ingest-split-enabled" class="anchored"><code>storage.ingest_split.enabled</code></div></td><td>boolean</td><td><code>false</code></td><td>set to true to use ingest-time splitting to lower write-amplification (experimental)</td><td>Dedicated/Self-Hosted</td></tr>
<tr><td><div id="setting-storage-max-sync-duration" class="anchored"><code>storage.max_sync_duration</code></div></td><td>duration</td><td><code>20s</code></td><td>maximum duration for disk operations; any operations that take longer than this setting trigger a warning log entry or process crash</td><td>Serverless/Dedicated/Self-Hosted (read-only)</td></tr>
<tr><td><div id="setting-storage-max-sync-duration-fatal-enabled" class="anchored"><code>storage.max_sync_duration.fatal.enabled</code></div></td><td>boolean</td><td><code>true</code></td><td>if true, fatal the process when a disk operation exceeds storage.max_sync_duration</td><td>Serverless/Dedicated/Self-Hosted</td></tr>
<tr><td><div id="setting-storage-single-delete-crash-on-ineffectual-enabled" class="anchored"><code>storage.single_delete.crash_on_ineffectual.enabled</code></div></td><td>boolean</td><td><code>true</code></td><td>set to true to crash if the single delete was ineffectual</td><td>Dedicated/Self-Hosted</td></tr>
<tr><td><div id="setting-storage-single-delete-crash-on-invariant-violation-enabled" class="anchored"><code>storage.single_delete.crash_on_invariant_violation.enabled</code></div></td><td>boolean</td><td><code>true</code></td><td>set to true to crash if the single delete invariant is violated</td><td>Dedicated/Self-Hosted</td></tr>
<tr><td><div id="setting-storage-value-blocks-enabled" class="anchored"><code>storage.value_blocks.enabled</code></div></td><td>boolean</td><td><code>true</code></td><td>set to true to enable writing of value blocks in sstables</td><td>Serverless/Dedicated/Self-Hosted</td></tr>
<tr><td><div id="setting-timeseries-storage-enabled" class="anchored"><code>timeseries.storage.enabled</code></div></td><td>boolean</td><td><code>true</code></td><td>if set, periodic timeseries data is stored within the cluster; disabling is not recommended unless you are storing the data elsewhere</td><td>Dedicated/Self-Hosted</td></tr>
<tr><td><div id="setting-timeseries-storage-resolution-10s-ttl" class="anchored"><code>timeseries.storage.resolution_10s.ttl</code></div></td><td>duration</td><td><code>240h0m0s</code></td><td>the maximum age of time series data stored at the 10 second resolution. Data older than this is subject to rollup and deletion.</td><td>Serverless/Dedicated/Self-Hosted (read-only)</td></tr>
Expand Down
2 changes: 2 additions & 0 deletions pkg/kv/kvserver/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -783,12 +783,14 @@ bytes preserved during flushes and compactions over the lifetime of the process.
Measurement: "Bytes",
Unit: metric.Unit_BYTES,
}
// TODO(sumeer): remove, since can fire due to delete-only compactions.
metaStorageSingleDelInvariantViolationCount = metric.Metadata{
Name: "storage.single-delete.invariant-violation",
Help: "Number of SingleDelete invariant violations",
Measurement: "Events",
Unit: metric.Unit_COUNT,
}
// TODO(sumeer): remove, since can fire due to delete-only compactions.
metaStorageSingleDelIneffectualCount = metric.Metadata{
Name: "storage.single-delete.ineffectual",
Help: "Number of SingleDeletes that were ineffectual",
Expand Down
4 changes: 4 additions & 0 deletions pkg/storage/engine.go
Original file line number Diff line number Diff line change
Expand Up @@ -1196,9 +1196,13 @@ type Metrics struct {
// SingleDelInvariantViolationCount counts the number of times a
// SingleDelete was found to violate the invariant that it should only be
// used when there is at most one older Set for it to delete.
//
// TODO(sumeer): remove, since can fire due to delete-only compactions.
SingleDelInvariantViolationCount int64
// SingleDelIneffectualCount counts the number of times a SingleDelete was
// ineffectual, i.e., it was elided without deleting anything.
//
// TODO(sumeer): remove, since can fire due to delete-only compactions.
SingleDelIneffectualCount int64
// SharedStorageWriteBytes counts the number of bytes written to shared storage.
SharedStorageWriteBytes int64
Expand Down
1 change: 0 additions & 1 deletion pkg/storage/metamorphic/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@ go_library(
"//pkg/storage",
"//pkg/storage/enginepb",
"//pkg/util/hlc",
"//pkg/util/log",
"//pkg/util/randutil",
"//pkg/util/syncutil",
"//pkg/util/uint128",
Expand Down
16 changes: 0 additions & 16 deletions pkg/storage/metamorphic/generator.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@ import (
"github.com/cockroachdb/cockroach/pkg/storage"
"github.com/cockroachdb/cockroach/pkg/storage/enginepb"
"github.com/cockroachdb/cockroach/pkg/util/hlc"
"github.com/cockroachdb/cockroach/pkg/util/log"
"github.com/cockroachdb/pebble"
"github.com/cockroachdb/pebble/vfs"
)
Expand Down Expand Up @@ -62,21 +61,6 @@ func (e *engineConfig) create(path string, fs vfs.FS) (storage.Engine, error) {
pebbleConfig.Opts.Cache = pebble.NewCache(1 << 20)
defer pebbleConfig.Opts.Cache.Unref()

pebbleConfig.Opts.Experimental.IneffectualSingleDeleteCallback = func(userKey []byte) {
// TODO(sumeer): figure out what is causing these callbacks.
if false {
ek, ok := storage.DecodeEngineKey(userKey)
if !ok {
log.Fatalf(context.Background(), "unable to decode %s", roachpb.Key(userKey).String())
}
ltk, err := ek.ToLockTableKey()
if err != nil {
log.Fatalf(context.Background(), "%s", err.Error())
}
log.Fatalf(context.Background(), "Ineffectual SingleDel on k=%s str=%s txn=%s",
ltk.Key.String(), ltk.Strength.String(), ltk.TxnUUID.String())
}
}
return storage.NewPebble(context.Background(), pebbleConfig)
}

Expand Down
131 changes: 95 additions & 36 deletions pkg/storage/pebble.go
Original file line number Diff line number Diff line change
Expand Up @@ -165,20 +165,76 @@ var IngestAsFlushable = settings.RegisterBoolSetting(
util.ConstantWithMetamorphicTestBool(
"storage.ingest_as_flushable.enabled", true))

// DO NOT set storage.single_delete.crash_on_invariant_violation.enabled or
// storage.single_delete.crash_on_ineffectual.enabled to true.
//
// Pebble's delete-only compactions can cause a recent RANGEDEL to peek below
// an older SINGLEDEL and delete an arbitrary subset of data below that
// SINGLEDEL. When that SINGLEDEL gets compacted (without the RANGEDEL), any
// of these callbacks can happen, without it being a real correctness problem.
//
// Example 1:
// RANGEDEL [a, c)#10 in L0
// SINGLEDEL b#5 in L1
// SET b#3 in L6
//
// If the L6 file containing the SET is narrow and the L1 file containing the
// SINGLEDEL is wide, a delete-only compaction can remove the file in L2
// before the SINGLEDEL is compacted down. Then when the SINGLEDEL is
// compacted down, it will not find any SET to delete, resulting in the
// ineffectual callback.
//
// Example 2:
// RANGEDEL [a, z)#60 in L0
// SINGLEDEL g#50 in L1
// SET g#40 in L2
// RANGEDEL [g,h)#30 in L3
// SET g#20 in L6
//
// In this example, the two SETs represent the same intent, and the RANGEDELs
// are caused by the CRDB range being dropped. That is, the transaction wrote
// the intent once, range was dropped, then added back, which caused the SET
// again, then the transaction committed, causing a SINGLEDEL, and then the
// range was dropped again. The older RANGEDEL can get fragmented due to
// compactions it has been part of. Say this L3 file containing the RANGEDEL
// is very narrow, while the L1, L2, L6 files are wider than the RANGEDEL in
// L0. Then the RANGEDEL in L3 can be dropped using a delete-only compaction,
// resulting in an LSM with state:
//
// RANGEDEL [a, z)#60 in L0
// SINGLEDEL g#50 in L1
// SET g#40 in L2
// SET g#20 in L6
//
// A multi-level compaction involving L1, L2, L6 will cause the invariant
// violation callback. This example doesn't need multi-level compactions: say
// there was a Pebble snapshot at g#21 preventing g#20 from being dropped when
// it meets g#40 in a compaction. That snapshot will not save RANGEDEL
// [g,h)#30, so we can have:
//
// SINGLEDEL g#50 in L1
// SET g#40, SET g#20 in L6
//
// And say the snapshot is removed and then the L1 and L6 compaction happens,
// resulting in the invariant violation callback.
//
// TODO(sumeer): remove these cluster settings or figure out a way to bring
// back some invariant checking.

var SingleDeleteCrashOnInvariantViolation = settings.RegisterBoolSetting(
settings.SystemOnly,
"storage.single_delete.crash_on_invariant_violation.enabled",
"set to true to crash if the single delete invariant is violated",
true,
settings.WithPublic,
false,
settings.WithVisibility(settings.Reserved),
)

var SingleDeleteCrashOnIneffectual = settings.RegisterBoolSetting(
settings.SystemOnly,
"storage.single_delete.crash_on_ineffectual.enabled",
"set to true to crash if the single delete was ineffectual",
true,
settings.WithPublic,
false,
settings.WithVisibility(settings.Reserved),
)

// ShouldUseEFOS returns true if either of the UseEFOS or UseExciseForSnapshots
Expand Down Expand Up @@ -885,6 +941,8 @@ type Pebble struct {

storeIDPebbleLog *base.StoreIDContainer
replayer *replay.WorkloadCollector

singleDelLogEvery log.EveryN
}

// WorkloadCollector implements an workloadCollectorGetter and returns the
Expand Down Expand Up @@ -1196,26 +1254,27 @@ func NewPebble(ctx context.Context, cfg PebbleConfig) (p *Pebble, err error) {
storeProps := computeStoreProperties(ctx, cfg.Dir, opts.ReadOnly, encryptionEnv != nil /* encryptionEnabled */)

p = &Pebble{
FS: opts.FS,
readOnly: opts.ReadOnly,
path: cfg.Dir,
auxDir: auxDir,
ballastPath: ballastPath,
ballastSize: cfg.BallastSize,
maxSize: cfg.MaxSize,
attrs: cfg.Attrs,
properties: storeProps,
settings: cfg.Settings,
encryption: encryptionEnv,
fileLock: opts.Lock,
fileRegistry: fileRegistry,
unencryptedFS: unencryptedFS,
logger: opts.LoggerAndTracer,
logCtx: logCtx,
storeIDPebbleLog: storeIDContainer,
closer: filesystemCloser,
onClose: cfg.onClose,
replayer: replay.NewWorkloadCollector(cfg.StorageConfig.Dir),
FS: opts.FS,
readOnly: opts.ReadOnly,
path: cfg.Dir,
auxDir: auxDir,
ballastPath: ballastPath,
ballastSize: cfg.BallastSize,
maxSize: cfg.MaxSize,
attrs: cfg.Attrs,
properties: storeProps,
settings: cfg.Settings,
encryption: encryptionEnv,
fileLock: opts.Lock,
fileRegistry: fileRegistry,
unencryptedFS: unencryptedFS,
logger: opts.LoggerAndTracer,
logCtx: logCtx,
storeIDPebbleLog: storeIDContainer,
closer: filesystemCloser,
onClose: cfg.onClose,
replayer: replay.NewWorkloadCollector(cfg.StorageConfig.Dir),
singleDelLogEvery: log.Every(5 * time.Minute),
}
// In test builds, add a layer of VFS middleware that ensures users of an
// Engine don't try to use the filesystem after the Engine has been closed.
Expand All @@ -1231,24 +1290,24 @@ func NewPebble(ctx context.Context, cfg PebbleConfig) (p *Pebble, err error) {
}

opts.Experimental.SingleDeleteInvariantViolationCallback = func(userKey []byte) {
logFunc := log.Errorf
logFunc := func(ctx context.Context, format string, args ...interface{}) {}
if SingleDeleteCrashOnInvariantViolation.Get(&cfg.Settings.SV) {
logFunc = log.Fatalf
} else if p.singleDelLogEvery.ShouldLog() {
logFunc = log.Infof
}
logFunc(logCtx, "SingleDel invariant violation on key %s", roachpb.Key(userKey))
logFunc(logCtx, "SingleDel invariant violation callback (can be false positive) on key %s", roachpb.Key(userKey))
atomic.AddInt64(&p.singleDelInvariantViolationCount, 1)
}
// TODO(sumeer): remove this nil check once the metamorphic test is fixed,
// and does not need to override.
if opts.Experimental.IneffectualSingleDeleteCallback == nil {
opts.Experimental.IneffectualSingleDeleteCallback = func(userKey []byte) {
logFunc := log.Errorf
if SingleDeleteCrashOnIneffectual.Get(&cfg.Settings.SV) {
logFunc = log.Fatalf
}
logFunc(logCtx, "Ineffectual SingleDel on key %s", roachpb.Key(userKey))
atomic.AddInt64(&p.singleDelIneffectualCount, 1)
opts.Experimental.IneffectualSingleDeleteCallback = func(userKey []byte) {
logFunc := func(ctx context.Context, format string, args ...interface{}) {}
if SingleDeleteCrashOnInvariantViolation.Get(&cfg.Settings.SV) {
logFunc = log.Fatalf
} else if p.singleDelLogEvery.ShouldLog() {
logFunc = log.Infof
}
logFunc(logCtx, "Ineffectual SingleDel callback (can be false positive) on key %s", roachpb.Key(userKey))
atomic.AddInt64(&p.singleDelIneffectualCount, 1)
}

// MaxConcurrentCompactions can be set by multiple sources, but all the
Expand Down

0 comments on commit 4870247

Please sign in to comment.