Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

release-23.2.0-rc: storage: disable crashing for single delete callbacks #116897

Merged
merged 1 commit into from
Dec 20, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 0 additions & 2 deletions docs/generated/settings/settings.html
Original file line number Diff line number Diff line change
Expand Up @@ -265,8 +265,6 @@
<tr><td><div id="setting-storage-ingest-split-enabled" class="anchored"><code>storage.ingest_split.enabled</code></div></td><td>boolean</td><td><code>false</code></td><td>set to true to use ingest-time splitting to lower write-amplification (experimental)</td><td>Dedicated/Self-Hosted</td></tr>
<tr><td><div id="setting-storage-max-sync-duration" class="anchored"><code>storage.max_sync_duration</code></div></td><td>duration</td><td><code>20s</code></td><td>maximum duration for disk operations; any operations that take longer than this setting trigger a warning log entry or process crash</td><td>Serverless/Dedicated/Self-Hosted (read-only)</td></tr>
<tr><td><div id="setting-storage-max-sync-duration-fatal-enabled" class="anchored"><code>storage.max_sync_duration.fatal.enabled</code></div></td><td>boolean</td><td><code>true</code></td><td>if true, fatal the process when a disk operation exceeds storage.max_sync_duration</td><td>Serverless/Dedicated/Self-Hosted</td></tr>
<tr><td><div id="setting-storage-single-delete-crash-on-ineffectual-enabled" class="anchored"><code>storage.single_delete.crash_on_ineffectual.enabled</code></div></td><td>boolean</td><td><code>true</code></td><td>set to true to crash if the single delete was ineffectual</td><td>Dedicated/Self-Hosted</td></tr>
<tr><td><div id="setting-storage-single-delete-crash-on-invariant-violation-enabled" class="anchored"><code>storage.single_delete.crash_on_invariant_violation.enabled</code></div></td><td>boolean</td><td><code>true</code></td><td>set to true to crash if the single delete invariant is violated</td><td>Dedicated/Self-Hosted</td></tr>
<tr><td><div id="setting-storage-value-blocks-enabled" class="anchored"><code>storage.value_blocks.enabled</code></div></td><td>boolean</td><td><code>true</code></td><td>set to true to enable writing of value blocks in sstables</td><td>Serverless/Dedicated/Self-Hosted</td></tr>
<tr><td><div id="setting-timeseries-storage-enabled" class="anchored"><code>timeseries.storage.enabled</code></div></td><td>boolean</td><td><code>true</code></td><td>if set, periodic timeseries data is stored within the cluster; disabling is not recommended unless you are storing the data elsewhere</td><td>Dedicated/Self-Hosted</td></tr>
<tr><td><div id="setting-timeseries-storage-resolution-10s-ttl" class="anchored"><code>timeseries.storage.resolution_10s.ttl</code></div></td><td>duration</td><td><code>240h0m0s</code></td><td>the maximum age of time series data stored at the 10 second resolution. Data older than this is subject to rollup and deletion.</td><td>Serverless/Dedicated/Self-Hosted (read-only)</td></tr>
Expand Down
2 changes: 2 additions & 0 deletions pkg/kv/kvserver/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -781,12 +781,14 @@ bytes preserved during flushes and compactions over the lifetime of the process.
Measurement: "Bytes",
Unit: metric.Unit_BYTES,
}
// TODO(sumeer): remove, since can fire due to delete-only compactions.
metaStorageSingleDelInvariantViolationCount = metric.Metadata{
Name: "storage.single-delete.invariant-violation",
Help: "Number of SingleDelete invariant violations",
Measurement: "Events",
Unit: metric.Unit_COUNT,
}
// TODO(sumeer): remove, since can fire due to delete-only compactions.
metaStorageSingleDelIneffectualCount = metric.Metadata{
Name: "storage.single-delete.ineffectual",
Help: "Number of SingleDeletes that were ineffectual",
Expand Down
4 changes: 4 additions & 0 deletions pkg/storage/engine.go
Original file line number Diff line number Diff line change
Expand Up @@ -1176,9 +1176,13 @@ type Metrics struct {
// SingleDelInvariantViolationCount counts the number of times a
// SingleDelete was found to violate the invariant that it should only be
// used when there is at most one older Set for it to delete.
//
// TODO(sumeer): remove, since can fire due to delete-only compactions.
SingleDelInvariantViolationCount int64
// SingleDelIneffectualCount counts the number of times a SingleDelete was
// ineffectual, i.e., it was elided without deleting anything.
//
// TODO(sumeer): remove, since can fire due to delete-only compactions.
SingleDelIneffectualCount int64
// SharedStorageWriteBytes counts the number of bytes written to shared storage.
SharedStorageWriteBytes int64
Expand Down
1 change: 0 additions & 1 deletion pkg/storage/metamorphic/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@ go_library(
"//pkg/storage",
"//pkg/storage/enginepb",
"//pkg/util/hlc",
"//pkg/util/log",
"//pkg/util/randutil",
"//pkg/util/syncutil",
"//pkg/util/uint128",
Expand Down
16 changes: 0 additions & 16 deletions pkg/storage/metamorphic/generator.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@ import (
"github.com/cockroachdb/cockroach/pkg/storage"
"github.com/cockroachdb/cockroach/pkg/storage/enginepb"
"github.com/cockroachdb/cockroach/pkg/util/hlc"
"github.com/cockroachdb/cockroach/pkg/util/log"
"github.com/cockroachdb/pebble"
"github.com/cockroachdb/pebble/vfs"
)
Expand Down Expand Up @@ -62,21 +61,6 @@ func (e *engineConfig) create(path string, fs vfs.FS) (storage.Engine, error) {
pebbleConfig.Opts.Cache = pebble.NewCache(1 << 20)
defer pebbleConfig.Opts.Cache.Unref()

pebbleConfig.Opts.Experimental.IneffectualSingleDeleteCallback = func(userKey []byte) {
// TODO(sumeer): figure out what is causing these callbacks.
if false {
ek, ok := storage.DecodeEngineKey(userKey)
if !ok {
log.Fatalf(context.Background(), "unable to decode %s", roachpb.Key(userKey).String())
}
ltk, err := ek.ToLockTableKey()
if err != nil {
log.Fatalf(context.Background(), "%s", err.Error())
}
log.Fatalf(context.Background(), "Ineffectual SingleDel on k=%s str=%s txn=%s",
ltk.Key.String(), ltk.Strength.String(), ltk.TxnUUID.String())
}
}
return storage.NewPebble(context.Background(), pebbleConfig)
}

Expand Down
131 changes: 95 additions & 36 deletions pkg/storage/pebble.go
Original file line number Diff line number Diff line change
Expand Up @@ -159,20 +159,76 @@ var IngestAsFlushable = settings.RegisterBoolSetting(
util.ConstantWithMetamorphicTestBool(
"storage.ingest_as_flushable.enabled", true))

// DO NOT set storage.single_delete.crash_on_invariant_violation.enabled or
// storage.single_delete.crash_on_ineffectual.enabled to true.
//
// Pebble's delete-only compactions can cause a recent RANGEDEL to peek below
// an older SINGLEDEL and delete an arbitrary subset of data below that
// SINGLEDEL. When that SINGLEDEL gets compacted (without the RANGEDEL), any
// of these callbacks can happen, without it being a real correctness problem.
//
// Example 1:
// RANGEDEL [a, c)#10 in L0
// SINGLEDEL b#5 in L1
// SET b#3 in L6
//
// If the L6 file containing the SET is narrow and the L1 file containing the
// SINGLEDEL is wide, a delete-only compaction can remove the file in L2
// before the SINGLEDEL is compacted down. Then when the SINGLEDEL is
// compacted down, it will not find any SET to delete, resulting in the
// ineffectual callback.
//
// Example 2:
// RANGEDEL [a, z)#60 in L0
// SINGLEDEL g#50 in L1
// SET g#40 in L2
// RANGEDEL [g,h)#30 in L3
// SET g#20 in L6
//
// In this example, the two SETs represent the same intent, and the RANGEDELs
// are caused by the CRDB range being dropped. That is, the transaction wrote
// the intent once, range was dropped, then added back, which caused the SET
// again, then the transaction committed, causing a SINGLEDEL, and then the
// range was dropped again. The older RANGEDEL can get fragmented due to
// compactions it has been part of. Say this L3 file containing the RANGEDEL
// is very narrow, while the L1, L2, L6 files are wider than the RANGEDEL in
// L0. Then the RANGEDEL in L3 can be dropped using a delete-only compaction,
// resulting in an LSM with state:
//
// RANGEDEL [a, z)#60 in L0
// SINGLEDEL g#50 in L1
// SET g#40 in L2
// SET g#20 in L6
//
// A multi-level compaction involving L1, L2, L6 will cause the invariant
// violation callback. This example doesn't need multi-level compactions: say
// there was a Pebble snapshot at g#21 preventing g#20 from being dropped when
// it meets g#40 in a compaction. That snapshot will not save RANGEDEL
// [g,h)#30, so we can have:
//
// SINGLEDEL g#50 in L1
// SET g#40, SET g#20 in L6
//
// And say the snapshot is removed and then the L1 and L6 compaction happens,
// resulting in the invariant violation callback.
//
// TODO(sumeer): remove these cluster settings or figure out a way to bring
// back some invariant checking.

var SingleDeleteCrashOnInvariantViolation = settings.RegisterBoolSetting(
settings.SystemOnly,
"storage.single_delete.crash_on_invariant_violation.enabled",
"set to true to crash if the single delete invariant is violated",
true,
settings.WithPublic,
false,
settings.WithVisibility(settings.Reserved),
)

var SingleDeleteCrashOnIneffectual = settings.RegisterBoolSetting(
settings.SystemOnly,
"storage.single_delete.crash_on_ineffectual.enabled",
"set to true to crash if the single delete was ineffectual",
true,
settings.WithPublic,
false,
settings.WithVisibility(settings.Reserved),
)

// ShouldUseEFOS returns true if either of the UseEFOS or UseExciseForSnapshots
Expand Down Expand Up @@ -875,6 +931,8 @@ type Pebble struct {

storeIDPebbleLog *base.StoreIDContainer
replayer *replay.WorkloadCollector

singleDelLogEvery log.EveryN
}

// WorkloadCollector implements an workloadCollectorGetter and returns the
Expand Down Expand Up @@ -1162,26 +1220,27 @@ func NewPebble(ctx context.Context, cfg PebbleConfig) (p *Pebble, err error) {
storeProps := computeStoreProperties(ctx, cfg.Dir, opts.ReadOnly, encryptionEnv != nil /* encryptionEnabled */)

p = &Pebble{
FS: opts.FS,
readOnly: opts.ReadOnly,
path: cfg.Dir,
auxDir: auxDir,
ballastPath: ballastPath,
ballastSize: cfg.BallastSize,
maxSize: cfg.MaxSize,
attrs: cfg.Attrs,
properties: storeProps,
settings: cfg.Settings,
encryption: encryptionEnv,
fileLock: opts.Lock,
fileRegistry: fileRegistry,
unencryptedFS: unencryptedFS,
logger: opts.LoggerAndTracer,
logCtx: logCtx,
storeIDPebbleLog: storeIDContainer,
closer: filesystemCloser,
onClose: cfg.onClose,
replayer: replay.NewWorkloadCollector(cfg.StorageConfig.Dir),
FS: opts.FS,
readOnly: opts.ReadOnly,
path: cfg.Dir,
auxDir: auxDir,
ballastPath: ballastPath,
ballastSize: cfg.BallastSize,
maxSize: cfg.MaxSize,
attrs: cfg.Attrs,
properties: storeProps,
settings: cfg.Settings,
encryption: encryptionEnv,
fileLock: opts.Lock,
fileRegistry: fileRegistry,
unencryptedFS: unencryptedFS,
logger: opts.LoggerAndTracer,
logCtx: logCtx,
storeIDPebbleLog: storeIDContainer,
closer: filesystemCloser,
onClose: cfg.onClose,
replayer: replay.NewWorkloadCollector(cfg.StorageConfig.Dir),
singleDelLogEvery: log.Every(5 * time.Minute),
}
// In test builds, add a layer of VFS middleware that ensures users of an
// Engine don't try to use the filesystem after the Engine has been closed.
Expand All @@ -1197,24 +1256,24 @@ func NewPebble(ctx context.Context, cfg PebbleConfig) (p *Pebble, err error) {
}

opts.Experimental.SingleDeleteInvariantViolationCallback = func(userKey []byte) {
logFunc := log.Errorf
logFunc := func(ctx context.Context, format string, args ...interface{}) {}
if SingleDeleteCrashOnInvariantViolation.Get(&cfg.Settings.SV) {
logFunc = log.Fatalf
} else if p.singleDelLogEvery.ShouldLog() {
logFunc = log.Infof
}
logFunc(logCtx, "SingleDel invariant violation on key %s", roachpb.Key(userKey))
logFunc(logCtx, "SingleDel invariant violation callback (can be false positive) on key %s", roachpb.Key(userKey))
atomic.AddInt64(&p.singleDelInvariantViolationCount, 1)
}
// TODO(sumeer): remove this nil check once the metamorphic test is fixed,
// and does not need to override.
if opts.Experimental.IneffectualSingleDeleteCallback == nil {
opts.Experimental.IneffectualSingleDeleteCallback = func(userKey []byte) {
logFunc := log.Errorf
if SingleDeleteCrashOnIneffectual.Get(&cfg.Settings.SV) {
logFunc = log.Fatalf
}
logFunc(logCtx, "Ineffectual SingleDel on key %s", roachpb.Key(userKey))
atomic.AddInt64(&p.singleDelIneffectualCount, 1)
opts.Experimental.IneffectualSingleDeleteCallback = func(userKey []byte) {
logFunc := func(ctx context.Context, format string, args ...interface{}) {}
if SingleDeleteCrashOnInvariantViolation.Get(&cfg.Settings.SV) {
logFunc = log.Fatalf
} else if p.singleDelLogEvery.ShouldLog() {
logFunc = log.Infof
}
logFunc(logCtx, "Ineffectual SingleDel callback (can be false positive) on key %s", roachpb.Key(userKey))
atomic.AddInt64(&p.singleDelIneffectualCount, 1)
}

// MaxConcurrentCompactions can be set by multiple sources, but all the
Expand Down