forked from cockroachdb/pebble
-
Notifications
You must be signed in to change notification settings - Fork 0
/
options.go
2187 lines (2005 loc) · 86.8 KB
/
options.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
// Copyright 2011 The LevelDB-Go and Pebble Authors. All rights reserved. Use
// of this source code is governed by a BSD-style license that can be found in
// the LICENSE file.
package pebble
import (
"bytes"
"fmt"
"io"
"runtime"
"strconv"
"strings"
"time"
"unicode"
"github.com/cockroachdb/errors"
"github.com/cockroachdb/fifo"
"github.com/cockroachdb/pebble/internal/base"
"github.com/cockroachdb/pebble/internal/cache"
"github.com/cockroachdb/pebble/internal/humanize"
"github.com/cockroachdb/pebble/internal/keyspan"
"github.com/cockroachdb/pebble/internal/manifest"
"github.com/cockroachdb/pebble/internal/testkeys"
"github.com/cockroachdb/pebble/objstorage/objstorageprovider"
"github.com/cockroachdb/pebble/objstorage/remote"
"github.com/cockroachdb/pebble/rangekey"
"github.com/cockroachdb/pebble/sstable"
"github.com/cockroachdb/pebble/sstable/block"
"github.com/cockroachdb/pebble/sstable/colblk"
"github.com/cockroachdb/pebble/vfs"
"github.com/cockroachdb/pebble/wal"
)
const (
cacheDefaultSize = 8 << 20 // 8 MB
defaultLevelMultiplier = 10
)
// Compression exports the base.Compression type.
type Compression = block.Compression
// Exported Compression constants.
const (
DefaultCompression = block.DefaultCompression
NoCompression = block.NoCompression
SnappyCompression = block.SnappyCompression
ZstdCompression = block.ZstdCompression
)
// FilterType exports the base.FilterType type.
type FilterType = base.FilterType
// Exported TableFilter constants.
const (
TableFilter = base.TableFilter
)
// FilterWriter exports the base.FilterWriter type.
type FilterWriter = base.FilterWriter
// FilterPolicy exports the base.FilterPolicy type.
type FilterPolicy = base.FilterPolicy
// KeySchema exports the colblk.KeySchema type.
type KeySchema = colblk.KeySchema
// BlockPropertyCollector exports the sstable.BlockPropertyCollector type.
type BlockPropertyCollector = sstable.BlockPropertyCollector
// BlockPropertyFilter exports the sstable.BlockPropertyFilter type.
type BlockPropertyFilter = base.BlockPropertyFilter
// ShortAttributeExtractor exports the base.ShortAttributeExtractor type.
type ShortAttributeExtractor = base.ShortAttributeExtractor
// UserKeyPrefixBound exports the sstable.UserKeyPrefixBound type.
type UserKeyPrefixBound = sstable.UserKeyPrefixBound
// IterKeyType configures which types of keys an iterator should surface.
type IterKeyType int8
const (
// IterKeyTypePointsOnly configures an iterator to iterate over point keys
// only.
IterKeyTypePointsOnly IterKeyType = iota
// IterKeyTypeRangesOnly configures an iterator to iterate over range keys
// only.
IterKeyTypeRangesOnly
// IterKeyTypePointsAndRanges configures an iterator iterate over both point
// keys and range keys simultaneously.
IterKeyTypePointsAndRanges
)
// String implements fmt.Stringer.
func (t IterKeyType) String() string {
switch t {
case IterKeyTypePointsOnly:
return "points-only"
case IterKeyTypeRangesOnly:
return "ranges-only"
case IterKeyTypePointsAndRanges:
return "points-and-ranges"
default:
panic(fmt.Sprintf("unknown key type %d", t))
}
}
// IterOptions hold the optional per-query parameters for NewIter.
//
// Like Options, a nil *IterOptions is valid and means to use the default
// values.
type IterOptions struct {
// LowerBound specifies the smallest key (inclusive) that the iterator will
// return during iteration. If the iterator is seeked or iterated past this
// boundary the iterator will return Valid()==false. Setting LowerBound
// effectively truncates the key space visible to the iterator.
LowerBound []byte
// UpperBound specifies the largest key (exclusive) that the iterator will
// return during iteration. If the iterator is seeked or iterated past this
// boundary the iterator will return Valid()==false. Setting UpperBound
// effectively truncates the key space visible to the iterator.
UpperBound []byte
// SkipPoint may be used to skip over point keys that don't match an
// arbitrary predicate during iteration. If set, the Iterator invokes
// SkipPoint for keys encountered. If SkipPoint returns true, the iterator
// will skip the key without yielding it to the iterator operation in
// progress.
//
// SkipPoint must be a pure function and always return the same result when
// provided the same arguments. The iterator may call SkipPoint multiple
// times for the same user key.
SkipPoint func(userKey []byte) bool
// PointKeyFilters can be used to avoid scanning tables and blocks in tables
// when iterating over point keys. This slice represents an intersection
// across all filters, i.e., all filters must indicate that the block is
// relevant.
//
// Performance note: When len(PointKeyFilters) > 0, the caller should ensure
// that cap(PointKeyFilters) is at least len(PointKeyFilters)+1. This helps
// avoid allocations in Pebble internal code that mutates the slice.
PointKeyFilters []BlockPropertyFilter
// RangeKeyFilters can be usefd to avoid scanning tables and blocks in tables
// when iterating over range keys. The same requirements that apply to
// PointKeyFilters apply here too.
RangeKeyFilters []BlockPropertyFilter
// KeyTypes configures which types of keys to iterate over: point keys,
// range keys, or both.
KeyTypes IterKeyType
// RangeKeyMasking can be used to enable automatic masking of point keys by
// range keys. Range key masking is only supported during combined range key
// and point key iteration mode (IterKeyTypePointsAndRanges).
RangeKeyMasking RangeKeyMasking
// OnlyReadGuaranteedDurable is an advanced option that is only supported by
// the Reader implemented by DB. When set to true, only the guaranteed to be
// durable state is visible in the iterator.
// - This definition is made under the assumption that the FS implementation
// is providing a durability guarantee when data is synced.
// - The visible state represents a consistent point in the history of the
// DB.
// - The implementation is free to choose a conservative definition of what
// is guaranteed durable. For simplicity, the current implementation
// ignores memtables. A more sophisticated implementation could track the
// highest seqnum that is synced to the WAL and published and use that as
// the visible seqnum for an iterator. Note that the latter approach is
// not strictly better than the former since we can have DBs that are (a)
// synced more rarely than memtable flushes, (b) have no WAL. (a) is
// likely to be true in a future CockroachDB context where the DB
// containing the state machine may be rarely synced.
// NB: this current implementation relies on the fact that memtables are
// flushed in seqnum order, and any ingested sstables that happen to have a
// lower seqnum than a non-flushed memtable don't have any overlapping keys.
// This is the fundamental level invariant used in other code too, like when
// merging iterators.
//
// Semantically, using this option provides the caller a "snapshot" as of
// the time the most recent memtable was flushed. An alternate interface
// would be to add a NewSnapshot variant. Creating a snapshot is heavier
// weight than creating an iterator, so we have opted to support this
// iterator option.
OnlyReadGuaranteedDurable bool
// UseL6Filters allows the caller to opt into reading filter blocks for L6
// sstables. Helpful if a lot of SeekPrefixGEs are expected in quick
// succession, that are also likely to not yield a single key. Filter blocks in
// L6 can be relatively large, often larger than data blocks, so the benefit of
// loading them in the cache is minimized if the probability of the key
// existing is not low or if we just expect a one-time Seek (where loading the
// data block directly is better).
UseL6Filters bool
// Category is used for categorized iterator stats. This should not be
// changed by calling SetOptions.
Category sstable.Category
DebugRangeKeyStack bool
// Internal options.
logger Logger
// Layer corresponding to this file. Only passed in if constructed by a
// levelIter.
layer manifest.Layer
// disableLazyCombinedIteration is an internal testing option.
disableLazyCombinedIteration bool
// snapshotForHideObsoletePoints is specified for/by levelIter when opening
// files and is used to decide whether to hide obsolete points. A value of 0
// implies obsolete points should not be hidden.
snapshotForHideObsoletePoints base.SeqNum
// NB: If adding new Options, you must account for them in iterator
// construction and Iterator.SetOptions.
}
// GetLowerBound returns the LowerBound or nil if the receiver is nil.
func (o *IterOptions) GetLowerBound() []byte {
if o == nil {
return nil
}
return o.LowerBound
}
// GetUpperBound returns the UpperBound or nil if the receiver is nil.
func (o *IterOptions) GetUpperBound() []byte {
if o == nil {
return nil
}
return o.UpperBound
}
func (o *IterOptions) pointKeys() bool {
if o == nil {
return true
}
return o.KeyTypes == IterKeyTypePointsOnly || o.KeyTypes == IterKeyTypePointsAndRanges
}
func (o *IterOptions) rangeKeys() bool {
if o == nil {
return false
}
return o.KeyTypes == IterKeyTypeRangesOnly || o.KeyTypes == IterKeyTypePointsAndRanges
}
func (o *IterOptions) getLogger() Logger {
if o == nil || o.logger == nil {
return DefaultLogger
}
return o.logger
}
// SpanIterOptions creates a SpanIterOptions from this IterOptions.
func (o *IterOptions) SpanIterOptions() keyspan.SpanIterOptions {
if o == nil {
return keyspan.SpanIterOptions{}
}
return keyspan.SpanIterOptions{
RangeKeyFilters: o.RangeKeyFilters,
}
}
// scanInternalOptions is similar to IterOptions, meant for use with
// scanInternalIterator.
type scanInternalOptions struct {
IterOptions
category sstable.Category
visitPointKey func(key *InternalKey, value LazyValue, iterInfo IteratorLevel) error
visitRangeDel func(start, end []byte, seqNum SeqNum) error
visitRangeKey func(start, end []byte, keys []rangekey.Key) error
visitSharedFile func(sst *SharedSSTMeta) error
visitExternalFile func(sst *ExternalFile) error
// includeObsoleteKeys specifies whether keys shadowed by newer internal keys
// are exposed. If false, only one internal key per user key is exposed.
includeObsoleteKeys bool
// rateLimitFunc is used to limit the amount of bytes read per second.
rateLimitFunc func(key *InternalKey, value LazyValue) error
}
// RangeKeyMasking configures automatic hiding of point keys by range keys. A
// non-nil Suffix enables range-key masking. When enabled, range keys with
// suffixes ≥ Suffix behave as masks. All point keys that are contained within a
// masking range key's bounds and have suffixes greater than the range key's
// suffix are automatically skipped.
//
// Specifically, when configured with a RangeKeyMasking.Suffix _s_, and there
// exists a range key with suffix _r_ covering a point key with suffix _p_, and
//
// _s_ ≤ _r_ < _p_
//
// then the point key is elided.
//
// Range-key masking may only be used when iterating over both point keys and
// range keys with IterKeyTypePointsAndRanges.
type RangeKeyMasking struct {
// Suffix configures which range keys may mask point keys. Only range keys
// that are defined at suffixes greater than or equal to Suffix will mask
// point keys.
Suffix []byte
// Filter is an optional field that may be used to improve performance of
// range-key masking through a block-property filter defined over key
// suffixes. If non-nil, Filter is called by Pebble to construct a
// block-property filter mask at iterator creation. The filter is used to
// skip whole point-key blocks containing point keys with suffixes greater
// than a covering range-key's suffix.
//
// To use this functionality, the caller must create and configure (through
// Options.BlockPropertyCollectors) a block-property collector that records
// the maxmimum suffix contained within a block. The caller then must write
// and provide a BlockPropertyFilterMask implementation on that same
// property. See the BlockPropertyFilterMask type for more information.
Filter func() BlockPropertyFilterMask
}
// BlockPropertyFilterMask extends the BlockPropertyFilter interface for use
// with range-key masking. Unlike an ordinary block property filter, a
// BlockPropertyFilterMask's filtering criteria is allowed to change when Pebble
// invokes its SetSuffix method.
//
// When a Pebble iterator steps into a range key's bounds and the range key has
// a suffix greater than or equal to RangeKeyMasking.Suffix, the range key acts
// as a mask. The masking range key hides all point keys that fall within the
// range key's bounds and have suffixes > the range key's suffix. Without a
// filter mask configured, Pebble performs this hiding by stepping through point
// keys and comparing suffixes. If large numbers of point keys are masked, this
// requires Pebble to load, iterate through and discard a large number of
// sstable blocks containing masked point keys.
//
// If a block-property collector and a filter mask are configured, Pebble may
// skip loading some point-key blocks altogether. If a block's keys are known to
// all fall within the bounds of the masking range key and the block was
// annotated by a block-property collector with the maximal suffix, Pebble can
// ask the filter mask to compare the property to the current masking range
// key's suffix. If the mask reports no intersection, the block may be skipped.
//
// If unsuffixed and suffixed keys are written to the database, care must be
// taken to avoid unintentionally masking un-suffixed keys located in the same
// block as suffixed keys. One solution is to interpret unsuffixed keys as
// containing the maximal suffix value, ensuring that blocks containing
// unsuffixed keys are always loaded.
type BlockPropertyFilterMask interface {
BlockPropertyFilter
// SetSuffix configures the mask with the suffix of a range key. The filter
// should return false from Intersects whenever it's provided with a
// property encoding a block's minimum suffix that's greater (according to
// Compare) than the provided suffix.
SetSuffix(suffix []byte) error
}
// WriteOptions hold the optional per-query parameters for Set and Delete
// operations.
//
// Like Options, a nil *WriteOptions is valid and means to use the default
// values.
type WriteOptions struct {
// Sync is whether to sync writes through the OS buffer cache and down onto
// the actual disk, if applicable. Setting Sync is required for durability of
// individual write operations but can result in slower writes.
//
// If false, and the process or machine crashes, then a recent write may be
// lost. This is due to the recently written data being buffered inside the
// process running Pebble. This differs from the semantics of a write system
// call in which the data is buffered in the OS buffer cache and would thus
// survive a process crash.
//
// The default value is true.
Sync bool
}
// Sync specifies the default write options for writes which synchronize to
// disk.
var Sync = &WriteOptions{Sync: true}
// NoSync specifies the default write options for writes which do not
// synchronize to disk.
var NoSync = &WriteOptions{Sync: false}
// GetSync returns the Sync value or true if the receiver is nil.
func (o *WriteOptions) GetSync() bool {
return o == nil || o.Sync
}
// LevelOptions holds the optional per-level parameters.
type LevelOptions struct {
// BlockRestartInterval is the number of keys between restart points
// for delta encoding of keys.
//
// The default value is 16.
BlockRestartInterval int
// BlockSize is the target uncompressed size in bytes of each table block.
//
// The default value is 4096.
BlockSize int
// BlockSizeThreshold finishes a block if the block size is larger than the
// specified percentage of the target block size and adding the next entry
// would cause the block to be larger than the target block size.
//
// The default value is 90
BlockSizeThreshold int
// Compression defines the per-block compression to use.
//
// The default value (DefaultCompression) uses snappy compression.
Compression func() Compression
// FilterPolicy defines a filter algorithm (such as a Bloom filter) that can
// reduce disk reads for Get calls.
//
// One such implementation is bloom.FilterPolicy(10) from the pebble/bloom
// package.
//
// The default value means to use no filter.
FilterPolicy FilterPolicy
// FilterType defines whether an existing filter policy is applied at a
// block-level or table-level. Block-level filters use less memory to create,
// but are slower to access as a check for the key in the index must first be
// performed to locate the filter block. A table-level filter will require
// memory proportional to the number of keys in an sstable to create, but
// avoids the index lookup when determining if a key is present. Table-level
// filters should be preferred except under constrained memory situations.
FilterType FilterType
// IndexBlockSize is the target uncompressed size in bytes of each index
// block. When the index block size is larger than this target, two-level
// indexes are automatically enabled. Setting this option to a large value
// (such as math.MaxInt32) disables the automatic creation of two-level
// indexes.
//
// The default value is the value of BlockSize.
IndexBlockSize int
// The target file size for the level.
TargetFileSize int64
}
// EnsureDefaults ensures that the default values for all of the options have
// been initialized. It is valid to call EnsureDefaults on a nil receiver. A
// non-nil result will always be returned.
func (o *LevelOptions) EnsureDefaults() *LevelOptions {
if o == nil {
o = &LevelOptions{}
}
if o.BlockRestartInterval <= 0 {
o.BlockRestartInterval = base.DefaultBlockRestartInterval
}
if o.BlockSize <= 0 {
o.BlockSize = base.DefaultBlockSize
} else if o.BlockSize > sstable.MaximumBlockSize {
panic(errors.Errorf("BlockSize %d exceeds MaximumBlockSize", o.BlockSize))
}
if o.BlockSizeThreshold <= 0 {
o.BlockSizeThreshold = base.DefaultBlockSizeThreshold
}
if o.Compression == nil {
o.Compression = func() Compression { return DefaultCompression }
}
if o.IndexBlockSize <= 0 {
o.IndexBlockSize = o.BlockSize
}
if o.TargetFileSize <= 0 {
o.TargetFileSize = 2 << 20 // 2 MB
}
return o
}
// Options holds the optional parameters for configuring pebble. These options
// apply to the DB at large; per-query options are defined by the IterOptions
// and WriteOptions types.
type Options struct {
// Sync sstables periodically in order to smooth out writes to disk. This
// option does not provide any persistency guarantee, but is used to avoid
// latency spikes if the OS automatically decides to write out a large chunk
// of dirty filesystem buffers. This option only controls SSTable syncs; WAL
// syncs are controlled by WALBytesPerSync.
//
// The default value is 512KB.
BytesPerSync int
// Cache is used to cache uncompressed blocks from sstables.
//
// The default cache size is 8 MB.
Cache *cache.Cache
// LoadBlockSema, if set, is used to limit the number of blocks that can be
// loaded (i.e. read from the filesystem) in parallel. Each load acquires one
// unit from the semaphore for the duration of the read.
LoadBlockSema *fifo.Semaphore
// Cleaner cleans obsolete files.
//
// The default cleaner uses the DeleteCleaner.
Cleaner Cleaner
// Local contains option that pertain to files stored on the local filesystem.
Local struct {
// ReadaheadConfig is used to retrieve the current readahead mode; it is
// consulted whenever a read handle is initialized.
ReadaheadConfig *ReadaheadConfig
// TODO(radu): move BytesPerSync, LoadBlockSema, Cleaner here.
}
// Comparer defines a total ordering over the space of []byte keys: a 'less
// than' relationship. The same comparison algorithm must be used for reads
// and writes over the lifetime of the DB.
//
// The default value uses the same ordering as bytes.Compare.
Comparer *Comparer
// DebugCheck is invoked, if non-nil, whenever a new version is being
// installed. Typically, this is set to pebble.DebugCheckLevels in tests
// or tools only, to check invariants over all the data in the database.
DebugCheck func(*DB) error
// Disable the write-ahead log (WAL). Disabling the write-ahead log prohibits
// crash recovery, but can improve performance if crash recovery is not
// needed (e.g. when only temporary state is being stored in the database).
//
// TODO(peter): untested
DisableWAL bool
// ErrorIfExists causes an error on Open if the database already exists.
// The error can be checked with errors.Is(err, ErrDBAlreadyExists).
//
// The default value is false.
ErrorIfExists bool
// ErrorIfNotExists causes an error on Open if the database does not already
// exist. The error can be checked with errors.Is(err, ErrDBDoesNotExist).
//
// The default value is false which will cause a database to be created if it
// does not already exist.
ErrorIfNotExists bool
// ErrorIfNotPristine causes an error on Open if the database already exists
// and any operations have been performed on the database. The error can be
// checked with errors.Is(err, ErrDBNotPristine).
//
// Note that a database that contained keys that were all subsequently deleted
// may or may not trigger the error. Currently, we check if there are any live
// SSTs or log records to replay.
ErrorIfNotPristine bool
// EventListener provides hooks to listening to significant DB events such as
// flushes, compactions, and table deletion.
EventListener *EventListener
// Experimental contains experimental options which are off by default.
// These options are temporary and will eventually either be deleted, moved
// out of the experimental group, or made the non-adjustable default. These
// options may change at any time, so do not rely on them.
Experimental struct {
// The threshold of L0 read-amplification at which compaction concurrency
// is enabled (if CompactionDebtConcurrency was not already exceeded).
// Every multiple of this value enables another concurrent
// compaction up to MaxConcurrentCompactions.
L0CompactionConcurrency int
// CompactionDebtConcurrency controls the threshold of compaction debt
// at which additional compaction concurrency slots are added. For every
// multiple of this value in compaction debt bytes, an additional
// concurrent compaction is added. This works "on top" of
// L0CompactionConcurrency, so the higher of the count of compaction
// concurrency slots as determined by the two options is chosen.
CompactionDebtConcurrency uint64
// IngestSplit, if it returns true, allows for ingest-time splitting of
// existing sstables into two virtual sstables to allow ingestion sstables to
// slot into a lower level than they otherwise would have.
IngestSplit func() bool
// ReadCompactionRate controls the frequency of read triggered
// compactions by adjusting `AllowedSeeks` in manifest.FileMetadata:
//
// AllowedSeeks = FileSize / ReadCompactionRate
//
// From LevelDB:
// ```
// We arrange to automatically compact this file after
// a certain number of seeks. Let's assume:
// (1) One seek costs 10ms
// (2) Writing or reading 1MB costs 10ms (100MB/s)
// (3) A compaction of 1MB does 25MB of IO:
// 1MB read from this level
// 10-12MB read from next level (boundaries may be misaligned)
// 10-12MB written to next level
// This implies that 25 seeks cost the same as the compaction
// of 1MB of data. I.e., one seek costs approximately the
// same as the compaction of 40KB of data. We are a little
// conservative and allow approximately one seek for every 16KB
// of data before triggering a compaction.
// ```
ReadCompactionRate int64
// ReadSamplingMultiplier is a multiplier for the readSamplingPeriod in
// iterator.maybeSampleRead() to control the frequency of read sampling
// to trigger a read triggered compaction. A value of -1 prevents sampling
// and disables read triggered compactions. The default is 1 << 4. which
// gets multiplied with a constant of 1 << 16 to yield 1 << 20 (1MB).
ReadSamplingMultiplier int64
// NumDeletionsThreshold defines the minimum number of point tombstones
// that must be present in a single data block for that block to be
// considered tombstone-dense for the purposes of triggering a
// tombstone density compaction. Data blocks may also be considered
// tombstone-dense if they meet the criteria defined by
// DeletionSizeRatioThreshold below. Tombstone-dense blocks are identified
// when sstables are written, and so this is effectively an option for
// sstable writers. The default value is 100.
NumDeletionsThreshold int
// DeletionSizeRatioThreshold defines the minimum ratio of the size of
// point tombstones to the size of a data block that must be reached
// for that block to be considered tombstone-dense for the purposes of
// triggering a tombstone density compaction. Data blocks may also be
// considered tombstone-dense if they meet the criteria defined by
// NumDeletionsThreshold above. Tombstone-dense blocks are identified
// when sstables are written, and so this is effectively an option for
// sstable writers. The default value is 0.5.
DeletionSizeRatioThreshold float32
// TombstoneDenseCompactionThreshold is the minimum percent of data
// blocks in a table that must be tombstone-dense for that table to be
// eligible for a tombstone density compaction. It should be defined as a
// ratio out of 1. The default value is 0.10.
//
// If multiple tables are eligible for a tombstone density compaction, then
// tables with a higher percent of tombstone-dense blocks are still
// prioritized for compaction.
//
// A zero or negative value disables tombstone density compactions.
TombstoneDenseCompactionThreshold float64
// TableCacheShards is the number of shards per table cache.
// Reducing the value can reduce the number of idle goroutines per DB
// instance which can be useful in scenarios with a lot of DB instances
// and a large number of CPUs, but doing so can lead to higher contention
// in the table cache and reduced performance.
//
// The default value is the number of logical CPUs, which can be
// limited by runtime.GOMAXPROCS.
TableCacheShards int
// KeyValidationFunc is a function to validate a user key in an SSTable.
//
// Currently, this function is used to validate the smallest and largest
// keys in an SSTable undergoing compaction. In this case, returning an
// error from the validation function will result in a panic at runtime,
// given that there is rarely any way of recovering from malformed keys
// present in compacted files. By default, validation is not performed.
//
// Additional use-cases may be added in the future.
//
// NOTE: callers should take care to not mutate the key being validated.
KeyValidationFunc func(userKey []byte) error
// ValidateOnIngest schedules validation of sstables after they have
// been ingested.
//
// By default, this value is false.
ValidateOnIngest bool
// LevelMultiplier configures the size multiplier used to determine the
// desired size of each level of the LSM. Defaults to 10.
LevelMultiplier int
// MultiLevelCompactionHeuristic determines whether to add an additional
// level to a conventional two level compaction. If nil, a multilevel
// compaction will never get triggered.
MultiLevelCompactionHeuristic MultiLevelHeuristic
// MaxWriterConcurrency is used to indicate the maximum number of
// compression workers the compression queue is allowed to use. If
// MaxWriterConcurrency > 0, then the Writer will use parallelism, to
// compress and write blocks to disk. Otherwise, the writer will
// compress and write blocks to disk synchronously.
MaxWriterConcurrency int
// ForceWriterParallelism is used to force parallelism in the sstable
// Writer for the metamorphic tests. Even with the MaxWriterConcurrency
// option set, we only enable parallelism in the sstable Writer if there
// is enough CPU available, and this option bypasses that.
ForceWriterParallelism bool
// CPUWorkPermissionGranter should be set if Pebble should be given the
// ability to optionally schedule additional CPU. See the documentation
// for CPUWorkPermissionGranter for more details.
CPUWorkPermissionGranter CPUWorkPermissionGranter
// EnableColumnarBlocks is used to decide whether to enable writing
// TableFormatPebblev5 sstables. This setting is only respected by
// FormatColumnarBlocks. In lower format major versions, the
// TableFormatPebblev5 format is prohibited. If EnableColumnarBlocks is
// nil and the DB is at FormatColumnarBlocks, the DB defaults to not
// writing columnar blocks.
EnableColumnarBlocks func() bool
// EnableValueBlocks is used to decide whether to enable writing
// TableFormatPebblev3 sstables. This setting is only respected by a
// specific subset of format major versions: FormatSSTableValueBlocks,
// FormatFlushableIngest and FormatPrePebblev1MarkedCompacted. In lower
// format major versions, value blocks are never enabled. In higher
// format major versions, value blocks are always enabled.
EnableValueBlocks func() bool
// ShortAttributeExtractor is used iff EnableValueBlocks() returns true
// (else ignored). If non-nil, a ShortAttribute can be extracted from the
// value and stored with the key, when the value is stored elsewhere.
ShortAttributeExtractor ShortAttributeExtractor
// RequiredInPlaceValueBound specifies an optional span of user key
// prefixes that are not-MVCC, but have a suffix. For these the values
// must be stored with the key, since the concept of "older versions" is
// not defined. It is also useful for statically known exclusions to value
// separation. In CockroachDB, this will be used for the lock table key
// space that has non-empty suffixes, but those locks don't represent
// actual MVCC versions (the suffix ordering is arbitrary). We will also
// need to add support for dynamically configured exclusions (we want the
// default to be to allow Pebble to decide whether to separate the value
// or not, hence this is structured as exclusions), for example, for users
// of CockroachDB to dynamically exclude certain tables.
//
// Any change in exclusion behavior takes effect only on future written
// sstables, and does not start rewriting existing sstables.
//
// Even ignoring changes in this setting, exclusions are interpreted as a
// guidance by Pebble, and not necessarily honored. Specifically, user
// keys with multiple Pebble-versions *may* have the older versions stored
// in value blocks.
RequiredInPlaceValueBound UserKeyPrefixBound
// DisableIngestAsFlushable disables lazy ingestion of sstables through
// a WAL write and memtable rotation. Only effectual if the format
// major version is at least `FormatFlushableIngest`.
DisableIngestAsFlushable func() bool
// RemoteStorage enables use of remote storage (e.g. S3) for storing
// sstables. Setting this option enables use of CreateOnShared option and
// allows ingestion of external files.
RemoteStorage remote.StorageFactory
// If CreateOnShared is non-zero, new sstables are created on remote storage
// (using CreateOnSharedLocator and with the appropriate
// CreateOnSharedStrategy). These sstables can be shared between different
// Pebble instances; the lifecycle of such objects is managed by the
// remote.Storage constructed by options.RemoteStorage.
//
// Can only be used when RemoteStorage is set (and recognizes
// CreateOnSharedLocator).
CreateOnShared remote.CreateOnSharedStrategy
CreateOnSharedLocator remote.Locator
// CacheSizeBytesBytes is the size of the on-disk block cache for objects
// on shared storage in bytes. If it is 0, no cache is used.
SecondaryCacheSizeBytes int64
// NB: DO NOT crash on SingleDeleteInvariantViolationCallback or
// IneffectualSingleDeleteCallback, since these can be false positives
// even if SingleDel has been used correctly.
//
// Pebble's delete-only compactions can cause a recent RANGEDEL to peek
// below an older SINGLEDEL and delete an arbitrary subset of data below
// that SINGLEDEL. When that SINGLEDEL gets compacted (without the
// RANGEDEL), any of these callbacks can happen, without it being a real
// correctness problem.
//
// Example 1:
// RANGEDEL [a, c)#10 in L0
// SINGLEDEL b#5 in L1
// SET b#3 in L6
//
// If the L6 file containing the SET is narrow and the L1 file containing
// the SINGLEDEL is wide, a delete-only compaction can remove the file in
// L2 before the SINGLEDEL is compacted down. Then when the SINGLEDEL is
// compacted down, it will not find any SET to delete, resulting in the
// ineffectual callback.
//
// Example 2:
// RANGEDEL [a, z)#60 in L0
// SINGLEDEL g#50 in L1
// SET g#40 in L2
// RANGEDEL [g,h)#30 in L3
// SET g#20 in L6
//
// In this example, the two SETs represent the same user write, and the
// RANGEDELs are caused by the CockroachDB range being dropped. That is,
// the user wrote to g once, range was dropped, then added back, which
// caused the SET again, then at some point g was validly deleted using a
// SINGLEDEL, and then the range was dropped again. The older RANGEDEL can
// get fragmented due to compactions it has been part of. Say this L3 file
// containing the RANGEDEL is very narrow, while the L1, L2, L6 files are
// wider than the RANGEDEL in L0. Then the RANGEDEL in L3 can be dropped
// using a delete-only compaction, resulting in an LSM with state:
//
// RANGEDEL [a, z)#60 in L0
// SINGLEDEL g#50 in L1
// SET g#40 in L2
// SET g#20 in L6
//
// A multi-level compaction involving L1, L2, L6 will cause the invariant
// violation callback. This example doesn't need multi-level compactions:
// say there was a Pebble snapshot at g#21 preventing g#20 from being
// dropped when it meets g#40 in a compaction. That snapshot will not save
// RANGEDEL [g,h)#30, so we can have:
//
// SINGLEDEL g#50 in L1
// SET g#40, SET g#20 in L6
//
// And say the snapshot is removed and then the L1 and L6 compaction
// happens, resulting in the invariant violation callback.
//
// TODO(sumeer): rename SingleDeleteInvariantViolationCallback to remove
// the word "invariant".
// IneffectualPointDeleteCallback is called in compactions/flushes if any
// single delete is being elided without deleting a point set/merge.
IneffectualSingleDeleteCallback func(userKey []byte)
// SingleDeleteInvariantViolationCallback is called in compactions/flushes if any
// single delete has consumed a Set/Merge, and there is another immediately older
// Set/SetWithDelete/Merge. The user of Pebble has violated the invariant under
// which SingleDelete can be used correctly.
//
// Consider the sequence SingleDelete#3, Set#2, Set#1. There are three
// ways some of these keys can first meet in a compaction.
//
// - All 3 keys in the same compaction: this callback will detect the
// violation.
//
// - SingleDelete#3, Set#2 meet in a compaction first: Both keys will
// disappear. The violation will not be detected, and the DB will have
// Set#1 which is likely incorrect (from the user's perspective).
//
// - Set#2, Set#1 meet in a compaction first: The output will be Set#2,
// which will later be consumed by SingleDelete#3. The violation will
// not be detected and the DB will be correct.
SingleDeleteInvariantViolationCallback func(userKey []byte)
// EnableDeleteOnlyCompactionExcises enables delete-only compactions to also
// apply delete-only compaction hints on sstables that partially overlap
// with it. This application happens through an excise, similar to
// the excise phase of IngestAndExcise.
EnableDeleteOnlyCompactionExcises func() bool
}
// Filters is a map from filter policy name to filter policy. It is used for
// debugging tools which may be used on multiple databases configured with
// different filter policies. It is not necessary to populate this filters
// map during normal usage of a DB (it will be done automatically by
// EnsureDefaults).
Filters map[string]FilterPolicy
// FlushDelayDeleteRange configures how long the database should wait before
// forcing a flush of a memtable that contains a range deletion. Disk space
// cannot be reclaimed until the range deletion is flushed. No automatic
// flush occurs if zero.
FlushDelayDeleteRange time.Duration
// FlushDelayRangeKey configures how long the database should wait before
// forcing a flush of a memtable that contains a range key. Range keys in
// the memtable prevent lazy combined iteration, so it's desirable to flush
// range keys promptly. No automatic flush occurs if zero.
FlushDelayRangeKey time.Duration
// FlushSplitBytes denotes the target number of bytes per sublevel in
// each flush split interval (i.e. range between two flush split keys)
// in L0 sstables. When set to zero, only a single sstable is generated
// by each flush. When set to a non-zero value, flushes are split at
// points to meet L0's TargetFileSize, any grandparent-related overlap
// options, and at boundary keys of L0 flush split intervals (which are
// targeted to contain around FlushSplitBytes bytes in each sublevel
// between pairs of boundary keys). Splitting sstables during flush
// allows increased compaction flexibility and concurrency when those
// tables are compacted to lower levels.
FlushSplitBytes int64
// FormatMajorVersion sets the format of on-disk files. It is
// recommended to set the format major version to an explicit
// version, as the default may change over time.
//
// At Open if the existing database is formatted using a later
// format major version that is known to this version of Pebble,
// Pebble will continue to use the later format major version. If
// the existing database's version is unknown, the caller may use
// FormatMostCompatible and will be able to open the database
// regardless of its actual version.
//
// If the existing database is formatted using a format major
// version earlier than the one specified, Open will automatically
// ratchet the database to the specified format major version.
FormatMajorVersion FormatMajorVersion
// FS provides the interface for persistent file storage.
//
// The default value uses the underlying operating system's file system.
FS vfs.FS
// KeySchema is the name of the key schema that should be used when writing
// new sstables. There must be a key schema with this name defined in
// KeySchemas. If not set, colblk.DefaultKeySchema is used to construct a
// default key schema.
KeySchema string
// KeySchemas defines the set of known schemas of user keys. When columnar
// blocks are in use (see FormatColumnarBlocks), the user may specify how a
// key should be decomposed into columns. Each KeySchema must have a unique
// name. The schema named by Options.KeySchema is used while writing
// sstables during flushes and compactions.
//
// Multiple KeySchemas may be used over the lifetime of a database. Once a
// KeySchema is used, it must be provided in KeySchemas in subsequent calls
// to Open for perpetuity.
KeySchemas sstable.KeySchemas
// Lock, if set, must be a database lock acquired through LockDirectory for
// the same directory passed to Open. If provided, Open will skip locking
// the directory. Closing the database will not release the lock, and it's
// the responsibility of the caller to release the lock after closing the
// database.
//
// Open will enforce that the Lock passed locks the same directory passed to
// Open. Concurrent calls to Open using the same Lock are detected and
// prohibited.
Lock *Lock
// The count of L0 files necessary to trigger an L0 compaction.
L0CompactionFileThreshold int
// The amount of L0 read-amplification necessary to trigger an L0 compaction.
L0CompactionThreshold int
// Hard limit on L0 read-amplification, computed as the number of L0
// sublevels. Writes are stopped when this threshold is reached.
L0StopWritesThreshold int
// The maximum number of bytes for LBase. The base level is the level which
// L0 is compacted into. The base level is determined dynamically based on
// the existing data in the LSM. The maximum number of bytes for other levels
// is computed dynamically based on the base level's maximum size. When the
// maximum number of bytes for a level is exceeded, compaction is requested.
LBaseMaxBytes int64
// Per-level options. Options for at least one level must be specified. The
// options for the last level are used for all subsequent levels.
Levels []LevelOptions
// LoggerAndTracer will be used, if non-nil, else Logger will be used and
// tracing will be a noop.
// Logger used to write log messages.
//
// The default logger uses the Go standard library log package.
Logger Logger
// LoggerAndTracer is used for writing log messages and traces.
LoggerAndTracer LoggerAndTracer
// MaxManifestFileSize is the maximum size the MANIFEST file is allowed to
// become. When the MANIFEST exceeds this size it is rolled over and a new
// MANIFEST is created.
MaxManifestFileSize int64
// MaxOpenFiles is a soft limit on the number of open files that can be
// used by the DB.
//
// The default value is 1000.
MaxOpenFiles int
// The size of a MemTable in steady state. The actual MemTable size starts at
// min(256KB, MemTableSize) and doubles for each subsequent MemTable up to
// MemTableSize. This reduces the memory pressure caused by MemTables for
// short lived (test) DB instances. Note that more than one MemTable can be
// in existence since flushing a MemTable involves creating a new one and
// writing the contents of the old one in the
// background. MemTableStopWritesThreshold places a hard limit on the size of
// the queued MemTables.
//
// The default value is 4MB.
MemTableSize uint64
// Hard limit on the number of queued of MemTables. Writes are stopped when
// the sum of the queued memtable sizes exceeds:
// MemTableStopWritesThreshold * MemTableSize.
//
// This value should be at least 2 or writes will stop whenever a MemTable is
// being flushed.
//
// The default value is 2.
MemTableStopWritesThreshold int
// Merger defines the associative merge operation to use for merging values
// written with {Batch,DB}.Merge.
//
// The default merger concatenates values.
Merger *Merger