-
Notifications
You must be signed in to change notification settings - Fork 451
/
version.go
1461 lines (1344 loc) · 52 KB
/
version.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
// Copyright 2012 The LevelDB-Go and Pebble Authors. All rights reserved. Use
// of this source code is governed by a BSD-style license that can be found in
// the LICENSE file.
package manifest
import (
"bytes"
"fmt"
"sort"
"strconv"
"strings"
"sync"
"sync/atomic"
"unicode"
"github.com/cockroachdb/errors"
"github.com/cockroachdb/pebble/internal/base"
"github.com/cockroachdb/pebble/internal/invariants"
)
// Compare exports the base.Compare type.
type Compare = base.Compare
// InternalKey exports the base.InternalKey type.
type InternalKey = base.InternalKey
// TableInfo contains the common information for table related events.
type TableInfo struct {
// FileNum is the internal DB identifier for the table.
FileNum base.FileNum
// Size is the size of the file in bytes.
Size uint64
// Smallest is the smallest internal key in the table.
Smallest InternalKey
// Largest is the largest internal key in the table.
Largest InternalKey
// SmallestSeqNum is the smallest sequence number in the table.
SmallestSeqNum uint64
// LargestSeqNum is the largest sequence number in the table.
LargestSeqNum uint64
}
// TableStats contains statistics on a table used for compaction heuristics,
// and export via Metrics.
type TableStats struct {
// The total number of entries in the table.
NumEntries uint64
// The number of point and range deletion entries in the table.
NumDeletions uint64
// NumRangeKeySets is the total number of range key sets in the table.
NumRangeKeySets uint64
// Estimate of the total disk space that may be dropped by this table's
// point deletions by compacting them.
PointDeletionsBytesEstimate uint64
// Estimate of the total disk space that may be dropped by this table's
// range deletions by compacting them. This estimate is at data-block
// granularity and is not updated if compactions beneath the table reduce
// the amount of reclaimable disk space. It also does not account for
// overlapping data in L0 and ignores L0 sublevels, but the error that
// introduces is expected to be small.
//
// Tables in the bottommost level of the LSM may have a nonzero estimate
// if snapshots or move compactions prevented the elision of their range
// tombstones.
RangeDeletionsBytesEstimate uint64
// Total size of value blocks and value index block.
ValueBlocksSize uint64
}
// boundType represents the type of key (point or range) present as the smallest
// and largest keys.
type boundType uint8
const (
boundTypePointKey boundType = iota + 1
boundTypeRangeKey
)
// CompactionState is the compaction state of a file.
//
// The following shows the valid state transitions:
//
// NotCompacting --> Compacting --> Compacted
// ^ |
// | |
// +-------<-------+
//
// Input files to a compaction transition to Compacting when a compaction is
// picked. A file that has finished compacting typically transitions into the
// Compacted state, at which point it is effectively obsolete ("zombied") and
// will eventually be removed from the LSM. A file that has been move-compacted
// will transition from Compacting back into the NotCompacting state, signaling
// that the file may be selected for a subsequent compaction. A failed
// compaction will result in all input tables transitioning from Compacting to
// NotCompacting.
//
// This state is in-memory only. It is not persisted to the manifest.
type CompactionState uint8
// CompactionStates.
const (
CompactionStateNotCompacting CompactionState = iota
CompactionStateCompacting
CompactionStateCompacted
)
// String implements fmt.Stringer.
func (s CompactionState) String() string {
switch s {
case CompactionStateNotCompacting:
return "NotCompacting"
case CompactionStateCompacting:
return "Compacting"
case CompactionStateCompacted:
return "Compacted"
default:
panic(fmt.Sprintf("pebble: unknown compaction state %d", s))
}
}
// FileMetadata is maintained for leveled-ssts, i.e., they belong to a level of
// some version. FileMetadata does not contain the actual level of the sst,
// since such leveled-ssts can move across levels in different versions, while
// sharing the same FileMetadata. There are two kinds of leveled-ssts, physical
// and virtual. Underlying both leveled-ssts is a backing-sst, for which the
// only state is BackingState. A backing-sst is level-less. It is possible for a
// backing-sst to be referred to by a physical sst in one version and by one or
// more virtual ssts in one or more versions. A backing-sst becomes obsolete
// and can be deleted once it is no longer required by any physical or virtual
// sst in any version.
//
// We maintain some invariants:
// 1. Each physical and virtual sst will have a unique FileMetadata.FileNum,
// and there will be exactly one FileMetadata associated with the FileNum.
// 2. Within a version, a backing-sst is either only referred to by one
// physical sst or one or more virtual ssts.
// 3. Once a backing-sst is referred to by a virtual sst in the latest version,
// it cannot go back to being referred to by a physical sst in any future
// version.
//
// Once a physical sst is no longer needed by any version, we will no longer
// maintain the file metadata associated with it. We will still maintain the
// BackingState associated with the physical sst if the backing sst is required
// by any virtual ssts in any version.
type FileMetadata struct {
// Atomic contains fields which are accessed atomically. Go allocations
// are guaranteed to be 64-bit aligned which we take advantage of by
// placing the 64-bit fields which we access atomically at the beginning
// of the FileMetadata struct. For more information, see
// https://golang.org/pkg/sync/atomic/#pkg-note-BUG.
Atomic struct {
// AllowedSeeks is used to determine if a file should be picked for
// a read triggered compaction. It is decremented when read sampling
// in pebble.Iterator after every after every positioning operation
// that returns a user key (eg. Next, Prev, SeekGE, SeekLT, etc).
AllowedSeeks int64
// statsValid is 1 if stats have been loaded for the table. The
// TableStats structure is populated only if valid is 1.
statsValid uint32
}
// BackingState is the state which backs either a physical or virtual
// sstables.
BackingState *BackingState
// InitAllowedSeeks is the inital value of allowed seeks. This is used
// to re-set allowed seeks on a file once it hits 0.
InitAllowedSeeks int64
// FileNum is the file number.
//
// INVARIANT: when !FileMetadata.Virtual, FileNum == BackingState.FileNum.
//
// TODO(bananabrick): Consider creating separate types for
// FileMetadata.FileNum and BackingState.FileNum. FileNum is used both as
// an indentifier for the FileMetadata in Pebble, and also as a handle to
// perform reads and writes. We should ensure through types that
// FileMetadata.FileNum isn't used to perform reads, and that
// BackingState.FileNum isn't used as an identifier for the FileMetadata.
FileNum base.FileNum
// Size is the size of the file, in bytes. Size is an approximate value for
// virtual sstables.
//
// INVARIANT: when !FileMetadata.Virtual, Size == BackingState.Size.
//
// TODO(bananabrick): Size is currently used in metrics, and for many key
// Pebble level heuristics. Make sure that the heuristics will still work
// appropriately with an approximate value of size.
Size uint64
// File creation time in seconds since the epoch (1970-01-01 00:00:00
// UTC). For ingested sstables, this corresponds to the time the file was
// ingested. For virtual sstables, this corresponds to the wall clock time
// when the FileMetadata for the virtual sstable was first created.
CreationTime int64
// Lower and upper bounds for the smallest and largest sequence numbers in
// the table, across both point and range keys. For physical sstables, these
// values are precise.
SmallestSeqNum uint64
LargestSeqNum uint64
// SmallestPointKey and LargestPointKey are the inclusive bounds for the
// internal point keys stored in the table. This includes RANGEDELs, which
// alter point keys.
// NB: these field should be set using ExtendPointKeyBounds. They are left
// exported for reads as an optimization.
SmallestPointKey InternalKey
LargestPointKey InternalKey
// SmallestRangeKey and LargestRangeKey are the inclusive bounds for the
// internal range keys stored in the table.
// NB: these field should be set using ExtendRangeKeyBounds. They are left
// exported for reads as an optimization.
SmallestRangeKey InternalKey
LargestRangeKey InternalKey
// Smallest and Largest are the inclusive bounds for the internal keys stored
// in the table, across both point and range keys.
// NB: these fields are derived from their point and range key equivalents,
// and are updated via the MaybeExtend{Point,Range}KeyBounds methods.
Smallest InternalKey
Largest InternalKey
// Stats describe table statistics. Protected by DB.mu.
//
// For virtual sstables, set stats upon virtual sstable creation as
// asynchronous computation of stats is not currently supported.
//
// TODO(bananabrick): To support manifest replay for virtual sstables, we
// probably need to compute virtual sstable stats asynchronously. Otherwise,
// we'd have to write virtual sstable stats to the version edit.
Stats TableStats
SubLevel int
L0Index int
minIntervalIndex int
maxIntervalIndex int
// NB: the alignment of this struct is 8 bytes. We pack all the bools to
// ensure an optimal packing.
// For L0 files only. Protected by DB.mu. Used to generate L0 sublevels and
// pick L0 compactions. Only accurate for the most recent Version.
//
// IsIntraL0Compacting is set to True if this file is part of an intra-L0
// compaction. When it's true, IsCompacting must also return true. If
// Compacting is true and IsIntraL0Compacting is false for an L0 file, the
// file must be part of a compaction to Lbase.
IsIntraL0Compacting bool
CompactionState CompactionState
// True if compaction of this file has been explicitly requested.
// Previously, RocksDB and earlier versions of Pebble allowed this
// flag to be set by a user table property collector. Some earlier
// versions of Pebble respected this flag, while other more recent
// versions ignored this flag.
//
// More recently this flag has been repurposed to facilitate the
// compaction of 'atomic compaction units'. Files marked for
// compaction are compacted in a rewrite compaction at the lowest
// possible compaction priority.
//
// NB: A count of files marked for compaction is maintained on
// Version, and compaction picking reads cached annotations
// determined by this field.
//
// Protected by DB.mu.
MarkedForCompaction bool
// HasPointKeys tracks whether the table contains point keys (including
// RANGEDELs). If a table contains only range deletions, HasPointsKeys is
// still true.
HasPointKeys bool
// HasRangeKeys tracks whether the table contains any range keys.
HasRangeKeys bool
// smallestSet and largestSet track whether the overall bounds have been set.
boundsSet bool
// boundTypeSmallest and boundTypeLargest provide an indication as to which
// key type (point or range) corresponds to the smallest and largest overall
// table bounds.
boundTypeSmallest, boundTypeLargest boundType
// Virtual is true if the FileMetadata belongs to a virtual sstable.
Virtual bool
}
// PhysicalFileMeta is used by functions which want a guarantee that their input
// belongs to a physical sst and not a virtual sst.
//
// NB: This type should only be constructed by calling
// FileMetadata.PhysicalMeta.
type PhysicalFileMeta struct {
*FileMetadata
}
// VirtualFileMeta is used by functions which want a guarantee that their input
// belongs to a virtual sst and not a physical sst.
//
// NB: This type should only be constructed by calling FileMetadata.VirtualMeta.
type VirtualFileMeta struct {
*FileMetadata
}
// PhysicalMeta should be the only source of creating the PhysicalFileMeta
// wrapper type.
func (m *FileMetadata) PhysicalMeta() PhysicalFileMeta {
if m.Virtual {
panic("pebble: file metadata does not belong to a physical sstable")
}
return PhysicalFileMeta{
m,
}
}
// VirtualMeta should be the only source of creating the VirtualFileMeta wrapper
// type.
func (m *FileMetadata) VirtualMeta() VirtualFileMeta {
if !m.Virtual {
panic("pebble: file metadata does not belong to a virtual sstable")
}
return VirtualFileMeta{
m,
}
}
// BackingState either backs a single physical sstable, or one or more virtual
// sstables.
//
// See the comment above the FileMetadata type for sstable terminology.
type BackingState struct {
Atomic struct {
// Reference count for the backing file on disk: incremented when a
// physical or virtual sstable which is backed by the BackingState is
// added to a version and decremented when the version is unreferenced.
// We ref count in order to determine when it is safe to delete a
// backing sst file from disk. The backing file is obsolete when the
// reference count falls to zero.
refs int32
// latestVersionRefs are the references to the BackingState in the
// latest version. This reference can be through a single physical
// sstable in the latest version, or one or more virtual sstables in the
// latest version.
//
// INVARIANT: latestVersionRefs <= refs.
latestVersionRefs int32
// VirtualizedSize is set iff the backing sst is only referred to by
// virtual ssts. VirtualizedSize is the sum of the virtual sstable sizes
// of all of the virtual sstables in the latest version which are backed
// by the physical sstable. When a virtual sstable is removed from the
// latest version, we will decrement the VirtualizedSize. During
// compaction picking, we'll compensate a virtual sstable file size by
// (BackingState.Size - BackingState.VirtualizedSize) / latestVersionRefs.
// The intuition is that if BackingState.Size - BackingState.VirtualizedSize
// is high, then the space amplification due to virtual sstables is
// high, and we should pick the virtual sstable with a higher priority.
//
// TODO(bananabrick): Compensate the virtual sstable file size using
// the VirtualizedSize during compaction picking and test.
VirtualizedSize uint64
}
FileNum base.FileNum
Size uint64
}
// Init allocates and sets the BackingState which is required by a physical
// sstable FileMetadata.
//
// Ensure that the state required by BackingState, such as the FileNum, is
// already set on the FileMetadata before Init is called. Calling Init only
// after the relevant state has been set in the FileMetadata is not necessary
// in tests which don't rely on BackingState.
func (m *FileMetadata) Init() {
if m.Virtual {
panic("pebble: virtual sstables should use a pre-existing BackingState")
}
if m.BackingState == nil {
m.BackingState = &BackingState{}
m.BackingState.Size = m.Size
m.BackingState.FileNum = m.FileNum
}
}
// ValidateVirtual should be called once the FileMetadata for a virtual sstable
// is created to verify that the fields of the virtual sstable are sound.
func (m *FileMetadata) ValidateVirtual(createdFrom *FileMetadata) {
if !m.Virtual {
panic("pebble: invalid virtual sstable")
}
if createdFrom.SmallestSeqNum != m.SmallestSeqNum {
panic("pebble: invalid smallest sequence number for virtual sstable")
}
if createdFrom.LargestSeqNum != m.LargestSeqNum {
panic("pebble: invalid largest sequence number for virtual sstable")
}
if createdFrom.BackingState != nil && createdFrom.BackingState != m.BackingState {
panic("pebble: invalid physical sstable state for virtual sstable")
}
}
// Refs returns the refcount of backing sstable.
func (m *FileMetadata) Refs() int32 {
return atomic.LoadInt32(&m.BackingState.Atomic.refs)
}
// Ref increments the ref count associated with the backing sstable.
func (m *FileMetadata) Ref() {
atomic.AddInt32(&m.BackingState.Atomic.refs, 1)
}
// Unref decrements the ref count associated with the backing sstable.
func (m *FileMetadata) Unref() int32 {
return atomic.AddInt32(&m.BackingState.Atomic.refs, -1)
}
// LatestRef increments the latest ref count associated with the backing
// sstable.
func (m *FileMetadata) LatestRef() {
atomic.AddInt32(&m.BackingState.Atomic.latestVersionRefs, 1)
if m.Virtual {
atomic.AddUint64(&m.BackingState.Atomic.VirtualizedSize, m.Size)
}
}
// LatestUnref decrements the latest ref count associated with the backing
// sstable.
func (m *FileMetadata) LatestUnref() int32 {
if m.Virtual {
atomic.AddUint64(&m.BackingState.Atomic.VirtualizedSize, -m.Size)
}
return atomic.AddInt32(&m.BackingState.Atomic.latestVersionRefs, -1)
}
// LatestRefs returns the latest ref count associated with the backing sstable.
func (m *FileMetadata) LatestRefs() int32 {
return atomic.LoadInt32(&m.BackingState.Atomic.latestVersionRefs)
}
// SetCompactionState transitions this file's compaction state to the given
// state. Protected by DB.mu.
func (m *FileMetadata) SetCompactionState(to CompactionState) {
if invariants.Enabled {
transitionErr := func() error {
return errors.Newf("pebble: invalid compaction state transition: %s -> %s", m.CompactionState, to)
}
switch m.CompactionState {
case CompactionStateNotCompacting:
if to != CompactionStateCompacting {
panic(transitionErr())
}
case CompactionStateCompacting:
if to != CompactionStateCompacted && to != CompactionStateNotCompacting {
panic(transitionErr())
}
case CompactionStateCompacted:
panic(transitionErr())
default:
panic(fmt.Sprintf("pebble: unknown compaction state: %d", m.CompactionState))
}
}
m.CompactionState = to
}
// IsCompacting returns true if this file's compaction state is
// CompactionStateCompacting. Protected by DB.mu.
func (m *FileMetadata) IsCompacting() bool {
return m.CompactionState == CompactionStateCompacting
}
// StatsValid returns true if the table stats have been populated. If StatValid
// returns true, the Stats field may be read (with or without holding the
// database mutex).
func (m *FileMetadata) StatsValid() bool {
return atomic.LoadUint32(&m.Atomic.statsValid) == 1
}
// StatsValidLocked returns true if the table stats have been populated.
// StatsValidLocked requires DB.mu is held when it's invoked, and it avoids the
// overhead of an atomic load. This is possible because table stats validity is
// only set while DB.mu is held.
func (m *FileMetadata) StatsValidLocked() bool {
return m.Atomic.statsValid == 1
}
// StatsMarkValid marks the TableStats as valid. The caller must hold DB.mu
// while populating TableStats and calling StatsMarkValud. Once stats are
// populated, they must not be mutated.
func (m *FileMetadata) StatsMarkValid() {
atomic.StoreUint32(&m.Atomic.statsValid, 1)
}
// ExtendPointKeyBounds attempts to extend the lower and upper point key bounds
// and overall table bounds with the given smallest and largest keys. The
// smallest and largest bounds may not be extended if the table already has a
// bound that is smaller or larger, respectively. The receiver is returned.
// NB: calling this method should be preferred to manually setting the bounds by
// manipulating the fields directly, to maintain certain invariants.
func (m *FileMetadata) ExtendPointKeyBounds(
cmp Compare, smallest, largest InternalKey,
) *FileMetadata {
// Update the point key bounds.
if !m.HasPointKeys {
m.SmallestPointKey, m.LargestPointKey = smallest, largest
m.HasPointKeys = true
} else {
if base.InternalCompare(cmp, smallest, m.SmallestPointKey) < 0 {
m.SmallestPointKey = smallest
}
if base.InternalCompare(cmp, largest, m.LargestPointKey) > 0 {
m.LargestPointKey = largest
}
}
// Update the overall bounds.
m.extendOverallBounds(cmp, m.SmallestPointKey, m.LargestPointKey, boundTypePointKey)
return m
}
// ExtendRangeKeyBounds attempts to extend the lower and upper range key bounds
// and overall table bounds with the given smallest and largest keys. The
// smallest and largest bounds may not be extended if the table already has a
// bound that is smaller or larger, respectively. The receiver is returned.
// NB: calling this method should be preferred to manually setting the bounds by
// manipulating the fields directly, to maintain certain invariants.
func (m *FileMetadata) ExtendRangeKeyBounds(
cmp Compare, smallest, largest InternalKey,
) *FileMetadata {
// Update the range key bounds.
if !m.HasRangeKeys {
m.SmallestRangeKey, m.LargestRangeKey = smallest, largest
m.HasRangeKeys = true
} else {
if base.InternalCompare(cmp, smallest, m.SmallestRangeKey) < 0 {
m.SmallestRangeKey = smallest
}
if base.InternalCompare(cmp, largest, m.LargestRangeKey) > 0 {
m.LargestRangeKey = largest
}
}
// Update the overall bounds.
m.extendOverallBounds(cmp, m.SmallestRangeKey, m.LargestRangeKey, boundTypeRangeKey)
return m
}
// extendOverallBounds attempts to extend the overall table lower and upper
// bounds. The given bounds may not be used if a lower or upper bound already
// exists that is smaller or larger than the given keys, respectively. The given
// boundType will be used if the bounds are updated.
func (m *FileMetadata) extendOverallBounds(
cmp Compare, smallest, largest InternalKey, bTyp boundType,
) {
if !m.boundsSet {
m.Smallest, m.Largest = smallest, largest
m.boundsSet = true
m.boundTypeSmallest, m.boundTypeLargest = bTyp, bTyp
} else {
if base.InternalCompare(cmp, smallest, m.Smallest) < 0 {
m.Smallest = smallest
m.boundTypeSmallest = bTyp
}
if base.InternalCompare(cmp, largest, m.Largest) > 0 {
m.Largest = largest
m.boundTypeLargest = bTyp
}
}
}
// Overlaps returns true if the file key range overlaps with the given range.
func (m *FileMetadata) Overlaps(cmp Compare, start []byte, end []byte, exclusiveEnd bool) bool {
if c := cmp(m.Largest.UserKey, start); c < 0 || (c == 0 && m.Largest.IsExclusiveSentinel()) {
// f is completely before the specified range; no overlap.
return false
}
if c := cmp(m.Smallest.UserKey, end); c > 0 || (c == 0 && exclusiveEnd) {
// f is completely after the specified range; no overlap.
return false
}
return true
}
// ContainsKeyType returns whether or not the file contains keys of the provided
// type.
func (m *FileMetadata) ContainsKeyType(kt KeyType) bool {
switch kt {
case KeyTypePointAndRange:
return true
case KeyTypePoint:
return m.HasPointKeys
case KeyTypeRange:
return m.HasRangeKeys
default:
panic("unrecognized key type")
}
}
// SmallestBound returns the file's smallest bound of the key type. It returns a
// false second return value if the file does not contain any keys of the key
// type.
func (m *FileMetadata) SmallestBound(kt KeyType) (*InternalKey, bool) {
switch kt {
case KeyTypePointAndRange:
return &m.Smallest, true
case KeyTypePoint:
return &m.SmallestPointKey, m.HasPointKeys
case KeyTypeRange:
return &m.SmallestRangeKey, m.HasRangeKeys
default:
panic("unrecognized key type")
}
}
// LargestBound returns the file's largest bound of the key type. It returns a
// false second return value if the file does not contain any keys of the key
// type.
func (m *FileMetadata) LargestBound(kt KeyType) (*InternalKey, bool) {
switch kt {
case KeyTypePointAndRange:
return &m.Largest, true
case KeyTypePoint:
return &m.LargestPointKey, m.HasPointKeys
case KeyTypeRange:
return &m.LargestRangeKey, m.HasRangeKeys
default:
panic("unrecognized key type")
}
}
const (
maskContainsPointKeys = 1 << 0
maskSmallest = 1 << 1
maskLargest = 1 << 2
)
// boundsMarker returns a marker byte whose bits encode the following
// information (in order from least significant bit):
// - if the table contains point keys
// - if the table's smallest key is a point key
// - if the table's largest key is a point key
func (m *FileMetadata) boundsMarker() (sentinel uint8, err error) {
if m.HasPointKeys {
sentinel |= maskContainsPointKeys
}
switch m.boundTypeSmallest {
case boundTypePointKey:
sentinel |= maskSmallest
case boundTypeRangeKey:
// No op - leave bit unset.
default:
return 0, base.CorruptionErrorf("file %s has neither point nor range key as smallest key", m.FileNum)
}
switch m.boundTypeLargest {
case boundTypePointKey:
sentinel |= maskLargest
case boundTypeRangeKey:
// No op - leave bit unset.
default:
return 0, base.CorruptionErrorf("file %s has neither point nor range key as largest key", m.FileNum)
}
return
}
// String implements fmt.Stringer, printing the file number and the overall
// table bounds.
func (m *FileMetadata) String() string {
return fmt.Sprintf("%s:[%s-%s]", m.FileNum, m.Smallest, m.Largest)
}
// DebugString returns a verbose representation of FileMetadata, typically for
// use in tests and debugging, returning the file number and the point, range
// and overall bounds for the table.
func (m *FileMetadata) DebugString(format base.FormatKey, verbose bool) string {
var b bytes.Buffer
fmt.Fprintf(&b, "%s:[%s-%s]",
m.FileNum, m.Smallest.Pretty(format), m.Largest.Pretty(format))
if !verbose {
return b.String()
}
if m.HasPointKeys {
fmt.Fprintf(&b, " points:[%s-%s]",
m.SmallestPointKey.Pretty(format), m.LargestPointKey.Pretty(format))
}
if m.HasRangeKeys {
fmt.Fprintf(&b, " ranges:[%s-%s]",
m.SmallestRangeKey.Pretty(format), m.LargestRangeKey.Pretty(format))
}
return b.String()
}
// ParseFileMetadataDebug parses a FileMetadata from its DebugString
// representation.
func ParseFileMetadataDebug(s string) (m FileMetadata, err error) {
// Split lines of the form:
// 000000:[a#0,SET-z#0,SET] points:[...] ranges:[...]
fields := strings.FieldsFunc(s, func(c rune) bool {
switch c {
case ':', '[', '-', ']':
return true
default:
return unicode.IsSpace(c) // NB: also trim whitespace padding.
}
})
if len(fields)%3 != 0 {
return m, errors.Newf("malformed input: %s", s)
}
for len(fields) > 0 {
prefix := fields[0]
smallest := base.ParsePrettyInternalKey(fields[1])
largest := base.ParsePrettyInternalKey(fields[2])
switch prefix {
case "points":
m.SmallestPointKey, m.LargestPointKey = smallest, largest
m.HasPointKeys = true
case "ranges":
m.SmallestRangeKey, m.LargestRangeKey = smallest, largest
m.HasRangeKeys = true
default:
fileNum, err := strconv.ParseUint(prefix, 10, 64)
if err != nil {
return m, errors.Newf("malformed input: %s: %s", s, err)
}
m.FileNum = base.FileNum(fileNum)
m.Smallest, m.Largest = smallest, largest
m.boundsSet = true
}
fields = fields[3:]
}
// By default, when the parser sees just the overall bounds, we set the point
// keys. This preserves backwards compatability with existing test cases that
// specify only the overall bounds.
if !m.HasPointKeys && !m.HasRangeKeys {
m.SmallestPointKey, m.LargestPointKey = m.Smallest, m.Largest
m.HasPointKeys = true
}
m.Init()
return
}
// Validate validates the metadata for consistency with itself, returning an
// error if inconsistent.
func (m *FileMetadata) Validate(cmp Compare, formatKey base.FormatKey) error {
// Combined range and point key validation.
if !m.HasPointKeys && !m.HasRangeKeys {
return base.CorruptionErrorf("file %s has neither point nor range keys",
errors.Safe(m.FileNum))
}
if base.InternalCompare(cmp, m.Smallest, m.Largest) > 0 {
return base.CorruptionErrorf("file %s has inconsistent bounds: %s vs %s",
errors.Safe(m.FileNum), m.Smallest.Pretty(formatKey),
m.Largest.Pretty(formatKey))
}
if m.SmallestSeqNum > m.LargestSeqNum {
return base.CorruptionErrorf("file %s has inconsistent seqnum bounds: %d vs %d",
errors.Safe(m.FileNum), m.SmallestSeqNum, m.LargestSeqNum)
}
// Point key validation.
if m.HasPointKeys {
if base.InternalCompare(cmp, m.SmallestPointKey, m.LargestPointKey) > 0 {
return base.CorruptionErrorf("file %s has inconsistent point key bounds: %s vs %s",
errors.Safe(m.FileNum), m.SmallestPointKey.Pretty(formatKey),
m.LargestPointKey.Pretty(formatKey))
}
if base.InternalCompare(cmp, m.SmallestPointKey, m.Smallest) < 0 ||
base.InternalCompare(cmp, m.LargestPointKey, m.Largest) > 0 {
return base.CorruptionErrorf(
"file %s has inconsistent point key bounds relative to overall bounds: "+
"overall = [%s-%s], point keys = [%s-%s]",
errors.Safe(m.FileNum),
m.Smallest.Pretty(formatKey), m.Largest.Pretty(formatKey),
m.SmallestPointKey.Pretty(formatKey), m.LargestPointKey.Pretty(formatKey),
)
}
}
// Range key validation.
if m.HasRangeKeys {
if base.InternalCompare(cmp, m.SmallestRangeKey, m.LargestRangeKey) > 0 {
return base.CorruptionErrorf("file %s has inconsistent range key bounds: %s vs %s",
errors.Safe(m.FileNum), m.SmallestRangeKey.Pretty(formatKey),
m.LargestRangeKey.Pretty(formatKey))
}
if base.InternalCompare(cmp, m.SmallestRangeKey, m.Smallest) < 0 ||
base.InternalCompare(cmp, m.LargestRangeKey, m.Largest) > 0 {
return base.CorruptionErrorf(
"file %s has inconsistent range key bounds relative to overall bounds: "+
"overall = [%s-%s], range keys = [%s-%s]",
errors.Safe(m.FileNum),
m.Smallest.Pretty(formatKey), m.Largest.Pretty(formatKey),
m.SmallestRangeKey.Pretty(formatKey), m.LargestRangeKey.Pretty(formatKey),
)
}
}
// Ensure that FileMetadata.Init was called.
if m.BackingState == nil {
return base.CorruptionErrorf("file metadata BackingState not set")
}
return nil
}
// TableInfo returns a subset of the FileMetadata state formatted as a
// TableInfo.
func (m *FileMetadata) TableInfo() TableInfo {
return TableInfo{
FileNum: m.FileNum,
Size: m.Size,
Smallest: m.Smallest,
Largest: m.Largest,
SmallestSeqNum: m.SmallestSeqNum,
LargestSeqNum: m.LargestSeqNum,
}
}
func cmpUint64(a, b uint64) int {
switch {
case a < b:
return -1
case a > b:
return +1
default:
return 0
}
}
func (m *FileMetadata) cmpSeqNum(b *FileMetadata) int {
// NB: This is the same ordering that RocksDB uses for L0 files.
// Sort first by largest sequence number.
if m.LargestSeqNum != b.LargestSeqNum {
return cmpUint64(m.LargestSeqNum, b.LargestSeqNum)
}
// Then by smallest sequence number.
if m.SmallestSeqNum != b.SmallestSeqNum {
return cmpUint64(m.SmallestSeqNum, b.SmallestSeqNum)
}
// Break ties by file number.
return cmpUint64(uint64(m.FileNum), uint64(b.FileNum))
}
func (m *FileMetadata) lessSeqNum(b *FileMetadata) bool {
return m.cmpSeqNum(b) < 0
}
func (m *FileMetadata) cmpSmallestKey(b *FileMetadata, cmp Compare) int {
return base.InternalCompare(cmp, m.Smallest, b.Smallest)
}
// KeyRange returns the minimum smallest and maximum largest internalKey for
// all the FileMetadata in iters.
func KeyRange(ucmp Compare, iters ...LevelIterator) (smallest, largest InternalKey) {
first := true
for _, iter := range iters {
for meta := iter.First(); meta != nil; meta = iter.Next() {
if first {
first = false
smallest, largest = meta.Smallest, meta.Largest
continue
}
if base.InternalCompare(ucmp, smallest, meta.Smallest) >= 0 {
smallest = meta.Smallest
}
if base.InternalCompare(ucmp, largest, meta.Largest) <= 0 {
largest = meta.Largest
}
}
}
return smallest, largest
}
type bySeqNum []*FileMetadata
func (b bySeqNum) Len() int { return len(b) }
func (b bySeqNum) Less(i, j int) bool {
return b[i].lessSeqNum(b[j])
}
func (b bySeqNum) Swap(i, j int) { b[i], b[j] = b[j], b[i] }
// SortBySeqNum sorts the specified files by increasing sequence number.
func SortBySeqNum(files []*FileMetadata) {
sort.Sort(bySeqNum(files))
}
type bySmallest struct {
files []*FileMetadata
cmp Compare
}
func (b bySmallest) Len() int { return len(b.files) }
func (b bySmallest) Less(i, j int) bool {
return b.files[i].cmpSmallestKey(b.files[j], b.cmp) < 0
}
func (b bySmallest) Swap(i, j int) { b.files[i], b.files[j] = b.files[j], b.files[i] }
// SortBySmallest sorts the specified files by smallest key using the supplied
// comparison function to order user keys.
func SortBySmallest(files []*FileMetadata, cmp Compare) {
sort.Sort(bySmallest{files, cmp})
}
func overlaps(iter LevelIterator, cmp Compare, start, end []byte, exclusiveEnd bool) LevelSlice {
startIter := iter.Clone()
{
startIterFile := startIter.SeekGE(cmp, start)
// SeekGE compares user keys. The user key `start` may be equal to the
// f.Largest because f.Largest is a range deletion sentinel, indicating
// that the user key `start` is NOT contained within the file f. If
// that's the case, we can narrow the overlapping bounds to exclude the
// file with the sentinel.
if startIterFile != nil && startIterFile.Largest.IsExclusiveSentinel() &&
cmp(startIterFile.Largest.UserKey, start) == 0 {
startIterFile = startIter.Next()
}
_ = startIterFile // Ignore unused assignment.
}
endIter := iter.Clone()
{
endIterFile := endIter.SeekGE(cmp, end)
if !exclusiveEnd {
// endIter is now pointing at the *first* file with a largest key >= end.
// If there are multiple files including the user key `end`, we want all
// of them, so move forward.
for endIterFile != nil && cmp(endIterFile.Largest.UserKey, end) == 0 {
endIterFile = endIter.Next()
}
}
// LevelSlice uses inclusive bounds, so if we seeked to the end sentinel
// or nexted too far because Largest.UserKey equaled `end`, go back.
//
// Consider !exclusiveEnd and end = 'f', with the following file bounds:
//
// [b,d] [e, f] [f, f] [g, h]
//
// the above for loop will Next until it arrives at [g, h]. We need to
// observe that g > f, and Prev to the file with bounds [f, f].
if endIterFile == nil {
endIterFile = endIter.Prev()
} else if c := cmp(endIterFile.Smallest.UserKey, end); c > 0 || c == 0 && exclusiveEnd {
endIterFile = endIter.Prev()
}
_ = endIterFile // Ignore unused assignment.
}
return newBoundedLevelSlice(startIter.Clone().iter, &startIter.iter, &endIter.iter)
}
// NumLevels is the number of levels a Version contains.
const NumLevels = 7
// NewVersion constructs a new Version with the provided files. It requires
// the provided files are already well-ordered. It's intended for testing.
func NewVersion(
cmp Compare, formatKey base.FormatKey, flushSplitBytes int64, files [NumLevels][]*FileMetadata,
) *Version {
var v Version
for l := range files {
// NB: We specifically insert `files` into the B-Tree in the order
// they appear within `files`. Some tests depend on this behavior in
// order to test consistency checking, etc. Once we've constructed the
// initial B-Tree, we swap out the btreeCmp for the correct one.
// TODO(jackson): Adjust or remove the tests and remove this.
v.Levels[l].tree, _ = makeBTree(btreeCmpSpecificOrder(files[l]), files[l])
v.Levels[l].level = l
if l == 0 {
v.Levels[l].tree.cmp = btreeCmpSeqNum
} else {
v.Levels[l].tree.cmp = btreeCmpSmallestKey(cmp)
}
}
if err := v.InitL0Sublevels(cmp, formatKey, flushSplitBytes); err != nil {
panic(err)
}
return &v
}
// Version is a collection of file metadata for on-disk tables at various
// levels. In-memory DBs are written to level-0 tables, and compactions
// migrate data from level N to level N+1. The tables map internal keys (which
// are a user key, a delete or set bit, and a sequence number) to user values.
//
// The tables at level 0 are sorted by largest sequence number. Due to file
// ingestion, there may be overlap in the ranges of sequence numbers contain in
// level 0 sstables. In particular, it is valid for one level 0 sstable to have
// the seqnum range [1,100] while an adjacent sstable has the seqnum range
// [50,50]. This occurs when the [50,50] table was ingested and given a global
// seqnum. The ingestion code will have ensured that the [50,50] sstable will
// not have any keys that overlap with the [1,100] in the seqnum range
// [1,49]. The range of internal keys [fileMetadata.smallest,
// fileMetadata.largest] in each level 0 table may overlap.
//
// The tables at any non-0 level are sorted by their internal key range and any
// two tables at the same non-0 level do not overlap.
//
// The internal key ranges of two tables at different levels X and Y may
// overlap, for any X != Y.
//
// Finally, for every internal key in a table at level X, there is no internal
// key in a higher level table that has both the same user key and a higher
// sequence number.