Skip to content

Commit

Permalink
kvserver,storage: Update snapshot strategy to use shared storage
Browse files Browse the repository at this point in the history
If the sender node was created with a SharedStorage, switch to
fast ingestion where we ScanInternal() the keys not in shared
levels, and just share the metadata for files in shared levels.
The sender of the snapshot specifies in the Header that it
is using this ability, and the receiver rejects the snapshot if
it cannot accept shared snapshots.

If ScanInternal() returns an `ErrInvalidSkipSharedIteration`,
we switch back to old-style snapshots where the entirety
of the range is sent over the stream as SnapshotRequests.

Future changes will add better support for detection of when
different nodes point to different blob storage buckets / shared
storage locations, and incorporate that in rebalancing.

Fixes #103028.

Release note (general change): Takes advantage of new CLI option,
`--experimental-shared-storage` to rebalance faster from node to node.
  • Loading branch information
itsbilal committed Aug 16, 2023
1 parent e160427 commit 02a4b7e
Show file tree
Hide file tree
Showing 12 changed files with 602 additions and 123 deletions.
1 change: 1 addition & 0 deletions pkg/kv/kvserver/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -224,6 +224,7 @@ go_library(
"@com_github_cockroachdb_pebble//:pebble",
"@com_github_cockroachdb_pebble//objstorage",
"@com_github_cockroachdb_pebble//objstorage/remote",
"@com_github_cockroachdb_pebble//rangekey",
"@com_github_cockroachdb_pebble//vfs",
"@com_github_cockroachdb_redact//:redact",
"@com_github_gogo_protobuf//proto",
Expand Down
58 changes: 58 additions & 0 deletions pkg/kv/kvserver/kvserverpb/raft.proto
Original file line number Diff line number Diff line change
Expand Up @@ -249,16 +249,74 @@ message SnapshotRequest {
// from a particular sending source.
double sender_queue_priority = 11;

// If true, the snapshot could contain shared files present in a pre-configured
// or explicitly specified shared.Storage instance. Such files will have their
// metadata present in the snapshot, but not file contents.
bool shared_replicate = 12;

reserved 1, 4;
}

// SharedTable represents one shared SSTable present in shared storage.
// Intended to be the protobuf version of pebble.SharedSSTMeta.
message SharedTable {
// Internal key represents a Pebble-internal key. See pebble.InternalKey
// for details on how these keys are used.
message InternalKey {
// User key portion of the internal key.
bytes user_key = 1;
// Trailer portion of the internal key, as defined by Pebble.
uint64 trailer = 2;
}

// Used by the Pebble objstorage package to resolve a reference to a shared object.
bytes backing = 1;

// Used by the Pebble objstorage package to generate new blob storage drivers.
// Reserved for future use.
bytes locator = 2;

// Smallest internal key in the sstable.
InternalKey smallest = 3;
// Largest internal key in the sstable.
InternalKey largest = 4;
// Smallest range key in the sstable. Zero value if no range keys are
// present.
InternalKey smallest_range_key = 5;
// Largest range key in the sstable. Zero value if no range keys are
// present.
InternalKey largest_range_key = 6;
// Smallest point key in the sstable. Zero value if no point keys are
// present.
InternalKey smallest_point_key = 7;
// Largest point key in the sstable. Zero value if no point keys are
// present.
InternalKey largest_point_key = 8;

// LSM level of the original sstable. This sstable will go into the same
// level in the destination LSM.
int32 level = 9;
// Physical size of the sstable in bytes.
uint64 size = 10;
}

Header header = 1;

// A BatchRepr. Multiple kv_batches may be sent across multiple request messages.
bytes kv_batch = 2 [(gogoproto.customname) = "KVBatch"];

bool final = 4;

repeated SharedTable shared_tables = 5 [(gogoproto.nullable) = false];

// If true, signals the receiver that the sender can no longer replicate
// using shared files, even though the Header initially contained
// shared_replicate = true. All contents of this range will be streamed as
// usual beyond this point. This bool must be set to true in a request before
// the end of the snapshot (i.e. before the final = true request), and this
// flag must be set to true before any user keys are streamed.
bool transition_from_shared_to_regular_replicate = 6;

reserved 3;
}

Expand Down
13 changes: 8 additions & 5 deletions pkg/kv/kvserver/rditer/replica_data_iter.go
Original file line number Diff line number Diff line change
Expand Up @@ -304,9 +304,10 @@ func (ri *ReplicaMVCCDataIterator) HasPointAndRange() (bool, bool) {

// IterateReplicaKeySpans iterates over each of a range's key spans, and calls
// the given visitor with an iterator over its data. Specifically, it iterates
// over the spans returned by either makeAllKeySpans or MakeReplicatedKeySpans,
// and for each one provides first a point key iterator and then a range key
// iterator. This is the expected order for Raft snapshots.
// over the spans returned by a Select() over all spans or replicated only spans
// (with replicatedSpansFilter applied on replicated spans), and for each one
// provides first a point key iterator and then a range key iterator. This is the
// expected order for Raft snapshots.
//
// The iterator will be pre-seeked to the span, and is provided along with the
// key span and key type (point or range). Iterators that have no data are
Expand All @@ -328,9 +329,11 @@ func IterateReplicaKeySpans(
var spans []roachpb.Span
if replicatedOnly {
spans = Select(desc.RangeID, SelectOpts{
ReplicatedSpansFilter: replicatedSpansFilter,
ReplicatedBySpan: desc.RSpan(),
ReplicatedByRangeID: true,
ReplicatedSpansFilter: replicatedSpansFilter,
// NB: We exclude ReplicatedByRangeID if replicatedSpansFilter is
// ReplicatedSpansUserOnly.
ReplicatedByRangeID: replicatedSpansFilter != ReplicatedSpansUserOnly,
})
} else {
spans = Select(desc.RangeID, SelectOpts{
Expand Down
Original file line number Diff line number Diff line change
@@ -1,15 +1,9 @@
echo
----
+----------------------------+--------------------------------------------------------+--------------+--------------------+---------------------------------------------------------------------+
| SPAN | KEY HEX | ENDKEY HEX | VERSION HEX | PRETTY |
+----------------------------+--------------------------------------------------------+--------------+--------------------+---------------------------------------------------------------------+
| /Local/RangeID/1/{r""-s""} | 016989726162632d120ce61c175eb445878c36dcf4062ada4c0001 | | | /Local/RangeID/1/r/AbortSpan/"0ce61c17-5eb4-4587-8c36-dcf4062ada4c" |
| /Local/RangeID/1/{r""-s""} | 016989726162632d129855a1ef8eb94c06a106cab1dda78a2b0001 | | | /Local/RangeID/1/r/AbortSpan/"9855a1ef-8eb9-4c06-a106-cab1dda78a2b" |
| /Local/RangeID/1/{r""-s""} | 016989726c67632d | | | /Local/RangeID/1/r/RangeGCThreshold |
| /Local/RangeID/1/{r""-s""} | 016989727261736b | | | /Local/RangeID/1/r/RangeAppliedState |
| /Local/RangeID/1/{r""-s""} | 01698972726c6c2d | | | /Local/RangeID/1/r/RangeLease |
| /Local/RangeID/1/{r""-s""} | 016989723a61 | 016989723a78 | 000000000000000109 | /Local/RangeID/1/r":{a"-x"}/0.000000001,0 |
| {a-b} | 61 | | 0000000000000001 | "a"/0.000000001,0 |
| {a-b} | 61ffffffff | | 0000000000000001 | "a\xff\xff\xff\xff"/0.000000001,0 |
| {a-b} | 61 | 62 | 000000000000000109 | {a-b}/0.000000001,0 |
+----------------------------+--------------------------------------------------------+--------------+--------------------+---------------------------------------------------------------------+
+-------+------------+------------+--------------------+-----------------------------------+
| SPAN | KEY HEX | ENDKEY HEX | VERSION HEX | PRETTY |
+-------+------------+------------+--------------------+-----------------------------------+
| {a-b} | 61 | | 0000000000000001 | "a"/0.000000001,0 |
| {a-b} | 61ffffffff | | 0000000000000001 | "a\xff\xff\xff\xff"/0.000000001,0 |
| {a-b} | 61 | 62 | 000000000000000109 | {a-b}/0.000000001,0 |
+-------+------------+------------+--------------------+-----------------------------------+
Original file line number Diff line number Diff line change
@@ -1,15 +1,9 @@
echo
----
+----------------------------+--------------------------------------------------------+--------------+--------------------+---------------------------------------------------------------------+
| SPAN | KEY HEX | ENDKEY HEX | VERSION HEX | PRETTY |
+----------------------------+--------------------------------------------------------+--------------+--------------------+---------------------------------------------------------------------+
| /Local/RangeID/2/{r""-s""} | 01698a726162632d120ce61c175eb445878c36dcf4062ada4c0001 | | | /Local/RangeID/2/r/AbortSpan/"0ce61c17-5eb4-4587-8c36-dcf4062ada4c" |
| /Local/RangeID/2/{r""-s""} | 01698a726162632d129855a1ef8eb94c06a106cab1dda78a2b0001 | | | /Local/RangeID/2/r/AbortSpan/"9855a1ef-8eb9-4c06-a106-cab1dda78a2b" |
| /Local/RangeID/2/{r""-s""} | 01698a726c67632d | | | /Local/RangeID/2/r/RangeGCThreshold |
| /Local/RangeID/2/{r""-s""} | 01698a727261736b | | | /Local/RangeID/2/r/RangeAppliedState |
| /Local/RangeID/2/{r""-s""} | 01698a72726c6c2d | | | /Local/RangeID/2/r/RangeLease |
| /Local/RangeID/2/{r""-s""} | 01698a723a61 | 01698a723a78 | 000000000000000109 | /Local/RangeID/2/r":{a"-x"}/0.000000001,0 |
| {b-c} | 62 | | 0000000000000001 | "b"/0.000000001,0 |
| {b-c} | 62ffffffff | | 0000000000000001 | "b\xff\xff\xff\xff"/0.000000001,0 |
| {b-c} | 62 | 63 | 000000000000000109 | {b-c}/0.000000001,0 |
+----------------------------+--------------------------------------------------------+--------------+--------------------+---------------------------------------------------------------------+
+-------+------------+------------+--------------------+-----------------------------------+
| SPAN | KEY HEX | ENDKEY HEX | VERSION HEX | PRETTY |
+-------+------------+------------+--------------------+-----------------------------------+
| {b-c} | 62 | | 0000000000000001 | "b"/0.000000001,0 |
| {b-c} | 62ffffffff | | 0000000000000001 | "b\xff\xff\xff\xff"/0.000000001,0 |
| {b-c} | 62 | 63 | 000000000000000109 | {b-c}/0.000000001,0 |
+-------+------------+------------+--------------------+-----------------------------------+
Original file line number Diff line number Diff line change
@@ -1,15 +1,9 @@
echo
----
+----------------------------+--------------------------------------------------------+--------------+--------------------+---------------------------------------------------------------------+
| SPAN | KEY HEX | ENDKEY HEX | VERSION HEX | PRETTY |
+----------------------------+--------------------------------------------------------+--------------+--------------------+---------------------------------------------------------------------+
| /Local/RangeID/3/{r""-s""} | 01698b726162632d120ce61c175eb445878c36dcf4062ada4c0001 | | | /Local/RangeID/3/r/AbortSpan/"0ce61c17-5eb4-4587-8c36-dcf4062ada4c" |
| /Local/RangeID/3/{r""-s""} | 01698b726162632d129855a1ef8eb94c06a106cab1dda78a2b0001 | | | /Local/RangeID/3/r/AbortSpan/"9855a1ef-8eb9-4c06-a106-cab1dda78a2b" |
| /Local/RangeID/3/{r""-s""} | 01698b726c67632d | | | /Local/RangeID/3/r/RangeGCThreshold |
| /Local/RangeID/3/{r""-s""} | 01698b727261736b | | | /Local/RangeID/3/r/RangeAppliedState |
| /Local/RangeID/3/{r""-s""} | 01698b72726c6c2d | | | /Local/RangeID/3/r/RangeLease |
| /Local/RangeID/3/{r""-s""} | 01698b723a61 | 01698b723a78 | 000000000000000109 | /Local/RangeID/3/r":{a"-x"}/0.000000001,0 |
| {c-d} | 63 | | 0000000000000001 | "c"/0.000000001,0 |
| {c-d} | 63ffffffff | | 0000000000000001 | "c\xff\xff\xff\xff"/0.000000001,0 |
| {c-d} | 63 | 64 | 000000000000000109 | {c-d}/0.000000001,0 |
+----------------------------+--------------------------------------------------------+--------------+--------------------+---------------------------------------------------------------------+
+-------+------------+------------+--------------------+-----------------------------------+
| SPAN | KEY HEX | ENDKEY HEX | VERSION HEX | PRETTY |
+-------+------------+------------+--------------------+-----------------------------------+
| {c-d} | 63 | | 0000000000000001 | "c"/0.000000001,0 |
| {c-d} | 63ffffffff | | 0000000000000001 | "c\xff\xff\xff\xff"/0.000000001,0 |
| {c-d} | 63 | 64 | 000000000000000109 | {c-d}/0.000000001,0 |
+-------+------------+------------+--------------------+-----------------------------------+
7 changes: 7 additions & 0 deletions pkg/kv/kvserver/replica_command.go
Original file line number Diff line number Diff line change
Expand Up @@ -3138,6 +3138,12 @@ func (r *Replica) followerSendSnapshot(
// explicitly for snapshots going out to followers.
snap.State.DeprecatedUsingAppliedStateKey = true

// Use shared replication if shared storage is enabled and we're sending
// a snapshot for a non-system range. This allows us to send metadata of
// sstables in shared storage as opposed to streaming their contents. Keys
// in higher levels of the LSM are still streamed in the snapshot.
sharedReplicate := r.store.cfg.SharedStorageEnabled && snap.State.Desc.StartKey.AsRawKey().Compare(keys.TableDataMin) >= 0

// Create new snapshot request header using the delegate snapshot request.
header := kvserverpb.SnapshotRequest_Header{
State: snap.State,
Expand All @@ -3160,6 +3166,7 @@ func (r *Replica) followerSendSnapshot(
SenderQueuePriority: req.SenderQueuePriority,
Strategy: kvserverpb.SnapshotRequest_KV_BATCH,
Type: req.Type,
SharedReplicate: sharedReplicate,
}
newBatchFn := func() storage.WriteBatch {
return r.store.TODOEngine().NewWriteBatch()
Expand Down
40 changes: 31 additions & 9 deletions pkg/kv/kvserver/replica_raftstorage.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ import (
"github.com/cockroachdb/cockroach/pkg/util/uuid"
"github.com/cockroachdb/errors"
"github.com/cockroachdb/pebble"
"github.com/cockroachdb/pebble/objstorage"
"github.com/cockroachdb/redact"
"go.etcd.io/raft/v3"
"go.etcd.io/raft/v3/raftpb"
Expand Down Expand Up @@ -279,9 +280,10 @@ type OutgoingSnapshot struct {
// The Pebble snapshot that will be streamed from.
EngineSnap storage.Reader
// The replica state within the snapshot.
State kvserverpb.ReplicaState
snapType kvserverpb.SnapshotRequest_Type
onClose func()
State kvserverpb.ReplicaState
snapType kvserverpb.SnapshotRequest_Type
sharedBackings []objstorage.RemoteObjectBackingHandle
onClose func()
}

func (s OutgoingSnapshot) String() string {
Expand All @@ -297,6 +299,9 @@ func (s OutgoingSnapshot) SafeFormat(w redact.SafePrinter, _ rune) {
// Close releases the resources associated with the snapshot.
func (s *OutgoingSnapshot) Close() {
s.EngineSnap.Close()
for i := range s.sharedBackings {
s.sharedBackings[i].Close()
}
if s.onClose != nil {
s.onClose()
}
Expand All @@ -311,10 +316,13 @@ type IncomingSnapshot struct {
// The descriptor in the snapshot, never nil.
Desc *roachpb.RangeDescriptor
DataSize int64
SharedSize int64
snapType kvserverpb.SnapshotRequest_Type
placeholder *ReplicaPlaceholder
raftAppliedIndex kvpb.RaftIndex // logging only
msgAppRespCh chan raftpb.Message // receives MsgAppResp if/when snap is applied
sharedSSTs []pebble.SharedSSTMeta
doExcise bool
}

func (s IncomingSnapshot) String() string {
Expand Down Expand Up @@ -510,6 +518,12 @@ func (r *Replica) applySnapshot(
logDetails.Printf(" subsumedReplicas=%d@%0.0fms",
len(subsumedRepls), stats.subsumedReplicas.Sub(start).Seconds()*1000)
}
if len(inSnap.sharedSSTs) > 0 {
logDetails.Printf(" shared=%d sharedSize=%s", len(inSnap.sharedSSTs), humanizeutil.IBytes(inSnap.SharedSize))
}
if inSnap.doExcise {
logDetails.Printf(" excise=true")
}
logDetails.Printf(" ingestion=%d@%0.0fms", len(inSnap.SSTStorageScratch.SSTs()),
stats.ingestion.Sub(stats.subsumedReplicas).Seconds()*1000)
log.Infof(ctx, "applied %s (%s)", inSnap, logDetails)
Expand Down Expand Up @@ -574,12 +588,20 @@ func (r *Replica) applySnapshot(
}
}
var ingestStats pebble.IngestOperationStats
if ingestStats, err =
// TODO: separate ingestions for log and statemachine engine. See:
//
// https://github.com/cockroachdb/cockroach/issues/93251
r.store.TODOEngine().IngestLocalFilesWithStats(ctx, inSnap.SSTStorageScratch.SSTs()); err != nil {
return errors.Wrapf(err, "while ingesting %s", inSnap.SSTStorageScratch.SSTs())
// TODO: separate ingestions for log and statemachine engine. See:
//
// https://github.com/cockroachdb/cockroach/issues/93251
if inSnap.doExcise {
exciseSpan := desc.KeySpan().AsRawSpanWithNoLocals()
if ingestStats, err =
r.store.TODOEngine().IngestAndExciseFiles(ctx, inSnap.SSTStorageScratch.SSTs(), inSnap.sharedSSTs, exciseSpan); err != nil {
return errors.Wrapf(err, "while ingesting %s and excising %s-%s", inSnap.SSTStorageScratch.SSTs(), exciseSpan.Key, exciseSpan.EndKey)
}
} else {
if ingestStats, err =
r.store.TODOEngine().IngestLocalFilesWithStats(ctx, inSnap.SSTStorageScratch.SSTs()); err != nil {
return errors.Wrapf(err, "while ingesting %s", inSnap.SSTStorageScratch.SSTs())
}
}
if r.store.cfg.KVAdmissionController != nil {
r.store.cfg.KVAdmissionController.SnapshotIngested(r.store.StoreID(), ingestStats)
Expand Down
Loading

0 comments on commit 02a4b7e

Please sign in to comment.