diff --git a/pkg/toystore/destroyguard/guard.go b/pkg/toystore/destroyguard/guard.go new file mode 100644 index 000000000000..139c238a9fec --- /dev/null +++ b/pkg/toystore/destroyguard/guard.go @@ -0,0 +1,136 @@ +// Copyright 2019 The Cockroach Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. + +package destroyguard + +import ( + "sync" + "sync/atomic" + + "github.com/cockroachdb/cockroach/pkg/util" +) + +// A Guard tracks a reference count (usually associated to some object to be +// guarded) while offering an API that allows tearing down the guard and +// instructing all reference holders to relinquish their references in a timely +// manner. +// +// A Guard is in either of three states, through which it moves in the following +// order: +// - active: Teardown has not been called; Acquire() returns success. +// - tearing down: Teardown has been called, but the refcount hasn't dropped to +// zero; Acquire returns the error passed to Teardown. +// - deadCh: Teardown has been called and the refcount is zero; the Dead() channel +// is closed and Acquire continues to return the same error. +// +// The zero Guard is ready to use. A Guard must not be copied by value. All +// methods on Guard are safe to call concurrently. +type Guard struct { + // TODO(tbg): use int32 here to assert Release() without Acquire(). + ref uint32 // updated atomically by Acquire() and Release() + + initOnce sync.Once // sets up channels when needed + + done uint32 // for idempotency in Teardown + doneCh chan struct{} // Done(), closed on Teardown + err error // Err(), set on Teardown + + dead uint32 // atomically set by Teardown(), cleared by maybeKill() + deadCh chan struct{} // Dead(); closed when ref hits zero + + util.NoCopy +} + +// flagDestroyed is set on the ref count to indicate that Teardown has been +// called. +const flagDestroyed = uint32(1) << 31 + +func (d *Guard) maybeInit() { + d.initOnce.Do(func() { + d.doneCh = make(chan struct{}) + d.deadCh = make(chan struct{}) + }) +} + +// Acquire a reference on the Guard. If Teardown has been called, the acquisition +// will return with an error (and without having acquired a reference). If no +// error is returned, the caller is free to proceed and use the reference, though +// Done() should be consulted regularly (usually when carrying out a blocking +// operation, such as sending on or receiving from a channel). References can +// in principle be long-lived, +func (d *Guard) Acquire() error { + n := atomic.AddUint32(&d.ref, 1) + + if (n & flagDestroyed) != 0 { + d.Release() + return d.err + } + + return nil +} + +func (d *Guard) maybeKill(ref uint32) { + // If the Guard is tearing down and we just released the last reference, + // kill the guard. + if ref == flagDestroyed { + if atomic.CompareAndSwapUint32(&d.dead, 0, 1) { + close(d.deadCh) + } + } +} + +// Release a reference. This must match a previous successful call to Acquire. +func (d *Guard) Release() { + d.maybeKill(atomic.AddUint32(&d.ref, ^uint32(0) /* decrement by one */)) +} + +// Teardown instructs the Guard to refuse new references and to signal the +// Done() channel. The supplied error will be returned from Done(). Teardown +// does not block; the caller can consult Dead() to block until all references +// have been relinquished. In particular, it's perfectly fine to call Teardown +// from a goroutine that also holds a reference. +// +// Multiple calls to Teardown are allowed, but only the error from the first +// call will be used. +func (d *Guard) Teardown(err error) { + if atomic.AddUint32(&d.done, 1) == 1 { + d.maybeInit() + d.err = err + atomic.AddUint32(&d.ref, flagDestroyed) + close(d.doneCh) + d.maybeKill(atomic.LoadUint32(&d.ref)) + } +} + +// Dead returns a channel that is closed when the reference count has dropped to +// zero after a call to Teardown. Consequently it must not be blocked on from a +// goroutine that is holding a reference, for that would constitute a deadlock. +// Additionally, blocking on this channel makes little sense unless it is known +// that Teardown() has been called. +func (d *Guard) Dead() <-chan struct{} { + d.maybeInit() + return d.deadCh +} + +// Done is closed when Teardown is called. +func (d *Guard) Done() <-chan struct{} { + d.maybeInit() + return d.doneCh +} + +// Err returns the error passed to Teardown. A call to Err() must be preceded +// by a read of the Done() channel. +func (d *Guard) Err() error { + return d.err +} diff --git a/pkg/toystore/destroyguard/guard_test.go b/pkg/toystore/destroyguard/guard_test.go new file mode 100644 index 000000000000..70e3694291ec --- /dev/null +++ b/pkg/toystore/destroyguard/guard_test.go @@ -0,0 +1,121 @@ +// Copyright 2019 The Cockroach Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. + +package destroyguard + +import ( + "errors" + "runtime" + "sync" + "testing" + + "github.com/stretchr/testify/assert" +) + +func notDone(t *testing.T, g *Guard) { + t.Helper() + select { + case <-g.Done(): + t.Fatal("unexpectedly done") + + default: + } +} + +func notDead(t *testing.T, g *Guard) { + t.Helper() + select { + case <-g.Dead(): + t.Fatal("unexpectedly dead") + default: + } +} + +func notDoneOrDead(t *testing.T, g *Guard) { + t.Helper() + notDone(t, g) + notDead(t, g) +} + +func TestGuard(t *testing.T) { + err := errors.New("boom") + + t.Run("instant-teardown", func(t *testing.T) { + var g Guard + notDoneOrDead(t, &g) + g.Teardown(err) + <-g.Done() + <-g.Dead() + g.Teardown(err) + <-g.Done() + <-g.Dead() + assert.Equal(t, err, g.Err()) + }) + + t.Run("acquire-teardown-release", func(t *testing.T) { + var g Guard + assert.NoError(t, g.Acquire()) + notDoneOrDead(t, &g) + g.Teardown(err) + notDead(t, &g) + + // Teardown bit should be set. + assert.Equal(t, g.ref, (uint32(1)<<31)+1) + + assert.Equal(t, err, g.Err()) + assert.Equal(t, err, g.Acquire()) + + g.Release() + <-g.Done() + <-g.Dead() + + assert.Equal(t, err, g.Err()) + assert.Equal(t, err, g.Acquire()) + }) + + t.Run("concurrent", func(t *testing.T) { + var g Guard + + var wg sync.WaitGroup + + const n = 20 + for i := 0; i < n; i++ { + wg.Add(1) + go func(i int) { + defer wg.Done() + if i >= n/2 && (i%3) == 0 { + g.Teardown(err) + runtime.Gosched() + } + if err := g.Acquire(); err != nil { + return + } + if i%2 == 0 { + runtime.Gosched() + } + notDead(t, &g) + if i%3 == 0 { + <-g.Done() + } + g.Release() + }(i) + } + + wg.Wait() + <-g.Done() + <-g.Dead() + assert.Equal(t, g.ref, uint32(1)<<31) + assert.Error(t, g.Err()) + }) +} diff --git a/pkg/toystore/doc.go b/pkg/toystore/doc.go new file mode 100644 index 000000000000..96a0a438b13f --- /dev/null +++ b/pkg/toystore/doc.go @@ -0,0 +1,19 @@ +// Copyright 2019 The Cockroach Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. + +/* +Package toystore is a place to experiment with storage refactors without being +constrained by the clutter in the production version of the code. +*/ +package toystore diff --git a/pkg/toystore/replica.go b/pkg/toystore/replica.go new file mode 100644 index 000000000000..8947875b77e3 --- /dev/null +++ b/pkg/toystore/replica.go @@ -0,0 +1,149 @@ +// Copyright 2019 The Cockroach Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. + +package toystore + +import ( + "context" + + "github.com/cockroachdb/cockroach/pkg/toystore/destroyguard" +) + +// GuardedReplica is the external interface through which callers interact with +// a Replica. By acquiring references to the underlying Replica object while it +// is being used, it allows safe removal of Replica objects and/or their under- +// lying data. This is an important fundamental for five reasons: +// +// # Confidence +// Today, we use no such mechanism, though this is largely due to failing in the +// past to establish one[1]. The status quo "works" by maintaining a +// destroyStatus inside of Replica.mu, which is checked in a few places such as +// before proposing a command or serving a read. Since these checks are only +// "point in time", and nothing prevents the write to the status from occurring +// just a moment after the check, there is a high cognitive overhead to +// reasoning about the possible outcomes. In fact, in the case in which things +// could go bad most spectacularly, namely removing a replica including data, we +// hold essentially all of the locks available to us and rely on the +// readOnlyCmdMu (which we would rather get rid off). This then is the first +// reason for proposing this change: make the replica lifetime easier to reason +// about and establish confidence that the Replica can't just disappear out from +// under us. +// +// # Simplified lifetime state transitions +// The second motivator are ReplicaID transitions, which can be extremely +// complicated. For example, a Replica may +// +// 1. start off as an uninitialized Replica with ReplicaID 12 (i.e. no data) +// 2. receive a preemptive snapshot, which confusingly results in an initialized +// Replica with ReplicaID 12 (though preemptive snapshots nominally should +// result in a preemptive replica -- one with ReplicaID zero). +// 3. update its ReplicaID to 18 (i.e. being added back to the Raft group). +// 4. get ReplicaGC'ed because +// 5. it blocks a preemptive snapshot, which now recreates it. +// +// In my point of view, changing the ReplicaID for a live Replica is a bad idea +// and incurs too much complexity. An architecture in which Replica objects have +// a single ReplicaID throughout their lifetime is conceptually much simpler, +// but it is also much more straightforward to maintain since it does away with +// a whole class of concurrency that needs to be tamed in today's code, and which +// may have performance repercussions. On the other hand, replicaID changes are +// not frequent, and only need to be moderately fast. +// +// The alternative is to instantiate a new incarnation of the Replica whenever +// the ReplicaID changes. The difficult part about this is destroying the old +// Replica; since Replica provides proper serialization, we mustn't have commands +// in-flight in two instances for the same data (and generally we want to avoid +// even having to think about concurrent use of old incarnations). This is +// explored here. The above history would read something like this: +// +// 1. start off as an uninitialized Replica R with Replica ID 12 (no data) +// 2. preemptive snapshot is received: tear down R, instantiate R' +// 3. ReplicaID is updated: tear down R', instantiate R'' +// 4. R'' is marked for ReplicaGC: replace with placeholder R''' in Store, tear +// down R'', wait for references to drain, remove the data, remove R'''. +// 5. instantiate R''' (no change from before). +// +// # Enable separate Replica types +// A third upshot is almost visible in the above description. Once we can re- +// instantiate cleanly on ReplicaID-based state changes, we might as well go +// ahead and pull apart various types of Replica: +// +// - preemptive snapshots (though we may replace those by learner replicas in +// the future[2]) +// - uninitialized Replicas (has replicaID, but no data) +// - initialized Replicas (has replicaID and data) +// - placeholders (have no replicaID and no data) +// +// To simplify replicaGC and to remove the preemptive snapshot state even before +// we stop using preemptive snapshots, we may allow placeholders to hold data +// (but with no means to run client requests against it). +// +// # Reduce footprint for idle Replicas +// +// Once there, reducing the memory footprint for "idle" replicas by only having +// a "shim" in memory (replacing the "full" version) until the next request +// comes in becomes a natural proposition by introducing a new replica state +// that upon access gets promoted to a full replica (which loads the full +// in-memory state from disk), pickup up past attempts at this which failed due +// to the technical debt present at the moment[3]. +// +// [1]: https://github.com/cockroachdb/cockroach/issues/8630 +// [2]: https://github.com/cockroachdb/cockroach/issues/34058 +// [3]: https://github.com/cockroachdb/cockroach/pull/31663 +type GuardedReplica struct { + guard destroyguard.Guard + + // The remainder of Replica goes here. +} + +// Replica is struct-identical to GuardedReplica, but exposes internal methods +// only (for which the assumption is that the caller transitively already holds +// a reference). +type Replica GuardedReplica + +func (r *GuardedReplica) Send(ctx context.Context, req Request) (Response, error) { + if err := r.guard.Acquire(); err != nil { + return Response{}, err + } + defer r.guard.Release() + return (*Replica)(r).Execute(ctx, req) +} + +func (r *Replica) Execute(ctx context.Context, req Request) (Response, error) { + // Placeholder for command execution. + // Run checks and preprocess requests. + // Check lease, perhaps trigger new lease and wait. + // Enter spanlatch manager and wait. + // Query timestamp cache. + // Evaluate command (basically last step on read). + // Propose. + + // Wait for result of proposal. Here, simulate a proposal that's stuck + // (for example because the replica is no longer part of the range). + ch := make(chan struct{}) + select { + case <-ctx.Done(): + return Response{}, ctx.Err() + case <-r.guard.Done(): + // The Replica is being torn down, so return up the stack to relinquish + // any references held to it. + return Response{}, r.guard.Err() + case <-ch: + return Response{}, nil + } + + // Handle side effects. + // Update timestamp cache. + // Leave spanlatch manager. +} diff --git a/pkg/toystore/store.go b/pkg/toystore/store.go new file mode 100644 index 000000000000..c018236e3c02 --- /dev/null +++ b/pkg/toystore/store.go @@ -0,0 +1,35 @@ +package toystore + +import ( + "context" + + "github.com/cockroachdb/cockroach/pkg/util/syncutil" +) + +type Store struct { + // Extremely simplified. + mu struct { + syncutil.RWMutex + // NB: if we had separate "states" for preemptive snapshots, etc, we + // would store a (simulated) enum in this struct. + replicas map[int64]*GuardedReplica + } +} + +type Request struct { + RangeID int64 + + // Remainder omitted. +} + +type Response struct{} // contents omitted + +func (s *Store) Send(ctx context.Context, req Request) (Response, error) { + // Lots of detail omitted. + + s.mu.RLock() + guard := s.mu.replicas[req.RangeID] + s.mu.RUnlock() + + return guard.Send(ctx, req) +}