etcd-io · xiang90 · Mar 4, 2018 · heyitsanthony · Mar 4, 2018 · heyitsanthony
diff --git a/etcdserver/v3_server.go b/etcdserver/v3_server.go
@@ -86,8 +86,6 @@ type Authenticator interface {
 }
 
 func (s *EtcdServer) Range(ctx context.Context, r *pb.RangeRequest) (*pb.RangeResponse, error) {
- defer warnOfExpensiveReadOnlyRangeRequest(s.getLogger(), time.Now(), r)
-
  if !r.Serializable {
  err := s.linearizableReadNotify(ctx)
  if err != nil {

diff --git a/mvcc/backend/backend.go b/mvcc/backend/backend.go
@@ -49,6 +49,11 @@ var (
 )
 
 type Backend interface {
+ // CommittedReadTx returns a non-blocking read tx that is suitable for large reads.
+ // CommittedReadTx call itself will not return until the current BatchTx gets committed to
+ // ensure consistency.
+ CommittedReadTx() ReadTx
+
  ReadTx() ReadTx
  BatchTx() BatchTx
 
@@ -97,6 +102,8 @@ type backend struct {
 
  readTx *readTx
 
+ concurrentReadTxCh chan chan ReadTx
+
  stopc chan struct{}
  donec chan struct{}
 
@@ -165,6 +172,8 @@ func newBackend(bcfg BackendConfig) *backend {
  buckets: make(map[string]*bolt.Bucket),
  },
 
+ concurrentReadTxCh: make(chan chan ReadTx),
+
  stopc: make(chan struct{}),
  donec: make(chan struct{}),
 
@@ -184,6 +193,12 @@ func (b *backend) BatchTx() BatchTx {
 
 func (b *backend) ReadTx() ReadTx { return b.readTx }
 
+func (b *backend) CommittedReadTx() ReadTx {
+ rch := make(chan ReadTx)
+ b.concurrentReadTxCh <- rch
+ return <-rch
+}
+
 // ForceCommit forces the current batching tx to commit.
 func (b *backend) ForceCommit() {
  b.batchTx.Commit()
@@ -301,6 +316,25 @@ func (b *backend) run() {
  b.batchTx.Commit()
  }
  t.Reset(b.batchInterval)
+ b.createConcurrentReadTxs()
+ }
+}
+
+func (b *backend) createConcurrentReadTxs() {
+ // do not allow too many concurrent read txs.
+ // TODO: improve this by having a global pending counter?
+ for i := 0; i < 100; i++ {
+ select {
+ case rch := <-b.concurrentReadTxCh:
+ rtx, err := b.db.Begin(false)
+ if err != nil {
+ plog.Fatalf("cannot begin read tx (%s)", err)
+ }
+ rch <- &concurrentReadTx{tx: rtx}
+ default:
+ // no more to create.
+ return
+ }
  }
 }
 

diff --git a/mvcc/backend/backend_test.go b/mvcc/backend/backend_test.go
@@ -300,6 +300,81 @@ func TestBackendWritebackForEach(t *testing.T) {
  }
 }
 
+// TestBackendConcurrentReadTx checks if the concurrent tx is created correctly.
+func TestBackendConcurrentReadTx(t *testing.T) {
+ b, tmpPath := NewTmpBackend(2*time.Second, 10000)
+ defer cleanup(b, tmpPath)
+
+ var rtx0 ReadTx
+ done := make(chan struct{})
+ go func() {
+ rtx0 = b.ConcurrentReadTx()
+ close(done)
+ }()
+
+ tx := b.BatchTx()
+ tx.Lock()
+ tx.UnsafeCreateBucket([]byte("key"))
+ for i := 0; i < 5; i++ {
+ k := []byte(fmt.Sprintf("%04d", i))
+ tx.UnsafePut([]byte("key"), k, []byte("bar"))
+ }
+ tx.Unlock()
+
+ select {
+ case <-done:
+ t.Fatal("concurrent read tx should block on the last batch tx!")
+ case <-time.After(time.Second):
+ }
+
+ select {
+ case <-done:
+ case <-time.After(4 * time.Second):
+ t.Fatal("commit the last batched tx should unblock concurrent tx!")
+ }
+
+ rtx0.Lock()
+ defer rtx0.Unlock()
+ ks, _ := rtx0.UnsafeRange([]byte("key"), []byte(fmt.Sprintf("%04d", 0)), []byte(fmt.Sprintf("%04d", 5)), 0)
+ if len(ks) != 5 {
+ t.Errorf("got %d keys, expect %d", len(ks), 5)
+ }
+
+ // test if we can create concurrent read while the previous read tx is still open
+ var rtx1 ReadTx
+ done = make(chan struct{})
+ go func() {
+ rtx1 = b.ConcurrentReadTx()
+ rtx1.Lock()
+ rtx1.UnsafeForEach([]byte(""), nil)
+ rtx1.Unlock()
+ close(done)
+ }()
+ select {
+ case <-done:
+ case <-time.After(4 * time.Second):
+ t.Fatal("cannot create concurrent read")
+ }
+
+ done = make(chan struct{})
+ // test if we can create concurrent write while the previous read tx is still open
+ go func() {
+ tx := b.BatchTx()
+ tx.Lock()
+ for i := 0; i < 5; i++ {
+ k := []byte(fmt.Sprintf("%04d", i))
+ tx.UnsafePut([]byte("key"), k, []byte("bar"))
+ }
+ tx.Unlock()
+ close(done)
+ }()
+ select {
+ case <-done:
+ case <-time.After(4 * time.Second):
+ t.Fatal("cannot create concurrent write")
+ }
+}
+
 func cleanup(b Backend, path string) {
  b.Close()
  os.Remove(path)

diff --git a/mvcc/backend/concurrent_read_tx.go b/mvcc/backend/concurrent_read_tx.go
@@ -0,0 +1,38 @@
+// Copyright 2018 The etcd Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package backend
+
+import (
+ bolt "github.com/coreos/bbolt"
+)
+
+type concurrentReadTx struct {
+ tx *bolt.Tx
+}
+
+func (rt *concurrentReadTx) Lock() {}
+func (rt *concurrentReadTx) Unlock() { rt.tx.Rollback() }
+
+func (rt *concurrentReadTx) UnsafeRange(bucketName, key, endKey []byte, limit int64) ([][]byte, [][]byte) {
+ bucket := rt.tx.Bucket(bucketName)
+ if bucket == nil {
+ plog.Fatalf("bucket %s does not exist", bucketName)
+ }
+ return unsafeRange(bucket.Cursor(), key, endKey, limit)
+}
+
+func (rt *concurrentReadTx) UnsafeForEach(bucketName []byte, visitor func(k, v []byte) error) error {
+ return unsafeForEach(rt.tx, bucketName, visitor)
+}
diff --git a/mvcc/index.go b/mvcc/index.go
@@ -91,10 +91,11 @@ func (ti *treeIndex) keyIndex(keyi *keyIndex) *keyIndex {
 func (ti *treeIndex) visit(key, end []byte, f func(ki *keyIndex)) {
  keyi, endi := &keyIndex{key: key}, &keyIndex{key: end}
 
- ti.RLock()
- defer ti.RUnlock()
+ ti.Lock()
+ clone := ti.tree.Clone()
+ ti.Unlock()
 
- ti.tree.AscendGreaterOrEqual(keyi, func(item btree.Item) bool {
+ clone.AscendGreaterOrEqual(keyi, func(item btree.Item) bool {
  if len(endi.key) > 0 && !item.Less(endi) {
  return false
  }

diff --git a/mvcc/kvstore_test.go b/mvcc/kvstore_test.go
@@ -640,7 +640,7 @@ func TestTxnBlockBackendForceCommit(t *testing.T) {
  s := NewStore(zap.NewExample(), b, &lease.FakeLessor{}, nil)
  defer os.Remove(tmpPath)
 
- txn := s.Read()
+ txn := s.Write()
 
  done := make(chan struct{})
  go func() {
@@ -742,6 +742,7 @@ type fakeBackend struct {
 
 func (b *fakeBackend) BatchTx() backend.BatchTx { return b.tx }
 func (b *fakeBackend) ReadTx() backend.ReadTx { return b.tx }
+func (b *fakeBackend) ConcurrentReadTx() backend.ReadTx { return b.tx }
 func (b *fakeBackend) Hash(ignores map[backend.IgnoreKey]struct{}) (uint32, error) { return 0, nil }
 func (b *fakeBackend) Size() int64 { return 0 }
 func (b *fakeBackend) SizeInUse() int64 { return 0 }

diff --git a/mvcc/kvstore_txn.go b/mvcc/kvstore_txn.go
@@ -21,9 +21,21 @@ import (
  "go.uber.org/zap"
 )
 
+const (
+ expensiveReadLimit = 1000
+ readonly = true
+ readwrite = false
+)
+
 type storeTxnRead struct {
- s *store
- tx backend.ReadTx
+ s *store
+ tx backend.ReadTx
+ txlocked bool
+
+ // for creating concurrent read tx when the read is expensive.
+ b backend.Backend
+ // is the transcation readonly?
+ ro bool
 
  firstRev int64
  rev int64
@@ -33,10 +45,15 @@ func (s *store) Read() TxnRead {
  s.mu.RLock()
  tx := s.b.ReadTx()
  s.revMu.RLock()
- tx.Lock()
  firstRev, rev := s.compactMainRev, s.currentRev
  s.revMu.RUnlock()
- return newMetricsTxnRead(&storeTxnRead{s, tx, firstRev, rev})
+ return newMetricsTxnRead(&storeTxnRead{
+ s: s,
+ tx: tx,
+ b: s.b,
+ ro: readonly,
+ firstRev: firstRev,
+ rev: rev})
 }
 
 func (tr *storeTxnRead) FirstRev() int64 { return tr.firstRev }
@@ -47,7 +64,9 @@ func (tr *storeTxnRead) Range(key, end []byte, ro RangeOptions) (r *RangeResult,
 }
 
 func (tr *storeTxnRead) End() {
- tr.tx.Unlock()
+ if tr.txlocked {
+ tr.tx.Unlock()
+ }
  tr.s.mu.RUnlock()
 }
 
@@ -64,10 +83,15 @@ func (s *store) Write() TxnWrite {
  tx := s.b.BatchTx()
  tx.Lock()
  tw := &storeTxnWrite{
- storeTxnRead: storeTxnRead{s, tx, 0, 0},
- tx: tx,
- beginRev: s.currentRev,
- changes: make([]mvccpb.KeyValue, 0, 4),
+ storeTxnRead: storeTxnRead{
+ s: s,
+ txlocked: true,
+ tx: tx,
+ ro: readwrite,
+ },
+ tx: tx,
+ beginRev: s.currentRev,
+ changes: make([]mvccpb.KeyValue, 0, 4),
  }
  return newMetricsTxnWrite(tw)
 }
@@ -134,6 +158,15 @@ func (tr *storeTxnRead) rangeKeys(key, end []byte, curRev int64, ro RangeOptions
  limit = len(revpairs)
  }
 
+ if limit > expensiveReadLimit && !tr.txlocked && tr.ro { // first expensive read in a read only transcation
+ // too many keys to range. upgrade the read transcation to concurrent read tx.
+ tr.tx = tr.b.CommittedReadTx()
+ }
+ if !tr.txlocked {
+ tr.tx.Lock()
+ tr.txlocked = true
+ }
+
  kvs := make([]mvccpb.KeyValue, limit)
  revBytes := newRevBytes()
  for i, revpair := range revpairs[:len(kvs)] {