vitessio · guoliang100 · Jan 14, 2016 · Jan 12, 2016 · Jan 14, 2016 · Jan 14, 2016
diff --git a/go/vt/vtgate/resilient_srv_topo_server.go b/go/vt/vtgate/resilient_srv_topo_server.go
@@ -48,7 +48,7 @@ type ResilientSrvTopoServer struct {
 
  // mutex protects the cache map itself, not the individual
  // values in the cache.
- mutex sync.Mutex
+ mutex sync.RWMutex
  srvKeyspaceNamesCache map[string]*srvKeyspaceNamesEntry
  srvKeyspaceCache map[string]*srvKeyspaceEntry
  srvShardCache map[string]*srvShardEntry
@@ -108,7 +108,7 @@ type srvKeyspaceEntry struct {
  keyspace string
 
  // the mutex protects any access to this structure (read or write)
- mutex sync.Mutex
+ mutex sync.RWMutex
 
  insertionTime time.Time
  value *topodatapb.SrvKeyspace
@@ -257,14 +257,19 @@ func (server *ResilientSrvTopoServer) GetSrvKeyspaceNames(ctx context.Context, c
  return result, err
 }
 
-// GetSrvKeyspace returns SrvKeyspace object for the given cell and keyspace.
-func (server *ResilientSrvTopoServer) GetSrvKeyspace(ctx context.Context, cell, keyspace string) (*topodatapb.SrvKeyspace, error) {
- server.counts.Add(queryCategory, 1)
-
+func (server *ResilientSrvTopoServer) getSrvKeyspaceEntry(cell, keyspace string) *srvKeyspaceEntry {
  // find the entry in the cache, add it if not there
  key := cell + "." + keyspace
- server.mutex.Lock()
+ server.mutex.RLock()
  entry, ok := server.srvKeyspaceCache[key]
+ if ok {
+ server.mutex.RUnlock()
+ return entry
+ }
+ server.mutex.RUnlock()
+
+ server.mutex.Lock()
+ entry, ok = server.srvKeyspaceCache[key]
  if !ok {
  entry = &srvKeyspaceEntry{
  cell: cell,
@@ -273,40 +278,88 @@ func (server *ResilientSrvTopoServer) GetSrvKeyspace(ctx context.Context, cell,
  server.srvKeyspaceCache[key] = entry
  }
  server.mutex.Unlock()
+ return entry
+}
+
+// GetSrvKeyspace returns SrvKeyspace object for the given cell and keyspace.
+func (server *ResilientSrvTopoServer) GetSrvKeyspace(ctx context.Context, cell, keyspace string) (*topodatapb.SrvKeyspace, error) {
+ server.counts.Add(queryCategory, 1)
+
+ entry := server.getSrvKeyspaceEntry(cell, keyspace)
+
+ // If the entry exists, return it
+ entry.mutex.RLock()
+ if !entry.insertionTime.IsZero() {
+ v, e := entry.value, entry.lastError
+ entry.mutex.RUnlock()
+ return v, e
+ }
+ entry.mutex.RUnlock()
 
  // Lock the entry, and do everything holding the lock. This
  // means two concurrent requests will only issue one
  // underlying query.
  entry.mutex.Lock()
  defer entry.mutex.Unlock()
 
- // If the entry is fresh enough, return it
- if time.Now().Sub(entry.insertionTime) < server.cacheTTL {
+ // If the entry exists, return it
+ if !entry.insertionTime.IsZero() {
  return entry.value, entry.lastError
  }
 
- // not in cache or too old, get the real value
+ // not in cache, get the real value
  newCtx, cancel := context.WithTimeout(context.Background(), *srvTopoTimeout)
  defer cancel()
 
- result, err := server.topoServer.GetSrvKeyspace(newCtx, cell, keyspace)
+ // start watching
+ notifications, _, err := server.topoServer.WatchSrvKeyspace(newCtx, cell, keyspace)
  if err != nil {
+ // set error if there is no cached value
  if entry.insertionTime.IsZero() {
- server.counts.Add(errorCategory, 1)
- log.Errorf("GetSrvKeyspace(%v, %v, %v) failed: %v (no cached value, caching and returning error)", newCtx, cell, keyspace, err)
- } else {
- server.counts.Add(cachedCategory, 1)
- log.Warningf("GetSrvKeyspace(%v, %v, %v) failed: %v (returning cached value: %v %v)", newCtx, cell, keyspace, err, entry.value, entry.lastError)
- return entry.value, entry.lastError
+ entry.lastError = err
+ entry.lastErrorCtx = newCtx
+ }
+ // return cached value if any
+ log.Errorf("WatchSrvKeyspace failed for %v/%v: %v, returning cached value: %+v, %v", cell, keyspace, err, entry.value, entry.lastError)
+ return entry.value, entry.lastError
+ }
+ sk, ok := <-notifications
+ if !ok {
+ // set error if there is no cached value
+ if entry.insertionTime.IsZero() {
+ entry.lastError = fmt.Errorf("failed to receive from channel: %v %v", sk, ok)
+ entry.lastErrorCtx = newCtx
  }
+ // return cached value if any
+ log.Errorf("WatchSrvKeyspace first result failed for %v/%v: %v %v, returning cached value: %+v, %v", cell, keyspace, sk, ok, entry.value, entry.lastError)
+ return entry.value, entry.lastError
  }
 
- // save the value we got and the current time in the cache
+ // cache the first notification
  entry.insertionTime = time.Now()
- entry.value = result
- entry.lastError = err
+ entry.value = sk
+ entry.lastError = nil
  entry.lastErrorCtx = newCtx
- return result, err
+
+ go func() {
+ for {
+ sk, ok := <-notifications
+ entry.mutex.Lock()
+ if !ok {
+ log.Errorf("failed to receive from channel: %v %v", sk, ok)
+ // reset entry so it retries watching in next call
+ entry.insertionTime = time.Time{}
+ entry.mutex.Unlock()
+ break
+ }
+ entry.insertionTime = time.Now()
+ entry.value = sk
+ entry.lastError = nil
+ entry.mutex.Unlock()
+ }
+ }()
+
+ return entry.value, entry.lastError
 }
 
 // GetSrvShard returns SrvShard object for the given cell, keyspace, and shard.
@@ -705,15 +758,15 @@ func (server *ResilientSrvTopoServer) CacheStatus() *ResilientSrvTopoServerCache
  }
 
  for _, entry := range server.srvKeyspaceCache {
- entry.mutex.Lock()
+ entry.mutex.RLock()
  result.SrvKeyspaces = append(result.SrvKeyspaces, &SrvKeyspaceCacheStatus{
  Cell: entry.cell,
  Keyspace: entry.keyspace,
  Value: entry.value,
  LastError: entry.lastError,
  LastErrorCtx: entry.lastErrorCtx,
  })
- entry.mutex.Unlock()
+ entry.mutex.RUnlock()
  }
 
  for _, entry := range server.srvShardCache {

diff --git a/go/vt/vtgate/resilient_srv_topo_server_test.go b/go/vt/vtgate/resilient_srv_topo_server_test.go
@@ -193,20 +193,35 @@ func TestFilterUnhealthy(t *testing.T) {
 // returns errors for everything, except the one keyspace.
 type fakeTopo struct {
  faketopo.FakeTopo
- keyspace string
- callCount int
+ keyspace string
+ callCount int
+ notifications chan *topodatapb.SrvKeyspace
+ stopWatching chan struct{}
 }
 
 func (ft *fakeTopo) GetSrvKeyspaceNames(ctx context.Context, cell string) ([]string, error) {
  return []string{ft.keyspace}, nil
 }
 
-func (ft *fakeTopo) GetSrvKeyspace(ctx context.Context, cell, keyspace string) (*topodatapb.SrvKeyspace, error) {
+func (ft *fakeTopo) WatchSrvKeyspace(ctx context.Context, cell, keyspace string) (<-chan *topodatapb.SrvKeyspace, chan<- struct{}, error) {
  ft.callCount++
  if keyspace == ft.keyspace {
- return &topodatapb.SrvKeyspace{}, nil
+ ft.notifications = make(chan *topodatapb.SrvKeyspace, 10)
+ ft.stopWatching = make(chan struct{})
+ ft.notifications <- &topodatapb.SrvKeyspace{}
+ return ft.notifications, ft.stopWatching, nil
  }
- return nil, fmt.Errorf("Unknown keyspace")
+ return nil, nil, fmt.Errorf("Unknown keyspace")
+}
+
+func (ft *fakeTopo) GetSrvShard(ctx context.Context, cell, keyspace, shard string) (*topodatapb.SrvShard, error) {
+ ft.callCount++
+ if keyspace != ft.keyspace {
+ return nil, fmt.Errorf("Unknown keyspace")
+ }
+ return &topodatapb.SrvShard{
+ Name: shard,
+ }, nil
 }
 
 func (ft *fakeTopo) GetEndPoints(ctx context.Context, cell, keyspace, shard string, tabletType topodatapb.TabletType) (*topodatapb.EndPoints, int64, error) {
@@ -298,21 +313,41 @@ func TestCacheWithErrors(t *testing.T) {
  rsts := NewResilientSrvTopoServer(topo.Server{Impl: ft}, "TestCacheWithErrors")
 
  // ask for the known keyspace, that populates the cache
- _, err := rsts.GetSrvKeyspace(context.Background(), "", "test_ks")
+ _, err := rsts.GetSrvShard(context.Background(), "", "test_ks", "shard_0")
  if err != nil {
- t.Fatalf("GetSrvKeyspace got unexpected error: %v", err)
+ t.Fatalf("GetSrvShard got unexpected error: %v", err)
  }
 
  // now make the topo server fail, and ask again, should get cached
  // value, not even ask underlying guy
  ft.keyspace = "another_test_ks"
- _, err = rsts.GetSrvKeyspace(context.Background(), "", "test_ks")
+ _, err = rsts.GetSrvShard(context.Background(), "", "test_ks", "shard_0")
  if err != nil {
- t.Fatalf("GetSrvKeyspace got unexpected error: %v", err)
+ t.Fatalf("GetSrvShard got unexpected error: %v", err)
  }
 
  // now reduce TTL to nothing, so we won't use cache, and ask again
  rsts.cacheTTL = 0
+ _, err = rsts.GetSrvShard(context.Background(), "", "test_ks", "shard_0")
+ if err != nil {
+ t.Fatalf("GetSrvShard got unexpected error: %v", err)
+ }
+}
+
+// TestSrvKeyspaceCacheWithErrors will test we properly return cached errors for GetSrvKeyspace.
+func TestSrvKeyspaceCacheWithErrors(t *testing.T) {
+ ft := &fakeTopo{keyspace: "test_ks"}
+ rsts := NewResilientSrvTopoServer(topo.Server{Impl: ft}, "TestSrvKeyspaceCacheWithErrors")
+
+ // ask for the known keyspace, that populates the cache
+ _, err := rsts.GetSrvKeyspace(context.Background(), "", "test_ks")
+ if err != nil {
+ t.Fatalf("GetSrvKeyspace got unexpected error: %v", err)
+ }
+
+ // now make the topo server fail, and ask again, should get cached
+ // value, not even ask underlying guy
+ close(ft.notifications)
  _, err = rsts.GetSrvKeyspace(context.Background(), "", "test_ks")
  if err != nil {
  t.Fatalf("GetSrvKeyspace got unexpected error: %v", err)
@@ -325,28 +360,52 @@ func TestCachedErrors(t *testing.T) {
  rsts := NewResilientSrvTopoServer(topo.Server{Impl: ft}, "TestCachedErrors")
 
  // ask for an unknown keyspace, should get an error
- _, err := rsts.GetSrvKeyspace(context.Background(), "", "unknown_ks")
+ _, err := rsts.GetSrvShard(context.Background(), "", "unknown_ks", "shard_0")
  if err == nil {
- t.Fatalf("First GetSrvKeyspace didn't return an error")
+ t.Fatalf("First GetSrvShard didn't return an error")
  }
  if ft.callCount != 1 {
- t.Fatalf("GetSrvKeyspace didn't get called 1 but %v times", ft.callCount)
+ t.Fatalf("GetSrvShard didn't get called 1 but %v times", ft.callCount)
  }
 
  // ask again, should get an error and use cache
- _, err = rsts.GetSrvKeyspace(context.Background(), "", "unknown_ks")
+ _, err = rsts.GetSrvShard(context.Background(), "", "unknown_ks", "shard_0")
  if err == nil {
- t.Fatalf("Second GetSrvKeyspace didn't return an error")
+ t.Fatalf("Second GetSrvShard didn't return an error")
  }
  if ft.callCount != 1 {
- t.Fatalf("GetSrvKeyspace was called again: %v times", ft.callCount)
+ t.Fatalf("GetSrvShard was called again: %v times", ft.callCount)
  }
 
  // ask again after expired cache, should get an error
  rsts.cacheTTL = 0
+ _, err = rsts.GetSrvShard(context.Background(), "", "unknown_ks", "shard_0")
+ if err == nil {
+ t.Fatalf("Third GetSrvShard didn't return an error")
+ }
+ if ft.callCount != 2 {
+ t.Fatalf("GetSrvShard was not called again: %v times", ft.callCount)
+ }
+}
+
+// TestSrvKeyspaceCachedErrors will test we properly return cached errors for SrvKeyspace.
+func TestSrvKeyspaceCachedErrors(t *testing.T) {
+ ft := &fakeTopo{keyspace: "test_ks"}
+ rsts := NewResilientSrvTopoServer(topo.Server{Impl: ft}, "TestSrvKeyspaceCachedErrors")
+
+ // ask for an unknown keyspace, should get an error
+ _, err := rsts.GetSrvKeyspace(context.Background(), "", "unknown_ks")
+ if err == nil {
+ t.Fatalf("First GetSrvKeyspace didn't return an error")
+ }
+ if ft.callCount != 1 {
+ t.Fatalf("GetSrvKeyspace didn't get called 1 but %v times", ft.callCount)
+ }
+
+ // ask again, should get an error and use cache
  _, err = rsts.GetSrvKeyspace(context.Background(), "", "unknown_ks")
  if err == nil {
- t.Fatalf("Third GetSrvKeyspace didn't return an error")
+ t.Fatalf("Second GetSrvKeyspace didn't return an error")
  }
  if ft.callCount != 2 {
  t.Fatalf("GetSrvKeyspace was not called again: %v times", ft.callCount)