Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Prevent Race Conditions Between Tablet Deletes and Updates #9237

Merged
merged 5 commits into from
Nov 21, 2021
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion go/test/endtoend/tabletgateway/vtgate_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -204,7 +204,7 @@ func TestReplicaTransactions(t *testing.T) {
require.Nil(t, err)
serving := replicaTablet.VttabletProcess.WaitForStatus("SERVING", time.Duration(60*time.Second))
assert.Equal(t, serving, true, "Tablet did not become ready within a reasonable time")
exec(t, readConn, fetchAllCustomers, "is either down or nonexistent")
exec(t, readConn, fetchAllCustomers, "not found")

// create a new connection, should be able to query again
readConn, err = mysql.Connect(ctx, &vtParams)
Expand Down
37 changes: 29 additions & 8 deletions go/vt/discovery/healthcheck.go
Original file line number Diff line number Diff line change
Expand Up @@ -381,7 +381,6 @@ func (hc *HealthCheckImpl) deleteTablet(tablet *topodata.Tablet) {
hc.mu.Lock()
defer hc.mu.Unlock()

key := hc.keyFromTablet(tablet)
tabletAlias := tabletAliasString(topoproto.TabletAliasString(tablet.Alias))
// delete from authoritative map
th, ok := hc.healthByAlias[tabletAlias]
Expand All @@ -393,14 +392,18 @@ func (hc *HealthCheckImpl) deleteTablet(tablet *topodata.Tablet) {
// which will call finalizeConn, which will close the connection.
th.cancelFunc()
delete(hc.healthByAlias, tabletAlias)
// delete from map by keyspace.shard.tabletType
ths, ok := hc.healthData[key]
if !ok {
log.Warningf("We have no health data for target: %v", key)
return

// the tablet has been deleted from the authoritative healthByAlias map so let's ensure it's deleted
// from the healthData map as well, which means we need to delete any existing combinations of
// keyspace.shard.tabletType we may have had for the tablet
for _, key := range hc.keysFromTablet(tablet) {
if ths, ok := hc.healthData[key]; ok {
delete(ths, tabletAlias)
}
}
delete(ths, tabletAlias)
// delete from healthy list

// We only need to recompute the healthy record for the current state of the tablet
key := hc.keyFromTablet(tablet)
healthy, ok := hc.healthy[key]
if ok && len(healthy) > 0 {
hc.recomputeHealthy(key)
Expand All @@ -413,6 +416,15 @@ func (hc *HealthCheckImpl) updateHealth(th *TabletHealth, prevTarget *query.Targ
defer hc.mu.Unlock()

tabletAlias := tabletAliasString(topoproto.TabletAliasString(th.Tablet.Alias))
// let's be sure that this tablet hasn't been deleted from the authortative map
mattlord marked this conversation as resolved.
Show resolved Hide resolved
// so that we're not racing to update it and in effect re-adding a copy of the
// tablet record that was deleted
_, ok := hc.healthByAlias[tabletAlias]
if !ok {
log.Infof("Tablet %s has been deleted, skipping health update", tabletAlias)
mattlord marked this conversation as resolved.
Show resolved Hide resolved
return
}

targetKey := hc.keyFromTarget(th.Target)
targetChanged := prevTarget.TabletType != th.Target.TabletType || prevTarget.Keyspace != th.Target.Keyspace || prevTarget.Shard != th.Target.Shard
if targetChanged {
Expand Down Expand Up @@ -723,6 +735,15 @@ func (hc *HealthCheckImpl) keyFromTablet(tablet *topodata.Tablet) keyspaceShardT
return keyspaceShardTabletType(fmt.Sprintf("%s.%s.%s", tablet.Keyspace, tablet.Shard, topoproto.TabletTypeLString(tablet.Type)))
}

// keysFromTablet returns a slice of potential type based keys for a tablet
func (hc *HealthCheckImpl) keysFromTablet(tablet *topodata.Tablet) []keyspaceShardTabletType {
tabletTypeKeys := make([]keyspaceShardTabletType, len(topoproto.AllTabletTypes))
for i, tabletType := range topoproto.AllTabletTypes {
tabletTypeKeys[i] = keyspaceShardTabletType(fmt.Sprintf("%s.%s.%s", tablet.Keyspace, tablet.Shard, topoproto.TabletTypeLString(tabletType)))
}
return tabletTypeKeys
}

// getAliasByCell should only be called while holding hc.mu
func (hc *HealthCheckImpl) getAliasByCell(cell string) string {
if alias, ok := hc.cellAliases[cell]; ok {
Expand Down
8 changes: 3 additions & 5 deletions go/vt/discovery/tablet_health_check.go
Original file line number Diff line number Diff line change
Expand Up @@ -290,11 +290,9 @@ func (thc *tabletHealthCheck) checkConn(hc *HealthCheckImpl) {
if err != nil {
hcErrorCounters.Add([]string{thc.Target.Keyspace, thc.Target.Shard, topoproto.TabletTypeLString(thc.Target.TabletType)}, 1)
// We have reason to suspect the tablet healthcheck record is corrupted or invalid so let's remove the tablet's record
// from the healthcheck cache and it will get re-added again if the tablet is reachable
if strings.Contains(err.Error(), "health stats mismatch") ||
strings.HasSuffix(err.Error(), context.Canceled.Error()) ||
strings.Contains(err.Error(), `"error reading from server: EOF", received prior goaway`) {
log.Warningf("tablet %s had a suspect healthcheck error: %s -- clearing cache record", thc.Tablet.Alias, err.Error())
// from the healthcheck cache and the corrected tablet record will be fetched from the topology and added to the
// healthcheck cache again via the topology watcher.
mattlord marked this conversation as resolved.
Show resolved Hide resolved
if strings.Contains(err.Error(), "health stats mismatch") {
hc.deleteTablet(thc.Tablet)
mattlord marked this conversation as resolved.
Show resolved Hide resolved
return
}
Expand Down