Skip to content
This repository has been archived by the owner on Dec 16, 2022. It is now read-only.

Port ERS bug fixes into 8.0 #197

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
102 changes: 102 additions & 0 deletions go/vt/concurrency/error_group.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
/*
Copyright 2021 The Vitess Authors.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package concurrency

import "context"

// ErrorGroup provides a function for waiting for N goroutines to complete with
// at least X successes and no more than Y failures, and cancelling the rest.
//
// It should be used as follows:
//
// errCh := make(chan error)
// errgroupCtx, errgroupCancel := context.WithCancel(ctx)
//
// for _, arg := range args {
// arg := arg
//
// go func() {
// err := doWork(errGroupCtx, arg)
// errCh <- err
// }()
// }
//
// errgroup := concurrency.ErrorGroup{
// NumGoroutines: len(args),
// NumRequiredSuccess: 5, // need at least 5 to respond with nil error before cancelling the rest
// NumAllowedErrors: 1, // if more than 1 responds with non-nil error, cancel the rest
// }
// errRec := errgroup.Wait(errgroupCancel, errCh)
//
// if errRec.HasErrors() {
// // ...
// }
type ErrorGroup struct {
NumGoroutines int
NumRequiredSuccesses int
NumAllowedErrors int
}

// Wait waits for a group of goroutines that are sending errors to the given
// error channel, and are cancellable by the given cancel function.
//
// Wait will cancel any outstanding goroutines under the following conditions:
//
// (1) More than NumAllowedErrors non-nil results have been consumed on the
// error channel.
//
// (2) At least NumRequiredSuccesses nil results have been consumed on the error
// channel.
//
// After the cancellation condition is triggered, Wait will continue to consume
// results off the error channel so as to not permanently block any of those
// cancelled goroutines.
//
// When finished consuming results from all goroutines, cancelled or otherwise,
// Wait returns an AllErrorRecorder that contains all errors returned by any of
// those goroutines. It does not close the error channel.
func (eg ErrorGroup) Wait(cancel context.CancelFunc, errors chan error) *AllErrorRecorder {
errCounter := 0
successCounter := 0
responseCounter := 0
rec := &AllErrorRecorder{}

for err := range errors {
responseCounter++

switch err {
case nil:
successCounter++
default:
errCounter++
rec.RecordError(err)
}

// Even though we cancel in the next conditional, we need to keep
// consuming off the channel, or those goroutines will get stuck
// forever.
if responseCounter == eg.NumGoroutines {
break
}

if errCounter > eg.NumAllowedErrors || successCounter >= eg.NumRequiredSuccesses {
cancel()
}
}

return rec
}
124 changes: 124 additions & 0 deletions go/vt/topotools/position_searcher.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
/*
Copyright 2021 The Vitess Authors.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package topotools

import (
"context"
"sync"
"time"

"vitess.io/vitess/go/mysql"
"vitess.io/vitess/go/vt/logutil"
"vitess.io/vitess/go/vt/topo/topoproto"
"vitess.io/vitess/go/vt/vttablet/tmclient"

topodatapb "vitess.io/vitess/go/vt/proto/topodata"
)

// MaxReplicationPositionSearcher provides a threadsafe way to find a tablet
// with the most advanced replication position.
//
// A typical usage will look like:
//
// var (
// searcher = NewMaxReplicationPositionSearcher(tmc, logger, waitTimeout)
// wg sync.WaitGroup
// )
// for _, tablet := range tablets {
// wg.Add(1)
// go func(t *topodatapb.Tablet) {
// defer wg.Done()
// searcher.ProcessTablet(ctx, t)
// }(tablet)
// }
// wg.Wait()
// maxPosTablet := searcher.MaxPositionTablet()
//
type MaxReplicationPositionSearcher struct {
tmc tmclient.TabletManagerClient
logger logutil.Logger
waitTimeout time.Duration
m sync.Mutex

maxPos mysql.Position
maxPosTablet *topodatapb.Tablet
}

// NewMaxReplicationPositionSearcher returns a new
// MaxReplicationPositionSearcher instance, ready to begin processing tablets.
// To reuse an existing instance, first call Reset().
func NewMaxReplicationPositionSearcher(tmc tmclient.TabletManagerClient, logger logutil.Logger, waitTimeout time.Duration) *MaxReplicationPositionSearcher {
return &MaxReplicationPositionSearcher{
tmc: tmc,
logger: logger,
waitTimeout: waitTimeout,
m: sync.Mutex{},
maxPos: mysql.Position{},
maxPosTablet: nil,
}
}

// ProcessTablet processes the replication position for a single tablet and
// updates the state of the searcher. It is safe to call from multiple
// goroutines.
func (searcher *MaxReplicationPositionSearcher) ProcessTablet(ctx context.Context, tablet *topodatapb.Tablet) {
searcher.logger.Infof("getting replication position from %v", topoproto.TabletAliasString(tablet.Alias))

ctx, cancel := context.WithTimeout(ctx, searcher.waitTimeout)
defer cancel()

status, err := searcher.tmc.ReplicationStatus(ctx, tablet)
if err != nil {
searcher.logger.Warningf("failed to get replication status from %v, ignoring tablet: %v", topoproto.TabletAliasString(tablet.Alias), err)

return
}

pos, err := mysql.DecodePosition(status.Position)
if err != nil {
searcher.logger.Warningf("cannot decode replica position %v for tablet %v, ignoring tablet: %v", status.Position, topoproto.TabletAliasString(tablet.Alias), err)

return
}

searcher.m.Lock()
defer searcher.m.Unlock()

if searcher.maxPosTablet == nil || !searcher.maxPos.AtLeast(pos) {
searcher.maxPos = pos
searcher.maxPosTablet = tablet
}
}

// MaxPositionTablet returns the most advanced-positioned tablet the searcher
// has seen so far.
func (searcher *MaxReplicationPositionSearcher) MaxPositionTablet() *topodatapb.Tablet {
searcher.m.Lock()
defer searcher.m.Unlock()

return searcher.maxPosTablet
}

// Reset clears any tracked position or tablet from the searcher, making this
// instance ready to begin a new search.
func (searcher *MaxReplicationPositionSearcher) Reset() {
searcher.m.Lock()
defer searcher.m.Unlock()

searcher.maxPos = mysql.Position{}
searcher.maxPosTablet = nil
}
Loading