Skip to content

Commit

Permalink
Merge pull request #8869 from sonne5/ywu/vtgr_log
Browse files Browse the repository at this point in the history
Prefix logger with keyspace/shard
  • Loading branch information
deepthi authored Oct 7, 2021
2 parents 4d0436a + 494f083 commit 93c811b
Show file tree
Hide file tree
Showing 7 changed files with 190 additions and 115 deletions.
35 changes: 17 additions & 18 deletions go/vt/vtgr/controller/diagnose.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,6 @@ import (

"vitess.io/vitess/go/mysql"
"vitess.io/vitess/go/vt/concurrency"
"vitess.io/vitess/go/vt/log"
topodatapb "vitess.io/vitess/go/vt/proto/topodata"
"vitess.io/vitess/go/vt/topo"
"vitess.io/vitess/go/vt/vterrors"
Expand Down Expand Up @@ -92,17 +91,17 @@ const (
func (shard *GRShard) ScanAndRepairShard(ctx context.Context) {
status, err := shard.Diagnose(ctx)
if err != nil {
log.Errorf("fail to scanAndRepairShard %v/%v because of Diagnose error: %v", shard.KeyspaceShard.Keyspace, shard.KeyspaceShard.Shard, err)
shard.logger.Errorf("fail to scanAndRepairShard %v/%v because of Diagnose error: %v", shard.KeyspaceShard.Keyspace, shard.KeyspaceShard.Shard, err)
return
}
// We are able to get Diagnose without error
//
// Note: all the recovery function should first try to grab a shard level lock
// and check the trigger conditions before doing anything. This is to avoid
// other VTGR instance try to do the same thing
log.Infof("%v status is %v", formatKeyspaceShard(shard.KeyspaceShard), status)
shard.logger.Infof("%v status is %v", formatKeyspaceShard(shard.KeyspaceShard), status)
if _, err := shard.Repair(ctx, status); err != nil {
log.Errorf("failed to repair %v: %v", status, err)
shard.logger.Errorf("failed to repair %v: %v", status, err)
}
}

Expand All @@ -123,7 +122,7 @@ func (shard *GRShard) Diagnose(ctx context.Context) (DiagnoseType, error) {
shard.shardStatusCollector.recordDiagnoseResult(diagnoseResult)
shard.populateVTGRStatusLocked()
if diagnoseResult != DiagnoseTypeHealthy {
log.Warningf(`VTGR diagnose shard as unhealthy for %s/%s: result=%v | last_result=%v | instances=%v | primary=%v | primary_tablet=%v | problematics=%v | unreachables=%v | SQL group=%v`,
shard.logger.Warningf(`VTGR diagnose shard as unhealthy for %s/%s: result=%v | last_result=%v | instances=%v | primary=%v | primary_tablet=%v | problematics=%v | unreachables=%v | SQL group=%v`,
shard.KeyspaceShard.Keyspace, shard.KeyspaceShard.Shard,
shard.shardStatusCollector.status.DiagnoseResult,
shard.lastDiagnoseResult,
Expand Down Expand Up @@ -154,7 +153,7 @@ func (shard *GRShard) diagnoseLocked(ctx context.Context) (DiagnoseType, error)
// later VTGR needs to find group name, primary etc from
// SQLGroup for repairing instead of getting nil
shard.sqlGroup.overrideView([]*db.GroupView{localView})
log.Infof("Diagnose %v from fast path", fastDiagnose)
shard.logger.Infof("Diagnose %v from fast path", fastDiagnose)
return fastDiagnose, nil
}
}
Expand All @@ -181,7 +180,7 @@ func (shard *GRShard) diagnoseLocked(ctx context.Context) (DiagnoseType, error)
// In this situation, instead of bootstrap a group, we should re-build the
// old group for the shard
if shard.isAllOfflineOrError() {
log.Info("Found all members are OFFLINE or ERROR")
shard.logger.Info("Found all members are OFFLINE or ERROR")
return DiagnoseTypeShardHasInactiveGroup, nil
}

Expand All @@ -196,7 +195,7 @@ func (shard *GRShard) diagnoseLocked(ctx context.Context) (DiagnoseType, error)
// errMissingGroup means we cannot find a mysql group for the shard
// we are in DiagnoseTypeShardHasNoGroup state
if err == errMissingGroup {
log.Warning("Missing mysql group")
shard.logger.Warning("Missing mysql group")
return DiagnoseTypeShardHasNoGroup, nil
}
// errMissingPrimaryTablet means we cannot find a tablet based on mysql primary
Expand Down Expand Up @@ -268,7 +267,7 @@ func (shard *GRShard) getLocalView() *db.GroupView {
// We still have the fallback logic if this failed, therefore we don't raise error
// but try to get local view with best effort
if err != nil {
log.Errorf("failed to fetch local group view: %v", err)
shard.logger.Errorf("failed to fetch local group view: %v", err)
}
return view
}
Expand Down Expand Up @@ -319,7 +318,7 @@ func (shard *GRShard) hasWrongPrimaryTablet(ctx context.Context) (bool, error) {
// in case the primary is unreachable
host, port, _ := shard.sqlGroup.GetPrimary()
if !isHostPortValid(host, port) {
log.Warningf("Invalid address for primary %v:%v", host, port)
shard.logger.Warningf("Invalid address for primary %v:%v", host, port)
return false, errMissingGroup
}
// Make sure we have a tablet available
Expand All @@ -329,7 +328,7 @@ func (shard *GRShard) hasWrongPrimaryTablet(ctx context.Context) (bool, error) {
// we retrun errMissingPrimaryTablet so that VTGR will trigger a failover
tablet := shard.findTabletByHostAndPort(host, port)
if tablet == nil || !shard.instanceReachable(ctx, tablet) {
log.Errorf("Failed to find tablet that is running with mysql on %v:%v", host, port)
shard.logger.Errorf("Failed to find tablet that is running with mysql on %v:%v", host, port)
return false, errMissingPrimaryTablet
}
// Now we know we have a valid mysql primary in the group
Expand All @@ -338,7 +337,7 @@ func (shard *GRShard) hasWrongPrimaryTablet(ctx context.Context) (bool, error) {
// If we failed to find primary for shard, it mostly means we are initializing the shard
// return true directly so that VTGR will set primary tablet according to MySQL group
if primary == nil {
log.Infof("unable to find primary tablet for %v", formatKeyspaceShard(shard.KeyspaceShard))
shard.logger.Infof("unable to find primary tablet for %v", formatKeyspaceShard(shard.KeyspaceShard))
return true, nil
}
return (host != primary.instanceKey.Hostname) || (port != primary.instanceKey.Port), nil
Expand All @@ -363,11 +362,11 @@ func (shard *GRShard) instanceReachable(ctx context.Context, instance *grInstanc
go func() { c <- shard.tmc.Ping(pingCtx, instance.tablet) }()
select {
case <-pingCtx.Done():
log.Errorf("Ping abort timeout %v", *pingTabletTimeout)
shard.logger.Errorf("Ping abort timeout %v", *pingTabletTimeout)
return false
case err := <-c:
if err != nil {
log.Errorf("Ping error host=%v: %v", instance.instanceKey.Hostname, err)
shard.logger.Errorf("Ping error host=%v: %v", instance.instanceKey.Hostname, err)
}
return err == nil
}
Expand Down Expand Up @@ -428,7 +427,7 @@ func (shard *GRShard) disconnectedInstance() (*grInstance, error) {
// Skip instance without hostname because they are not up and running
// also skip instances that raised unrecoverable errors
if shard.shardStatusCollector.isUnreachable(instance) {
log.Infof("Skip %v to check disconnectedInstance because it is unhealthy", instance.alias)
shard.logger.Infof("Skip %v to check disconnectedInstance because it is unhealthy", instance.alias)
continue
}
isUnconnected := shard.sqlGroup.IsUnconnectedReplica(instance.instanceKey)
Expand Down Expand Up @@ -530,7 +529,7 @@ func (shard *GRShard) forAllInstances(task func(instance *grInstance, wg *sync.W
}
wg.Wait()
if len(errorRecord.Errors) > 0 {
log.Errorf("get errors in forAllInstances call: %v", errorRecord.Error())
shard.logger.Errorf("get errors in forAllInstances call: %v", errorRecord.Error())
}
return &errorRecord
}
Expand Down Expand Up @@ -571,15 +570,15 @@ func (shard *GRShard) refreshSQLGroup() error {
if unreachableError(err) {
shard.shardStatusCollector.recordUnreachables(instance)
}
log.Errorf("%v get error while fetch group info: %v", instance.alias, err)
shard.logger.Errorf("%v get error while fetch group info: %v", instance.alias, err)
return
}
shard.sqlGroup.recordView(view)
})
// Only raise error if we failed to get any data from mysql
// otherwise, we will use what we get from mysql directly
if len(er.Errors) == len(shard.instances) {
log.Errorf("fail to fetch any data for mysql")
shard.logger.Errorf("fail to fetch any data for mysql")
return db.ErrGroupBackoffError
}
return shard.sqlGroup.Resolve()
Expand Down
30 changes: 16 additions & 14 deletions go/vt/vtgr/controller/group.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,9 @@ import (
"sync"

"vitess.io/vitess/go/stats"
"vitess.io/vitess/go/vt/log"
"vitess.io/vitess/go/vt/orchestrator/inst"
"vitess.io/vitess/go/vt/vtgr/db"
"vitess.io/vitess/go/vt/vtgr/log"
)

var (
Expand All @@ -37,6 +37,7 @@ var (
type SQLGroup struct {
views []*db.GroupView
resolvedView *ResolvedView
logger *log.Logger
size int
singlePrimary bool
statsTags []string
Expand All @@ -45,13 +46,14 @@ type SQLGroup struct {

// NewSQLGroup creates a new SQLGroup
func NewSQLGroup(size int, singlePrimary bool, keyspace, shard string) *SQLGroup {
return &SQLGroup{size: size, singlePrimary: singlePrimary, statsTags: []string{keyspace, shard}}
return &SQLGroup{size: size, singlePrimary: singlePrimary, statsTags: []string{keyspace, shard}, logger: log.NewVTGRLogger(keyspace, shard)}
}

// ResolvedView is the resolved view
type ResolvedView struct {
groupName string
view map[inst.InstanceKey]db.GroupMember
logger *log.Logger
}

// recordView adds a view to the group
Expand Down Expand Up @@ -164,13 +166,13 @@ func (group *SQLGroup) IsSafeToBootstrap() bool {
// for bootstrap we require group at least has quorum number of views
// this is to make sure we don't bootstrap a group improperly
if len(group.views) < group.size {
log.Errorf("[sql_group] cannot bootstrap because we only have %v views | expected %v", len(group.views), group.size)
group.logger.Errorf("[sql_group] cannot bootstrap because we only have %v views | expected %v", len(group.views), group.size)
return false
}
// we think it is safe to bootstrap a group if all the views don't have a primary host
host, port, _ := group.getPrimaryLocked()
if host != "" || port != 0 {
log.Warningf("not safe to bootstrap sql group because %v/%v might already be primary", host, port)
group.logger.Warningf("not safe to bootstrap sql group because %v/%v might already be primary", host, port)
}
return host == "" && port == 0
}
Expand Down Expand Up @@ -201,15 +203,15 @@ func (group *SQLGroup) Resolve() error {
return group.resolveLocked()
}
func (group *SQLGroup) resolveLocked() error {
rv := &ResolvedView{}
rv := &ResolvedView{logger: group.logger}
group.resolvedView = rv
m := make(map[inst.InstanceKey]db.GroupMember)
for _, view := range group.views {
if rv.groupName == "" && view.GroupName != "" {
rv.groupName = view.GroupName
}
if view.GroupName != "" && rv.groupName != view.GroupName {
log.Errorf("previous group name %v found %v", rv.groupName, view.GroupName)
group.logger.Errorf("previous group name %v found %v", rv.groupName, view.GroupName)
return db.ErrGroupSplitBrain
}
for _, member := range view.UnresolvedMembers {
Expand Down Expand Up @@ -246,7 +248,7 @@ func (group *SQLGroup) resolveLocked() error {

func (rv *ResolvedView) validate(singlePrimary bool, statsTags []string) error {
if !rv.hasGroup() {
log.Info("Resolved view does not have a group")
rv.logger.Info("Resolved view does not have a group")
return nil
}
hasPrimary := false
Expand All @@ -255,13 +257,13 @@ func (rv *ResolvedView) validate(singlePrimary bool, statsTags []string) error {
for _, status := range rv.view {
if status.Role == db.PRIMARY {
if singlePrimary && hasPrimary {
log.Errorf("Found more than one primary in the group")
rv.logger.Errorf("Found more than one primary in the group")
return db.ErrGroupSplitBrain
}
hasPrimary = true
primaryState = status.State
if status.State != db.ONLINE {
log.Warningf("Found a PRIMARY not ONLINE (%v)", status.State)
rv.logger.Warningf("Found a PRIMARY not ONLINE (%v)", status.State)
}
}
switch status.State {
Expand All @@ -279,10 +281,10 @@ func (rv *ResolvedView) validate(singlePrimary bool, statsTags []string) error {
}
groupOnlineSize.Set(statsTags, int64(onlineCount))
if unreachableCount > 0 || errorCount > 0 || offlineCount > 0 {
log.Warningf("Some of nodes are unconnected in the group. hasPrimary=%v (%v), online_count=%v, recovering_count=%v, unreachable_count=%v, offline_count=%v, error_count=%v", hasPrimary, primaryState, onlineCount, recoveringCount, unreachableCount, offlineCount, errorCount)
rv.logger.Warningf("Some of nodes are unconnected in the group. hasPrimary=%v (%v), online_count=%v, recovering_count=%v, unreachable_count=%v, offline_count=%v, error_count=%v", hasPrimary, primaryState, onlineCount, recoveringCount, unreachableCount, offlineCount, errorCount)
}
if unreachableCount >= len(rv.view)/2+1 {
log.Errorf("Backoff error by quorum unreachable: found %v number of UNREACHABLE nodes while quorum is %v", unreachableCount, len(rv.view)/2+1)
rv.logger.Errorf("Backoff error by quorum unreachable: found %v number of UNREACHABLE nodes while quorum is %v", unreachableCount, len(rv.view)/2+1)
isLostQuorum.Set(statsTags, 1)
} else {
isLostQuorum.Set(statsTags, 0)
Expand All @@ -300,19 +302,19 @@ func (rv *ResolvedView) validate(singlePrimary bool, statsTags []string) error {
}
// Ongoing bootstrap, we should backoff and wait
if recoveringCount == 1 && (offlineCount+recoveringCount == len(rv.view)) {
log.Warningf("Group has one recovery node with all others in offline mode")
rv.logger.Warningf("Group has one recovery node with all others in offline mode")
return db.ErrGroupOngoingBootstrap
}
// We don't have quorum number of unreachable, but the primary is not online
// This most likely means there is a failover in the group we should back off and wait
if hasPrimary && primaryState != db.ONLINE {
log.Warningf("Found a PRIMARY that is not ONLINE (%v)", primaryState)
rv.logger.Warningf("Found a PRIMARY that is not ONLINE (%v)", primaryState)
return db.ErrGroupBackoffError
}
// If all the node in view are OFFLINE or ERROR, it is an inactive group
// It is expected to have no primary in this case
if !hasPrimary && (offlineCount+errorCount != len(rv.view)) {
log.Warningf("Group is NOT all offline or error without a primary node")
rv.logger.Warningf("Group is NOT all offline or error without a primary node")
return db.ErrGroupBackoffError
}
return nil
Expand Down
15 changes: 9 additions & 6 deletions go/vt/vtgr/controller/group_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ package controller
import (
"testing"

"vitess.io/vitess/go/vt/vtgr/log"

"vitess.io/vitess/go/vt/orchestrator/inst"
"vitess.io/vitess/go/vt/vtgr/db"

Expand Down Expand Up @@ -249,7 +251,7 @@ func TestResolve(t *testing.T) {
{Hostname: "host1", Port: 10}: {HostName: "host1", Port: 10, Role: db.PRIMARY, State: db.ONLINE, ReadOnly: false},
{Hostname: "host2", Port: 10}: {HostName: "host2", Port: 10, Role: db.SECONDARY, State: db.ONLINE, ReadOnly: true},
{Hostname: "host3", Port: 10}: {HostName: "host3", Port: 10, Role: db.SECONDARY, State: db.ONLINE, ReadOnly: true},
}}, ""},
}, nil}, ""},
{"test readonly with unreachable primary", []*db.GroupView{ // host1 is unreachable
{MySQLHost: "host2", MySQLPort: 10, GroupName: "group", UnresolvedMembers: []*db.GroupMember{
{HostName: "host1", Port: 10, Role: db.PRIMARY, State: db.ONLINE, ReadOnly: false},
Expand All @@ -265,7 +267,7 @@ func TestResolve(t *testing.T) {
{Hostname: "host1", Port: 10}: {HostName: "host1", Port: 10, Role: db.PRIMARY, State: db.ONLINE, ReadOnly: false},
{Hostname: "host2", Port: 10}: {HostName: "host2", Port: 10, Role: db.SECONDARY, State: db.ONLINE, ReadOnly: true},
{Hostname: "host3", Port: 10}: {HostName: "host3", Port: 10, Role: db.SECONDARY, State: db.ONLINE, ReadOnly: true},
}}, ""},
}, nil}, ""},
{"test split brain by group name", []*db.GroupView{
{MySQLHost: "host1", MySQLPort: 10, GroupName: "group", UnresolvedMembers: healthyView},
{MySQLHost: "host2", MySQLPort: 10, GroupName: "group1", UnresolvedMembers: healthyView},
Expand All @@ -285,7 +287,7 @@ func TestResolve(t *testing.T) {
{Hostname: "host1", Port: 10}: {HostName: "host1", Port: 10, Role: db.UNKNOWNROLE, State: db.OFFLINE, ReadOnly: true},
{Hostname: "host2", Port: 10}: {HostName: "host2", Port: 10, Role: db.UNKNOWNROLE, State: db.OFFLINE, ReadOnly: true},
{Hostname: "host3", Port: 10}: {HostName: "host3", Port: 10, Role: db.UNKNOWNROLE, State: db.OFFLINE, ReadOnly: true},
}}, ""},
}, nil}, ""},
{"test network partition by majority unreachable", []*db.GroupView{
{MySQLHost: "host1", MySQLPort: 10, GroupName: "group", UnresolvedMembers: []*db.GroupMember{
{HostName: "host1", Port: 10, Role: db.PRIMARY, State: db.UNREACHABLE, ReadOnly: false},
Expand All @@ -308,7 +310,7 @@ func TestResolve(t *testing.T) {
{Hostname: "host1", Port: 10}: {HostName: "host1", Port: 10, Role: db.PRIMARY, State: db.ONLINE, ReadOnly: false},
{Hostname: "host2", Port: 10}: {HostName: "host2", Port: 10, Role: db.SECONDARY, State: db.ONLINE, ReadOnly: true},
{Hostname: "host3", Port: 10}: {HostName: "host3", Port: 10, Role: db.SECONDARY, State: db.UNREACHABLE, ReadOnly: false},
}}, "group backoff error"},
}, nil}, "group backoff error"},
{"test network partition by unreachable primary", []*db.GroupView{
{MySQLHost: "host2", MySQLPort: 10, GroupName: "group", UnresolvedMembers: []*db.GroupMember{
{HostName: "host1", Port: 10, Role: db.PRIMARY, State: db.UNREACHABLE},
Expand All @@ -331,7 +333,7 @@ func TestResolve(t *testing.T) {
}
for _, testCase := range testCases {
t.Run(testCase.testName, func(t *testing.T) {
group := SQLGroup{views: testCase.views, statsTags: []string{"ks", "0"}}
group := SQLGroup{views: testCase.views, statsTags: []string{"ks", "0"}, logger: log.NewVTGRLogger("ks", "0")}
err := group.Resolve()
if testCase.errorMsg != "" {
assert.EqualError(t, err, testCase.errorMsg)
Expand All @@ -341,7 +343,8 @@ func TestResolve(t *testing.T) {
if testCase.expected != nil {
rv := group.resolvedView
expected := testCase.expected
assert.Equal(t, expected, rv)
assert.Equal(t, expected.view, rv.view)
assert.Equal(t, expected.groupName, rv.groupName)
}
})
}
Expand Down
Loading

0 comments on commit 93c811b

Please sign in to comment.