Skip to content

Commit

Permalink
testcluster: improve AddReplicas check
Browse files Browse the repository at this point in the history
AddReplicas was verifying that a replica had indeed been added, but
there's no guarantee that the replicate queue wouldn't have removed
it in the meantime. Attempt to work around this somewhat. The real
solution is not to provide that guarantee, but some tests likely
rely on it (and the failure is extremely rare, i.e. the new for
loop basically never runs).

Observed in #28368.

Release note: None
  • Loading branch information
tbg committed Sep 20, 2018
1 parent b903a24 commit 4b20a07
Showing 1 changed file with 33 additions and 27 deletions.
60 changes: 33 additions & 27 deletions pkg/testutils/testcluster/testcluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -348,37 +348,43 @@ func (tc *TestCluster) AddReplicas(
startKey roachpb.Key, targets ...roachpb.ReplicationTarget,
) (roachpb.RangeDescriptor, error) {
rKey := keys.MustAddr(startKey)
rangeDesc, err := tc.changeReplicas(
roachpb.ADD_REPLICA, rKey, targets...,
)
if err != nil {
return roachpb.RangeDescriptor{}, err
}
errRetry := errors.Errorf("target not found")
for {
rangeDesc, err := tc.changeReplicas(
roachpb.ADD_REPLICA, rKey, targets...,
)
if err != nil {
return roachpb.RangeDescriptor{}, err
}

// Wait for the replication to complete on all destination nodes.
if err := retry.ForDuration(time.Second*5, func() error {
for _, target := range targets {
// Use LookupReplica(keys) instead of GetRange(rangeID) to ensure that the
// snapshot has been transferred and the descriptor initialized.
store, err := tc.findMemberStore(target.StoreID)
if err != nil {
log.Errorf(context.TODO(), "unexpected error: %s", err)
return err
}
repl := store.LookupReplica(rKey)
if repl == nil {
return errors.Errorf("range not found on store %d", target)
}
desc := repl.Desc()
if _, ok := desc.GetReplicaDescriptor(target.StoreID); !ok {
return errors.Errorf("target store %d not yet in range descriptor %v", target.StoreID, desc)
// Wait for the replication to complete on all destination nodes.
if err := retry.ForDuration(time.Second*25, func() error {
for _, target := range targets {
// Use LookupReplica(keys) instead of GetRange(rangeID) to ensure that the
// snapshot has been transferred and the descriptor initialized.
store, err := tc.findMemberStore(target.StoreID)
if err != nil {
log.Errorf(context.TODO(), "unexpected error: %s", err)
return err
}
repl := store.LookupReplica(rKey)
if repl == nil {
return errors.Wrapf(errRetry, "for target %s", target)
}
desc := repl.Desc()
if _, ok := desc.GetReplicaDescriptor(target.StoreID); !ok {
return errors.Errorf("target store %d not yet in range descriptor %v", target.StoreID, desc)
}
}
return nil
}); errors.Cause(err) == errRetry {
log.Warningf(context.Background(), "target was likely downreplicated again; retrying after %s", err)
continue
} else if err != nil {
return roachpb.RangeDescriptor{}, err
}
return nil
}); err != nil {
return roachpb.RangeDescriptor{}, err
return rangeDesc, nil
}
return rangeDesc, nil
}

// RemoveReplicas is part of the TestServerInterface.
Expand Down

0 comments on commit 4b20a07

Please sign in to comment.