Skip to content

Commit fae7b25

Browse files
author
Renato Costa
committed
roachtest: update mixed-version follower read tests for shared-process
As usual, a few changes were necessary because the test changed cluster settings only visible to the system tenant. In addition, this test performs HTTP calls to crdb endpoints. We use newly introduced `VirtualCluster` option in the HTTP client in order to view system statistics during the test. Informs: #127378 Release note: None
1 parent 5cd0af6 commit fae7b25

File tree

1 file changed

+100
-53
lines changed

1 file changed

+100
-53
lines changed

pkg/cmd/roachtest/tests/follower_reads.go

Lines changed: 100 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,9 @@ import (
3535
"github.com/cockroachdb/cockroach/pkg/testutils"
3636
"github.com/cockroachdb/cockroach/pkg/testutils/sqlutils"
3737
"github.com/cockroachdb/cockroach/pkg/ts/tspb"
38+
"github.com/cockroachdb/cockroach/pkg/util/randutil"
3839
"github.com/cockroachdb/cockroach/pkg/util/retry"
40+
"github.com/cockroachdb/cockroach/pkg/util/syncutil"
3941
"github.com/cockroachdb/cockroach/pkg/util/timeutil"
4042
"github.com/cockroachdb/errors"
4143
"github.com/jackc/pgtype"
@@ -76,8 +78,33 @@ func registerFollowerReads(r registry.Registry) {
7678
survival: survival,
7779
deadPrimaryRegion: insufficientQuorum,
7880
}
79-
data := initFollowerReadsDB(ctx, t, c, topology)
80-
runFollowerReadsTest(ctx, t, c, topology, rc, data)
81+
conns := struct {
82+
mu syncutil.Mutex
83+
mapping map[int]*gosql.DB
84+
}{
85+
mapping: make(map[int]*gosql.DB),
86+
}
87+
connFunc := func(node int) *gosql.DB {
88+
conns.mu.Lock()
89+
defer conns.mu.Unlock()
90+
91+
if _, ok := conns.mapping[node]; !ok {
92+
conn := c.Conn(ctx, t.L(), node)
93+
conns.mapping[node] = conn
94+
}
95+
96+
return conns.mapping[node]
97+
}
98+
99+
defer func() {
100+
for _, c := range conns.mapping {
101+
c.Close()
102+
}
103+
}()
104+
105+
rng, _ := randutil.NewPseudoRand()
106+
data := initFollowerReadsDB(ctx, t, t.L(), c, connFunc, connFunc, rng, topology)
107+
runFollowerReadsTest(ctx, t, t.L(), c, rng, topology, rc, data)
81108
},
82109
})
83110
}
@@ -185,7 +212,9 @@ type topologySpec struct {
185212
func runFollowerReadsTest(
186213
ctx context.Context,
187214
t test.Test,
215+
l *logger.Logger,
188216
c cluster.Cluster,
217+
rng *rand.Rand,
189218
topology topologySpec,
190219
rc readConsistency,
191220
data map[int]int64,
@@ -196,7 +225,7 @@ func runFollowerReadsTest(
196225
// levels in the mixed-version variant of this test than they are on master.
197226
isoLevels := []string{"read committed", "snapshot", "serializable"}
198227
require.NoError(t, func() error {
199-
db := c.Conn(ctx, t.L(), 1)
228+
db := c.Conn(ctx, l, 1)
200229
defer db.Close()
201230
err := enableIsolationLevels(ctx, t, db)
202231
if err != nil && strings.Contains(err.Error(), "unknown cluster setting") {
@@ -209,9 +238,8 @@ func runFollowerReadsTest(
209238

210239
var conns []*gosql.DB
211240
for i := 0; i < c.Spec().NodeCount; i++ {
212-
isoLevel := isoLevels[rand.Intn(len(isoLevels))]
213-
isoLevelOpt := option.ConnectionOption("default_transaction_isolation", isoLevel)
214-
conn := c.Conn(ctx, t.L(), i+1, isoLevelOpt)
241+
isoLevel := isoLevels[rng.Intn(len(isoLevels))]
242+
conn := c.Conn(ctx, l, i+1, option.ConnectionOption("default_transaction_isolation", isoLevel))
215243
defer conn.Close()
216244
conns = append(conns, conn)
217245
}
@@ -299,10 +327,7 @@ func runFollowerReadsTest(
299327
),
300328
)
301329
if err != nil {
302-
// 20.2 doesn't have this setting.
303-
if !strings.Contains(err.Error(), "unknown cluster setting") {
304-
t.Fatal(err)
305-
}
330+
t.Fatal(err)
306331
}
307332

308333
// Read the follower read counts before issuing the follower reads to observe
@@ -317,7 +342,7 @@ func runFollowerReadsTest(
317342
// Perform reads on each node and ensure we get the expected value. Do so for
318343
// 15 seconds to give closed timestamps a chance to propagate and caches time
319344
// to warm up.
320-
t.L().Printf("warming up reads")
345+
l.Printf("warming up reads")
321346
g, gCtx := errgroup.WithContext(ctx)
322347
k, v := chooseKV()
323348
until := timeutil.Now().Add(15 * time.Second)
@@ -361,19 +386,19 @@ func runFollowerReadsTest(
361386
if topology.deadPrimaryRegion && i <= 3 {
362387
stopOpts := option.DefaultStopOpts()
363388
stopOpts.RoachprodOpts.Sig = 9
364-
c.Stop(ctx, t.L(), stopOpts, c.Node(i))
389+
c.Stop(ctx, l, stopOpts, c.Node(i))
365390
deadNodes[i] = struct{}{}
366391
} else {
367392
liveNodes[i] = struct{}{}
368393
}
369394
}
370395

371-
t.L().Printf("starting read load")
396+
l.Printf("starting read load")
372397
const loadDuration = 4 * time.Minute
373398
timeoutCtx, cancel := context.WithCancel(ctx)
374399
defer cancel()
375400
time.AfterFunc(loadDuration, func() {
376-
t.L().Printf("stopping load")
401+
l.Printf("stopping load")
377402
cancel()
378403
})
379404
g, gCtx = errgroup.WithContext(timeoutCtx)
@@ -392,7 +417,7 @@ func runFollowerReadsTest(
392417
t.Fatalf("error reading data: %v", err)
393418
}
394419
end := timeutil.Now()
395-
t.L().Printf("load stopped")
420+
l.Printf("load stopped")
396421

397422
// Depending on the test's topology, we expect a different set of nodes to
398423
// perform follower reads.
@@ -422,39 +447,52 @@ func runFollowerReadsTest(
422447
expectedLowRatioNodes = 1
423448
}
424449
}
425-
verifyHighFollowerReadRatios(ctx, t, c, liveNodes, start, end, expectedLowRatioNodes)
450+
verifyHighFollowerReadRatios(ctx, t, l, c, liveNodes, start, end, expectedLowRatioNodes)
426451

427452
if topology.multiRegion {
428453
// Perform a ts query to verify that the SQL latencies were well below the
429454
// WAN latencies which should be at least 50ms.
430455
//
431456
// We don't do this for singleRegion since, in a single region, there's no
432457
// low latency and high-latency regimes.
433-
verifySQLLatency(ctx, t, c, liveNodes, start, end, maxLatencyThreshold)
458+
verifySQLLatency(ctx, t, l, c, liveNodes, start, end, maxLatencyThreshold)
434459
}
435460

436461
// Restart dead nodes, if necessary.
437462
for i := range deadNodes {
438-
c.Start(ctx, t.L(), option.DefaultStartOpts(), install.MakeClusterSettings(), c.Node(i))
463+
c.Start(ctx, l, option.DefaultStartOpts(), install.MakeClusterSettings(), c.Node(i))
439464
}
440465
}
441466

442467
// initFollowerReadsDB initializes a database for the follower reads test.
443468
// Returns the data inserted into the test table.
469+
//
470+
// The `connFunc` (and `systemConnFunc`) parameters capture how to
471+
// connect to the database in a test run. This allows us to use the
472+
// mixedersion helpers in mixed-version tests. We need to connect to
473+
// the system tenant to change relevant SystemOnly cluster settings,
474+
// and some mixed-version tests run in a multi-tenant deployment.
444475
func initFollowerReadsDB(
445-
ctx context.Context, t test.Test, c cluster.Cluster, topology topologySpec,
476+
ctx context.Context,
477+
t test.Test,
478+
l *logger.Logger,
479+
c cluster.Cluster,
480+
connectFunc, systemConnectFunc func(int) *gosql.DB,
481+
rng *rand.Rand,
482+
topology topologySpec,
446483
) (data map[int]int64) {
447-
db := c.Conn(ctx, t.L(), 1)
448-
defer db.Close()
484+
systemDB := systemConnectFunc(1)
485+
db := connectFunc(1)
486+
449487
// Disable load based splitting and range merging because splits and merges
450488
// interfere with follower reads. This test's workload regularly triggers load
451489
// based splitting in the first phase creating small ranges which later
452490
// in the test are merged. The merging tends to coincide with the final phase
453491
// of the test which attempts to observe low latency reads leading to
454492
// flakiness.
455-
_, err := db.ExecContext(ctx, "SET CLUSTER SETTING kv.range_split.by_load_enabled = 'false'")
493+
_, err := systemDB.ExecContext(ctx, "SET CLUSTER SETTING kv.range_split.by_load_enabled = 'false'")
456494
require.NoError(t, err)
457-
_, err = db.ExecContext(ctx, "SET CLUSTER SETTING kv.range_merge.queue_enabled = 'false'")
495+
_, err = systemDB.ExecContext(ctx, "SET CLUSTER SETTING kv.range_merge.queue_enabled = 'false'")
458496
require.NoError(t, err)
459497

460498
// Check the cluster regions.
@@ -502,7 +540,7 @@ func initFollowerReadsDB(
502540
}
503541

504542
// Wait until the table has completed up-replication.
505-
t.L().Printf("waiting for up-replication...")
543+
l.Printf("waiting for up-replication...")
506544
retryOpts := retry.Options{MaxBackoff: 15 * time.Second}
507545
for r := retry.StartWithCtx(ctx, retryOpts); r.Next(); {
508546
// Check that the table has the expected number and location of voting and
@@ -550,7 +588,7 @@ func initFollowerReadsDB(
550588
ctx, q1, votersCount, nonVotersCount, pq.Array(votersSet), pq.Array(nonVotersSet),
551589
).Scan(&ok, &voters, &nonVoters)
552590
if errors.Is(err, gosql.ErrNoRows) {
553-
t.L().Printf("up-replication not complete, missing range")
591+
l.Printf("up-replication not complete, missing range")
554592
continue
555593
}
556594
require.NoError(t, err)
@@ -559,9 +597,9 @@ func initFollowerReadsDB(
559597
break
560598
}
561599

562-
t.L().Printf("up-replication not complete, "+
563-
"found voters = %v (want %d in set %v) and non_voters = %s (want %d in set %v)",
564-
voters, votersCount, votersSet, nonVoters, votersCount, votersSet)
600+
l.Printf("up-replication not complete, "+
601+
"found voters = %v (want %d in set %v) and non_voters = %v (want %d in set %v)",
602+
voters, votersCount, votersSet, nonVoters, nonVotersCount, nonVotersSet)
565603
}
566604

567605
if topology.multiRegion {
@@ -618,7 +656,7 @@ func initFollowerReadsDB(
618656
break
619657
}
620658

621-
t.L().Printf("rebalancing not complete, expected %d at risk ranges, "+
659+
l.Printf("rebalancing not complete, expected %d at risk ranges, "+
622660
"found %d", expAtRisk, atRisk)
623661
}
624662
}
@@ -629,7 +667,7 @@ func initFollowerReadsDB(
629667
sem := make(chan struct{}, concurrency)
630668
data = make(map[int]int64)
631669
insert := func(ctx context.Context, k int) func() error {
632-
v := rand.Int63()
670+
v := rng.Int63()
633671
data[k] = v
634672
return func() error {
635673
sem <- struct{}{}
@@ -671,6 +709,7 @@ func computeFollowerReadDuration(ctx context.Context, db *gosql.DB) (time.Durati
671709
func verifySQLLatency(
672710
ctx context.Context,
673711
t test.Test,
712+
l *logger.Logger,
674713
c cluster.Cluster,
675714
liveNodes map[int]struct{},
676715
start, end time.Time,
@@ -682,13 +721,11 @@ func verifySQLLatency(
682721
adminNode = i
683722
break
684723
}
685-
adminURLs, err := c.ExternalAdminUIAddr(ctx, t.L(), c.Node(adminNode))
724+
adminURLs, err := c.ExternalAdminUIAddr(ctx, l, c.Node(adminNode))
686725
if err != nil {
687726
t.Fatal(err)
688727
}
689-
// follower-reads/mixed-version runs on insecure mode, so we need http.
690-
// Tests that do run on secure mode will redirect to https.
691-
url := "http://" + adminURLs[0] + "/ts/query"
728+
url := "https://" + adminURLs[0] + "/ts/query"
692729
var sources []string
693730
for i := range liveNodes {
694731
sources = append(sources, strconv.Itoa(i))
@@ -704,7 +741,7 @@ func verifySQLLatency(
704741
SourceAggregator: tspb.TimeSeriesQueryAggregator_MAX.Enum(),
705742
}},
706743
}
707-
client := roachtestutil.DefaultHTTPClient(c, t.L())
744+
client := roachtestutil.DefaultHTTPClient(c, l)
708745
var response tspb.TimeSeriesQueryResponse
709746
if err := client.PostProtobuf(ctx, url, &request, &response); err != nil {
710747
t.Fatal(err)
@@ -736,6 +773,7 @@ func verifySQLLatency(
736773
func verifyHighFollowerReadRatios(
737774
ctx context.Context,
738775
t test.Test,
776+
l *logger.Logger,
739777
c cluster.Cluster,
740778
liveNodes map[int]struct{},
741779
start, end time.Time,
@@ -753,11 +791,10 @@ func verifyHighFollowerReadRatios(
753791
adminNode = i
754792
break
755793
}
756-
adminURLs, err := c.ExternalAdminUIAddr(ctx, t.L(), c.Node(adminNode))
794+
adminURLs, err := c.ExternalAdminUIAddr(ctx, l, c.Node(adminNode))
757795
require.NoError(t, err)
758-
// follower-reads/mixed-version runs on insecure mode, so we need http.
759-
// Tests that do run on secure mode will redirect to https.
760-
url := "http://" + adminURLs[0] + "/ts/query"
796+
797+
url := "https://" + adminURLs[0] + "/ts/query"
761798
request := tspb.TimeSeriesQueryRequest{
762799
StartNanos: start.UnixNano(),
763800
EndNanos: end.UnixNano(),
@@ -777,7 +814,11 @@ func verifyHighFollowerReadRatios(
777814
Derivative: tspb.TimeSeriesQueryDerivative_NON_NEGATIVE_DERIVATIVE.Enum(),
778815
})
779816
}
780-
client := roachtestutil.DefaultHTTPClient(c, t.L())
817+
// Make sure to connect to the system tenant in case this test
818+
// is running on a multitenant deployment.
819+
client := roachtestutil.DefaultHTTPClient(
820+
c, l, roachtestutil.VirtualCluster(install.SystemInterfaceName),
821+
)
781822
var response tspb.TimeSeriesQueryResponse
782823
if err := client.PostProtobuf(ctx, url, &request, &response); err != nil {
783824
t.Fatal(err)
@@ -810,7 +851,7 @@ func verifyHighFollowerReadRatios(
810851
}
811852
}
812853

813-
t.L().Printf("interval stats: %s", intervalsToString(stats))
854+
l.Printf("interval stats: %s", intervalsToString(stats))
814855

815856
// Now count how many intervals have more than the tolerated number of nodes
816857
// with low follower read ratios.
@@ -865,10 +906,12 @@ func getFollowerReadCounts(ctx context.Context, t test.Test, c cluster.Cluster)
865906
if err != nil {
866907
return err
867908
}
868-
// follower-reads/mixed-version runs on insecure mode, so we need http.
869-
// Tests that do run on secure mode will redirect to https.
870-
url := "http://" + adminUIAddrs[0] + "/_status/vars"
871-
client := roachtestutil.DefaultHTTPClient(c, t.L())
909+
url := "https://" + adminUIAddrs[0] + "/_status/vars"
910+
// Make sure to connect to the system tenant in case this test
911+
// is running on a multitenant deployment.
912+
client := roachtestutil.DefaultHTTPClient(
913+
c, t.L(), roachtestutil.VirtualCluster(install.SystemInterfaceName),
914+
)
872915
resp, err := client.Get(ctx, url)
873916
if err != nil {
874917
return err
@@ -970,21 +1013,25 @@ func runFollowerReadsMixedVersionTest(
9701013
rc readConsistency,
9711014
opts ...mixedversion.CustomOption,
9721015
) {
973-
mvt := mixedversion.NewTest(ctx, t, t.L(), c, c.All(),
974-
append([]mixedversion.CustomOption{
975-
// Multi-tenant deployments are currently unsupported. See #127378.
976-
mixedversion.EnabledDeploymentModes(mixedversion.SystemOnlyDeployment),
977-
}, opts...)...,
978-
)
1016+
mvt := mixedversion.NewTest(ctx, t, t.L(), c, c.All(), opts...)
9791017

9801018
var data map[int]int64
9811019
runInit := func(ctx context.Context, l *logger.Logger, r *rand.Rand, h *mixedversion.Helper) error {
982-
data = initFollowerReadsDB(ctx, t, c, topology)
1020+
if topology.multiRegion &&
1021+
h.IsMultitenant() &&
1022+
!h.Context().FromVersion.AtLeast(mixedversion.TenantsAndSystemAlignedSettingsVersion) {
1023+
const setting = "sql.multi_region.allow_abstractions_for_secondary_tenants.enabled"
1024+
if err := setTenantSetting(l, r, h, setting, true); err != nil {
1025+
return errors.Wrapf(err, "setting %s", setting)
1026+
}
1027+
}
1028+
1029+
data = initFollowerReadsDB(ctx, t, l, c, h.Connect, h.System.Connect, r, topology)
9831030
return nil
9841031
}
9851032

9861033
runFollowerReads := func(ctx context.Context, l *logger.Logger, r *rand.Rand, h *mixedversion.Helper) error {
987-
runFollowerReadsTest(ctx, t, c, topology, rc, data)
1034+
runFollowerReadsTest(ctx, t, l, c, r, topology, rc, data)
9881035
return nil
9891036
}
9901037

0 commit comments

Comments
 (0)