Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
134014: roachtest: add mixed version test for RAC r=kvoli,andrewbaptist,sumeerbhola a=pav-kv

The test exercises RACv1 and v2 behaviours in a mixed-version cluster, and after the upgrade is finalized.

Part of #132778

134346: sql: skip TestIndexBackfillMergeRetry under duress r=Dedej-Bergin a=Dedej-Bergin

This test fails under duress so we are skipping it.

Fixes: #134033
Release note: None

Co-authored-by: Pavel Kalinnikov <pavel@cockroachlabs.com>
Co-authored-by: Bergin Dedej <bergin.dedej@cockroachlabs.com>
  • Loading branch information
3 people committed Nov 6, 2024
3 parents 4dce850 + fb08418 + 49cdea4 commit 72fd7e2
Show file tree
Hide file tree
Showing 5 changed files with 146 additions and 6 deletions.
1 change: 1 addition & 0 deletions pkg/cmd/roachtest/tests/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ go_library(
"admission_control_elastic_backup.go",
"admission_control_elastic_cdc.go",
"admission_control_elastic_io.go",
"admission_control_elastic_mixed_version.go",
"admission_control_follower_overload.go",
"admission_control_index_backfill.go",
"admission_control_index_overload.go",
Expand Down
1 change: 1 addition & 0 deletions pkg/cmd/roachtest/tests/admission_control.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,5 +39,6 @@ func registerAdmission(r registry.Registry) {
registerIntentResolutionOverload(r)
registerElasticIO(r)
registerDiskBandwidthOverload(r)
registerElasticWorkloadMixedVersion(r)
registerLatencyTests(r)
}
136 changes: 136 additions & 0 deletions pkg/cmd/roachtest/tests/admission_control_elastic_mixed_version.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
// Copyright 2024 The Cockroach Authors.
//
// Use of this software is governed by the CockroachDB Software License
// included in the /LICENSE file.

package tests

import (
"context"
"fmt"
"math/rand"
"time"

"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/cluster"
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option"
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/registry"
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/roachtestutil"
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/roachtestutil/clusterupgrade"
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/roachtestutil/mixedversion"
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/spec"
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test"
"github.com/cockroachdb/cockroach/pkg/roachprod/install"
"github.com/cockroachdb/cockroach/pkg/roachprod/logger"
"github.com/stretchr/testify/require"
)

// This test exercises Replication Admission Control v1 and v2 with respect to
// regular and elastic workloads in a mixed-version setup.
//
// It runs 2 workloads: kv consisting of "regular" priority writes and kv
// consisting of "background" (elastic) priority writes. The goal is to show
// that even with a demanding "background" workload that is able to push the
// used bandwidth much higher than the provisioned one, AC paces the traffic at
// the set bandwidth limit, and favours regular writes. This behaviour does not
// regress after the cluster is upgraded to v24.3.
func registerElasticWorkloadMixedVersion(r registry.Registry) {
r.Add(registry.TestSpec{
Name: "admission-control/elastic-workload/mixed-version",
Owner: registry.OwnerKV,
Timeout: 1 * time.Hour,
Benchmark: true,
CompatibleClouds: registry.OnlyGCE,
Suites: registry.Suites(registry.Nightly),
Cluster: r.MakeClusterSpec(4, spec.CPU(8),
spec.WorkloadNode(), spec.ReuseNone()),
Run: func(ctx context.Context, t test.Test, c cluster.Cluster) {
require.Equal(t, 4, c.Spec().NodeCount)

settings := install.MakeClusterSettings()
mvt := mixedversion.NewTest(ctx, t, t.L(), c, c.CRDBNodes(),
mixedversion.NeverUseFixtures,
mixedversion.ClusterSettingOption(
install.ClusterSettingsOption(settings.ClusterSettings),
),
mixedversion.EnabledDeploymentModes(
mixedversion.SystemOnlyDeployment,
mixedversion.SharedProcessDeployment,
),
mixedversion.AlwaysUseLatestPredecessors,
// Don't go back too far. We are mostly interested in upgrading to v24.3
// where RACv2 was introduced.
mixedversion.MaxUpgrades(2),
mixedversion.MinimumSupportedVersion("v24.1.0"),
)

// Limit the disk throughput to 128 MiB/s, to more easily stress the
// elastic traffic waiting.
const diskBand = 128 << 20 // 128 MiB
setDiskBandwidth := func() {
t.Status(fmt.Sprintf("limiting disk bandwidth to %d bytes/s", diskBand))
staller := roachtestutil.MakeCgroupDiskStaller(t, c,
false /* readsToo */, false /* logsToo */)
staller.Setup(ctx)
staller.Slow(ctx, c.CRDBNodes(), diskBand)
}

// Init KV workload with a bunch of pre-split ranges and pre-inserted
// rows. The block sizes are picked the same as for the "foreground"
// workload below.
initKV := func(ctx context.Context, version *clusterupgrade.Version) error {
binary := uploadCockroach(ctx, t, c, c.WorkloadNode(), version)
return c.RunE(ctx, option.WithNodes(c.WorkloadNode()), fmt.Sprintf(
"%s workload init kv --drop --splits=1000 --insert-count=3000 "+
"--min-block-bytes=512 --max-block-bytes=1024 {pgurl%s}",
binary, c.Node(1)))
}
// The workloads are tuned to keep the cluster busy at 30-40% CPU, and IO
// overload metric approaching 20-30% which causes elastic traffic being
// de-prioritized and wait.
runForeground := func(ctx context.Context, duration time.Duration) error {
cmd := roachtestutil.NewCommand("./cockroach workload run kv "+
"--histograms=perf/stats.json --concurrency=500 "+
"--max-rate=5000 --read-percent=5 "+
"--min-block-bytes=512 --max-block-bytes=1024 "+
"--txn-qos='regular' "+
"--duration=%v {pgurl%s}", duration, c.CRDBNodes())
return c.RunE(ctx, option.WithNodes(c.WorkloadNode()), cmd.String())
}
runBackground := func(ctx context.Context, duration time.Duration) error {
cmd := roachtestutil.NewCommand("./cockroach workload run kv "+
"--histograms=perf/stats.json --concurrency=500 "+
"--max-rate=10000 --read-percent=0 "+
"--min-block-bytes=2048 --max-block-bytes=4096 "+
"--txn-qos='background' "+
"--duration=%v {pgurl%s}", duration, c.CRDBNodes())
return c.RunE(ctx, option.WithNodes(c.WorkloadNode()), cmd.String())
}
runWorkloads := func(ctx2 context.Context) error {
const duration = 5 * time.Minute
m := c.NewMonitor(ctx, c.CRDBNodes())
m.Go(func(ctx context.Context) error { return runForeground(ctx, duration) })
m.Go(func(ctx context.Context) error { return runBackground(ctx, duration) })
return m.WaitE()
}

mvt.OnStartup("initializing kv dataset",
func(ctx context.Context, _ *logger.Logger, _ *rand.Rand, h *mixedversion.Helper) error {
return initKV(ctx, h.System.FromVersion)
})
mvt.InMixedVersion("running kv workloads in mixed version",
func(ctx context.Context, _ *logger.Logger, _ *rand.Rand, _ *mixedversion.Helper) error {
setDiskBandwidth()
return runWorkloads(ctx)
})
mvt.AfterUpgradeFinalized("running kv workloads after upgrade",
func(ctx context.Context, _ *logger.Logger, _ *rand.Rand, _ *mixedversion.Helper) error {
return runWorkloads(ctx)
})

mvt.Run()
// TODO(pav-kv): also validate that the write throughput was kept under
// control, and the foreground traffic was not starved.
validateTokensReturned(ctx, t, c, c.CRDBNodes())
},
})
}
12 changes: 7 additions & 5 deletions pkg/cmd/roachtest/tests/admission_control_latency.go
Original file line number Diff line number Diff line change
Expand Up @@ -948,19 +948,21 @@ func (v variations) runTest(ctx context.Context, t test.Test, c cluster.Cluster)
t.L().Printf("validating stats after the perturbation")
failures = append(failures, isAcceptableChange(t.L(), baselineStats, afterStats, v.acceptableChange)...)
require.True(t, len(failures) == 0, strings.Join(failures, "\n"))
v.validateTokensReturned(ctx, t)
validateTokensReturned(ctx, t, v, v.stableNodes())
}

// validateTokensReturned ensures that all RAC tokens are returned to the pool
// validateTokensReturned ensures that all RACv2 tokens are returned to the pool
// at the end of the test.
func (v variations) validateTokensReturned(ctx context.Context, t test.Test) {
func validateTokensReturned(
ctx context.Context, t test.Test, c cluster.Cluster, nodes option.NodeListOption,
) {
t.L().Printf("validating all tokens returned")
for _, node := range v.stableNodes() {
for _, node := range nodes {
// Wait for the tokens to be returned to the pool. Normally this will
// pass immediately however it is possible that there is still some
// recovery so loop a few times.
testutils.SucceedsWithin(t, func() error {
db := v.Conn(ctx, t.L(), node)
db := c.Conn(ctx, t.L(), node)
defer db.Close()
for _, sType := range []string{"send", "eval"} {
for _, tType := range []string{"elastic", "regular"} {
Expand Down
2 changes: 1 addition & 1 deletion pkg/sql/mvcc_backfiller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ func TestIndexBackfillMergeRetry(t *testing.T) {
defer leaktest.AfterTest(t)()
defer log.Scope(t).Close(t)

skip.UnderRace(t, "TODO(ssd) test times out under race")
skip.UnderDuress(t, "this test fails under duress")

params, _ := createTestServerParams()

Expand Down

0 comments on commit 72fd7e2

Please sign in to comment.