From dee8143bc2d15f184be8bdd7cb59f71b2fdc2d48 Mon Sep 17 00:00:00 2001 From: Nathan VanBenschoten Date: Mon, 30 Sep 2024 20:52:13 +0000 Subject: [PATCH] roachtest: add `failover` variants with leader leases MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Part of #132762. Leader leases have different availability properties than epoch leases under most failure modes. This patch adds failover test variants that use leader leases where possible. Initial test results: | test | lease=epoch (ms) | lease=expiration (ms) | lease=leader (ms) | parity with expiration | |:---------------------------------------------|-----------------:|----------------------:|------------------:|:----------------------:| | failover/chaos/read-only | 60,129 | 18,253 | 60,129 | ✔ | | failover/chaos/read-write | 60,129 | 20,401 | 60,129 | XXX | | failover/liveness/blackhole | 9,663 | 369 | 335 | ✔ | | failover/liveness/blackhole-recv | 11,274 | 402 | 369 | ✔ | | failover/liveness/blackhole-send | 9,663 | 385 | 469 | ✔ | | failover/liveness/crash | 8,053 | 352 | 318 | ✔ | | failover/liveness/deadlock | 24,696 | 385 | 369 | ✔ | | failover/liveness/disk-stall | 26,843 | 369 | 419 | ✔ | | failover/liveness/pause | 10,200 | 385 | 436 | ✔ | | failover/non-system/blackhole | 7,247 | 7,516 | 15,032 | XX | | failover/non-system/blackhole-recv | 12,348 | 10,737 | 18,253 | XX | | failover/non-system/blackhole-send | 6,979 | 6,979 | 8,053 | X | | failover/non-system/crash | 7,247 | 6,979 | 9,126 | X | | failover/non-system/deadlock | 60,129 | 60,129 | 60,129 | ✔ | | failover/non-system/disk-stall | 22,548 | 22,548 | 25,769 | X | | failover/non-system/pause | 7,247 | 7,247 | 9,126 | X | | failover/partial/lease-gateway | 8,589 | 19,327 [^1] | 60,129 | XXX | | failover/partial/lease-leader | 60,129 | 22,549 [^2] | 31,139 [^2] | XX | | failover/partial/lease-liveness | 8,589 | 301 | 318 | ✔ | | failover/system-non-liveness/blackhole | 369 | 402 | 352 | ✔ | | failover/system-non-liveness/blackhole-recv | 335 | 285 | 318 | ✔ | | failover/system-non-liveness/blackhole-send | 402 | 419 | 335 | ✔ | | failover/system-non-liveness/crash | 419 | 301 | 453 | ✔ | | failover/system-non-liveness/deadlock | 369 | 352 | 402 | ✔ | | failover/system-non-liveness/disk-stall | 402 | 318 | 453 | ✔ | | failover/system-non-liveness/pause | 369 | 385 | 335 | ✔ | _note: because of the way the test measures pMax, anything under 1,000ms is essentially "no impact"_ **Key _(comparing leader vs. expiration)_**: ✔ = parity X = minor regression XX = major regression XXX = unavailability [^1]: I don't understand why expiration-based lease perform worse than epoch-based leases on this test. [^2]: With #133214. Epic: none Release note: None --- pkg/cmd/roachtest/registry/test_spec.go | 7 ++++++- pkg/cmd/roachtest/test_runner.go | 4 ++++ pkg/cmd/roachtest/tests/failover.go | 14 ++++++++++++-- pkg/cmd/roachtest/tests/tpcc.go | 3 ++- 4 files changed, 24 insertions(+), 4 deletions(-) diff --git a/pkg/cmd/roachtest/registry/test_spec.go b/pkg/cmd/roachtest/registry/test_spec.go index a06ef44f92f1..98feded22697 100644 --- a/pkg/cmd/roachtest/registry/test_spec.go +++ b/pkg/cmd/roachtest/registry/test_spec.go @@ -224,6 +224,8 @@ func (l LeaseType) String() string { return "epoch" case ExpirationLeases: return "expiration" + case LeaderLeases: + return "leader" case MetamorphicLeases: return "metamorphic" default: @@ -238,8 +240,11 @@ const ( EpochLeases // ExpirationLeases uses expiration leases for all ranges. ExpirationLeases + // LeaderLeases uses leader leases where possible. + LeaderLeases // MetamorphicLeases randomly chooses epoch or expiration - // leases (across the entire cluster) + // leases (across the entire cluster). + // TODO(nvanbenschoten): add leader leases to this mix. MetamorphicLeases ) diff --git a/pkg/cmd/roachtest/test_runner.go b/pkg/cmd/roachtest/test_runner.go index 8d26b3ed4dd9..1104882e3b56 100644 --- a/pkg/cmd/roachtest/test_runner.go +++ b/pkg/cmd/roachtest/test_runner.go @@ -903,8 +903,12 @@ func (r *testRunner) runWorker( case registry.DefaultLeases: case registry.EpochLeases: c.clusterSettings["kv.expiration_leases_only.enabled"] = "false" + c.clusterSettings["kv.raft.leader_fortification.fraction_enabled"] = "0.0" case registry.ExpirationLeases: c.clusterSettings["kv.expiration_leases_only.enabled"] = "true" + case registry.LeaderLeases: + c.clusterSettings["kv.expiration_leases_only.enabled"] = "false" + c.clusterSettings["kv.raft.leader_fortification.fraction_enabled"] = "1.0" case registry.MetamorphicLeases: enabled := prng.Float64() < 0.5 c.status(fmt.Sprintf("metamorphically setting kv.expiration_leases_only.enabled = %t", diff --git a/pkg/cmd/roachtest/tests/failover.go b/pkg/cmd/roachtest/tests/failover.go index f4543aa6a54c..84927edb02ed 100644 --- a/pkg/cmd/roachtest/tests/failover.go +++ b/pkg/cmd/roachtest/tests/failover.go @@ -59,10 +59,20 @@ var rangeLeaseRenewalDuration = func() time.Duration { // requests are successful with nominal latencies. See also: // https://github.com/cockroachdb/cockroach/issues/103654 func registerFailover(r registry.Registry) { - for _, leases := range []registry.LeaseType{registry.EpochLeases, registry.ExpirationLeases} { + leaseTypes := []registry.LeaseType{registry.EpochLeases, registry.ExpirationLeases, registry.LeaderLeases} + for _, leases := range leaseTypes { var leasesStr string - if leases == registry.ExpirationLeases { + switch leases { + case registry.EpochLeases: + // TODO(nvanbenschoten): when leader leases become the default, we should + // change this to "/lease=epoch" and change leader leases to "". + leasesStr = "" + case registry.ExpirationLeases: leasesStr = "/lease=expiration" + case registry.LeaderLeases: + leasesStr = "/lease=leader" + default: + panic(errors.AssertionFailedf("unknown lease type: %v", leases)) } for _, readOnly := range []bool{false, true} { diff --git a/pkg/cmd/roachtest/tests/tpcc.go b/pkg/cmd/roachtest/tests/tpcc.go index 6361501e38f3..5705321cd756 100644 --- a/pkg/cmd/roachtest/tests/tpcc.go +++ b/pkg/cmd/roachtest/tests/tpcc.go @@ -1280,7 +1280,8 @@ type tpccBenchSpec struct { // Encryption-At-Rest / EAR). EncryptionEnabled bool // ExpirationLeases enables use of expiration-based leases. - ExpirationLeases bool + ExpirationLeases bool + // TODO(nvanbenschoten): add a leader lease variant. EnableDefaultScheduledBackup bool // SharedProcessMT, if true, indicates that the cluster should run in // shared-process mode of multi-tenancy.