Skip to content

Commit

Permalink
roachtest: add failover variants with leader leases
Browse files Browse the repository at this point in the history
Part of #132762.

Leader leases have different availability properties than epoch leases under
most failure modes. This patch adds failover test variants that use leader
leases where possible.

Initial test results:

| test                                         | lease=epoch (ms) | lease=expiration (ms) | lease=leader (ms) | parity with expiration |
|:---------------------------------------------|-----------------:|----------------------:|------------------:|:----------------------:|
| failover/chaos/read-only                     | 60,129           | 18,253                | 60,129            | ✔                      |
| failover/chaos/read-write                    | 60,129           | 20,401                | 60,129            | XXX                    |
| failover/liveness/blackhole                  | 9,663            | 369                   | 335               | ✔                      |
| failover/liveness/blackhole-recv             | 11,274           | 402                   | 369               | ✔                      |
| failover/liveness/blackhole-send             | 9,663            | 385                   | 469               | ✔                      |
| failover/liveness/crash                      | 8,053            | 352                   | 318               | ✔                      |
| failover/liveness/deadlock                   | 24,696           | 385                   | 369               | ✔                      |
| failover/liveness/disk-stall                 | 26,843           | 369                   | 419               | ✔                      |
| failover/liveness/pause                      | 10,200           | 385                   | 436               | ✔                      |
| failover/non-system/blackhole                | 7,247            | 7,516                 | 15,032            | XX                     |
| failover/non-system/blackhole-recv           | 12,348           | 10,737                | 18,253            | XX                     |
| failover/non-system/blackhole-send           | 6,979            | 6,979                 | 8,053             | X                      |
| failover/non-system/crash                    | 7,247            | 6,979                 | 9,126             | X                      |
| failover/non-system/deadlock                 | 60,129           | 60,129                | 60,129            | ✔                      |
| failover/non-system/disk-stall               | 22,548           | 22,548                | 25,769            | X                      |
| failover/non-system/pause                    | 7,247            | 7,247                 | 9,126             | X                      |
| failover/partial/lease-gateway               | 8,589            | 19,327 [^1]           | 60,129            | XXX                    |
| failover/partial/lease-leader                | 60,129           | 22,549 [^2]           | 31,139 [^2]       | XX                     |
| failover/partial/lease-liveness              | 8,589            | 301                   | 318               | ✔                      |
| failover/system-non-liveness/blackhole       | 369              | 402                   | 352               | ✔                      |
| failover/system-non-liveness/blackhole-recv  | 335              | 285                   | 318               | ✔                      |
| failover/system-non-liveness/blackhole-send  | 402              | 419                   | 335               | ✔                      |
| failover/system-non-liveness/crash           | 419              | 301                   | 453               | ✔                      |
| failover/system-non-liveness/deadlock        | 369              | 352                   | 402               | ✔                      |
| failover/system-non-liveness/disk-stall      | 402              | 318                   | 453               | ✔                      |
| failover/system-non-liveness/pause           | 369              | 385                   | 335               | ✔                      |

_note: because of the way the test measures pMax, anything under 1,000ms is essentially "no impact"_

**Key _(comparing leader vs. expiration)_**:
✔ = parity
X = minor regression
XX = major regression
XXX = unavailability

[^1]: I don't understand why expiration-based lease perform worse than epoch-based leases on this test.
[^2]: With #133214.

Epic: none
Release note: None
  • Loading branch information
nvanbenschoten committed Oct 23, 2024
1 parent 2a97d1b commit 961c233
Show file tree
Hide file tree
Showing 4 changed files with 24 additions and 4 deletions.
7 changes: 6 additions & 1 deletion pkg/cmd/roachtest/registry/test_spec.go
Original file line number Diff line number Diff line change
Expand Up @@ -224,6 +224,8 @@ func (l LeaseType) String() string {
return "epoch"
case ExpirationLeases:
return "expiration"
case LeaderLeases:
return "leader"
case MetamorphicLeases:
return "metamorphic"
default:
Expand All @@ -238,8 +240,11 @@ const (
EpochLeases
// ExpirationLeases uses expiration leases for all ranges.
ExpirationLeases
// LeaderLeases uses leader leases where possible.
LeaderLeases
// MetamorphicLeases randomly chooses epoch or expiration
// leases (across the entire cluster)
// leases (across the entire cluster).
// TODO(nvanbenschoten): add leader leases to this mix.
MetamorphicLeases
)

Expand Down
4 changes: 4 additions & 0 deletions pkg/cmd/roachtest/test_runner.go
Original file line number Diff line number Diff line change
Expand Up @@ -903,8 +903,12 @@ func (r *testRunner) runWorker(
case registry.DefaultLeases:
case registry.EpochLeases:
c.clusterSettings["kv.expiration_leases_only.enabled"] = "false"
c.clusterSettings["kv.raft.leader_fortification.fraction_enabled"] = "0.0"
case registry.ExpirationLeases:
c.clusterSettings["kv.expiration_leases_only.enabled"] = "true"
case registry.LeaderLeases:
c.clusterSettings["kv.expiration_leases_only.enabled"] = "false"
c.clusterSettings["kv.raft.leader_fortification.fraction_enabled"] = "1.0"
case registry.MetamorphicLeases:
enabled := prng.Float64() < 0.5
c.status(fmt.Sprintf("metamorphically setting kv.expiration_leases_only.enabled = %t",
Expand Down
14 changes: 12 additions & 2 deletions pkg/cmd/roachtest/tests/failover.go
Original file line number Diff line number Diff line change
Expand Up @@ -59,10 +59,20 @@ var rangeLeaseRenewalDuration = func() time.Duration {
// requests are successful with nominal latencies. See also:
// https://github.com/cockroachdb/cockroach/issues/103654
func registerFailover(r registry.Registry) {
for _, leases := range []registry.LeaseType{registry.EpochLeases, registry.ExpirationLeases} {
leaseTypes := []registry.LeaseType{registry.EpochLeases, registry.ExpirationLeases, registry.LeaderLeases}
for _, leases := range leaseTypes {
var leasesStr string
if leases == registry.ExpirationLeases {
switch leases {
case registry.EpochLeases:
// TODO(nvanbenschoten): when leader leases become the default, we should
// change this to "/lease=epoch" and change leader leases to "".
leasesStr = ""
case registry.ExpirationLeases:
leasesStr = "/lease=expiration"
case registry.LeaderLeases:
leasesStr = "/lease=leader"
default:
panic(errors.AssertionFailedf("unknown lease type: %v", leases))
}

for _, readOnly := range []bool{false, true} {
Expand Down
3 changes: 2 additions & 1 deletion pkg/cmd/roachtest/tests/tpcc.go
Original file line number Diff line number Diff line change
Expand Up @@ -1280,7 +1280,8 @@ type tpccBenchSpec struct {
// Encryption-At-Rest / EAR).
EncryptionEnabled bool
// ExpirationLeases enables use of expiration-based leases.
ExpirationLeases bool
ExpirationLeases bool
// TODO(nvanbenschoten): add a leader lease variant.
EnableDefaultScheduledBackup bool
// SharedProcessMT, if true, indicates that the cluster should run in
// shared-process mode of multi-tenancy.
Expand Down

0 comments on commit 961c233

Please sign in to comment.