From 9b08a1e62329e526b6d5d9a54bfa817a260c2da0 Mon Sep 17 00:00:00 2001 From: Nathan VanBenschoten Date: Tue, 22 Oct 2024 22:07:08 -0400 Subject: [PATCH] roachtest: enable DistSender circuit breakers in failover/partial/lease-leader DistSender circuit breakers are useful in this test to avoid artificially inflated latencies due to the way the test measures failover time (pMax, no timeouts). Without circuit breakers, a request stuck on the partitioned leaseholder will get blocked indefinitely, despite the range recovering on the other side of the partition and becoming available to all new traffic. As a result, the test won't differentiate between temporary and permanent range unavailability. We have other tests which demonstrate the benefit of DistSender circuit breakers (especially when applications do not use statement timeouts), so we don't need to test them here. With this change, the test's failover time drops from: | lease=epoch (ms) | lease=expiration (ms) | lease=leader (ms) | |-----------------:|----------------------:|------------------:| | 60,129 | 60,129 | 60,129 | down to: | lease=epoch (ms) | lease=expiration (ms) | lease=leader (ms) | |-----------------:|----------------------:|------------------:| | 60,129 | 22,549 | 31,139 | Notice that expiration and leader leases now recover, while epoch leases remain unavailable indefinitely. Epic: None Release note: None --- pkg/cmd/roachtest/tests/failover.go | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/pkg/cmd/roachtest/tests/failover.go b/pkg/cmd/roachtest/tests/failover.go index f4543aa6a54c..91cc4d74d1dd 100644 --- a/pkg/cmd/roachtest/tests/failover.go +++ b/pkg/cmd/roachtest/tests/failover.go @@ -539,6 +539,18 @@ func runFailoverPartialLeaseLeader(ctx context.Context, t test.Test, c cluster.C settings := install.MakeClusterSettings() settings.Env = append(settings.Env, "COCKROACH_SCAN_MAX_IDLE_TIME=100ms") // speed up replication + // DistSender circuit breakers are useful in this test to avoid artificially + // inflated latencies due to the way the test measures failover time. Without + // circuit breakers, a request stuck on the partitioned leaseholder will get + // blocked indefinitely, despite the range recovering on the other side of the + // partition. As a result, the test won't differentiate between temporary and + // permanent range unavailability. We have other tests which demonstrate the + // benefit of DistSender circuit breakers (especially when applications do not + // use statement timeouts), so we don't need to test them here. + // TODO(arul): this can be removed if/when we turn on DistSender circuit + // breakers for all ranges by default. + settings.ClusterSettings["kv.dist_sender.circuit_breakers.mode"] = "all ranges" + m := c.NewMonitor(ctx, c.CRDBNodes()) failer := makeFailer(t, c, m, failureModeBlackhole, settings, rng).(PartialFailer)