diff --git a/docs/root/intro/arch_overview/upstream/load_balancing/load_balancers.rst b/docs/root/intro/arch_overview/upstream/load_balancing/load_balancers.rst index df762656fea6..5b6c4bb5c40f 100644 --- a/docs/root/intro/arch_overview/upstream/load_balancing/load_balancers.rst +++ b/docs/root/intro/arch_overview/upstream/load_balancing/load_balancers.rst @@ -28,10 +28,10 @@ effective weighting. Weighted least request ^^^^^^^^^^^^^^^^^^^^^^ -The least request load balancer uses different algorithms depending on whether any of the hosts have -weight greater than 1. +The least request load balancer uses different algorithms depending on whether hosts have the +same or different weights. -* *all weights 1*: An O(1) algorithm which selects N random available hosts as specified in the +* *all weights equal*: An O(1) algorithm which selects N random available hosts as specified in the :ref:`configuration ` (2 by default) and picks the host which has the fewest active requests (`Research `_ has shown that this @@ -39,17 +39,13 @@ weight greater than 1. choices). The P2C load balancer has the property that a host with the highest number of active requests in the cluster will never receive new requests. It will be allowed to drain until it is less than or equal to all of the other hosts. -* *not all weights 1*: If any host in the cluster has a load balancing weight greater than 1, the - load balancer shifts into a mode where it uses a weighted round robin schedule in which weights - are dynamically adjusted based on the host's request load at the time of selection (weight is - divided by the current active request count. For example, a host with weight 2 and an active - request count of 4 will have a synthetic weight of 2 / 4 = 0.5). This algorithm provides good - balance at steady state but may not adapt to load imbalance as quickly. Additionally, unlike P2C, - a host will never truly drain, though it will receive fewer requests over time. - - .. note:: - If all weights are not 1, but are the same (e.g., 42), Envoy will still use the weighted round - robin schedule instead of P2C. +* *all weights not equal*: If two or more hosts in the cluster have different load balancing + weights, the load balancer shifts into a mode where it uses a weighted round robin schedule in + which weights are dynamically adjusted based on the host's request load at the time of selection + (weight is divided by the current active request count. For example, a host with weight 2 and an + active request count of 4 will have a synthetic weight of 2 / 4 = 0.5). This algorithm provides + good balance at steady state but may not adapt to load imbalance as quickly. Additionally, unlike + P2C, a host will never truly drain, though it will receive fewer requests over time. .. _arch_overview_load_balancing_types_ring_hash: diff --git a/docs/root/intro/version_history.rst b/docs/root/intro/version_history.rst index e9972311ec75..94ea3e38cc12 100644 --- a/docs/root/intro/version_history.rst +++ b/docs/root/intro/version_history.rst @@ -24,6 +24,7 @@ Version history * tls: added verification of IP address SAN fields in certificates against configured SANs in the certificate validation context. * upstream: added network filter chains to upstream connections, see :ref:`filters`. +* upstream: use p2c to select hosts for least-requests load balancers if all host weights are the same, even in cases where weights are not equal to 1. * zookeeper: parse responses and emit latency stats. 1.11.0 (July 11, 2019) diff --git a/source/common/upstream/load_balancer_impl.cc b/source/common/upstream/load_balancer_impl.cc index e4b653d928dc..79bf87b79c44 100644 --- a/source/common/upstream/load_balancer_impl.cc +++ b/source/common/upstream/load_balancer_impl.cc @@ -50,6 +50,20 @@ std::pair distributeLoad(PriorityLoad& per_priority_load, return {first_available_priority, total_load}; } +// Returns true if the weights of all the hosts in the HostVector are equal. +bool hostWeightsAreEqual(const HostVector& hosts) { + if (hosts.size() <= 1) { + return true; + } + const uint32_t weight = hosts[0]->weight(); + for (size_t i = 1; i < hosts.size(); ++i) { + if (hosts[i]->weight() != weight) { + return false; + } + } + return true; +} + } // namespace std::pair @@ -629,6 +643,16 @@ void EdfLoadBalancerBase::refresh(uint32_t priority) { auto& scheduler = scheduler_[source] = Scheduler{}; refreshHostSource(source); + // Check if the original host weights are equal and skip EDF creation if they are. When all + // original weights are equal we can rely on unweighted host pick to do optimal round robin and + // least-loaded host selection with lower memory and CPU overhead. + if (hostWeightsAreEqual(hosts)) { + // Skip edf creation. + return; + } + + scheduler.edf_ = std::make_unique>(); + // Populate scheduler with host list. // TODO(mattklein123): We must build the EDF schedule even if all of the hosts are currently // weighted 1. This is because currently we don't refresh host sets if only weights change. @@ -639,7 +663,7 @@ void EdfLoadBalancerBase::refresh(uint32_t priority) { // notification, this will only be stale until this host is next picked, // at which point it is reinserted into the EdfScheduler with its new // weight in chooseHost(). - scheduler.edf_.add(hostWeight(*host), host); + scheduler.edf_->add(hostWeight(*host), host); } // Cycle through hosts to achieve the intended offset behavior. @@ -647,8 +671,8 @@ void EdfLoadBalancerBase::refresh(uint32_t priority) { // refreshes for the weighted case. if (!hosts.empty()) { for (uint32_t i = 0; i < seed_ % hosts.size(); ++i) { - auto host = scheduler.edf_.pick(); - scheduler.edf_.add(hostWeight(*host), host); + auto host = scheduler.edf_->pick(); + scheduler.edf_->add(hostWeight(*host), host); } } }; @@ -684,15 +708,12 @@ HostConstSharedPtr EdfLoadBalancerBase::chooseHostOnce(LoadBalancerContext* cont // As has been commented in both EdfLoadBalancerBase::refresh and // BaseDynamicClusterImpl::updateDynamicHostList, we must do a runtime pivot here to determine - // whether to use EDF or do unweighted (fast) selection. - // TODO(mattklein123): As commented elsewhere, this is wasteful, and we should just refresh the - // host set if any weights change. Additionally, it has the property that if all weights are - // the same but not 1 (like 42), we will use the EDF schedule not the unweighted pick. This is - // not optimal. If this is fixed, remove the note in the arch overview docs for the LR LB. - if (stats_.max_host_weight_.value() != 1) { - auto host = scheduler.edf_.pick(); + // whether to use EDF or do unweighted (fast) selection. EDF is non-null iff the original weights + // of 2 or more hosts differ. + if (scheduler.edf_ != nullptr) { + auto host = scheduler.edf_->pick(); if (host != nullptr) { - scheduler.edf_.add(hostWeight(*host), host); + scheduler.edf_->add(hostWeight(*host), host); } return host; } else { diff --git a/source/common/upstream/load_balancer_impl.h b/source/common/upstream/load_balancer_impl.h index 518ea5071121..98fd0c75d7d1 100644 --- a/source/common/upstream/load_balancer_impl.h +++ b/source/common/upstream/load_balancer_impl.h @@ -346,8 +346,10 @@ class EdfLoadBalancerBase : public ZoneAwareLoadBalancerBase { protected: struct Scheduler { - // EdfScheduler for weighted LB. - EdfScheduler edf_; + // EdfScheduler for weighted LB. The edf_ is only created when the original + // host weights of 2 or more hosts differ. When not present, the + // implementation of chooseHostOnce falls back to unweightedHostPick. + std::unique_ptr> edf_; }; void initialize(); @@ -408,12 +410,12 @@ class RoundRobinLoadBalancer : public EdfLoadBalancerBase { /** * Weighted Least Request load balancer. * - * In a normal setup when all hosts have the same weight of 1 it randomly picks up N healthy hosts + * In a normal setup when all hosts have the same weight it randomly picks up N healthy hosts * (where N is specified in the LB configuration) and compares number of active requests. Technique * is based on http://www.eecs.harvard.edu/~michaelm/postscripts/mythesis.pdf and is known as P2C * (power of two choices). * - * When any hosts have a weight that is not 1, an RR EDF schedule is used. Host weight is scaled + * When hosts have different weights, an RR EDF schedule is used. Host weight is scaled * by the number of active requests at pick/insert time. Thus, hosts will never fully drain as * they would in normal P2C, though they will get picked less and less often. In the future, we * can consider two alternate algorithms: @@ -442,11 +444,9 @@ class LeastRequestLoadBalancer : public EdfLoadBalancerBase { void refreshHostSource(const HostsSource&) override {} double hostWeight(const Host& host) override { // Here we scale host weight by the number of active requests at the time we do the pick. We - // always add 1 to avoid division by 0. Note that if all weights are 1, the EDF schedule is - // unlikely to yield the same result as P2C given the lack of randomness as well as the fact - // that hosts are always picked, regardless of their current request load at the time of pick. - // It might be possible to do better by picking two hosts off of the schedule, and selecting - // the one with fewer active requests at the time of selection. + // always add 1 to avoid division by 0. It might be possible to do better by picking two hosts + // off of the schedule, and selecting the one with fewer active requests at the time of + // selection. // TODO(mattklein123): @htuch brings up the point that how we are scaling weight here might not // be the only/best way of doing this. Essentially, it makes weight and active requests equally // important. Are they equally important in practice? There is no right answer here and we might diff --git a/test/common/upstream/BUILD b/test/common/upstream/BUILD index 566e06cf6012..4e5d8b4679ba 100644 --- a/test/common/upstream/BUILD +++ b/test/common/upstream/BUILD @@ -333,6 +333,7 @@ envoy_cc_binary( "benchmark", ], deps = [ + "//source/common/memory:stats_lib", "//source/common/upstream:maglev_lb_lib", "//source/common/upstream:ring_hash_lb_lib", "//source/common/upstream:upstream_lib", diff --git a/test/common/upstream/load_balancer_benchmark.cc b/test/common/upstream/load_balancer_benchmark.cc index 9bd79569b52f..b1f736873346 100644 --- a/test/common/upstream/load_balancer_benchmark.cc +++ b/test/common/upstream/load_balancer_benchmark.cc @@ -2,6 +2,7 @@ #include +#include "common/memory/stats.h" #include "common/runtime/runtime_impl.h" #include "common/upstream/maglev_lb.h" #include "common/upstream/ring_hash_lb.h" @@ -27,15 +28,18 @@ class BaseTester { hosts.push_back(makeTestHost(info_, fmt::format("tcp://10.0.{}.{}:6379", i / 256, i % 256), should_weight ? weight : 1)); } - HostVectorConstSharedPtr updated_hosts{new HostVector(hosts)}; - priority_set_.updateHosts( - 0, - updateHostsParams(updated_hosts, nullptr, - std::make_shared(*updated_hosts), nullptr), - {}, hosts, {}, absl::nullopt); + + HostVectorConstSharedPtr updated_hosts = std::make_shared(hosts); + HostsPerLocalityConstSharedPtr hosts_per_locality = makeHostsPerLocality({hosts}); + priority_set_.updateHosts(0, HostSetImpl::partitionHosts(updated_hosts, hosts_per_locality), {}, + hosts, {}, absl::nullopt); + local_priority_set_.updateHosts(0, + HostSetImpl::partitionHosts(updated_hosts, hosts_per_locality), + {}, hosts, {}, absl::nullopt); } PrioritySetImpl priority_set_; + PrioritySetImpl local_priority_set_; Stats::IsolatedStoreImpl stats_store_; ClusterStats stats_{ClusterInfoImpl::generateStats(stats_store_)}; NiceMock runtime_; @@ -44,6 +48,58 @@ class BaseTester { std::shared_ptr info_{new NiceMock()}; }; +class RoundRobinTester : public BaseTester { +public: + RoundRobinTester(uint64_t num_hosts, uint32_t weighted_subset_percent = 0, uint32_t weight = 0) + : BaseTester(num_hosts, weighted_subset_percent, weight) {} + + void initialize() { + lb_ = std::make_unique(priority_set_, &local_priority_set_, stats_, + runtime_, random_, common_config_); + } + + std::unique_ptr lb_; +}; + +void BM_RoundRobinLoadBalancerBuild(benchmark::State& state) { + for (auto _ : state) { + state.PauseTiming(); + const uint64_t num_hosts = state.range(0); + const uint64_t weighted_subset_percent = state.range(1); + const uint64_t weight = state.range(2); + + RoundRobinTester tester(num_hosts, weighted_subset_percent, weight); + const size_t start_mem = Memory::Stats::totalCurrentlyAllocated(); + + // We are only interested in timing the initial build. + state.ResumeTiming(); + tester.initialize(); + state.PauseTiming(); + const size_t end_mem = Memory::Stats::totalCurrentlyAllocated(); + state.counters["memory"] = end_mem - start_mem; + state.counters["memory_per_host"] = (end_mem - start_mem) / num_hosts; + state.ResumeTiming(); + } +} +BENCHMARK(BM_RoundRobinLoadBalancerBuild) + ->Args({1, 0, 1}) + ->Args({500, 0, 1}) + ->Args({500, 50, 50}) + ->Args({500, 100, 50}) + ->Args({2500, 0, 1}) + ->Args({2500, 50, 50}) + ->Args({2500, 100, 50}) + ->Args({10000, 0, 1}) + ->Args({10000, 50, 50}) + ->Args({10000, 100, 50}) + ->Args({25000, 0, 1}) + ->Args({25000, 50, 50}) + ->Args({25000, 100, 50}) + ->Args({50000, 0, 1}) + ->Args({50000, 50, 50}) + ->Args({50000, 100, 50}) + ->Unit(benchmark::kMillisecond); + class RingHashTester : public BaseTester { public: RingHashTester(uint64_t num_hosts, uint64_t min_ring_size) : BaseTester(num_hosts) { diff --git a/test/common/upstream/load_balancer_impl_test.cc b/test/common/upstream/load_balancer_impl_test.cc index 5a42f7bf5edf..e29dd9b0dcd8 100644 --- a/test/common/upstream/load_balancer_impl_test.cc +++ b/test/common/upstream/load_balancer_impl_test.cc @@ -1229,7 +1229,7 @@ TEST_P(LeastRequestLoadBalancerTest, SingleHost) { // Host weight is 100. { - EXPECT_CALL(random_, random()).WillOnce(Return(0)); + EXPECT_CALL(random_, random()).WillOnce(Return(0)).WillOnce(Return(2)).WillOnce(Return(3)); stats_.max_host_weight_.set(100UL); EXPECT_EQ(hostSet().healthy_hosts_[0], lb_.chooseHost(nullptr)); } @@ -1237,7 +1237,7 @@ TEST_P(LeastRequestLoadBalancerTest, SingleHost) { HostVector empty; { hostSet().runCallbacks(empty, empty); - EXPECT_CALL(random_, random()).WillOnce(Return(0)); + EXPECT_CALL(random_, random()).WillOnce(Return(0)).WillOnce(Return(2)).WillOnce(Return(3)); EXPECT_EQ(hostSet().healthy_hosts_[0], lb_.chooseHost(nullptr)); } diff --git a/test/integration/stats_integration_test.cc b/test/integration/stats_integration_test.cc index cc016ffbea1a..ceadf07bacd7 100644 --- a/test/integration/stats_integration_test.cc +++ b/test/integration/stats_integration_test.cc @@ -212,6 +212,7 @@ TEST_P(ClusterMemoryTestRunner, MemoryLargeClusterSizeWithStats) { // 2019/07/06 7477 42742 43000 fork gauge representation to drop pending_increment_ // 2019/07/15 7555 42806 43000 static link libstdc++ in tests // 2019/07/24 7503 43030 44000 add upstream filters to clusters + // 2019/08/13 7877 42838 44000 skip EdfScheduler creation if all host weights equal // Note: when adjusting this value: EXPECT_MEMORY_EQ is active only in CI // 'release' builds, where we control the platform and tool-chain. So you @@ -221,7 +222,7 @@ TEST_P(ClusterMemoryTestRunner, MemoryLargeClusterSizeWithStats) { // On a local clang8/libstdc++/linux flow, the memory usage was observed in // June 2019 to be 64 bytes higher than it is in CI/release. Your mileage may // vary. - EXPECT_MEMORY_EQ(m_per_cluster, 43030); // 104 bytes higher than a debug build. + EXPECT_MEMORY_EQ(m_per_cluster, 42838); // 104 bytes higher than a debug build. EXPECT_MEMORY_LE(m_per_cluster, 44000); }