envoyproxy · mattklein123 · Aug 15, 2019 · Jun 24, 2019 · Aug 8, 2019 · Aug 12, 2019
diff --git a/docs/root/intro/arch_overview/upstream/load_balancing/load_balancers.rst b/docs/root/intro/arch_overview/upstream/load_balancing/load_balancers.rst
@@ -28,28 +28,24 @@ effective weighting.
 Weighted least request
 ^^^^^^^^^^^^^^^^^^^^^^
 
-The least request load balancer uses different algorithms depending on whether any of the hosts have
-weight greater than 1.
+The least request load balancer uses different algorithms depending on whether hosts have the
+same or different weights.
 
-* *all weights 1*: An O(1) algorithm which selects N random available hosts as specified in the
+* *all weights equal*: An O(1) algorithm which selects N random available hosts as specified in the
  :ref:`configuration <envoy_api_msg_Cluster.LeastRequestLbConfig>` (2 by default) and picks the
  host which has the fewest active requests (`Research
  <https://www.eecs.harvard.edu/~michaelm/postscripts/handbook2001.pdf>`_ has shown that this
  approach is nearly as good as an O(N) full scan). This is also known as P2C (power of two
  choices). The P2C load balancer has the property that a host with the highest number of active
  requests in the cluster will never receive new requests. It will be allowed to drain until it is
  less than or equal to all of the other hosts.
-* *not all weights 1*: If any host in the cluster has a load balancing weight greater than 1, the
- load balancer shifts into a mode where it uses a weighted round robin schedule in which weights
- are dynamically adjusted based on the host's request load at the time of selection (weight is
- divided by the current active request count. For example, a host with weight 2 and an active
- request count of 4 will have a synthetic weight of 2 / 4 = 0.5). This algorithm provides good
- balance at steady state but may not adapt to load imbalance as quickly. Additionally, unlike P2C,
- a host will never truly drain, though it will receive fewer requests over time.
-
- .. note::
- If all weights are not 1, but are the same (e.g., 42), Envoy will still use the weighted round
- robin schedule instead of P2C.
+* *all weights not equal*: If two or more hosts in the cluster have different load balancing
+ weights, the load balancer shifts into a mode where it uses a weighted round robin schedule in
+ which weights are dynamically adjusted based on the host's request load at the time of selection
+ (weight is divided by the current active request count. For example, a host with weight 2 and an
+ active request count of 4 will have a synthetic weight of 2 / 4 = 0.5). This algorithm provides
+ good balance at steady state but may not adapt to load imbalance as quickly. Additionally, unlike
+ P2C, a host will never truly drain, though it will receive fewer requests over time.
 
 .. _arch_overview_load_balancing_types_ring_hash:
 

diff --git a/docs/root/intro/version_history.rst b/docs/root/intro/version_history.rst
@@ -24,6 +24,7 @@ Version history
 * tls: added verification of IP address SAN fields in certificates against configured SANs in the
  certificate validation context.
 * upstream: added network filter chains to upstream connections, see :ref:`filters<envoy_api_field_Cluster.filters>`.
+* upstream: use p2c to select hosts for least-requests load balancers if all host weights are the same, even in cases where weights are not equal to 1.
 * zookeeper: parse responses and emit latency stats.
 
 1.11.0 (July 11, 2019)

diff --git a/source/common/upstream/load_balancer_impl.cc b/source/common/upstream/load_balancer_impl.cc
@@ -50,6 +50,20 @@ std::pair<int32_t, size_t> distributeLoad(PriorityLoad& per_priority_load,
  return {first_available_priority, total_load};
 }
 
+// Returns true if the weights of all the hosts in the HostVector are equal.
+bool hostWeightsAreEqual(const HostVector& hosts) {
+ if (hosts.size() <= 1) {
+ return true;
+ }
+ const uint32_t weight = hosts[0]->weight();
+ for (size_t i = 1; i < hosts.size(); ++i) {
+ if (hosts[i]->weight() != weight) {
+ return false;
+ }
+ }
+ return true;
+}
+
 } // namespace
 
 std::pair<uint32_t, LoadBalancerBase::HostAvailability>
@@ -629,6 +643,16 @@ void EdfLoadBalancerBase::refresh(uint32_t priority) {
  auto& scheduler = scheduler_[source] = Scheduler{};
  refreshHostSource(source);
 
+ // Check if the original host weights are equal and skip EDF creation if they are. When all
+ // original weights are equal we can rely on unweighted host pick to do optimal round robin and
+ // least-loaded host selection with lower memory and CPU overhead.
+ if (hostWeightsAreEqual(hosts)) {
+ // Skip edf creation.
+ return;
+ }
+
+ scheduler.edf_ = std::make_unique<EdfScheduler<const Host>>();
+
  // Populate scheduler with host list.
  // TODO(mattklein123): We must build the EDF schedule even if all of the hosts are currently
  // weighted 1. This is because currently we don't refresh host sets if only weights change.
@@ -639,16 +663,16 @@ void EdfLoadBalancerBase::refresh(uint32_t priority) {
  // notification, this will only be stale until this host is next picked,
  // at which point it is reinserted into the EdfScheduler with its new
  // weight in chooseHost().
- scheduler.edf_.add(hostWeight(*host), host);
+ scheduler.edf_->add(hostWeight(*host), host);
  }
 
  // Cycle through hosts to achieve the intended offset behavior.
  // TODO(htuch): Consider how we can avoid biasing towards earlier hosts in the schedule across
  // refreshes for the weighted case.
  if (!hosts.empty()) {
  for (uint32_t i = 0; i < seed_ % hosts.size(); ++i) {
- auto host = scheduler.edf_.pick();
- scheduler.edf_.add(hostWeight(*host), host);
+ auto host = scheduler.edf_->pick();
+ scheduler.edf_->add(hostWeight(*host), host);
  }
  }
  };
@@ -684,15 +708,12 @@ HostConstSharedPtr EdfLoadBalancerBase::chooseHostOnce(LoadBalancerContext* cont
 
  // As has been commented in both EdfLoadBalancerBase::refresh and
  // BaseDynamicClusterImpl::updateDynamicHostList, we must do a runtime pivot here to determine
- // whether to use EDF or do unweighted (fast) selection.
- // TODO(mattklein123): As commented elsewhere, this is wasteful, and we should just refresh the
- // host set if any weights change. Additionally, it has the property that if all weights are
- // the same but not 1 (like 42), we will use the EDF schedule not the unweighted pick. This is
- // not optimal. If this is fixed, remove the note in the arch overview docs for the LR LB.
- if (stats_.max_host_weight_.value() != 1) {
- auto host = scheduler.edf_.pick();
+ // whether to use EDF or do unweighted (fast) selection. EDF is non-null iff the original weights
+ // of 2 or more hosts differ.
+ if (scheduler.edf_ != nullptr) {
+ auto host = scheduler.edf_->pick();
  if (host != nullptr) {
- scheduler.edf_.add(hostWeight(*host), host);
+ scheduler.edf_->add(hostWeight(*host), host);
  }
  return host;
  } else {

diff --git a/source/common/upstream/load_balancer_impl.h b/source/common/upstream/load_balancer_impl.h
@@ -346,8 +346,10 @@ class EdfLoadBalancerBase : public ZoneAwareLoadBalancerBase {
 
 protected:
  struct Scheduler {
- // EdfScheduler for weighted LB.
- EdfScheduler<const Host> edf_;
+ // EdfScheduler for weighted LB. The edf_ is only created when the original
+ // host weights of 2 or more hosts differ. When not present, the
+ // implementation of chooseHostOnce falls back to unweightedHostPick.
+ std::unique_ptr<EdfScheduler<const Host>> edf_;
  };
 
  void initialize();
@@ -408,12 +410,12 @@ class RoundRobinLoadBalancer : public EdfLoadBalancerBase {
 /**
  * Weighted Least Request load balancer.
  *
- * In a normal setup when all hosts have the same weight of 1 it randomly picks up N healthy hosts
+ * In a normal setup when all hosts have the same weight it randomly picks up N healthy hosts
  * (where N is specified in the LB configuration) and compares number of active requests. Technique
  * is based on http://www.eecs.harvard.edu/~michaelm/postscripts/mythesis.pdf and is known as P2C
  * (power of two choices).
  *
- * When any hosts have a weight that is not 1, an RR EDF schedule is used. Host weight is scaled
+ * When hosts have different weights, an RR EDF schedule is used. Host weight is scaled
  * by the number of active requests at pick/insert time. Thus, hosts will never fully drain as
  * they would in normal P2C, though they will get picked less and less often. In the future, we
  * can consider two alternate algorithms:
@@ -442,11 +444,9 @@ class LeastRequestLoadBalancer : public EdfLoadBalancerBase {
  void refreshHostSource(const HostsSource&) override {}
  double hostWeight(const Host& host) override {
  // Here we scale host weight by the number of active requests at the time we do the pick. We
- // always add 1 to avoid division by 0. Note that if all weights are 1, the EDF schedule is
- // unlikely to yield the same result as P2C given the lack of randomness as well as the fact
- // that hosts are always picked, regardless of their current request load at the time of pick.
- // It might be possible to do better by picking two hosts off of the schedule, and selecting
- // the one with fewer active requests at the time of selection.
+ // always add 1 to avoid division by 0. It might be possible to do better by picking two hosts
+ // off of the schedule, and selecting the one with fewer active requests at the time of
+ // selection.
  // TODO(mattklein123): @htuch brings up the point that how we are scaling weight here might not
  // be the only/best way of doing this. Essentially, it makes weight and active requests equally
  // important. Are they equally important in practice? There is no right answer here and we might

diff --git a/test/common/upstream/BUILD b/test/common/upstream/BUILD
@@ -333,6 +333,7 @@ envoy_cc_binary(
  "benchmark",
  ],
  deps = [
+ "//source/common/memory:stats_lib",
  "//source/common/upstream:maglev_lb_lib",
  "//source/common/upstream:ring_hash_lb_lib",
  "//source/common/upstream:upstream_lib",

diff --git a/test/common/upstream/load_balancer_benchmark.cc b/test/common/upstream/load_balancer_benchmark.cc
@@ -2,6 +2,7 @@
 
 #include <memory>
 
+#include "common/memory/stats.h"
 #include "common/runtime/runtime_impl.h"
 #include "common/upstream/maglev_lb.h"
 #include "common/upstream/ring_hash_lb.h"
@@ -27,15 +28,18 @@ class BaseTester {
  hosts.push_back(makeTestHost(info_, fmt::format("tcp://10.0.{}.{}:6379", i / 256, i % 256),
  should_weight ? weight : 1));
  }
- HostVectorConstSharedPtr updated_hosts{new HostVector(hosts)};
- priority_set_.updateHosts(
- 0,
- updateHostsParams(updated_hosts, nullptr,
- std::make_shared<const HealthyHostVector>(*updated_hosts), nullptr),
- {}, hosts, {}, absl::nullopt);
+
+ HostVectorConstSharedPtr updated_hosts = std::make_shared<HostVector>(hosts);
+ HostsPerLocalityConstSharedPtr hosts_per_locality = makeHostsPerLocality({hosts});
+ priority_set_.updateHosts(0, HostSetImpl::partitionHosts(updated_hosts, hosts_per_locality), {},
+ hosts, {}, absl::nullopt);
+ local_priority_set_.updateHosts(0,
+ HostSetImpl::partitionHosts(updated_hosts, hosts_per_locality),
+ {}, hosts, {}, absl::nullopt);
  }
 
  PrioritySetImpl priority_set_;
+ PrioritySetImpl local_priority_set_;
  Stats::IsolatedStoreImpl stats_store_;
  ClusterStats stats_{ClusterInfoImpl::generateStats(stats_store_)};
  NiceMock<Runtime::MockLoader> runtime_;
@@ -44,6 +48,58 @@ class BaseTester {
  std::shared_ptr<MockClusterInfo> info_{new NiceMock<MockClusterInfo>()};
 };
 
+class RoundRobinTester : public BaseTester {
+public:
+ RoundRobinTester(uint64_t num_hosts, uint32_t weighted_subset_percent = 0, uint32_t weight = 0)
+ : BaseTester(num_hosts, weighted_subset_percent, weight) {}
+
+ void initialize() {
+ lb_ = std::make_unique<RoundRobinLoadBalancer>(priority_set_, &local_priority_set_, stats_,
+ runtime_, random_, common_config_);
+ }
+
+ std::unique_ptr<RoundRobinLoadBalancer> lb_;
+};
+
+void BM_RoundRobinLoadBalancerBuild(benchmark::State& state) {
+ for (auto _ : state) {
+ state.PauseTiming();
+ const uint64_t num_hosts = state.range(0);
+ const uint64_t weighted_subset_percent = state.range(1);
+ const uint64_t weight = state.range(2);
+
+ RoundRobinTester tester(num_hosts, weighted_subset_percent, weight);
+ const size_t start_mem = Memory::Stats::totalCurrentlyAllocated();
+
+ // We are only interested in timing the initial build.
+ state.ResumeTiming();
+ tester.initialize();
+ state.PauseTiming();
+ const size_t end_mem = Memory::Stats::totalCurrentlyAllocated();
+ state.counters["memory"] = end_mem - start_mem;
+ state.counters["memory_per_host"] = (end_mem - start_mem) / num_hosts;
+ state.ResumeTiming();
+ }
+}
+BENCHMARK(BM_RoundRobinLoadBalancerBuild)
+ ->Args({1, 0, 1})
+ ->Args({500, 0, 1})
+ ->Args({500, 50, 50})
+ ->Args({500, 100, 50})
+ ->Args({2500, 0, 1})
+ ->Args({2500, 50, 50})
+ ->Args({2500, 100, 50})
+ ->Args({10000, 0, 1})
+ ->Args({10000, 50, 50})
+ ->Args({10000, 100, 50})
+ ->Args({25000, 0, 1})
+ ->Args({25000, 50, 50})
+ ->Args({25000, 100, 50})
+ ->Args({50000, 0, 1})
+ ->Args({50000, 50, 50})
+ ->Args({50000, 100, 50})
+ ->Unit(benchmark::kMillisecond);
+
 class RingHashTester : public BaseTester {
 public:
  RingHashTester(uint64_t num_hosts, uint64_t min_ring_size) : BaseTester(num_hosts) {

diff --git a/test/common/upstream/load_balancer_impl_test.cc b/test/common/upstream/load_balancer_impl_test.cc
@@ -1229,15 +1229,15 @@ TEST_P(LeastRequestLoadBalancerTest, SingleHost) {
 
  // Host weight is 100.
  {
- EXPECT_CALL(random_, random()).WillOnce(Return(0));
+ EXPECT_CALL(random_, random()).WillOnce(Return(0)).WillOnce(Return(2)).WillOnce(Return(3));
  stats_.max_host_weight_.set(100UL);
  EXPECT_EQ(hostSet().healthy_hosts_[0], lb_.chooseHost(nullptr));
  }
 
  HostVector empty;
  {
  hostSet().runCallbacks(empty, empty);
- EXPECT_CALL(random_, random()).WillOnce(Return(0));
+ EXPECT_CALL(random_, random()).WillOnce(Return(0)).WillOnce(Return(2)).WillOnce(Return(3));
  EXPECT_EQ(hostSet().healthy_hosts_[0], lb_.chooseHost(nullptr));
  }
 

diff --git a/test/integration/stats_integration_test.cc b/test/integration/stats_integration_test.cc
@@ -212,6 +212,7 @@ TEST_P(ClusterMemoryTestRunner, MemoryLargeClusterSizeWithStats) {
  // 2019/07/06 7477 42742 43000 fork gauge representation to drop pending_increment_
  // 2019/07/15 7555 42806 43000 static link libstdc++ in tests
  // 2019/07/24 7503 43030 44000 add upstream filters to clusters
+ // 2019/08/13 7877 42838 44000 skip EdfScheduler creation if all host weights equal
 
  // Note: when adjusting this value: EXPECT_MEMORY_EQ is active only in CI
  // 'release' builds, where we control the platform and tool-chain. So you
@@ -221,7 +222,7 @@ TEST_P(ClusterMemoryTestRunner, MemoryLargeClusterSizeWithStats) {
  // On a local clang8/libstdc++/linux flow, the memory usage was observed in
  // June 2019 to be 64 bytes higher than it is in CI/release. Your mileage may
  // vary.
- EXPECT_MEMORY_EQ(m_per_cluster, 43030); // 104 bytes higher than a debug build.
+ EXPECT_MEMORY_EQ(m_per_cluster, 42838); // 104 bytes higher than a debug build.
  EXPECT_MEMORY_LE(m_per_cluster, 44000);
 }