From f2938395528d0ed6e88b33f7cce2f3dfb057d73d Mon Sep 17 00:00:00 2001
From: Roman Lebedev <lebedev.ri@gmail.com>
Date: Tue, 1 Jun 2021 20:16:11 +0300
Subject: [PATCH] Random interleaving of benchmark repetitions - the sequel
 (fixes #1051)

Based on the original implementation by Hai Huang @haih-g)
from https://github.com/google/benchmark/pull/1105.
---
 README.md                                   |   2 +
 docs/random_interleaving.md                 |  13 ++
 include/benchmark/benchmark.h               |  13 ++
 src/benchmark.cc                            |  71 +++++++++--
 src/benchmark_api_internal.h                |   2 -
 src/benchmark_register.cc                   |   1 -
 src/benchmark_runner.cc                     |  48 ++++----
 src/benchmark_runner.h                      |  26 ++--
 test/CMakeLists.txt                         |   1 +
 test/benchmark_random_interleaving_gtest.cc | 125 ++++++++++++++++++++
 10 files changed, 254 insertions(+), 48 deletions(-)
 create mode 100644 docs/random_interleaving.md
 create mode 100644 test/benchmark_random_interleaving_gtest.cc
diff --git a/README.md b/README.md
index 7947229499..aa61cef1b1 100644
--- a/README.md
+++ b/README.md
@@ -299,6 +299,8 @@ too (`-lkstat`).
 
 [Setting the Time Unit](#setting-the-time-unit)
 
+[Random Interleaving](docs/random_interleaving.md)
+
 [User-Requested Performance Counters](docs/perf_counters.md)
 
 [Preventing Optimization](#preventing-optimization)
diff --git a/docs/random_interleaving.md b/docs/random_interleaving.md
new file mode 100644
index 0000000000..c083036841
--- /dev/null
+++ b/docs/random_interleaving.md
@@ -0,0 +1,13 @@
+<a name="interleaving" />
+
+# Random Interleaving
+
+[Random Interleaving](https://github.com/google/benchmark/issues/1051) is a
+technique to lower run-to-run variance. It randomly interleaves repetitions of a
+microbenchmark with repetitions from other microbenchmarks in the same benchmark
+test. Data shows it is able to lower run-to-run variance by
+[40%](https://github.com/google/benchmark/issues/1051) on average.
+
+To use, you mainly need to set `--benchmark_enable_random_interleaving=true`,
+and optionally specify non-zero repetition count `--benchmark_repetitions=9`
+and optionally decrease the per-repetition time `--benchmark_min_time=0.1`.
diff --git a/include/benchmark/benchmark.h b/include/benchmark/benchmark.h
index 290c5c5011..9b5480244d 100644
--- a/include/benchmark/benchmark.h
+++ b/include/benchmark/benchmark.h
@@ -1472,6 +1472,19 @@ class BenchmarkReporter {
     int64_t max_bytes_used;
   };
 
+  struct PerFamilyRunReports {
+    PerFamilyRunReports() : num_runs_total(0), num_runs_done(0) {}
+
+    // How many runs will all instances of this benchmark perform?
+    int num_runs_total;
+
+    // How many runs have happened already?
+    int num_runs_done;
+
+    // The reports about (non-errneous!) runs of this family.
+    std::vector<BenchmarkReporter::Run> Runs;
+  };
+
   // Construct a BenchmarkReporter with the output stream set to 'std::cout'
   // and the error stream set to 'std::cerr'
   BenchmarkReporter();
diff --git a/src/benchmark.cc b/src/benchmark.cc
index c4392bfdcf..a726ef8af2 100644
--- a/src/benchmark.cc
+++ b/src/benchmark.cc
@@ -33,8 +33,10 @@
 #include <cstdlib>
 #include <fstream>
 #include <iostream>
+#include <limits>
 #include <map>
 #include <memory>
+#include <random>
 #include <string>
 #include <thread>
 #include <utility>
@@ -73,6 +75,10 @@ DEFINE_double(benchmark_min_time, 0.5);
 // standard deviation of the runs will be reported.
 DEFINE_int32(benchmark_repetitions, 1);
 
+// If set, enable random interleaving of repetitions of all benchmarks.
+// See http://github.com/google/benchmark/issues/1051 for details.
+DEFINE_bool(benchmark_enable_random_interleaving, false);
+
 // Report the result of each benchmark repetitions. When 'true' is specified
 // only the mean, standard deviation, and other statistics are reported for
 // repeated benchmarks. Affects all reporters.
@@ -297,23 +303,69 @@ void RunBenchmarks(const std::vector<BenchmarkInstance>& benchmarks,
   context.name_field_width = name_field_width;
 
   // Keep track of running times of all instances of each benchmark family.
-  std::map<int /*family_index*/, std::vector<BenchmarkReporter::Run>>
-      complexity_reports;
+  std::map<int /*family_index*/, BenchmarkReporter::PerFamilyRunReports>
+      per_family_reports;
 
   if (display_reporter->ReportContext(context) &&
       (!file_reporter || file_reporter->ReportContext(context))) {
     FlushStreams(display_reporter);
     FlushStreams(file_reporter);
 
+    size_t num_repetitions_total = 0;
+
+    std::vector<internal::BenchmarkRunner> runners;
+    runners.reserve(benchmarks.size());
     for (const BenchmarkInstance& benchmark : benchmarks) {
-      std::vector<BenchmarkReporter::Run>* complexity_reports_for_family =
-          nullptr;
+      BenchmarkReporter::PerFamilyRunReports* reports_for_family = nullptr;
       if (benchmark.complexity() != oNone)
-        complexity_reports_for_family =
-            &complexity_reports[benchmark.family_index()];
+        reports_for_family = &per_family_reports[benchmark.family_index()];
+
+      runners.emplace_back(benchmark, reports_for_family);
+      int num_repeats_of_this_instance = runners.back().GetNumRepeats();
+      num_repetitions_total += num_repeats_of_this_instance;
+      if (reports_for_family)
+        reports_for_family->num_runs_total += num_repeats_of_this_instance;
+    }
+    assert(runners.size() == benchmarks.size() && "Unexpected runner count.");
+
+    std::vector<int> repetition_indices;
+    repetition_indices.reserve(num_repetitions_total);
+    for (size_t runner_index = 0, num_runners = runners.size();
+         runner_index != num_runners; ++runner_index) {
+      const internal::BenchmarkRunner& runner = runners[runner_index];
+      std::fill_n(std::back_inserter(repetition_indices),
+                  runner.GetNumRepeats(), runner_index);
+    }
+    assert(repetition_indices.size() == num_repetitions_total &&
+           "Unexpected number of repetition indexes.");
+
+    if (FLAGS_benchmark_enable_random_interleaving) {
+      std::random_device rd;
+      std::mt19937 g(rd());
+      std::shuffle(repetition_indices.begin(), repetition_indices.end(), g);
+    }
 
-      RunResults run_results =
-          RunBenchmark(benchmark, complexity_reports_for_family);
+    for (size_t repetition_index : repetition_indices) {
+      internal::BenchmarkRunner& runner = runners[repetition_index];
+      runner.DoOneRepetition();
+      if (runner.HasRepeatsRemaining()) continue;
+      // FIXME: report each repetition separately, not all of them in bulk.
+
+      RunResults run_results = runner.GetResults();
+
+      // Maybe calculate complexity report
+      if (BenchmarkReporter::PerFamilyRunReports* reports_for_family =
+              runner.GetReportsForFamily()) {
+        if (reports_for_family->num_runs_done ==
+            reports_for_family->num_runs_total) {
+          auto additional_run_stats = ComputeBigO(reports_for_family->Runs);
+          run_results.aggregates_only.insert(run_results.aggregates_only.end(),
+                                             additional_run_stats.begin(),
+                                             additional_run_stats.end());
+          per_family_reports.erase(
+              (int)reports_for_family->Runs.front().family_index);
+        }
+      }
 
       Report(display_reporter, file_reporter, run_results);
     }
@@ -471,6 +523,7 @@ void PrintUsageAndExit() {
           "          [--benchmark_filter=<regex>]\n"
           "          [--benchmark_min_time=<min_time>]\n"
           "          [--benchmark_repetitions=<num_repetitions>]\n"
+          "          [--benchmark_enable_random_interleaving={true|false}]\n"
           "          [--benchmark_report_aggregates_only={true|false}]\n"
           "          [--benchmark_display_aggregates_only={true|false}]\n"
           "          [--benchmark_format=<console|json|csv>]\n"
@@ -495,6 +548,8 @@ void ParseCommandLineFlags(int* argc, char** argv) {
                         &FLAGS_benchmark_min_time) ||
         ParseInt32Flag(argv[i], "benchmark_repetitions",
                        &FLAGS_benchmark_repetitions) ||
+        ParseBoolFlag(argv[i], "benchmark_enable_random_interleaving",
+                      &FLAGS_benchmark_enable_random_interleaving) ||
         ParseBoolFlag(argv[i], "benchmark_report_aggregates_only",
                       &FLAGS_benchmark_report_aggregates_only) ||
         ParseBoolFlag(argv[i], "benchmark_display_aggregates_only",
diff --git a/src/benchmark_api_internal.h b/src/benchmark_api_internal.h
index e2afbd833c..9296b7d2c8 100644
--- a/src/benchmark_api_internal.h
+++ b/src/benchmark_api_internal.h
@@ -39,8 +39,6 @@ class BenchmarkInstance {
   IterationCount iterations() const { return iterations_; }
   int threads() const { return threads_; }
 
-  bool last_benchmark_instance;
-
   State Run(IterationCount iters, int thread_id, internal::ThreadTimer* timer,
             internal::ThreadManager* manager,
             internal::PerfCountersMeasurement* perf_counters_measurement) const;
diff --git a/src/benchmark_register.cc b/src/benchmark_register.cc
index 435c48b4aa..574462220e 100644
--- a/src/benchmark_register.cc
+++ b/src/benchmark_register.cc
@@ -166,7 +166,6 @@ bool BenchmarkFamilies::FindBenchmarks(
         const auto full_name = instance.name().str();
         if ((re.Match(full_name) && !isNegativeFilter) ||
             (!re.Match(full_name) && isNegativeFilter)) {
-          instance.last_benchmark_instance = (&args == &family->args_.back());
           benchmarks->push_back(std::move(instance));
 
           ++per_family_instance_index;
diff --git a/src/benchmark_runner.cc b/src/benchmark_runner.cc
index 55d6cf15d2..6742d42dbe 100644
--- a/src/benchmark_runner.cc
+++ b/src/benchmark_runner.cc
@@ -143,9 +143,9 @@ void RunInThread(const BenchmarkInstance* b, IterationCount iters,
 
 BenchmarkRunner::BenchmarkRunner(
     const benchmark::internal::BenchmarkInstance& b_,
-    std::vector<BenchmarkReporter::Run>* complexity_reports_)
+    BenchmarkReporter::PerFamilyRunReports* reports_for_family_)
     : b(b_),
-      complexity_reports(complexity_reports_),
+      reports_for_family(reports_for_family_),
       min_time(!IsZero(b.min_time()) ? b.min_time() : FLAGS_benchmark_min_time),
       repeats(b.repetitions() != 0 ? b.repetitions()
                                    : FLAGS_benchmark_repetitions),
@@ -172,22 +172,6 @@ BenchmarkRunner::BenchmarkRunner(
           perf_counters_measurement.IsValid())
         << "Perf counters were requested but could not be set up.";
   }
-
-  for (int repetition_num = 0; repetition_num < repeats; repetition_num++) {
-    DoOneRepetition(repetition_num);
-  }
-
-  // Calculate additional statistics
-  run_results.aggregates_only = ComputeStats(run_results.non_aggregates);
-
-  // Maybe calculate complexity report
-  if (complexity_reports && b.last_benchmark_instance) {
-    auto additional_run_stats = ComputeBigO(*complexity_reports);
-    run_results.aggregates_only.insert(run_results.aggregates_only.end(),
-                                       additional_run_stats.begin(),
-                                       additional_run_stats.end());
-    complexity_reports->clear();
-  }
 }
 
 BenchmarkRunner::IterationResults BenchmarkRunner::DoNIterations() {
@@ -283,8 +267,10 @@ bool BenchmarkRunner::ShouldReportIterationResults(
          ((i.results.real_time_used >= 5 * min_time) && !b.use_manual_time());
 }
 
-void BenchmarkRunner::DoOneRepetition(int64_t repetition_index) {
-  const bool is_the_first_repetition = repetition_index == 0;
+void BenchmarkRunner::DoOneRepetition() {
+  assert(HasRepeatsRemaining() && "Already done all repetitions?");
+
+  const bool is_the_first_repetition = num_repetitions_done == 0;
   IterationResults i;
 
   // We *may* be gradually increasing the length (iteration count)
@@ -337,19 +323,25 @@ void BenchmarkRunner::DoOneRepetition(int64_t repetition_index) {
   // Ok, now actualy report.
   BenchmarkReporter::Run report =
       CreateRunReport(b, i.results, memory_iterations, memory_result, i.seconds,
-                      repetition_index, repeats);
+                      num_repetitions_done, repeats);
 
-  if (complexity_reports && !report.error_occurred)
-    complexity_reports->push_back(report);
+  if (reports_for_family) {
+    ++reports_for_family->num_runs_done;
+    if (!report.error_occurred) reports_for_family->Runs.push_back(report);
+  }
 
   run_results.non_aggregates.push_back(report);
+
+  ++num_repetitions_done;
 }
 
-RunResults RunBenchmark(
-    const benchmark::internal::BenchmarkInstance& b,
-    std::vector<BenchmarkReporter::Run>* complexity_reports) {
-  internal::BenchmarkRunner r(b, complexity_reports);
-  return r.get_results();
+RunResults&& BenchmarkRunner::GetResults() {
+  assert(!HasRepeatsRemaining() && "Did not run all repetitions yet?");
+
+  // Calculate additional statistics over the repetitions of this instance.
+  run_results.aggregates_only = ComputeStats(run_results.non_aggregates);
+
+  return std::move(run_results);
 }
 
 }  // end namespace internal
diff --git a/src/benchmark_runner.h b/src/benchmark_runner.h
index 9730ad386c..8a855236b2 100644
--- a/src/benchmark_runner.h
+++ b/src/benchmark_runner.h
@@ -50,20 +50,34 @@ struct RunResults {
 class BenchmarkRunner {
  public:
   BenchmarkRunner(const benchmark::internal::BenchmarkInstance& b_,
-                  std::vector<BenchmarkReporter::Run>* complexity_reports_);
+                  BenchmarkReporter::PerFamilyRunReports* reports_for_family);
 
-  RunResults&& get_results() { return std::move(run_results); }
+  int GetNumRepeats() const { return repeats; }
+
+  bool HasRepeatsRemaining() const {
+    return GetNumRepeats() != num_repetitions_done;
+  }
+
+  void DoOneRepetition();
+
+  RunResults&& GetResults();
+
+  BenchmarkReporter::PerFamilyRunReports* GetReportsForFamily() const {
+    return reports_for_family;
+  };
 
  private:
   RunResults run_results;
 
   const benchmark::internal::BenchmarkInstance& b;
-  std::vector<BenchmarkReporter::Run>* complexity_reports;
+  BenchmarkReporter::PerFamilyRunReports* reports_for_family;
 
   const double min_time;
   const int repeats;
   const bool has_explicit_iteration_count;
 
+  int num_repetitions_done = 0;
+
   std::vector<std::thread> pool;
 
   IterationCount iters;  // preserved between repetitions!
@@ -83,14 +97,8 @@ class BenchmarkRunner {
   IterationCount PredictNumItersNeeded(const IterationResults& i) const;
 
   bool ShouldReportIterationResults(const IterationResults& i) const;
-
-  void DoOneRepetition(int64_t repetition_index);
 };
 
-RunResults RunBenchmark(
-    const benchmark::internal::BenchmarkInstance& b,
-    std::vector<BenchmarkReporter::Run>* complexity_reports);
-
 }  // namespace internal
 
 }  // end namespace benchmark
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 012f5a8bcd..79cdf53b40 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -199,6 +199,7 @@ if (BENCHMARK_ENABLE_GTEST_TESTS)
 
   add_gtest(benchmark_gtest)
   add_gtest(benchmark_name_gtest)
+  add_gtest(benchmark_random_interleaving_gtest)
   add_gtest(commandlineflags_gtest)
   add_gtest(statistics_gtest)
   add_gtest(string_util_gtest)
diff --git a/test/benchmark_random_interleaving_gtest.cc b/test/benchmark_random_interleaving_gtest.cc
new file mode 100644
index 0000000000..48c4efc98e
--- /dev/null
+++ b/test/benchmark_random_interleaving_gtest.cc
@@ -0,0 +1,125 @@
+#include <queue>
+#include <string>
+#include <vector>
+
+#include "../src/commandlineflags.h"
+#include "../src/string_util.h"
+#include "benchmark/benchmark.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+DECLARE_bool(benchmark_enable_random_interleaving);
+DECLARE_string(benchmark_filter);
+DECLARE_int32(benchmark_repetitions);
+
+namespace benchmark {
+namespace internal {
+namespace {
+
+class EventQueue : public std::queue<std::string> {
+ public:
+  void Put(const std::string& event) { push(event); }
+
+  void Clear() {
+    while (!empty()) {
+      pop();
+    }
+  }
+
+  std::string Get() {
+    std::string event = front();
+    pop();
+    return event;
+  }
+};
+
+static EventQueue* queue = new EventQueue;
+
+class NullReporter : public BenchmarkReporter {
+ public:
+  bool ReportContext(const Context& /*context*/) override { return true; }
+  void ReportRuns(const std::vector<Run>& /* report */) override {}
+};
+
+class BenchmarkTest : public testing::Test {
+ public:
+  static void SetupHook(int /* num_threads */) { queue->push("Setup"); }
+
+  static void TeardownHook(int /* num_threads */) { queue->push("Teardown"); }
+
+  void Execute(const std::string& pattern) {
+    queue->Clear();
+
+    BenchmarkReporter* reporter = new NullReporter;
+    FLAGS_benchmark_filter = pattern;
+    RunSpecifiedBenchmarks(reporter);
+    delete reporter;
+
+    queue->Put("DONE");  // End marker
+  }
+};
+
+static void BM_Match1(benchmark::State& state) {
+  const int64_t arg = state.range(0);
+
+  for (auto _ : state) {
+  }
+  queue->Put(StrFormat("BM_Match1/%d", static_cast<int>(arg)));
+}
+BENCHMARK(BM_Match1)
+    ->Iterations(100)
+    ->Arg(1)
+    ->Arg(2)
+    ->Arg(3)
+    ->Range(10, 80)
+    ->Args({90})
+    ->Args({100});
+
+TEST_F(BenchmarkTest, Match1) {
+  Execute("BM_Match1");
+  ASSERT_EQ("BM_Match1/1", queue->Get());
+  ASSERT_EQ("BM_Match1/2", queue->Get());
+  ASSERT_EQ("BM_Match1/3", queue->Get());
+  ASSERT_EQ("BM_Match1/10", queue->Get());
+  ASSERT_EQ("BM_Match1/64", queue->Get());
+  ASSERT_EQ("BM_Match1/80", queue->Get());
+  ASSERT_EQ("BM_Match1/90", queue->Get());
+  ASSERT_EQ("BM_Match1/100", queue->Get());
+  ASSERT_EQ("DONE", queue->Get());
+}
+
+TEST_F(BenchmarkTest, Match1WithRepetition) {
+  FLAGS_benchmark_repetitions = 2;
+
+  Execute("BM_Match1/(64|80)");
+  ASSERT_EQ("BM_Match1/64", queue->Get());
+  ASSERT_EQ("BM_Match1/64", queue->Get());
+  ASSERT_EQ("BM_Match1/80", queue->Get());
+  ASSERT_EQ("BM_Match1/80", queue->Get());
+  ASSERT_EQ("DONE", queue->Get());
+}
+
+TEST_F(BenchmarkTest, Match1WithRandomInterleaving) {
+  FLAGS_benchmark_enable_random_interleaving = true;
+  FLAGS_benchmark_repetitions = 100;
+
+  std::map<std::string, int> interleaving_count;
+  const char* regexp = "BM_Match1/(64|80)";
+  Execute(regexp);
+  for (int i = 0; i < 100; ++i) {
+    std::vector<std::string> interleaving;
+    interleaving.push_back(queue->Get());
+    interleaving.push_back(queue->Get());
+    EXPECT_THAT(interleaving,
+                testing::UnorderedElementsAre(testing::MatchesRegex(regexp),
+                                              testing::MatchesRegex(regexp)));
+    interleaving_count[StrFormat("%s,%s", interleaving[0].c_str(),
+                                 interleaving[1].c_str())]++;
+  }
+  EXPECT_GE(interleaving_count.size(), 2) << "Interleaving was not randomized.";
+  ASSERT_EQ("DONE", queue->Get());
+}
+
+}  // namespace
+}  // namespace internal
+}  // namespace benchmark