Some cache_bench enhancements (#11661)

Summary: ... used in validating some HyperClockCache development in progress. * Revamp the "populate cache" step to avoid redundant insertions (very rare in practice) and more consistently approach the desired resident_ratio while maintaining appropriate skew (still not perfect). * Track and print hit ratio on lookups, to ensure a fair comparison is happening between implementations etc. * Add an option to disable tracking and printing histograms (lots of output) * Add an option to specify a random seed (for more reproducibility) * Remove confusing/redundant "-skewed" option Uses BitwiseAnd from #11660 (tested there) Pull Request resolved: #11661 Test Plan: manual Reviewed By: akankshamahajan15, jowlyzhang Differential Revision: D47937671 Pulled By: pdillinger fbshipit-source-id: 85a2bb881b1bca4f63e015bac684105fd91c9f35
facebook · Aug 2, 2023 · f9de217 · f9de217
1 parent cf95821
commit f9de217
Showing 1 changed file with 116 additions and 59 deletions.
diff --git a/cache/cache_bench_tool.cc b/cache/cache_bench_tool.cc
@@ -50,7 +50,7 @@ DEFINE_double(resident_ratio, 0.25,
 DEFINE_uint64(ops_per_thread, 2000000U, "Number of operations per thread.");
 DEFINE_uint32(value_bytes, 8 * KiB, "Size of each value added.");
 
-DEFINE_uint32(skew, 5, "Degree of skew in key selection");
+DEFINE_uint32(skew, 5, "Degree of skew in key selection. 0 = no skew");
 DEFINE_bool(populate_cache, true, "Populate cache before operations");
 
 DEFINE_uint32(lookup_insert_percent, 87,
@@ -71,7 +71,6 @@ DEFINE_uint32(
 
 DEFINE_uint32(gather_stats_entries_per_lock, 256,
               "For Cache::ApplyToAllEntries");
-DEFINE_bool(skewed, false, "If true, skew the key access distribution");
 
 DEFINE_bool(lean, false,
             "If true, no additional computation is performed besides cache "
@@ -81,6 +80,11 @@ DEFINE_bool(early_exit, false,
             "Exit before deallocating most memory. Good for malloc stats, e.g."
             "MALLOC_CONF=\"stats_print:true\"");
 
+DEFINE_bool(histograms, true,
+            "Whether to track and print histogram statistics.");
+
+DEFINE_uint32(seed, 0, "Hashing/random seed to use. 0 = choose at random");
+
 DEFINE_string(secondary_cache_uri, "",
               "Full URI for creating a custom secondary cache object");
 static class std::shared_ptr<ROCKSDB_NAMESPACE::SecondaryCache> secondary_cache;
@@ -149,9 +153,6 @@ class SharedState {
  public:
   explicit SharedState(CacheBench* cache_bench)
       : cv_(&mu_),
-        num_initialized_(0),
-        start_(false),
-        num_done_(0),
         cache_bench_(cache_bench) {}
 
   ~SharedState() {}
@@ -174,15 +175,27 @@ class SharedState {
 
   bool Started() const { return start_; }
 
+  void AddLookupStats(uint64_t hits, uint64_t misses) {
+    MutexLock l(&mu_);
+    lookup_count_ += hits + misses;
+    lookup_hits_ += hits;
+  }
+
+  double GetLookupHitRatio() const {
+    return 1.0 * lookup_hits_ / lookup_count_;
+  }
+
  private:
   port::Mutex mu_;
   port::CondVar cv_;
 
-  uint64_t num_initialized_;
-  bool start_;
-  uint64_t num_done_;
-
   CacheBench* cache_bench_;
+
+  uint64_t num_initialized_ = 0;
+  bool start_ = false;
+  uint64_t num_done_ = 0;
+  uint64_t lookup_count_ = 0;
+  uint64_t lookup_hits_ = 0;
 };
 
 // Per-thread state for concurrent executions of the same benchmark.
@@ -194,27 +207,19 @@ struct ThreadState {
   uint64_t duration_us = 0;
 
   ThreadState(uint32_t index, SharedState* _shared)
-      : tid(index), rnd(1000 + index), shared(_shared) {}
+      : tid(index), rnd(FLAGS_seed + 1 + index), shared(_shared) {}
 };
 
 struct KeyGen {
   char key_data[27];
 
-  Slice GetRand(Random64& rnd, uint64_t max_key, int max_log) {
-    uint64_t key = 0;
-    if (!FLAGS_skewed) {
-      uint64_t raw = rnd.Next();
-      // Skew according to setting
-      for (uint32_t i = 0; i < FLAGS_skew; ++i) {
-        raw = std::min(raw, rnd.Next());
-      }
-      key = FastRange64(raw, max_key);
-    } else {
-      key = rnd.Skewed(max_log);
-      if (key > max_key) {
-        key -= max_key;
-      }
+  Slice GetRand(Random64& rnd, uint64_t max_key, uint32_t skew) {
+    uint64_t raw = rnd.Next();
+    // Skew according to setting
+    for (uint32_t i = 0; i < skew; ++i) {
+      raw = std::min(raw, rnd.Next());
     }
+    uint64_t key = FastRange64(raw, max_key);
     // Variable size and alignment
     size_t off = key % 8;
     key_data[0] = char{42};
@@ -285,31 +290,25 @@ class CacheBench {
         lookup_threshold_(insert_threshold_ +
                           kHundredthUint64 * FLAGS_lookup_percent),
         erase_threshold_(lookup_threshold_ +
-                         kHundredthUint64 * FLAGS_erase_percent),
-        skewed_(FLAGS_skewed) {
+                         kHundredthUint64 * FLAGS_erase_percent) {
     if (erase_threshold_ != 100U * kHundredthUint64) {
       fprintf(stderr, "Percentages must add to 100.\n");
       exit(1);
     }
 
-    max_log_ = 0;
-    if (skewed_) {
-      uint64_t max_key = max_key_;
-      while (max_key >>= 1) max_log_++;
-      if (max_key > (static_cast<uint64_t>(1) << max_log_)) max_log_++;
-    }
-
     if (FLAGS_cache_type == "clock_cache") {
       fprintf(stderr, "Old clock cache implementation has been removed.\n");
       exit(1);
     } else if (FLAGS_cache_type == "hyper_clock_cache") {
-      cache_ = HyperClockCacheOptions(FLAGS_cache_size, FLAGS_value_bytes,
-                                      FLAGS_num_shard_bits)
-                   .MakeSharedCache();
+      HyperClockCacheOptions opts(FLAGS_cache_size, FLAGS_value_bytes,
+                                  FLAGS_num_shard_bits);
+      opts.hash_seed = BitwiseAnd(FLAGS_seed, INT32_MAX);
+      cache_ = opts.MakeSharedCache();
     } else if (FLAGS_cache_type == "lru_cache") {
       LRUCacheOptions opts(FLAGS_cache_size, FLAGS_num_shard_bits,
                            false /* strict_capacity_limit */,
                            0.5 /* high_pri_pool_ratio */);
+      opts.hash_seed = BitwiseAnd(FLAGS_seed, INT32_MAX);
       if (!FLAGS_secondary_cache_uri.empty()) {
         Status s = SecondaryCache::CreateFromString(
             ConfigOptions(), FLAGS_secondary_cache_uri, &secondary_cache);
@@ -333,13 +332,50 @@ class CacheBench {
   ~CacheBench() {}
 
   void PopulateCache() {
-    Random64 rnd(1);
+    Random64 rnd(FLAGS_seed);
     KeyGen keygen;
-    for (uint64_t i = 0; i < 2 * FLAGS_cache_size; i += FLAGS_value_bytes) {
-      Status s = cache_->Insert(keygen.GetRand(rnd, max_key_, max_log_),
-                                createValue(rnd), &helper1, FLAGS_value_bytes);
+    size_t max_occ = 0;
+    size_t inserts_since_max_occ_increase = 0;
+    size_t keys_since_last_not_found = 0;
+
+    // Avoid redundant insertions by checking Lookup before Insert.
+    // Loop until insertions consistently fail to increase max occupancy or
+    // it becomes difficult to find keys not already inserted.
+    while (inserts_since_max_occ_increase < 100 &&
+           keys_since_last_not_found < 100) {
+      Slice key = keygen.GetRand(rnd, max_key_, FLAGS_skew);
+
+      Cache::Handle* handle = cache_->Lookup(key);
+      if (handle != nullptr) {
+        cache_->Release(handle);
+        ++keys_since_last_not_found;
+        continue;
+      }
+      keys_since_last_not_found = 0;
+
+      Status s =
+          cache_->Insert(key, createValue(rnd), &helper1, FLAGS_value_bytes);
       assert(s.ok());
+
+      handle = cache_->Lookup(key);
+      if (!handle) {
+        fprintf(stderr, "Failed to lookup key just inserted.\n");
+        assert(false);
+        exit(42);
+      } else {
+        cache_->Release(handle);
+      }
+
+      size_t occ = cache_->GetOccupancyCount();
+      if (occ > max_occ) {
+        max_occ = occ;
+        inserts_since_max_occ_increase = 0;
+      } else {
+        ++inserts_since_max_occ_increase;
+      }
     }
+    printf("Population complete (%zu entries, %g average charge)\n", max_occ,
+           1.0 * FLAGS_cache_size / max_occ);
   }
 
   bool Run() {
@@ -398,18 +434,21 @@ class CacheBench {
                                         FLAGS_ops_per_thread / elapsed_secs);
     printf("Thread ops/sec = %u\n", ops_per_sec);
 
-    printf("\nOperation latency (ns):\n");
-    HistogramImpl combined;
-    for (uint32_t i = 0; i < FLAGS_threads; i++) {
-      combined.Merge(threads[i]->latency_ns_hist);
-    }
-    printf("%s", combined.ToString().c_str());
+    printf("Lookup hit ratio: %g\n", shared.GetLookupHitRatio());
 
-    if (FLAGS_gather_stats) {
-      printf("\nGather stats latency (us):\n");
-      printf("%s", stats_hist.ToString().c_str());
-    }
+    if (FLAGS_histograms) {
+      printf("\nOperation latency (ns):\n");
+      HistogramImpl combined;
+      for (uint32_t i = 0; i < FLAGS_threads; i++) {
+        combined.Merge(threads[i]->latency_ns_hist);
+      }
+      printf("%s", combined.ToString().c_str());
 
+      if (FLAGS_gather_stats) {
+        printf("\nGather stats latency (us):\n");
+        printf("%s", stats_hist.ToString().c_str());
+      }
+    }
     printf("\n%s", stats_report.c_str());
 
     return true;
@@ -423,8 +462,6 @@ class CacheBench {
   const uint64_t insert_threshold_;
   const uint64_t lookup_threshold_;
   const uint64_t erase_threshold_;
-  const bool skewed_;
-  int max_log_;
 
   // A benchmark version of gathering stats on an active block cache by
   // iterating over it. The primary purpose is to measure the impact of
@@ -494,13 +531,17 @@ class CacheBench {
         // Something slightly more expensive as in stats by category
         helpers.insert(helper);
       };
-      timer.Start();
+      if (FLAGS_histograms) {
+        timer.Start();
+      }
       Cache::ApplyToAllEntriesOptions opts;
       opts.average_entries_per_lock = FLAGS_gather_stats_entries_per_lock;
       shared->GetCacheBench()->cache_->ApplyToAllEntries(fn, opts);
       table_occupancy = shared->GetCacheBench()->cache_->GetOccupancyCount();
       table_size = shared->GetCacheBench()->cache_->GetTableAddressCount();
-      stats_hist->Add(timer.ElapsedNanos() / 1000);
+      if (FLAGS_histograms) {
+        stats_hist->Add(timer.ElapsedNanos() / 1000);
+      }
     }
   }
 
@@ -531,6 +572,8 @@ class CacheBench {
   void OperateCache(ThreadState* thread) {
     // To use looked-up values
     uint64_t result = 0;
+    uint64_t lookup_misses = 0;
+    uint64_t lookup_hits = 0;
     // To hold handles for a non-trivial amount of time
     Cache::Handle* handle = nullptr;
     KeyGen gen;
@@ -539,10 +582,12 @@ class CacheBench {
     StopWatchNano timer(clock);
 
     for (uint64_t i = 0; i < FLAGS_ops_per_thread; i++) {
-      Slice key = gen.GetRand(thread->rnd, max_key_, max_log_);
+      Slice key = gen.GetRand(thread->rnd, max_key_, FLAGS_skew);
       uint64_t random_op = thread->rnd.Next();
 
-      timer.Start();
+      if (FLAGS_histograms) {
+        timer.Start();
+      }
 
       if (random_op < lookup_insert_threshold_) {
         if (handle) {
@@ -553,12 +598,14 @@ class CacheBench {
         handle = cache_->Lookup(key, &helper2, /*context*/ nullptr,
                                 Cache::Priority::LOW);
         if (handle) {
+          ++lookup_hits;
           if (!FLAGS_lean) {
             // do something with the data
             result += NPHash64(static_cast<char*>(cache_->Value(handle)),
                                FLAGS_value_bytes);
           }
         } else {
+          ++lookup_misses;
           // do insert
           Status s = cache_->Insert(key, createValue(thread->rnd), &helper2,
                                     FLAGS_value_bytes, &handle);
@@ -582,11 +629,14 @@ class CacheBench {
         handle = cache_->Lookup(key, &helper2, /*context*/ nullptr,
                                 Cache::Priority::LOW);
         if (handle) {
+          ++lookup_hits;
           if (!FLAGS_lean) {
             // do something with the data
             result += NPHash64(static_cast<char*>(cache_->Value(handle)),
                                FLAGS_value_bytes);
           }
+        } else {
+          ++lookup_misses;
         }
       } else if (random_op < erase_threshold_) {
         // do erase
@@ -595,7 +645,10 @@ class CacheBench {
         // Should be extremely unlikely (noop)
         assert(random_op >= kHundredthUint64 * 100U);
       }
-      thread->latency_ns_hist.Add(timer.ElapsedNanos());
+      if (FLAGS_histograms) {
+        thread->latency_ns_hist.Add(timer.ElapsedNanos());
+      }
+      thread->shared->AddLookupStats(lookup_hits, lookup_misses);
     }
     if (FLAGS_early_exit) {
       MutexLock l(thread->shared->GetMutex());
@@ -621,6 +674,7 @@ class CacheBench {
 #ifndef NDEBUG
     printf("WARNING: Assertions are enabled; benchmarks unnecessarily slow\n");
 #endif
+    printf("----------------------------\n");
     printf("RocksDB version     : %d.%d\n", kMajorVersion, kMinorVersion);
     printf("DMutex impl name    : %s\n", DMutex::kName());
     printf("Number of threads   : %u\n", FLAGS_threads);
@@ -960,11 +1014,14 @@ int cache_bench_tool(int argc, char** argv) {
     exit(1);
   }
 
+  if (FLAGS_seed == 0) {
+    FLAGS_seed = static_cast<uint32_t>(port::GetProcessID());
+    printf("Using seed = %" PRIu32 "\n", FLAGS_seed);
+  }
+
   ROCKSDB_NAMESPACE::CacheBench bench;
   if (FLAGS_populate_cache) {
     bench.PopulateCache();
-    printf("Population complete\n");
-    printf("----------------------------\n");
   }
   if (bench.Run()) {
     return 0;