Skip to content

Commit

Permalink
Some cache_bench enhancements (#11661)
Browse files Browse the repository at this point in the history
Summary:
... used in validating some HyperClockCache development in progress.

* Revamp the "populate cache" step to avoid redundant insertions (very rare in practice) and more consistently approach the desired resident_ratio while maintaining appropriate skew (still not perfect).
* Track and print hit ratio on lookups, to ensure a fair comparison is happening between implementations etc.
* Add an option to disable tracking and printing histograms (lots of output)
* Add an option to specify a random seed (for more reproducibility)
* Remove confusing/redundant "-skewed" option

Uses BitwiseAnd from #11660 (tested there)

Pull Request resolved: #11661

Test Plan: manual

Reviewed By: akankshamahajan15, jowlyzhang

Differential Revision: D47937671

Pulled By: pdillinger

fbshipit-source-id: 85a2bb881b1bca4f63e015bac684105fd91c9f35
  • Loading branch information
pdillinger authored and facebook-github-bot committed Aug 2, 2023
1 parent cf95821 commit f9de217
Showing 1 changed file with 116 additions and 59 deletions.
175 changes: 116 additions & 59 deletions cache/cache_bench_tool.cc
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ DEFINE_double(resident_ratio, 0.25,
DEFINE_uint64(ops_per_thread, 2000000U, "Number of operations per thread.");
DEFINE_uint32(value_bytes, 8 * KiB, "Size of each value added.");

DEFINE_uint32(skew, 5, "Degree of skew in key selection");
DEFINE_uint32(skew, 5, "Degree of skew in key selection. 0 = no skew");
DEFINE_bool(populate_cache, true, "Populate cache before operations");

DEFINE_uint32(lookup_insert_percent, 87,
Expand All @@ -71,7 +71,6 @@ DEFINE_uint32(

DEFINE_uint32(gather_stats_entries_per_lock, 256,
"For Cache::ApplyToAllEntries");
DEFINE_bool(skewed, false, "If true, skew the key access distribution");

DEFINE_bool(lean, false,
"If true, no additional computation is performed besides cache "
Expand All @@ -81,6 +80,11 @@ DEFINE_bool(early_exit, false,
"Exit before deallocating most memory. Good for malloc stats, e.g."
"MALLOC_CONF=\"stats_print:true\"");

DEFINE_bool(histograms, true,
"Whether to track and print histogram statistics.");

DEFINE_uint32(seed, 0, "Hashing/random seed to use. 0 = choose at random");

DEFINE_string(secondary_cache_uri, "",
"Full URI for creating a custom secondary cache object");
static class std::shared_ptr<ROCKSDB_NAMESPACE::SecondaryCache> secondary_cache;
Expand Down Expand Up @@ -149,9 +153,6 @@ class SharedState {
public:
explicit SharedState(CacheBench* cache_bench)
: cv_(&mu_),
num_initialized_(0),
start_(false),
num_done_(0),
cache_bench_(cache_bench) {}

~SharedState() {}
Expand All @@ -174,15 +175,27 @@ class SharedState {

bool Started() const { return start_; }

void AddLookupStats(uint64_t hits, uint64_t misses) {
MutexLock l(&mu_);
lookup_count_ += hits + misses;
lookup_hits_ += hits;
}

double GetLookupHitRatio() const {
return 1.0 * lookup_hits_ / lookup_count_;
}

private:
port::Mutex mu_;
port::CondVar cv_;

uint64_t num_initialized_;
bool start_;
uint64_t num_done_;

CacheBench* cache_bench_;

uint64_t num_initialized_ = 0;
bool start_ = false;
uint64_t num_done_ = 0;
uint64_t lookup_count_ = 0;
uint64_t lookup_hits_ = 0;
};

// Per-thread state for concurrent executions of the same benchmark.
Expand All @@ -194,27 +207,19 @@ struct ThreadState {
uint64_t duration_us = 0;

ThreadState(uint32_t index, SharedState* _shared)
: tid(index), rnd(1000 + index), shared(_shared) {}
: tid(index), rnd(FLAGS_seed + 1 + index), shared(_shared) {}
};

struct KeyGen {
char key_data[27];

Slice GetRand(Random64& rnd, uint64_t max_key, int max_log) {
uint64_t key = 0;
if (!FLAGS_skewed) {
uint64_t raw = rnd.Next();
// Skew according to setting
for (uint32_t i = 0; i < FLAGS_skew; ++i) {
raw = std::min(raw, rnd.Next());
}
key = FastRange64(raw, max_key);
} else {
key = rnd.Skewed(max_log);
if (key > max_key) {
key -= max_key;
}
Slice GetRand(Random64& rnd, uint64_t max_key, uint32_t skew) {
uint64_t raw = rnd.Next();
// Skew according to setting
for (uint32_t i = 0; i < skew; ++i) {
raw = std::min(raw, rnd.Next());
}
uint64_t key = FastRange64(raw, max_key);
// Variable size and alignment
size_t off = key % 8;
key_data[0] = char{42};
Expand Down Expand Up @@ -285,31 +290,25 @@ class CacheBench {
lookup_threshold_(insert_threshold_ +
kHundredthUint64 * FLAGS_lookup_percent),
erase_threshold_(lookup_threshold_ +
kHundredthUint64 * FLAGS_erase_percent),
skewed_(FLAGS_skewed) {
kHundredthUint64 * FLAGS_erase_percent) {
if (erase_threshold_ != 100U * kHundredthUint64) {
fprintf(stderr, "Percentages must add to 100.\n");
exit(1);
}

max_log_ = 0;
if (skewed_) {
uint64_t max_key = max_key_;
while (max_key >>= 1) max_log_++;
if (max_key > (static_cast<uint64_t>(1) << max_log_)) max_log_++;
}

if (FLAGS_cache_type == "clock_cache") {
fprintf(stderr, "Old clock cache implementation has been removed.\n");
exit(1);
} else if (FLAGS_cache_type == "hyper_clock_cache") {
cache_ = HyperClockCacheOptions(FLAGS_cache_size, FLAGS_value_bytes,
FLAGS_num_shard_bits)
.MakeSharedCache();
HyperClockCacheOptions opts(FLAGS_cache_size, FLAGS_value_bytes,
FLAGS_num_shard_bits);
opts.hash_seed = BitwiseAnd(FLAGS_seed, INT32_MAX);
cache_ = opts.MakeSharedCache();
} else if (FLAGS_cache_type == "lru_cache") {
LRUCacheOptions opts(FLAGS_cache_size, FLAGS_num_shard_bits,
false /* strict_capacity_limit */,
0.5 /* high_pri_pool_ratio */);
opts.hash_seed = BitwiseAnd(FLAGS_seed, INT32_MAX);
if (!FLAGS_secondary_cache_uri.empty()) {
Status s = SecondaryCache::CreateFromString(
ConfigOptions(), FLAGS_secondary_cache_uri, &secondary_cache);
Expand All @@ -333,13 +332,50 @@ class CacheBench {
~CacheBench() {}

void PopulateCache() {
Random64 rnd(1);
Random64 rnd(FLAGS_seed);
KeyGen keygen;
for (uint64_t i = 0; i < 2 * FLAGS_cache_size; i += FLAGS_value_bytes) {
Status s = cache_->Insert(keygen.GetRand(rnd, max_key_, max_log_),
createValue(rnd), &helper1, FLAGS_value_bytes);
size_t max_occ = 0;
size_t inserts_since_max_occ_increase = 0;
size_t keys_since_last_not_found = 0;

// Avoid redundant insertions by checking Lookup before Insert.
// Loop until insertions consistently fail to increase max occupancy or
// it becomes difficult to find keys not already inserted.
while (inserts_since_max_occ_increase < 100 &&
keys_since_last_not_found < 100) {
Slice key = keygen.GetRand(rnd, max_key_, FLAGS_skew);

Cache::Handle* handle = cache_->Lookup(key);
if (handle != nullptr) {
cache_->Release(handle);
++keys_since_last_not_found;
continue;
}
keys_since_last_not_found = 0;

Status s =
cache_->Insert(key, createValue(rnd), &helper1, FLAGS_value_bytes);
assert(s.ok());

handle = cache_->Lookup(key);
if (!handle) {
fprintf(stderr, "Failed to lookup key just inserted.\n");
assert(false);
exit(42);
} else {
cache_->Release(handle);
}

size_t occ = cache_->GetOccupancyCount();
if (occ > max_occ) {
max_occ = occ;
inserts_since_max_occ_increase = 0;
} else {
++inserts_since_max_occ_increase;
}
}
printf("Population complete (%zu entries, %g average charge)\n", max_occ,
1.0 * FLAGS_cache_size / max_occ);
}

bool Run() {
Expand Down Expand Up @@ -398,18 +434,21 @@ class CacheBench {
FLAGS_ops_per_thread / elapsed_secs);
printf("Thread ops/sec = %u\n", ops_per_sec);

printf("\nOperation latency (ns):\n");
HistogramImpl combined;
for (uint32_t i = 0; i < FLAGS_threads; i++) {
combined.Merge(threads[i]->latency_ns_hist);
}
printf("%s", combined.ToString().c_str());
printf("Lookup hit ratio: %g\n", shared.GetLookupHitRatio());

if (FLAGS_gather_stats) {
printf("\nGather stats latency (us):\n");
printf("%s", stats_hist.ToString().c_str());
}
if (FLAGS_histograms) {
printf("\nOperation latency (ns):\n");
HistogramImpl combined;
for (uint32_t i = 0; i < FLAGS_threads; i++) {
combined.Merge(threads[i]->latency_ns_hist);
}
printf("%s", combined.ToString().c_str());

if (FLAGS_gather_stats) {
printf("\nGather stats latency (us):\n");
printf("%s", stats_hist.ToString().c_str());
}
}
printf("\n%s", stats_report.c_str());

return true;
Expand All @@ -423,8 +462,6 @@ class CacheBench {
const uint64_t insert_threshold_;
const uint64_t lookup_threshold_;
const uint64_t erase_threshold_;
const bool skewed_;
int max_log_;

// A benchmark version of gathering stats on an active block cache by
// iterating over it. The primary purpose is to measure the impact of
Expand Down Expand Up @@ -494,13 +531,17 @@ class CacheBench {
// Something slightly more expensive as in stats by category
helpers.insert(helper);
};
timer.Start();
if (FLAGS_histograms) {
timer.Start();
}
Cache::ApplyToAllEntriesOptions opts;
opts.average_entries_per_lock = FLAGS_gather_stats_entries_per_lock;
shared->GetCacheBench()->cache_->ApplyToAllEntries(fn, opts);
table_occupancy = shared->GetCacheBench()->cache_->GetOccupancyCount();
table_size = shared->GetCacheBench()->cache_->GetTableAddressCount();
stats_hist->Add(timer.ElapsedNanos() / 1000);
if (FLAGS_histograms) {
stats_hist->Add(timer.ElapsedNanos() / 1000);
}
}
}

Expand Down Expand Up @@ -531,6 +572,8 @@ class CacheBench {
void OperateCache(ThreadState* thread) {
// To use looked-up values
uint64_t result = 0;
uint64_t lookup_misses = 0;
uint64_t lookup_hits = 0;
// To hold handles for a non-trivial amount of time
Cache::Handle* handle = nullptr;
KeyGen gen;
Expand All @@ -539,10 +582,12 @@ class CacheBench {
StopWatchNano timer(clock);

for (uint64_t i = 0; i < FLAGS_ops_per_thread; i++) {
Slice key = gen.GetRand(thread->rnd, max_key_, max_log_);
Slice key = gen.GetRand(thread->rnd, max_key_, FLAGS_skew);
uint64_t random_op = thread->rnd.Next();

timer.Start();
if (FLAGS_histograms) {
timer.Start();
}

if (random_op < lookup_insert_threshold_) {
if (handle) {
Expand All @@ -553,12 +598,14 @@ class CacheBench {
handle = cache_->Lookup(key, &helper2, /*context*/ nullptr,
Cache::Priority::LOW);
if (handle) {
++lookup_hits;
if (!FLAGS_lean) {
// do something with the data
result += NPHash64(static_cast<char*>(cache_->Value(handle)),
FLAGS_value_bytes);
}
} else {
++lookup_misses;
// do insert
Status s = cache_->Insert(key, createValue(thread->rnd), &helper2,
FLAGS_value_bytes, &handle);
Expand All @@ -582,11 +629,14 @@ class CacheBench {
handle = cache_->Lookup(key, &helper2, /*context*/ nullptr,
Cache::Priority::LOW);
if (handle) {
++lookup_hits;
if (!FLAGS_lean) {
// do something with the data
result += NPHash64(static_cast<char*>(cache_->Value(handle)),
FLAGS_value_bytes);
}
} else {
++lookup_misses;
}
} else if (random_op < erase_threshold_) {
// do erase
Expand All @@ -595,7 +645,10 @@ class CacheBench {
// Should be extremely unlikely (noop)
assert(random_op >= kHundredthUint64 * 100U);
}
thread->latency_ns_hist.Add(timer.ElapsedNanos());
if (FLAGS_histograms) {
thread->latency_ns_hist.Add(timer.ElapsedNanos());
}
thread->shared->AddLookupStats(lookup_hits, lookup_misses);
}
if (FLAGS_early_exit) {
MutexLock l(thread->shared->GetMutex());
Expand All @@ -621,6 +674,7 @@ class CacheBench {
#ifndef NDEBUG
printf("WARNING: Assertions are enabled; benchmarks unnecessarily slow\n");
#endif
printf("----------------------------\n");
printf("RocksDB version : %d.%d\n", kMajorVersion, kMinorVersion);
printf("DMutex impl name : %s\n", DMutex::kName());
printf("Number of threads : %u\n", FLAGS_threads);
Expand Down Expand Up @@ -960,11 +1014,14 @@ int cache_bench_tool(int argc, char** argv) {
exit(1);
}

if (FLAGS_seed == 0) {
FLAGS_seed = static_cast<uint32_t>(port::GetProcessID());
printf("Using seed = %" PRIu32 "\n", FLAGS_seed);
}

ROCKSDB_NAMESPACE::CacheBench bench;
if (FLAGS_populate_cache) {
bench.PopulateCache();
printf("Population complete\n");
printf("----------------------------\n");
}
if (bench.Run()) {
return 0;
Expand Down

0 comments on commit f9de217

Please sign in to comment.