From 8e4d69429f24eef8be8e5fab48768c3d79e3f164 Mon Sep 17 00:00:00 2001 From: Yingchun Lai <405403881@qq.com> Date: Mon, 28 Sep 2020 18:55:58 +0800 Subject: [PATCH 1/3] [LRUCache] Expose LRU Cache status to metrics Expose LRU Cache status to metrics would be helpful to diagnose problems like high usage, low hit rate. --- be/src/olap/file_helper.cpp | 2 +- be/src/olap/fs/file_block_manager.cpp | 4 +- be/src/olap/lru_cache.cpp | 121 +++++++++++--------------- be/src/olap/lru_cache.h | 46 +++++----- be/src/olap/page_cache.cpp | 2 +- be/src/olap/storage_engine.cpp | 8 +- be/src/olap/storage_engine.h | 4 - be/src/runtime/load_channel_mgr.cpp | 2 +- be/src/util/file_cache.cpp | 2 +- be/src/util/metrics.h | 3 + be/test/olap/lru_cache_test.cpp | 2 +- be/test/util/file_cache_test.cpp | 2 +- 12 files changed, 88 insertions(+), 110 deletions(-) diff --git a/be/src/olap/file_helper.cpp b/be/src/olap/file_helper.cpp index 8c3d68fbfaa73e..75e1c5bd6778b6 100644 --- a/be/src/olap/file_helper.cpp +++ b/be/src/olap/file_helper.cpp @@ -47,7 +47,7 @@ FileHandler::FileHandler() : static std::once_flag once_flag; #ifdef BE_TEST std::call_once(once_flag, [] { - _s_fd_cache = new_lru_cache(config::file_descriptor_cache_capacity); + _s_fd_cache = new_lru_cache("FileHandlerCacheTest", config::file_descriptor_cache_capacity); }); #else // storage engine may not be opened when doris try to read and write diff --git a/be/src/olap/fs/file_block_manager.cpp b/be/src/olap/fs/file_block_manager.cpp index 291dc0ed61f122..c49388593c44ed 100644 --- a/be/src/olap/fs/file_block_manager.cpp +++ b/be/src/olap/fs/file_block_manager.cpp @@ -387,9 +387,9 @@ FileBlockManager::FileBlockManager(Env* env, BlockManagerOptions opts) : } #ifdef BE_TEST - _file_cache.reset(new FileCache("Readable file cache", config::file_descriptor_cache_capacity)); + _file_cache.reset(new FileCache("Readable_file_cache", config::file_descriptor_cache_capacity)); #else - _file_cache.reset(new FileCache("Readable file cache", StorageEngine::instance()->file_cache())); + _file_cache.reset(new FileCache("Readable_file_cache", StorageEngine::instance()->file_cache())); #endif } diff --git a/be/src/olap/lru_cache.cpp b/be/src/olap/lru_cache.cpp index df6df40c1ffaa9..23a616784c5c0f 100644 --- a/be/src/olap/lru_cache.cpp +++ b/be/src/olap/lru_cache.cpp @@ -17,12 +17,20 @@ #include "olap/olap_index.h" #include "olap/row_block.h" #include "olap/utils.h" +#include "util/doris_metrics.h" using std::string; using std::stringstream; namespace doris { +DEFINE_GAUGE_METRIC_PROTOTYPE_2ARG(capacity, MetricUnit::BYTES); +DEFINE_GAUGE_METRIC_PROTOTYPE_2ARG(usage, MetricUnit::BYTES); +DEFINE_GAUGE_METRIC_PROTOTYPE_2ARG(usage_ratio, MetricUnit::NOUNIT); +DEFINE_COUNTER_METRIC_PROTOTYPE_2ARG(lookup_count, MetricUnit::OPERATIONS); +DEFINE_COUNTER_METRIC_PROTOTYPE_2ARG(hit_count, MetricUnit::OPERATIONS); +DEFINE_GAUGE_METRIC_PROTOTYPE_2ARG(hit_ratio, MetricUnit::NOUNIT); + uint32_t CacheKey::hash(const char* data, size_t n, uint32_t seed) const { // Similar to murmur hash const uint32_t m = 0xc6a4a793; @@ -82,9 +90,7 @@ LRUHandle* HandleTable::insert(LRUHandle* h) { if (_elems > _length) { // Since each cache entry is fairly large, we aim for a small // average linked list length (<= 1). - if (!_resize()) { - return NULL; - } + _resize(); } } @@ -114,7 +120,7 @@ LRUHandle** HandleTable::_find_pointer(const CacheKey& key, uint32_t hash) { return ptr; } -bool HandleTable::_resize() { +void HandleTable::_resize() { uint32_t new_length = 4; while (new_length < _elems) { @@ -122,21 +128,13 @@ bool HandleTable::_resize() { } LRUHandle** new_list = new(std::nothrow) LRUHandle*[new_length]; - - if (NULL == new_list) { - LOG(FATAL) << "failed to malloc new hash list. new_length=" << new_length; - return false; - } - memset(new_list, 0, sizeof(new_list[0]) * new_length); uint32_t count = 0; for (uint32_t i = 0; i < _length; i++) { LRUHandle* h = _list[i]; - while (h != NULL) { LRUHandle* next = h->next_hash; - CacheKey key = h->key(); uint32_t hash = h->hash; LRUHandle** ptr = &new_list[hash & (new_length - 1)]; h->next_hash = *ptr; @@ -146,20 +144,13 @@ bool HandleTable::_resize() { } } - if (_elems != count) { - delete [] new_list; - LOG(FATAL) << "_elems not match new count. elems=" << _elems - << ", count=" << count; - return false; - } - + DCHECK_EQ(_elems, count); delete [] _list; _list = new_list; _length = new_length; - return true; } -LRUCache::LRUCache() : _usage(0), _last_id(0), _lookup_count(0), +LRUCache::LRUCache() : _usage(0), _lookup_count(0), _hit_count(0) { // Make empty circular linked list _lru.next = &_lru; @@ -376,15 +367,29 @@ uint32_t ShardedLRUCache::_shard(uint32_t hash) { return hash >> (32 - kNumShardBits); } -ShardedLRUCache::ShardedLRUCache(size_t capacity) - : _last_id(0) { - const size_t per_shard = (capacity + (kNumShards - 1)) / kNumShards; - - for (int s = 0; s < kNumShards; s++) { - _shards[s].set_capacity(per_shard); - } +ShardedLRUCache::ShardedLRUCache(const std::string& name, size_t total_capacity) + : _name(name), _last_id(1) { + const size_t per_shard = (total_capacity + (kNumShards - 1)) / kNumShards; + for (int s = 0; s < kNumShards; s++) { + _shards[s].set_capacity(per_shard); } + _entity = DorisMetrics::instance()->metric_registry() + ->register_entity(std::string("lru_cache:") + name, {{"name", name}}); + _entity->register_hook(name, std::bind(&ShardedLRUCache::update_cache_metrics, this)); + INT_GAUGE_METRIC_REGISTER(_entity, capacity); + INT_GAUGE_METRIC_REGISTER(_entity, usage); + INT_DOUBLE_METRIC_REGISTER(_entity, usage_ratio); + INT_ATOMIC_COUNTER_METRIC_REGISTER(_entity, lookup_count); + INT_ATOMIC_COUNTER_METRIC_REGISTER(_entity, hit_count); + INT_DOUBLE_METRIC_REGISTER(_entity, hit_ratio); +} + +ShardedLRUCache::~ShardedLRUCache() { + _entity->deregister_hook(_name); + DorisMetrics::instance()->metric_registry()->deregister_entity(_entity); +} + Cache::Handle* ShardedLRUCache::insert( const CacheKey& key, void* value, @@ -420,8 +425,7 @@ Slice ShardedLRUCache::value_slice(Handle* handle) { } uint64_t ShardedLRUCache::new_id() { - MutexLock l(&_id_mutex); - return ++(_last_id); + return _last_id.fetch_add(1, std::memory_order_relaxed);; } void ShardedLRUCache::prune() { @@ -432,51 +436,28 @@ void ShardedLRUCache::prune() { VLOG(7) << "Successfully prune cache, clean " << num_prune << " entries."; } -size_t ShardedLRUCache::get_memory_usage() { +void ShardedLRUCache::update_cache_metrics() const { + size_t total_capacity = 0; size_t total_usage = 0; - for (int s = 0; s < kNumShards; s++) { - total_usage += _shards[s].get_usage(); - } - return total_usage; -} - -void ShardedLRUCache::get_cache_status(rapidjson::Document* document) { - size_t shard_count = sizeof(_shards) / sizeof(LRUCache); - - for (uint32_t i = 0; i < shard_count; ++i) { - size_t capacity = _shards[i].get_capacity(); - size_t usage = _shards[i].get_usage(); - rapidjson::Value shard_info(rapidjson::kObjectType); - shard_info.AddMember("capacity", static_cast(capacity), document->GetAllocator()); - shard_info.AddMember("usage", static_cast(usage), document->GetAllocator()); - - float usage_ratio = 0.0f; - - if (0 != capacity) { - usage_ratio = static_cast(usage) / static_cast(capacity); - } - - shard_info.AddMember("usage_ratio", usage_ratio, document->GetAllocator()); - - size_t lookup_count = _shards[i].get_lookup_count(); - size_t hit_count = _shards[i].get_hit_count(); - shard_info.AddMember("lookup_count", static_cast(lookup_count), document->GetAllocator()); - shard_info.AddMember("hit_count", static_cast(hit_count), document->GetAllocator()); - - float hit_ratio = 0.0f; - - if (0 != lookup_count) { - hit_ratio = static_cast(hit_count) / static_cast(lookup_count); - } - - shard_info.AddMember("hit_ratio", hit_ratio, document->GetAllocator()); - document->PushBack(shard_info, document->GetAllocator()); + size_t total_lookup_count = 0; + size_t total_hit_count = 0; + for (int i = 0; i < kNumShards; i++) { + total_capacity += _shards[i].get_capacity(); + total_usage += _shards[i].get_usage(); + total_lookup_count += _shards[i].get_lookup_count(); + total_hit_count += _shards[i].get_hit_count(); } + capacity->set_value(total_capacity); + usage->set_value(total_usage); + lookup_count->set_value(total_lookup_count); + hit_count->set_value(total_hit_count); + usage_ratio->set_value(total_capacity == 0 ? 0 : (total_usage / total_capacity)); + hit_ratio->set_value(total_lookup_count == 0 ? 0 : (total_hit_count / total_lookup_count)); } -Cache* new_lru_cache(size_t capacity) { - return new ShardedLRUCache(capacity); +Cache* new_lru_cache(const std::string& name, size_t capacity) { + return new ShardedLRUCache(name, capacity); } } // namespace doris diff --git a/be/src/olap/lru_cache.h b/be/src/olap/lru_cache.h index ef3d6bbcf44817..03f2b7641a3e92 100644 --- a/be/src/olap/lru_cache.h +++ b/be/src/olap/lru_cache.h @@ -14,6 +14,7 @@ #include #include "olap/olap_common.h" +#include "util/metrics.h" #include "util/mutex.h" #include "util/slice.h" @@ -46,9 +47,9 @@ namespace doris { class Cache; class CacheKey; - // Create a new cache with a fixed size capacity. This implementation + // Create a new cache with a specified name and a fixed size capacity. This implementation // of Cache uses a least-recently-used eviction policy. - extern Cache* new_lru_cache(size_t capacity); + extern Cache* new_lru_cache(const std::string& name, size_t capacity); class CacheKey { public: @@ -221,11 +222,6 @@ namespace doris { // leveldb may change prune() to a pure abstract method. virtual void prune() {} - // 获取运行统计项,包括内存占用 - virtual size_t get_memory_usage() = 0; - // cache命中率统计 - virtual void get_cache_status(rapidjson::Document* document) = 0; - private: DISALLOW_COPY_AND_ASSIGN(Cache); }; @@ -235,9 +231,9 @@ namespace doris { typedef struct LRUHandle { void* value; void (*deleter)(const CacheKey&, void* value); - LRUHandle* next_hash; - LRUHandle* next; - LRUHandle* prev; + LRUHandle* next_hash; // next entry in hash table + LRUHandle* next; // next entry in lru list + LRUHandle* prev; // previous entry in lru list size_t charge; size_t key_length; bool in_cache; // Whether entry is in the cache. @@ -296,7 +292,7 @@ namespace doris { // matches key/hash. If there is no such cache entry, return a // pointer to the trailing slot in the corresponding linked list. LRUHandle** _find_pointer(const CacheKey& key, uint32_t hash); - bool _resize(); + void _resize(); }; // A single shard of sharded cache. @@ -323,16 +319,16 @@ namespace doris { void erase(const CacheKey& key, uint32_t hash); int prune(); - uint64_t get_lookup_count() { + uint64_t get_lookup_count() const { return _lookup_count; } - uint64_t get_hit_count() { + uint64_t get_hit_count() const { return _hit_count; } - size_t get_usage() { + size_t get_usage() const { return _usage; } - size_t get_capacity() { + size_t get_capacity() const { return _capacity; } @@ -349,7 +345,6 @@ namespace doris { // _mutex protects the following state. Mutex _mutex; size_t _usage; - uint64_t _last_id; // Dummy head of LRU list. // lru.prev is newest entry, lru.next is oldest entry. @@ -367,9 +362,9 @@ namespace doris { class ShardedLRUCache : public Cache { public: - explicit ShardedLRUCache(size_t capacity); + explicit ShardedLRUCache(const std::string& name, size_t total_capacity); // TODO(fdy): 析构时清除所有cache元素 - virtual ~ShardedLRUCache() {} + virtual ~ShardedLRUCache(); virtual Handle* insert( const CacheKey& key, void* value, @@ -383,16 +378,23 @@ namespace doris { Slice value_slice(Handle* handle) override; virtual uint64_t new_id(); virtual void prune(); - virtual size_t get_memory_usage(); - virtual void get_cache_status(rapidjson::Document* document); + void update_cache_metrics() const; private: static inline uint32_t _hash_slice(const CacheKey& s); static uint32_t _shard(uint32_t hash); + std::string _name; LRUCache _shards[kNumShards]; - Mutex _id_mutex; - uint64_t _last_id; + std::atomic _last_id; + + std::shared_ptr _entity = nullptr; + IntGauge* capacity; + IntGauge* usage; + DoubleGauge* usage_ratio; + IntAtomicCounter* lookup_count; + IntAtomicCounter* hit_count; + DoubleGauge* hit_ratio; }; } // namespace doris diff --git a/be/src/olap/page_cache.cpp b/be/src/olap/page_cache.cpp index f92868b3df8086..4643ef809dfe45 100644 --- a/be/src/olap/page_cache.cpp +++ b/be/src/olap/page_cache.cpp @@ -27,7 +27,7 @@ void StoragePageCache::create_global_cache(size_t capacity) { _s_instance = &instance; } -StoragePageCache::StoragePageCache(size_t capacity) : _cache(new_lru_cache(capacity)) { +StoragePageCache::StoragePageCache(size_t capacity) : _cache(new_lru_cache("StoragePageCache", capacity)) { } bool StoragePageCache::lookup(const CacheKey& key, PageCacheHandle* handle) { diff --git a/be/src/olap/storage_engine.cpp b/be/src/olap/storage_engine.cpp index adbf9a5e43245c..16505671c62780 100644 --- a/be/src/olap/storage_engine.cpp +++ b/be/src/olap/storage_engine.cpp @@ -170,9 +170,9 @@ Status StorageEngine::_open() { RETURN_NOT_OK_STATUS_WITH_WARN(_check_file_descriptor_number(), "check fd number failed"); - _index_stream_lru_cache = new_lru_cache(config::index_stream_cache_capacity); + _index_stream_lru_cache = new_lru_cache("SegmentIndexCache", config::index_stream_cache_capacity); - _file_cache.reset(new_lru_cache(config::file_descriptor_cache_capacity)); + _file_cache.reset(new_lru_cache("FileHandlerCache", config::file_descriptor_cache_capacity)); auto dirs = get_stores(); load_data_dirs(dirs); @@ -633,10 +633,6 @@ void StorageEngine::_perform_base_compaction(TabletSharedPtr best_tablet) { best_tablet->set_last_base_compaction_failure_time(0); } -void StorageEngine::get_cache_status(rapidjson::Document* document) const { - return _index_stream_lru_cache->get_cache_status(document); -} - OLAPStatus StorageEngine::_start_trash_sweep(double* usage) { OLAPStatus res = OLAP_SUCCESS; LOG(INFO) << "start trash and snapshot sweep."; diff --git a/be/src/olap/storage_engine.h b/be/src/olap/storage_engine.h index 7321358f7aa7f4..bf9650c8c227e8 100644 --- a/be/src/olap/storage_engine.h +++ b/be/src/olap/storage_engine.h @@ -84,9 +84,6 @@ class StorageEngine { void clear_transaction_task(const TTransactionId transaction_id, const std::vector& partition_ids); - // 获取cache的使用情况信息 - void get_cache_status(rapidjson::Document* document) const; - // Note: 这里只能reload原先已经存在的root path,即re-load启动时就登记的root path // 是允许的,但re-load全新的path是不允许的,因为此处没有彻底更新ce调度器信息 void load_data_dirs(const std::vector& stores); @@ -283,7 +280,6 @@ class StorageEngine { int32_t _effective_cluster_id; bool _is_all_cluster_id_exist; - Cache* _file_descriptor_lru_cache; Cache* _index_stream_lru_cache; // _file_cache is a lru_cache for file descriptors of files opened by doris, diff --git a/be/src/runtime/load_channel_mgr.cpp b/be/src/runtime/load_channel_mgr.cpp index 0fb569212b2437..d48f46f708eba9 100644 --- a/be/src/runtime/load_channel_mgr.cpp +++ b/be/src/runtime/load_channel_mgr.cpp @@ -68,7 +68,7 @@ LoadChannelMgr::LoadChannelMgr() : _stop_background_threads_latch(1) { std::lock_guard l(_lock); return _load_channels.size(); }); - _last_success_channel = new_lru_cache(1024); + _last_success_channel = new_lru_cache("LastestSuccessChannelCache", 1024); } LoadChannelMgr::~LoadChannelMgr() { diff --git a/be/src/util/file_cache.cpp b/be/src/util/file_cache.cpp index bcad623deeb929..0bdfb13e0a4665 100644 --- a/be/src/util/file_cache.cpp +++ b/be/src/util/file_cache.cpp @@ -25,7 +25,7 @@ namespace doris { template FileCache::FileCache(const std::string& cache_name, int max_open_files) : _cache_name(cache_name), - _cache(new_lru_cache(max_open_files)), + _cache(new_lru_cache(std::string("FileBlockManagerCache:") + cache_name, max_open_files)), _is_cache_own(true) { } template diff --git a/be/src/util/metrics.h b/be/src/util/metrics.h index cc85ffebb36c06..cb6901266fcfec 100644 --- a/be/src/util/metrics.h +++ b/be/src/util/metrics.h @@ -277,6 +277,9 @@ struct MetricPrototype { #define INT_GAUGE_METRIC_REGISTER(entity, metric) \ metric = (IntGauge*)(entity->register_metric(&METRIC_##metric)) +#define INT_DOUBLE_METRIC_REGISTER(entity, metric) \ + metric = (DoubleGauge*)(entity->register_metric(&METRIC_##metric)) + #define INT_UGAUGE_METRIC_REGISTER(entity, metric) \ metric = (UIntGauge*)(entity->register_metric(&METRIC_##metric)) diff --git a/be/test/olap/lru_cache_test.cpp b/be/test/olap/lru_cache_test.cpp index ca5c7b17bd25f0..537b69678327ff 100644 --- a/be/test/olap/lru_cache_test.cpp +++ b/be/test/olap/lru_cache_test.cpp @@ -71,7 +71,7 @@ class CacheTest : public testing::Test { std::vector _deleted_values; Cache* _cache; - CacheTest() : _cache(new_lru_cache(kCacheSize)) { + CacheTest() : _cache(new_lru_cache("test", kCacheSize)) { _s_current = this; } diff --git a/be/test/util/file_cache_test.cpp b/be/test/util/file_cache_test.cpp index efb4aa69490ab8..33ac37f7e46e67 100644 --- a/be/test/util/file_cache_test.cpp +++ b/be/test/util/file_cache_test.cpp @@ -28,7 +28,7 @@ class FileCacheTest : public testing::Test { FileCacheTest() { } void SetUp() override { - _file_cache.reset(new FileCache("test cache", 10000)); + _file_cache.reset(new FileCache("test_cache", 10000)); _file_exist = "file_exist"; std::unique_ptr file; auto st = Env::Default()->new_writable_file(_file_exist, &file); From b337c00498a072ef6493af901850813251bd5393 Mon Sep 17 00:00:00 2001 From: Yingchun Lai <405403881@qq.com> Date: Sun, 18 Oct 2020 15:42:17 +0000 Subject: [PATCH 2/3] init member --- be/src/olap/lru_cache.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/be/src/olap/lru_cache.h b/be/src/olap/lru_cache.h index 03f2b7641a3e92..b57c82bb191648 100644 --- a/be/src/olap/lru_cache.h +++ b/be/src/olap/lru_cache.h @@ -389,12 +389,12 @@ namespace doris { std::atomic _last_id; std::shared_ptr _entity = nullptr; - IntGauge* capacity; - IntGauge* usage; - DoubleGauge* usage_ratio; - IntAtomicCounter* lookup_count; - IntAtomicCounter* hit_count; - DoubleGauge* hit_ratio; + IntGauge* capacity = nullptr; + IntGauge* usage = nullptr; + DoubleGauge* usage_ratio = nullptr; + IntAtomicCounter* lookup_count = nullptr; + IntAtomicCounter* hit_count = nullptr; + DoubleGauge* hit_ratio = nullptr; }; } // namespace doris From c0148596492e51e1a4f94e75e194e7cf79d8cc3a Mon Sep 17 00:00:00 2001 From: Yingchun Lai <405403881@qq.com> Date: Wed, 21 Oct 2020 10:44:50 +0800 Subject: [PATCH 3/3] Update be/src/olap/lru_cache.cpp Co-authored-by: Mingyu Chen --- be/src/olap/lru_cache.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/be/src/olap/lru_cache.cpp b/be/src/olap/lru_cache.cpp index 23a616784c5c0f..dd68b13c9bc3fb 100644 --- a/be/src/olap/lru_cache.cpp +++ b/be/src/olap/lru_cache.cpp @@ -425,7 +425,7 @@ Slice ShardedLRUCache::value_slice(Handle* handle) { } uint64_t ShardedLRUCache::new_id() { - return _last_id.fetch_add(1, std::memory_order_relaxed);; + return _last_id.fetch_add(1, std::memory_order_relaxed); } void ShardedLRUCache::prune() {