diff --git a/cloud/src/common/config.h b/cloud/src/common/config.h index 7dc63683688d03..31607057a057f3 100644 --- a/cloud/src/common/config.h +++ b/cloud/src/common/config.h @@ -285,6 +285,9 @@ CONF_Bool(enable_loopback_address_for_ms, "false"); // Comma seprated list: recycler_storage_vault_white_list="aaa,bbb,ccc" CONF_Strings(recycler_storage_vault_white_list, ""); +// for get_delete_bitmap_update_lock +CONF_mBool(enable_batch_get_mow_tablet_stats_and_meta, "true"); + // aws sdk log level // Off = 0, // Fatal = 1, diff --git a/cloud/src/meta-service/meta_service.cpp b/cloud/src/meta-service/meta_service.cpp index 9ecb08363cdc65..66dec09662f430 100644 --- a/cloud/src/meta-service/meta_service.cpp +++ b/cloud/src/meta-service/meta_service.cpp @@ -2574,9 +2574,19 @@ void MetaServiceImpl::get_delete_bitmap_update_lock(google::protobuf::RpcControl return; } + if (!get_mow_tablet_stats_and_meta(code, msg, request, response, instance_id, lock_key)) { + return; + }; +} + +bool MetaServiceImpl::get_mow_tablet_stats_and_meta(MetaServiceCode& code, std::string& msg, + const GetDeleteBitmapUpdateLockRequest* request, + GetDeleteBitmapUpdateLockResponse* response, + std::string& instance_id, + std::string& lock_key) { bool require_tablet_stats = request->has_require_compaction_stats() ? request->require_compaction_stats() : false; - if (!require_tablet_stats) return; + if (!require_tablet_stats) return true; // this request is from fe when it commits txn for MOW table, we send the compaction stats // along with the GetDeleteBitmapUpdateLockResponse which will be sent to BE later to let // BE eliminate unnecessary sync_rowsets() calls if possible @@ -2587,79 +2597,168 @@ void MetaServiceImpl::get_delete_bitmap_update_lock(google::protobuf::RpcControl // these steps can be done in different fdb txns StopWatch read_stats_sw; - err = txn_kv_->create_txn(&txn); + std::unique_ptr txn; + TxnErrorCode err = txn_kv_->create_txn(&txn); if (err != TxnErrorCode::TXN_OK) { code = cast_as(err); msg = "failed to init txn"; - return; + return false; } - - for (const auto& tablet_idx : request->tablet_indexes()) { + auto table_id = request->table_id(); + std::stringstream ss; + if (!config::enable_batch_get_mow_tablet_stats_and_meta) { + for (const auto& tablet_idx : request->tablet_indexes()) { + // 1. get compaction cnts + TabletStatsPB tablet_stat; + std::string stats_key = + stats_tablet_key({instance_id, tablet_idx.table_id(), tablet_idx.index_id(), + tablet_idx.partition_id(), tablet_idx.tablet_id()}); + std::string stats_val; + TxnErrorCode err = txn->get(stats_key, &stats_val); + TEST_SYNC_POINT_CALLBACK( + "get_delete_bitmap_update_lock.get_compaction_cnts_inject_error", &err); + if (err == TxnErrorCode::TXN_TOO_OLD) { + code = MetaServiceCode::OK; + err = txn_kv_->create_txn(&txn); + if (err != TxnErrorCode::TXN_OK) { + code = cast_as(err); + ss << "failed to init txn when get tablet stats"; + msg = ss.str(); + return false; + } + err = txn->get(stats_key, &stats_val); + } + if (err != TxnErrorCode::TXN_OK) { + code = cast_as(err); + msg = fmt::format("failed to get tablet stats, err={} tablet_id={}", err, + tablet_idx.tablet_id()); + return false; + } + if (!tablet_stat.ParseFromArray(stats_val.data(), stats_val.size())) { + code = MetaServiceCode::PROTOBUF_PARSE_ERR; + msg = fmt::format("marformed tablet stats value, key={}", hex(stats_key)); + return false; + } + response->add_base_compaction_cnts(tablet_stat.base_compaction_cnt()); + response->add_cumulative_compaction_cnts(tablet_stat.cumulative_compaction_cnt()); + response->add_cumulative_points(tablet_stat.cumulative_point()); + + // 2. get tablet states + std::string tablet_meta_key = + meta_tablet_key({instance_id, tablet_idx.table_id(), tablet_idx.index_id(), + tablet_idx.partition_id(), tablet_idx.tablet_id()}); + std::string tablet_meta_val; + err = txn->get(tablet_meta_key, &tablet_meta_val); + if (err != TxnErrorCode::TXN_OK) { + ss << "failed to get tablet meta" + << (err == TxnErrorCode::TXN_KEY_NOT_FOUND ? " (not found)" : "") + << " instance_id=" << instance_id << " tablet_id=" << tablet_idx.tablet_id() + << " key=" << hex(tablet_meta_key) << " err=" << err; + msg = ss.str(); + code = err == TxnErrorCode::TXN_KEY_NOT_FOUND ? MetaServiceCode::TABLET_NOT_FOUND + : cast_as(err); + return false; + } + doris::TabletMetaCloudPB tablet_meta; + if (!tablet_meta.ParseFromString(tablet_meta_val)) { + code = MetaServiceCode::PROTOBUF_PARSE_ERR; + msg = "malformed tablet meta"; + return false; + } + response->add_tablet_states( + static_cast>(tablet_meta.tablet_state())); + } + } else { // 1. get compaction cnts - TabletStatsPB tablet_stat; - std::string stats_key = - stats_tablet_key({instance_id, tablet_idx.table_id(), tablet_idx.index_id(), - tablet_idx.partition_id(), tablet_idx.tablet_id()}); - std::string stats_val; - TxnErrorCode err = txn->get(stats_key, &stats_val); + std::vector stats_tablet_keys; + for (const auto& tablet_idx : request->tablet_indexes()) { + stats_tablet_keys.push_back( + stats_tablet_key({instance_id, tablet_idx.table_id(), tablet_idx.index_id(), + tablet_idx.partition_id(), tablet_idx.tablet_id()})); + } + std::vector> stats_tablet_values; + err = txn->batch_get(&stats_tablet_values, stats_tablet_keys); TEST_SYNC_POINT_CALLBACK("get_delete_bitmap_update_lock.get_compaction_cnts_inject_error", &err); + if (err != TxnErrorCode::TXN_OK) { + code = cast_as(err); + msg = fmt::format("failed to get tablet stats, err={} table_id={} lock_id={}", err, + table_id, request->lock_id()); + return false; + } + for (size_t i = 0; i < stats_tablet_keys.size(); i++) { + if (!stats_tablet_values[i].has_value()) { + code = cast_as(err); + msg = fmt::format("failed to get tablet stats, err={} tablet_id={}", err, + request->tablet_indexes(i).tablet_id()); + return false; + } + TabletStatsPB tablet_stat; + if (!tablet_stat.ParseFromString(stats_tablet_values[i].value())) { + code = MetaServiceCode::PROTOBUF_PARSE_ERR; + msg = fmt::format("marformed tablet stats value"); + return false; + } + response->add_base_compaction_cnts(tablet_stat.base_compaction_cnt()); + response->add_cumulative_compaction_cnts(tablet_stat.cumulative_compaction_cnt()); + response->add_cumulative_points(tablet_stat.cumulative_point()); + } + stats_tablet_keys.clear(); + stats_tablet_values.clear(); + DCHECK(request->tablet_indexes_size() == response->base_compaction_cnts_size()); + DCHECK(request->tablet_indexes_size() == response->cumulative_compaction_cnts_size()); + DCHECK(request->tablet_indexes_size() == response->cumulative_points_size()); + + // 2. get tablet states + std::vector tablet_meta_keys; + for (const auto& tablet_idx : request->tablet_indexes()) { + tablet_meta_keys.push_back( + meta_tablet_key({instance_id, tablet_idx.table_id(), tablet_idx.index_id(), + tablet_idx.partition_id(), tablet_idx.tablet_id()})); + } + std::vector> tablet_meta_values; + err = txn->batch_get(&tablet_meta_values, tablet_meta_keys); if (err == TxnErrorCode::TXN_TOO_OLD) { code = MetaServiceCode::OK; err = txn_kv_->create_txn(&txn); if (err != TxnErrorCode::TXN_OK) { code = cast_as(err); - ss << "failed to init txn when get tablet stats"; + ss << "failed to init txn when get tablet meta"; msg = ss.str(); - return; + return false; } - err = txn->get(stats_key, &stats_val); + err = txn->batch_get(&tablet_meta_values, tablet_meta_keys); } if (err != TxnErrorCode::TXN_OK) { code = cast_as(err); - msg = fmt::format("failed to get tablet stats, err={} tablet_id={}", err, - tablet_idx.tablet_id()); - return; - } - if (!tablet_stat.ParseFromArray(stats_val.data(), stats_val.size())) { - code = MetaServiceCode::PROTOBUF_PARSE_ERR; - msg = fmt::format("marformed tablet stats value, key={}", hex(stats_key)); - return; - } - response->add_base_compaction_cnts(tablet_stat.base_compaction_cnt()); - response->add_cumulative_compaction_cnts(tablet_stat.cumulative_compaction_cnt()); - response->add_cumulative_points(tablet_stat.cumulative_point()); - - // 2. get tablet states - std::string tablet_meta_key = - meta_tablet_key({instance_id, tablet_idx.table_id(), tablet_idx.index_id(), - tablet_idx.partition_id(), tablet_idx.tablet_id()}); - std::string tablet_meta_val; - err = txn->get(tablet_meta_key, &tablet_meta_val); - if (err != TxnErrorCode::TXN_OK) { - ss << "failed to get tablet meta" - << (err == TxnErrorCode::TXN_KEY_NOT_FOUND ? " (not found)" : "") - << " instance_id=" << instance_id << " tablet_id=" << tablet_idx.tablet_id() - << " key=" << hex(tablet_meta_key) << " err=" << err; - msg = ss.str(); - code = err == TxnErrorCode::TXN_KEY_NOT_FOUND ? MetaServiceCode::TABLET_NOT_FOUND - : cast_as(err); - return; + msg = fmt::format("failed to get tablet meta, err={} table_id={} lock_id={}", err, + table_id, request->lock_id()); + return false; } - doris::TabletMetaCloudPB tablet_meta; - if (!tablet_meta.ParseFromString(tablet_meta_val)) { - code = MetaServiceCode::PROTOBUF_PARSE_ERR; - msg = "malformed tablet meta"; - return; + for (size_t i = 0; i < tablet_meta_keys.size(); i++) { + if (!tablet_meta_values[i].has_value()) { + code = cast_as(err); + msg = fmt::format("failed to get tablet meta, err={} tablet_id={}", err, + request->tablet_indexes(i).tablet_id()); + return false; + } + doris::TabletMetaCloudPB tablet_meta; + if (!tablet_meta.ParseFromString(tablet_meta_values[i].value())) { + code = MetaServiceCode::PROTOBUF_PARSE_ERR; + msg = fmt::format("marformed tablet meta value"); + return false; + } + response->add_tablet_states( + static_cast>(tablet_meta.tablet_state())); } - response->add_tablet_states( - static_cast>(tablet_meta.tablet_state())); + DCHECK(request->tablet_indexes_size() == response->tablet_states_size()); } read_stats_sw.pause(); LOG(INFO) << fmt::format( - "tablet_idxes.size()={}, read tablet compaction cnts and tablet states cost={} ms", - request->tablet_indexes().size(), read_stats_sw.elapsed_us() / 1000); + "table_id={}, tablet_idxes.size()={}, read tablet compaction cnts and tablet states " + "cost={} ms", + table_id, request->tablet_indexes().size(), read_stats_sw.elapsed_us() / 1000); DeleteBitmapUpdateLockPB lock_info_tmp; if (!check_delete_bitmap_lock(code, msg, ss, txn, table_id, request->lock_id(), @@ -2669,7 +2768,9 @@ void MetaServiceImpl::get_delete_bitmap_update_lock(google::protobuf::RpcControl << table_id << " request lock_id=" << request->lock_id() << " request initiator=" << request->initiator() << " code=" << code << " msg=" << msg; + return false; } + return true; } void MetaServiceImpl::remove_delete_bitmap_update_lock( diff --git a/cloud/src/meta-service/meta_service.h b/cloud/src/meta-service/meta_service.h index 4be017edc9a386..a4136ed39be26a 100644 --- a/cloud/src/meta-service/meta_service.h +++ b/cloud/src/meta-service/meta_service.h @@ -329,6 +329,11 @@ class MetaServiceImpl : public cloud::MetaService { const AlterInstanceRequest* request, std::function(InstanceInfoPB*)> action); + bool get_mow_tablet_stats_and_meta(MetaServiceCode& code, std::string& msg, + const GetDeleteBitmapUpdateLockRequest* request, + GetDeleteBitmapUpdateLockResponse* response, + std::string& instance_id, std::string& lock_key); + std::shared_ptr txn_kv_; std::shared_ptr resource_mgr_; std::shared_ptr rate_limiter_; diff --git a/cloud/test/meta_service_test.cpp b/cloud/test/meta_service_test.cpp index fbbfbff19fe802..d02ea7b5011257 100644 --- a/cloud/test/meta_service_test.cpp +++ b/cloud/test/meta_service_test.cpp @@ -4689,50 +4689,58 @@ TEST(MetaServiceTest, GetDeleteBitmapUpdateLockNoReadStats) { TEST(MetaServiceTest, GetDeleteBitmapUpdateLockTabletStatsNormal) { auto meta_service = get_meta_service(); - std::string instance_id = "test_get_delete_bitmap_update_lock_normal"; - [[maybe_unused]] auto* sp = SyncPoint::get_instance(); - std::unique_ptr> defer((int*)0x01, [](int*) { - SyncPoint::get_instance()->disable_processing(); - SyncPoint::get_instance()->clear_all_call_backs(); - }); - sp->set_call_back("get_instance_id", [&](auto&& args) { - auto* ret = try_any_cast_ret(args); - ret->first = instance_id; - ret->second = true; - }); - sp->enable_processing(); + bool enable_batch_get_mow_tablet_stats_and_meta_vals[] = {false, true}; + for (bool val : enable_batch_get_mow_tablet_stats_and_meta_vals) { + config::enable_batch_get_mow_tablet_stats_and_meta = val; - int64_t db_id = 1000; - int64_t table_id = 2001; - int64_t index_id = 3001; - // [(partition_id, tablet_id)] - std::vector> tablet_idxes {{70001, 12345}, {80001, 3456}, {90001, 6789}}; + std::string instance_id = "test_get_delete_bitmap_update_lock_normal"; + [[maybe_unused]] auto* sp = SyncPoint::get_instance(); + std::unique_ptr> defer((int*)0x01, [](int*) { + SyncPoint::get_instance()->disable_processing(); + SyncPoint::get_instance()->clear_all_call_backs(); + }); + sp->set_call_back("get_instance_id", [&](auto&& args) { + auto* ret = try_any_cast_ret(args); + ret->first = instance_id; + ret->second = true; + }); + sp->enable_processing(); - add_tablet_metas(meta_service.get(), instance_id, table_id, index_id, tablet_idxes); + int64_t db_id = 1000; + int64_t table_id = 2001; + int64_t index_id = 3001; + // [(partition_id, tablet_id)] + std::vector> tablet_idxes { + {70001, 12345}, {80001, 3456}, {90001, 6789}}; - GetDeleteBitmapUpdateLockResponse res; - get_delete_bitmap_update_lock(meta_service.get(), res, db_id, table_id, index_id, tablet_idxes, - 5, 999999, -1, true); - ASSERT_EQ(res.status().code(), MetaServiceCode::OK); + add_tablet_metas(meta_service.get(), instance_id, table_id, index_id, tablet_idxes); - ASSERT_EQ(res.base_compaction_cnts().size(), tablet_idxes.size()); - for (const auto& base_compaction_cnt : res.base_compaction_cnts()) { - ASSERT_EQ(base_compaction_cnt, 10); - } - ASSERT_EQ(res.cumulative_compaction_cnts().size(), tablet_idxes.size()); - for (const auto& cumu_compaction_cnt : res.cumulative_compaction_cnts()) { - ASSERT_EQ(cumu_compaction_cnt, 20); - } - ASSERT_EQ(res.cumulative_points().size(), tablet_idxes.size()); - for (const auto& cumulative_point : res.cumulative_points()) { - ASSERT_EQ(cumulative_point, 30); + GetDeleteBitmapUpdateLockResponse res; + get_delete_bitmap_update_lock(meta_service.get(), res, db_id, table_id, index_id, + tablet_idxes, 5, 999999, -1, true); + ASSERT_EQ(res.status().code(), MetaServiceCode::OK); + + ASSERT_EQ(res.base_compaction_cnts().size(), tablet_idxes.size()); + for (const auto& base_compaction_cnt : res.base_compaction_cnts()) { + ASSERT_EQ(base_compaction_cnt, 10); + } + ASSERT_EQ(res.cumulative_compaction_cnts().size(), tablet_idxes.size()); + for (const auto& cumu_compaction_cnt : res.cumulative_compaction_cnts()) { + ASSERT_EQ(cumu_compaction_cnt, 20); + } + ASSERT_EQ(res.cumulative_points().size(), tablet_idxes.size()); + for (const auto& cumulative_point : res.cumulative_points()) { + ASSERT_EQ(cumulative_point, 30); + } } } TEST(MetaServiceTest, GetDeleteBitmapUpdateLockTabletStatsLockExpired) { auto meta_service = get_meta_service(); - { + bool enable_batch_get_mow_tablet_stats_and_meta_vals[] = {false, true}; + for (bool val : enable_batch_get_mow_tablet_stats_and_meta_vals) { + config::enable_batch_get_mow_tablet_stats_and_meta = val; // 2.1 abnormal path, lock has been expired and taken by another load/compaction during // the reading of tablet stats std::string instance_id = "test_get_delete_bitmap_update_lock_abnormal1"; @@ -4773,7 +4781,9 @@ TEST(MetaServiceTest, GetDeleteBitmapUpdateLockTabletStatsLockExpired) { ASSERT_EQ(res.cumulative_points().size(), 0); } - { + for (bool val : enable_batch_get_mow_tablet_stats_and_meta_vals) { + config::enable_batch_get_mow_tablet_stats_and_meta = val; + // 2.2 abnormal path, lock has been taken by another load/compaction and been released during // the reading of tablet stats std::string instance_id = "test_get_delete_bitmap_update_lock_abnormal2"; @@ -4815,7 +4825,9 @@ TEST(MetaServiceTest, GetDeleteBitmapUpdateLockTabletStatsLockExpired) { TEST(MetaServiceTest, GetDeleteBitmapUpdateLockTabletStatsError) { auto meta_service = get_meta_service(); - { + bool enable_batch_get_mow_tablet_stats_and_meta_vals[] = {false, true}; + for (bool val : enable_batch_get_mow_tablet_stats_and_meta_vals) { + config::enable_batch_get_mow_tablet_stats_and_meta = val; // 2.3 abnormal path, meeting error when reading tablets' stats std::string instance_id = "test_get_delete_bitmap_update_lock_abnormal3"; [[maybe_unused]] auto* sp = SyncPoint::get_instance(); @@ -4853,7 +4865,8 @@ TEST(MetaServiceTest, GetDeleteBitmapUpdateLockTabletStatsError) { ASSERT_EQ(res.status().code(), MetaServiceCode::KV_TXN_GET_ERR); } - { + for (bool val : enable_batch_get_mow_tablet_stats_and_meta_vals) { + config::enable_batch_get_mow_tablet_stats_and_meta = val; // 2.4 abnormal path, meeting TXN_TOO_OLD error when reading tablets' stats, // this should not fail if lock is not expired std::string instance_id = "test_get_delete_bitmap_update_lock_abnormal4";