diff --git a/cloud/src/common/config.h b/cloud/src/common/config.h index 78d3358b581ed9..1e58f0d4d74037 100644 --- a/cloud/src/common/config.h +++ b/cloud/src/common/config.h @@ -229,6 +229,15 @@ CONF_mInt64(max_s3_client_retry, "10"); // Max byte getting delete bitmap can return, default is 1GB CONF_mInt64(max_get_delete_bitmap_byte, "1073741824"); +// Max byte txn commit when updating delete bitmap, default is 7MB. +// Because the size of one fdb transaction can't exceed 10MB, and +// fdb does not have an accurate way to estimate the size of txn. +// In my test, when txn->approximate_bytes() bigger than 8MB, +// it may meet Transaction exceeds byte limit error. We'd better +// reserve 1MB of buffer, so setting the default value to 7MB is +// more reasonable. +CONF_mInt64(max_txn_commit_byte, "7340032"); + CONF_Bool(enable_cloud_txn_lazy_commit, "true"); CONF_Int32(txn_lazy_commit_rowsets_thresold, "1000"); CONF_Int32(txn_lazy_commit_num_threads, "8"); diff --git a/cloud/src/meta-service/meta_service.cpp b/cloud/src/meta-service/meta_service.cpp index 17154a24777905..7914bf5db11cf6 100644 --- a/cloud/src/meta-service/meta_service.cpp +++ b/cloud/src/meta-service/meta_service.cpp @@ -1851,25 +1851,40 @@ void MetaServiceImpl::update_delete_bitmap(google::protobuf::RpcController* cont } // 4. Update delete bitmap for curent txn - size_t total_key = 0; - size_t total_size = 0; + size_t current_key_count = 0; + size_t current_value_count = 0; + size_t total_key_count = 0; + size_t total_value_count = 0; + size_t total_txn_put_keys = 0; + size_t total_txn_put_bytes = 0; + size_t total_txn_size = 0; for (size_t i = 0; i < request->rowset_ids_size(); ++i) { auto& key = delete_bitmap_keys.delete_bitmap_keys(i); auto& val = request->segment_delete_bitmaps(i); // Split into multiple fdb transactions, because the size of one fdb // transaction can't exceed 10MB. - if (fdb_txn_size + key.size() + val.size() > 9 * 1024 * 1024) { - LOG(INFO) << "fdb txn size more than 9MB, current size: " << fdb_txn_size - << " lock_id=" << request->lock_id(); + if (txn->approximate_bytes() + key.size() * 3 + val.size() > config::max_txn_commit_byte) { + LOG(INFO) << "fdb txn size more than " << config::max_txn_commit_byte + << ", current size: " << txn->approximate_bytes() + << " lock_id=" << request->lock_id() << ", need to commit"; err = txn->commit(); + total_txn_put_keys += txn->num_put_keys(); + total_txn_put_bytes += txn->put_bytes(); + total_txn_size += txn->approximate_bytes(); if (err != TxnErrorCode::TXN_OK) { code = cast_as(err); - ss << "failed to update delete bitmap, err=" << err; + ss << "failed to update delete bitmap, err=" << err << " tablet_id=" << tablet_id + << " lock_id=" << request->lock_id() + << " delete_bitmap_key=" << current_key_count + << " delete_bitmap_value=" << current_value_count + << " put_size=" << txn->put_bytes() << " num_put_keys=" << txn->num_put_keys() + << " txn_size=" << txn->approximate_bytes(); msg = ss.str(); return; } - fdb_txn_size = 0; + current_key_count = 0; + current_value_count = 0; TxnErrorCode err = txn_kv_->create_txn(&txn); if (err != TxnErrorCode::TXN_OK) { code = cast_as(err); @@ -1888,24 +1903,34 @@ void MetaServiceImpl::update_delete_bitmap(google::protobuf::RpcController* cont } // splitting large values (>90*1000) into multiple KVs cloud::put(txn.get(), key, val, 0); - fdb_txn_size = fdb_txn_size + key.size() + val.size(); - total_key++; - total_size += key.size() + val.size(); + current_key_count++; + current_value_count += val.size(); + total_key_count++; + total_value_count += val.size(); VLOG_DEBUG << "xxx update delete bitmap put delete_bitmap_key=" << hex(key) << " lock_id=" << request->lock_id() << " key_size: " << key.size() << " value_size: " << val.size(); } - err = txn->commit(); + total_txn_put_keys += txn->num_put_keys(); + total_txn_put_bytes += txn->put_bytes(); + total_txn_size += txn->approximate_bytes(); if (err != TxnErrorCode::TXN_OK) { code = cast_as(err); - ss << "failed to update delete bitmap, err=" << err; + ss << "failed to update delete bitmap, err=" << err << " tablet_id=" << tablet_id + << " lock_id=" << request->lock_id() << " delete_bitmap_key=" << current_key_count + << " delete_bitmap_value=" << current_value_count << " put_size=" << txn->put_bytes() + << " num_put_keys=" << txn->num_put_keys() << " txn_size=" << txn->approximate_bytes(); msg = ss.str(); return; } LOG(INFO) << "update_delete_bitmap tablet_id=" << tablet_id << " lock_id=" << request->lock_id() - << " rowset_num=" << request->rowset_ids_size() << " total_key=" << total_key - << " total_size=" << total_size << " unlock=" << unlock; + << " rowset_num=" << request->rowset_ids_size() + << " total_key_count=" << total_key_count + << " total_value_count=" << total_value_count << " unlock=" << unlock + << " total_txn_put_keys=" << total_txn_put_keys + << " total_txn_put_bytes=" << total_txn_put_bytes + << " total_txn_size=" << total_txn_size; } void MetaServiceImpl::get_delete_bitmap(google::protobuf::RpcController* controller, diff --git a/cloud/test/meta_service_test.cpp b/cloud/test/meta_service_test.cpp index b7004716035a46..fb17c29629b04e 100644 --- a/cloud/test/meta_service_test.cpp +++ b/cloud/test/meta_service_test.cpp @@ -104,6 +104,24 @@ std::unique_ptr get_meta_service() { return get_meta_service(true); } +std::unique_ptr get_fdb_meta_service() { + config::fdb_cluster_file_path = "fdb.cluster"; + static auto txn_kv = std::dynamic_pointer_cast(std::make_shared()); + static std::atomic init {false}; + bool tmp = false; + if (init.compare_exchange_strong(tmp, true)) { + int ret = txn_kv->init(); + [&] { + ASSERT_EQ(ret, 0); + ASSERT_NE(txn_kv.get(), nullptr); + }(); + } + auto rs = std::make_shared(txn_kv); + auto rl = std::make_shared(); + auto meta_service = std::make_unique(txn_kv, rs, rl); + return std::make_unique(std::move(meta_service)); +} + static std::string next_rowset_id() { static int cnt = 0; return std::to_string(++cnt); @@ -4857,6 +4875,43 @@ static std::string generate_random_string(int length) { return randomString; } +TEST(MetaServiceTest, UpdateDeleteBitmapWithBigKeys) { + auto meta_service = get_fdb_meta_service(); + // get delete bitmap update lock + brpc::Controller cntl; + GetDeleteBitmapUpdateLockRequest get_lock_req; + GetDeleteBitmapUpdateLockResponse get_lock_res; + get_lock_req.set_cloud_unique_id("test_cloud_unique_id"); + get_lock_req.set_table_id(1999); + get_lock_req.add_partition_ids(123); + get_lock_req.set_expiration(5); + get_lock_req.set_lock_id(-1); + get_lock_req.set_initiator(100); + meta_service->get_delete_bitmap_update_lock( + reinterpret_cast<::google::protobuf::RpcController*>(&cntl), &get_lock_req, + &get_lock_res, nullptr); + ASSERT_EQ(get_lock_res.status().code(), MetaServiceCode::OK); + UpdateDeleteBitmapRequest update_delete_bitmap_req; + UpdateDeleteBitmapResponse update_delete_bitmap_res; + update_delete_bitmap_req.set_cloud_unique_id("test_cloud_unique_id"); + update_delete_bitmap_req.set_table_id(1999); + update_delete_bitmap_req.set_partition_id(123); + update_delete_bitmap_req.set_lock_id(-1); + update_delete_bitmap_req.set_initiator(100); + update_delete_bitmap_req.set_tablet_id(333); + std::string large_value = generate_random_string(300 * 1000 * 3); + for (int i = 0; i < 100000; i++) { + update_delete_bitmap_req.add_rowset_ids("0200000003ea308a3647dbea83220ed4b8897f2288244a91"); + update_delete_bitmap_req.add_segment_ids(0); + update_delete_bitmap_req.add_versions(i); + update_delete_bitmap_req.add_segment_delete_bitmaps("1"); + } + meta_service->update_delete_bitmap(reinterpret_cast(&cntl), + &update_delete_bitmap_req, &update_delete_bitmap_res, + nullptr); + ASSERT_EQ(update_delete_bitmap_res.status().code(), MetaServiceCode::OK); +} + TEST(MetaServiceTest, UpdateDeleteBitmap) { auto meta_service = get_meta_service();