diff --git a/be/src/cloud/cloud_internal_service.cpp b/be/src/cloud/cloud_internal_service.cpp index 22fa9e1d4bdae5..2e058571ba0d87 100644 --- a/be/src/cloud/cloud_internal_service.cpp +++ b/be/src/cloud/cloud_internal_service.cpp @@ -21,10 +21,12 @@ #include "cloud/cloud_storage_engine.h" #include "cloud/cloud_tablet_mgr.h" +#include "cloud/cloud_warm_up_manager.h" #include "cloud/config.h" #include "io/cache/block_file_cache.h" #include "io/cache/block_file_cache_downloader.h" #include "io/cache/block_file_cache_factory.h" +#include "util/debug_points.h" namespace doris { @@ -223,9 +225,27 @@ void CloudInternalServiceImpl::warm_up_rowset(google::protobuf::RpcController* c expiration_time = 0; } + if (!tablet->add_rowset_warmup_state(rs_meta, WarmUpState::TRIGGERED_BY_JOB)) { + LOG(INFO) << "found duplicate warmup task for rowset " << rowset_id.to_string() + << ", skip it"; + continue; + } + for (int64_t segment_id = 0; segment_id < rs_meta.num_segments(); segment_id++) { auto segment_size = rs_meta.segment_file_size(segment_id); - auto download_done = [=](Status st) { + auto download_done = [=, version = rs_meta.version()](Status st) { + DBUG_EXECUTE_IF("CloudInternalServiceImpl::warm_up_rowset.download_segment", { + auto sleep_time = dp->param("sleep", 3); + LOG_INFO("[verbose] block download for rowset={}, version={}, sleep={}", + rowset_id.to_string(), version.to_string(), sleep_time); + std::this_thread::sleep_for(std::chrono::seconds(sleep_time)); + }); + DBUG_EXECUTE_IF( + "CloudInternalServiceImpl::warm_up_rowset.download_segment.inject_error", { + st = Status::InternalError("injected error"); + LOG_INFO("[verbose] inject error, tablet={}, rowset={}, st={}", + tablet_id, rowset_id.to_string(), st.to_string()); + }); if (st.ok()) { g_file_cache_event_driven_warm_up_finished_segment_num << 1; g_file_cache_event_driven_warm_up_finished_segment_size << segment_size; @@ -256,6 +276,11 @@ void CloudInternalServiceImpl::warm_up_rowset(google::protobuf::RpcController* c LOG(WARNING) << "download segment failed, tablet_id: " << tablet_id << " rowset_id: " << rowset_id.to_string() << ", error: " << st; } + if (tablet->complete_rowset_segment_warmup(rowset_id, st, 1, 0) == + WarmUpState::DONE) { + VLOG_DEBUG << "warmup rowset " << version.to_string() << "(" + << rowset_id.to_string() << ") completed"; + } if (wait) { wait->signal(); } @@ -267,13 +292,10 @@ void CloudInternalServiceImpl::warm_up_rowset(google::protobuf::RpcController* c .offset = 0, .download_size = segment_size, .file_system = storage_resource.value()->fs, - .ctx = - { - .is_index_data = false, - .expiration_time = expiration_time, - .is_dryrun = - config::enable_reader_dryrun_when_download_file_cache, - }, + .ctx = {.is_index_data = false, + .expiration_time = expiration_time, + .is_dryrun = config::enable_reader_dryrun_when_download_file_cache, + .is_warmup = true}, .download_done = std::move(download_done), }; g_file_cache_event_driven_warm_up_submitted_segment_num << 1; @@ -283,9 +305,18 @@ void CloudInternalServiceImpl::warm_up_rowset(google::protobuf::RpcController* c } _engine.file_cache_block_downloader().submit_download_task(download_meta); - auto download_inverted_index = [&](std::string index_path, uint64_t idx_size) { + auto download_inverted_index = [&, tablet](std::string index_path, uint64_t idx_size) { auto storage_resource = rs_meta.remote_storage_resource(); - auto download_done = [=](Status st) { + auto download_done = [=, version = rs_meta.version()](Status st) { + DBUG_EXECUTE_IF( + "CloudInternalServiceImpl::warm_up_rowset.download_inverted_idx", { + auto sleep_time = dp->param("sleep", 3); + LOG_INFO( + "[verbose] block download for rowset={}, inverted index " + "file={}, sleep={}", + rowset_id.to_string(), index_path, sleep_time); + std::this_thread::sleep_for(std::chrono::seconds(sleep_time)); + }); if (st.ok()) { g_file_cache_event_driven_warm_up_finished_index_num << 1; g_file_cache_event_driven_warm_up_finished_index_size << idx_size; @@ -319,6 +350,11 @@ void CloudInternalServiceImpl::warm_up_rowset(google::protobuf::RpcController* c << "download inverted index failed, tablet_id: " << tablet_id << " rowset_id: " << rowset_id.to_string() << ", error: " << st; } + if (tablet->complete_rowset_segment_warmup(rowset_id, st, 0, 1) == + WarmUpState::DONE) { + VLOG_DEBUG << "warmup rowset " << version.to_string() << "(" + << rowset_id.to_string() << ") completed"; + } if (wait) { wait->signal(); } @@ -327,18 +363,15 @@ void CloudInternalServiceImpl::warm_up_rowset(google::protobuf::RpcController* c .path = io::Path(index_path), .file_size = static_cast(idx_size), .file_system = storage_resource.value()->fs, - .ctx = - { - .is_index_data = false, // DORIS-20877 - .expiration_time = expiration_time, - .is_dryrun = config:: - enable_reader_dryrun_when_download_file_cache, - }, + .ctx = {.is_index_data = false, // DORIS-20877 + .expiration_time = expiration_time, + .is_dryrun = config::enable_reader_dryrun_when_download_file_cache, + .is_warmup = true}, .download_done = std::move(download_done), }; g_file_cache_event_driven_warm_up_submitted_index_num << 1; g_file_cache_event_driven_warm_up_submitted_index_size << idx_size; - + tablet->update_rowset_warmup_state_inverted_idx_num(rowset_id, 1); if (wait) { wait->add_count(); } diff --git a/be/src/cloud/cloud_meta_mgr.cpp b/be/src/cloud/cloud_meta_mgr.cpp index 5be30dcac792c3..6a332a292faaf2 100644 --- a/be/src/cloud/cloud_meta_mgr.cpp +++ b/be/src/cloud/cloud_meta_mgr.cpp @@ -766,7 +766,9 @@ Status CloudMetaMgr::sync_tablet_rowsets_unlocked(CloudTablet* tablet, // after doing EMPTY_CUMULATIVE compaction, MS cp is 13, get_rowset will return [2-11][12-12]. bool version_overlap = tablet->max_version_unlocked() >= rowsets.front()->start_version(); - tablet->add_rowsets(std::move(rowsets), version_overlap, wlock, warmup_delta_data); + tablet->add_rowsets( + std::move(rowsets), version_overlap, wlock, + warmup_delta_data || config::enable_warmup_immediately_on_new_rowset); RETURN_IF_ERROR(tablet->merge_rowsets_schema()); } diff --git a/be/src/cloud/cloud_schema_change_job.cpp b/be/src/cloud/cloud_schema_change_job.cpp index 9b629c0b038a35..583cc9a905b614 100644 --- a/be/src/cloud/cloud_schema_change_job.cpp +++ b/be/src/cloud/cloud_schema_change_job.cpp @@ -141,7 +141,7 @@ Status CloudSchemaChangeJob::process_alter_tablet(const TAlterTabletReqV2& reque if (request.alter_version > 1) { // [0-1] is a placeholder rowset, no need to convert RETURN_IF_ERROR(_base_tablet->capture_rs_readers({2, start_resp.alter_version()}, - &rs_splits, false)); + &rs_splits, CaptureRowsetOps {})); } Defer defer2 {[&]() { _new_tablet->set_alter_version(-1); diff --git a/be/src/cloud/cloud_storage_engine.cpp b/be/src/cloud/cloud_storage_engine.cpp index ed11a5c52aee0f..fe4c1f5f09994d 100644 --- a/be/src/cloud/cloud_storage_engine.cpp +++ b/be/src/cloud/cloud_storage_engine.cpp @@ -99,6 +99,7 @@ CloudStorageEngine::CloudStorageEngine(const EngineOptions& options) std::make_shared(); _cumulative_compaction_policies[CUMULATIVE_TIME_SERIES_POLICY] = std::make_shared(); + _startup_timepoint = std::chrono::system_clock::now(); } CloudStorageEngine::~CloudStorageEngine() { diff --git a/be/src/cloud/cloud_storage_engine.h b/be/src/cloud/cloud_storage_engine.h index 2b97c0b34b90b8..cfa13cf89ea609 100644 --- a/be/src/cloud/cloud_storage_engine.h +++ b/be/src/cloud/cloud_storage_engine.h @@ -17,6 +17,7 @@ #pragma once +#include #include #include @@ -156,6 +157,16 @@ class CloudStorageEngine final : public BaseStorageEngine { Status unregister_compaction_stop_token(CloudTabletSPtr tablet, bool clear_ms); + std::chrono::time_point startup_timepoint() const { + return _startup_timepoint; + } + +#ifdef BE_TEST + void set_startup_timepoint(const std::chrono::time_point& tp) { + _startup_timepoint = tp; + } +#endif + private: void _refresh_storage_vault_info_thread_callback(); void _vacuum_stale_rowsets_thread_callback(); @@ -227,6 +238,8 @@ class CloudStorageEngine final : public BaseStorageEngine { EngineOptions _options; std::mutex _store_lock; + + std::chrono::time_point _startup_timepoint; }; } // namespace doris diff --git a/be/src/cloud/cloud_tablet.cpp b/be/src/cloud/cloud_tablet.cpp index a86abb7fe4d16a..4913f657f31603 100644 --- a/be/src/cloud/cloud_tablet.cpp +++ b/be/src/cloud/cloud_tablet.cpp @@ -18,6 +18,7 @@ #include "cloud/cloud_tablet.h" #include +#include #include #include #include @@ -27,8 +28,11 @@ #include #include +#include #include #include +#include +#include #include #include #include @@ -39,6 +43,7 @@ #include "cloud/cloud_warm_up_manager.h" #include "common/config.h" #include "common/logging.h" +#include "common/status.h" #include "io/cache/block_file_cache_downloader.h" #include "io/cache/block_file_cache_factory.h" #include "olap/base_tablet.h" @@ -63,6 +68,20 @@ using namespace ErrorCode; bvar::Adder g_unused_rowsets_count("unused_rowsets_count"); +bvar::Adder g_capture_prefer_cache_count("capture_prefer_cache_count"); +bvar::Adder g_capture_with_freshness_tolerance_count( + "capture_with_freshness_tolerance_count"); +bvar::Adder g_capture_with_freshness_tolerance_fallback_count( + "capture_with_freshness_tolerance_fallback_count"); +bvar::Window> g_capture_prefer_cache_count_window( + "capture_prefer_cache_count_window", &g_capture_prefer_cache_count, 30); +bvar::Window> g_capture_with_freshness_tolerance_count_window( + "capture_with_freshness_tolerance_count_window", &g_capture_with_freshness_tolerance_count, + 30); +bvar::Window> g_capture_with_freshness_tolerance_fallback_count_window( + "capture_with_freshness_tolerance_fallback_count_window", + &g_capture_with_freshness_tolerance_fallback_count, 30); + static constexpr int LOAD_INITIATOR_ID = -1; bvar::Adder g_file_cache_cloud_tablet_submitted_segment_size( @@ -89,6 +108,23 @@ bvar::Adder g_file_cache_recycle_cached_data_segment_size( bvar::Adder g_file_cache_recycle_cached_data_index_num( "file_cache_recycle_cached_data_index_num"); +bvar::Adder g_file_cache_warm_up_segment_complete_num( + "file_cache_warm_up_segment_complete_num"); +bvar::Adder g_file_cache_warm_up_segment_failed_num( + "file_cache_warm_up_segment_failed_num"); +bvar::Adder g_file_cache_warm_up_inverted_idx_complete_num( + "file_cache_warm_up_inverted_idx_complete_num"); +bvar::Adder g_file_cache_warm_up_inverted_idx_failed_num( + "file_cache_warm_up_inverted_idx_failed_num"); +bvar::Adder g_file_cache_warm_up_rowset_complete_num( + "file_cache_warm_up_rowset_complete_num"); +bvar::Adder g_file_cache_warm_up_rowset_triggered_by_job_num( + "file_cache_warm_up_rowset_triggered_by_job_num"); +bvar::Adder g_file_cache_warm_up_rowset_triggered_by_sync_rowset_num( + "file_cache_warm_up_rowset_triggered_by_sync_rowset_num"); +bvar::LatencyRecorder g_file_cache_warm_up_rowset_all_segments_latency( + "file_cache_warm_up_rowset_all_segments_latency"); + CloudTablet::CloudTablet(CloudStorageEngine& engine, TabletMetaSharedPtr tablet_meta) : BaseTablet(std::move(tablet_meta)), _engine(engine) {} @@ -104,17 +140,27 @@ std::string CloudTablet::tablet_path() const { Status CloudTablet::capture_rs_readers(const Version& spec_version, std::vector* rs_splits, - bool skip_missing_version) { + const CaptureRowsetOps& opts) { DBUG_EXECUTE_IF("CloudTablet.capture_rs_readers.return.e-230", { LOG_WARNING("CloudTablet.capture_rs_readers.return e-230").tag("tablet_id", tablet_id()); return Status::Error(-230, "injected error"); }); std::shared_lock rlock(_meta_lock); *rs_splits = DORIS_TRY(capture_rs_readers_unlocked( - spec_version, CaptureRowsetOps {.skip_missing_versions = skip_missing_version})); + spec_version, CaptureRowsetOps {.skip_missing_versions = opts.skip_missing_versions})); return Status::OK(); } +[[nodiscard]] Result> CloudTablet::capture_consistent_versions_unlocked( + const Version& version_range, const CaptureRowsetOps& options) const { + if (options.query_freshness_tolerance_ms > 0) { + return capture_versions_with_freshness_tolerance(version_range, options); + } else if (options.enable_prefer_cached_rowset && !enable_unique_key_merge_on_write()) { + return capture_versions_prefer_cache(version_range); + } + return BaseTablet::capture_consistent_versions_unlocked(version_range, options); +} + Status CloudTablet::merge_rowsets_schema() { // Find the rowset with the max version auto max_version_rowset = @@ -145,6 +191,130 @@ Status CloudTablet::merge_rowsets_schema() { return Status::OK(); } +Result> CloudTablet::capture_versions_prefer_cache( + const Version& spec_version) const { + g_capture_prefer_cache_count << 1; + Versions version_path; + std::shared_lock rlock(_meta_lock); + auto st = _timestamped_version_tracker.capture_consistent_versions_prefer_cache( + spec_version, version_path, + [&](int64_t start, int64_t end) { return rowset_is_warmed_up_unlocked(start, end); }); + if (!st.ok()) { + return ResultError(st); + } + int64_t path_max_version = version_path.back().second; + VLOG_DEBUG << fmt::format( + "[verbose] CloudTablet::capture_versions_prefer_cache, capture path: {}, " + "tablet_id={}, spec_version={}, path_max_version={}", + fmt::join(version_path | std::views::transform([](const auto& version) { + return fmt::format("{}", version.to_string()); + }), + ", "), + tablet_id(), spec_version.to_string(), path_max_version); + return version_path; +} + +bool CloudTablet::rowset_is_warmed_up_unlocked(int64_t start_version, int64_t end_version) const { + if (start_version > end_version) { + return false; + } + Version version {start_version, end_version}; + auto it = _rs_version_map.find(version); + if (it == _rs_version_map.end()) { + it = _stale_rs_version_map.find(version); + if (it == _stale_rs_version_map.end()) { + LOG_WARNING( + "fail to find Rowset in rs_version or stale_rs_version for version. " + "tablet={}, version={}", + tablet_id(), version.to_string()); + return false; + } + } + const auto& rs = it->second; + if (rs->visible_timestamp() < _engine.startup_timepoint()) { + // We only care about rowsets that are created after startup time point. For other rowsets, + // we assume they are warmed up. + return true; + } + return is_rowset_warmed_up(rs->rowset_id()); +}; + +Result> CloudTablet::capture_versions_with_freshness_tolerance( + const Version& spec_version, const CaptureRowsetOps& options) const { + g_capture_with_freshness_tolerance_count << 1; + using namespace std::chrono; + auto query_freshness_tolerance_ms = options.query_freshness_tolerance_ms; + auto freshness_limit_tp = system_clock::now() - milliseconds(query_freshness_tolerance_ms); + // find a version path where every edge(rowset) has been warmuped + Versions version_path; + std::shared_lock rlock(_meta_lock); + if (enable_unique_key_merge_on_write()) { + // For merge-on-write table, newly generated delete bitmap marks will be on the rowsets which are in newest layout. + // So we can ony capture rowsets which are in newest data layout. Otherwise there may be data correctness issue. + RETURN_IF_ERROR_RESULT( + _timestamped_version_tracker.capture_consistent_versions_with_validator_mow( + spec_version, version_path, [&](int64_t start, int64_t end) { + return rowset_is_warmed_up_unlocked(start, end); + })); + } else { + RETURN_IF_ERROR_RESULT( + _timestamped_version_tracker.capture_consistent_versions_with_validator( + spec_version, version_path, [&](int64_t start, int64_t end) { + return rowset_is_warmed_up_unlocked(start, end); + })); + } + int64_t path_max_version = version_path.back().second; + auto should_be_visible_but_not_warmed_up = [&](const auto& rs_meta) -> bool { + if (rs_meta->version() == Version {0, 1}) { + // skip rowset[0-1] + return false; + } + bool ret = rs_meta->start_version() > path_max_version && + rs_meta->visible_timestamp() < freshness_limit_tp; + if (ret && config::read_cluster_cache_opt_verbose_log) { + std::time_t t1 = system_clock::to_time_t(rs_meta->visible_timestamp()); + std::tm tm1 = *std::localtime(&t1); + std::ostringstream oss1; + oss1 << std::put_time(&tm1, "%Y-%m-%d %H:%M:%S"); + + std::time_t t2 = system_clock::to_time_t(freshness_limit_tp); + std::tm tm2 = *std::localtime(&t2); + std::ostringstream oss2; + oss2 << std::put_time(&tm2, "%Y-%m-%d %H:%M:%S"); + LOG_INFO( + "[verbose] CloudTablet::capture_rs_readers_with_freshness_tolerance, " + "find a rowset which should be visible but not warmed up, tablet_id={}, " + "path_max_version={}, rowset_id={}, version={}, visible_time={}, " + "freshness_limit={}, version_graph={}, rowset_warmup_digest={}", + tablet_id(), path_max_version, rs_meta->rowset_id().to_string(), + rs_meta->version().to_string(), oss1.str(), oss2.str(), + _timestamped_version_tracker.debug_string(), rowset_warmup_digest()); + } + return ret; + }; + // use std::views::concat after C++26 + bool should_fallback = std::ranges::any_of(_tablet_meta->all_rs_metas(), + should_be_visible_but_not_warmed_up) || + std::ranges::any_of(_tablet_meta->all_stale_rs_metas(), + should_be_visible_but_not_warmed_up); + if (should_fallback) { + rlock.unlock(); + g_capture_with_freshness_tolerance_fallback_count << 1; + // if there exists a rowset which satisfies freshness tolerance and its start version is larger than the path max version + // but has not been warmuped up yet, fallback to capture rowsets as usual + return BaseTablet::capture_consistent_versions_unlocked(spec_version, options); + } + VLOG_DEBUG << fmt::format( + "[verbose] CloudTablet::capture_versions_with_freshness_tolerance, capture path: {}, " + "tablet_id={}, spec_version={}, path_max_version={}", + fmt::join(version_path | std::views::transform([](const auto& version) { + return fmt::format("{}", version.to_string()); + }), + ", "), + tablet_id(), spec_version.to_string(), path_max_version); + return version_path; +} + // There are only two tablet_states RUNNING and NOT_READY in cloud mode // This function will erase the tablet from `CloudTabletMgr` when it can't find this tablet in MS. Status CloudTablet::sync_rowsets(int64_t query_version, bool warmup_delta_data, @@ -243,6 +413,7 @@ void CloudTablet::add_rowsets(std::vector to_add, bool version_ for (auto& rs : rowsets) { if (version_overlap || warmup_delta_data) { #ifndef BE_TEST + bool warm_up_state_updated = false; // Warmup rowset data in background for (int seg_id = 0; seg_id < rs->num_segments(); ++seg_id) { const auto& rowset_meta = rs->rowset_meta(); @@ -271,6 +442,19 @@ void CloudTablet::add_rowsets(std::vector to_add, bool version_ g_file_cache_cloud_tablet_submitted_segment_size << rs->rowset_meta()->segment_file_size(seg_id); } + if (!warm_up_state_updated) { + VLOG_DEBUG << "warm up rowset " << rs->version() << "(" << rs->rowset_id() + << ") triggerd by sync rowset"; + if (!add_rowset_warmup_state_unlocked( + *(rs->rowset_meta()), WarmUpState::TRIGGERED_BY_SYNC_ROWSET)) { + LOG(INFO) << "found duplicate warmup task for rowset " + << rs->rowset_id() << ", skip it"; + break; + } + warm_up_state_updated = true; + } + // clang-format off + auto self = std::dynamic_pointer_cast(shared_from_this()); _engine.file_cache_block_downloader().submit_download_task(io::DownloadFileMeta { .path = storage_resource.value()->remote_segment_path(*rowset_meta, seg_id), @@ -279,17 +463,30 @@ void CloudTablet::add_rowsets(std::vector to_add, bool version_ .ctx = { .expiration_time = expiration_time, - .is_dryrun = config:: - enable_reader_dryrun_when_download_file_cache, + .is_dryrun = config::enable_reader_dryrun_when_download_file_cache, + .is_warmup = true }, - .download_done {[](Status st) { + .download_done {[=](Status st) { + DBUG_EXECUTE_IF("CloudTablet::add_rowsets.download_data.callback.block_compaction_rowset", { + if (rs->version().second > rs->version().first) { + auto sleep_time = dp->param("sleep", 3); + LOG_INFO( + "[verbose] block download for rowset={}, " + "version={}, sleep={}", + rs->rowset_id().to_string(), + rs->version().to_string(), sleep_time); + std::this_thread::sleep_for( + std::chrono::seconds(sleep_time)); + } + }); + self->complete_rowset_segment_warmup(rowset_meta->rowset_id(), st, 1, 0); if (!st) { LOG_WARNING("add rowset warm up error ").error(st); } }}, }); - auto download_idx_file = [&](const io::Path& idx_path, int64_t idx_size) { + auto download_idx_file = [&, self](const io::Path& idx_path, int64_t idx_size) { io::DownloadFileMeta meta { .path = idx_path, .file_size = idx_size, @@ -297,15 +494,30 @@ void CloudTablet::add_rowsets(std::vector to_add, bool version_ .ctx = { .expiration_time = expiration_time, - .is_dryrun = config:: - enable_reader_dryrun_when_download_file_cache, + .is_dryrun = config::enable_reader_dryrun_when_download_file_cache, + .is_warmup = true }, - .download_done {[](Status st) { + .download_done {[=](Status st) { + DBUG_EXECUTE_IF("CloudTablet::add_rowsets.download_idx.callback.block", { + // clang-format on + auto sleep_time = dp->param("sleep", 3); + LOG_INFO( + "[verbose] block download for " + "rowset={}, inverted_idx_file={}, " + "sleep={}", + rs->rowset_id().to_string(), + idx_path.string(), sleep_time); + std::this_thread::sleep_for( + std::chrono::seconds(sleep_time)); + // clang-format off + }); + self->complete_rowset_segment_warmup(rowset_meta->rowset_id(), st, 0, 1); if (!st) { LOG_WARNING("add rowset warm up error ").error(st); } }}, }; + self->update_rowset_warmup_state_inverted_idx_num_unlocked(rowset_meta->rowset_id(), 1); _engine.file_cache_block_downloader().submit_download_task(std::move(meta)); g_file_cache_cloud_tablet_submitted_index_num << 1; g_file_cache_cloud_tablet_submitted_index_size << idx_size; @@ -563,6 +775,7 @@ void CloudTablet::remove_unused_rowsets() { continue; } tablet_meta()->remove_rowset_delete_bitmap(rs->rowset_id(), rs->version()); + _rowset_warm_up_states.erase(rs->rowset_id()); rs->clear_cache(); removed_rowsets.push_back(std::move(rs)); g_unused_rowsets_count << -1; @@ -1288,5 +1501,94 @@ Status CloudTablet::check_delete_bitmap_cache(int64_t txn_id, return Status::OK(); } +WarmUpState CloudTablet::get_rowset_warmup_state(RowsetId rowset_id) { + std::shared_lock rlock(_meta_lock); + if (!_rowset_warm_up_states.contains(rowset_id)) { + return WarmUpState::NONE; + } + return _rowset_warm_up_states[rowset_id].state; +} + +bool CloudTablet::add_rowset_warmup_state(const RowsetMeta& rowset, WarmUpState state, + std::chrono::steady_clock::time_point start_tp) { + std::lock_guard wlock(_meta_lock); + return add_rowset_warmup_state_unlocked(rowset, state, start_tp); +} + +void CloudTablet::update_rowset_warmup_state_inverted_idx_num(RowsetId rowset_id, int64_t delta) { + std::lock_guard wlock(_meta_lock); + update_rowset_warmup_state_inverted_idx_num_unlocked(rowset_id, delta); +} + +void CloudTablet::update_rowset_warmup_state_inverted_idx_num_unlocked(RowsetId rowset_id, + int64_t delta) { + if (!_rowset_warm_up_states.contains(rowset_id)) { + return; + } + _rowset_warm_up_states[rowset_id].num_inverted_idx += delta; +} + +bool CloudTablet::add_rowset_warmup_state_unlocked(const RowsetMeta& rowset, WarmUpState state, + std::chrono::steady_clock::time_point start_tp) { + if (_rowset_warm_up_states.contains(rowset.rowset_id())) { + return false; + } + if (state == WarmUpState::TRIGGERED_BY_JOB) { + g_file_cache_warm_up_rowset_triggered_by_job_num << 1; + } else if (state == WarmUpState::TRIGGERED_BY_SYNC_ROWSET) { + g_file_cache_warm_up_rowset_triggered_by_sync_rowset_num << 1; + } + _rowset_warm_up_states[rowset.rowset_id()] = { + .state = state, .num_segments = rowset.num_segments(), .start_tp = start_tp}; + return true; +} + +WarmUpState CloudTablet::complete_rowset_segment_warmup(RowsetId rowset_id, Status status, + int64_t segment_num, + int64_t inverted_idx_num) { + std::lock_guard wlock(_meta_lock); + if (!_rowset_warm_up_states.contains(rowset_id)) { + return WarmUpState::NONE; + } + VLOG_DEBUG << "complete rowset segment warmup for rowset " << rowset_id << ", " << status; + if (segment_num > 0) { + g_file_cache_warm_up_segment_complete_num << segment_num; + if (!status.ok()) { + g_file_cache_warm_up_segment_failed_num << segment_num; + } + } + if (inverted_idx_num > 0) { + g_file_cache_warm_up_inverted_idx_complete_num << inverted_idx_num; + if (!status.ok()) { + g_file_cache_warm_up_inverted_idx_failed_num << inverted_idx_num; + } + } + _rowset_warm_up_states[rowset_id].done(segment_num, inverted_idx_num); + if (_rowset_warm_up_states[rowset_id].has_finished()) { + g_file_cache_warm_up_rowset_complete_num << 1; + auto cost = std::chrono::duration_cast( + std::chrono::steady_clock::now() - + _rowset_warm_up_states[rowset_id].start_tp) + .count(); + g_file_cache_warm_up_rowset_all_segments_latency << cost; + _rowset_warm_up_states[rowset_id].state = WarmUpState::DONE; + } + return _rowset_warm_up_states[rowset_id].state; +} + +bool CloudTablet::is_rowset_warmed_up(const RowsetId& rowset_id) const { + auto it = _rowset_warm_up_states.find(rowset_id); + if (it == _rowset_warm_up_states.end()) { + return false; + } + return it->second.state == WarmUpState::DONE; +} + +void CloudTablet::add_warmed_up_rowset(const RowsetId& rowset_id) { + _rowset_warm_up_states[rowset_id] = {.state = WarmUpState::DONE, + .num_segments = 1, + .start_tp = std::chrono::steady_clock::now()}; +} + #include "common/compile_check_end.h" } // namespace doris diff --git a/be/src/cloud/cloud_tablet.h b/be/src/cloud/cloud_tablet.h index 38b9cf94e6a042..109408bf4bb441 100644 --- a/be/src/cloud/cloud_tablet.h +++ b/be/src/cloud/cloud_tablet.h @@ -21,10 +21,12 @@ #include "olap/base_tablet.h" #include "olap/partial_update_info.h" +#include "olap/rowset/rowset.h" namespace doris { class CloudStorageEngine; +enum class WarmUpState : int; struct SyncRowsetStats { int64_t get_remote_rowsets_num {0}; @@ -59,7 +61,33 @@ class CloudTablet final : public BaseTablet { bool vertical) override; Status capture_rs_readers(const Version& spec_version, std::vector* rs_splits, - bool skip_missing_version) override; + const CaptureRowsetOps& opts) override; + + [[nodiscard]] Result> capture_consistent_versions_unlocked( + const Version& version_range, const CaptureRowsetOps& options) const override; + + // Capture versions with cache preference optimization. + // This method prioritizes using cached/warmed-up rowsets when building version paths, + // avoiding cold data reads when possible. It uses capture_consistent_versions_prefer_cache + // to find a consistent version path that prefers already warmed-up rowsets. + Result> capture_versions_prefer_cache(const Version& spec_version) const; + + // Capture versions with query freshness tolerance. + // This method finds a consistent version path where all rowsets are warmed up, + // but allows fallback to normal capture if there are newer rowsets that should be + // visible (based on freshness tolerance) but haven't been warmed up yet. + // For merge-on-write tables, uses special validation to ensure data correctness. + // + // IMPORTANT: The returned version may be smaller than the requested version if newer + // data hasn't been warmed up yet. This can cause different tablets in the same query + // to read from different versions, potentially leading to inconsistent query results. + // + // @param options.query_freshness_tolerance_ms: Time tolerance in milliseconds. Rowsets that + // became visible within this time range (after current_time - query_freshness_tolerance_ms) + // can be skipped if not warmed up. However, if older rowsets (before this time point) + // are not warmed up, the method will fallback to normal capture. + Result> capture_versions_with_freshness_tolerance( + const Version& spec_version, const CaptureRowsetOps& options) const; size_t tablet_footprint() override { return _approximate_data_size.load(std::memory_order_relaxed); @@ -281,12 +309,52 @@ class CloudTablet final : public BaseTablet { static std::vector recycle_cached_data( const std::vector& rowsets); + // Add warmup state management + WarmUpState get_rowset_warmup_state(RowsetId rowset_id); + bool add_rowset_warmup_state( + const RowsetMeta& rowset, WarmUpState state, + std::chrono::steady_clock::time_point start_tp = std::chrono::steady_clock::now()); + void update_rowset_warmup_state_inverted_idx_num(RowsetId rowset_id, int64_t delta); + void update_rowset_warmup_state_inverted_idx_num_unlocked(RowsetId rowset_id, int64_t delta); + WarmUpState complete_rowset_segment_warmup(RowsetId rowset_id, Status status, + int64_t segment_num, int64_t inverted_idx_num); + + bool is_rowset_warmed_up(const RowsetId& rowset_id) const; + + void add_warmed_up_rowset(const RowsetId& rowset_id); + + std::string rowset_warmup_digest() const { + std::string res; + auto add_log = [&](const RowsetSharedPtr& rs) { + auto tmp = fmt::format("{}{}", rs->rowset_id().to_string(), rs->version().to_string()); + if (_rowset_warm_up_states.contains(rs->rowset_id())) { + tmp += fmt::format( + ", state={}, segments_warmed_up={}/{}, inverted_idx_warmed_up={}/{}", + _rowset_warm_up_states.at(rs->rowset_id()).state, + _rowset_warm_up_states.at(rs->rowset_id()).num_segments_warmed_up, + _rowset_warm_up_states.at(rs->rowset_id()).num_segments, + _rowset_warm_up_states.at(rs->rowset_id()).num_inverted_idx_warmed_up, + _rowset_warm_up_states.at(rs->rowset_id()).num_inverted_idx); + } + res += fmt::format("[{}],", tmp); + }; + traverse_rowsets_unlocked(add_log, true); + return res; + } + private: // FIXME(plat1ko): No need to record base size if rowsets are ordered by version void update_base_size(const Rowset& rs); Status sync_if_not_running(SyncRowsetStats* stats = nullptr); + bool add_rowset_warmup_state_unlocked( + const RowsetMeta& rowset, WarmUpState state, + std::chrono::steady_clock::time_point start_tp = std::chrono::steady_clock::now()); + + // used by capture_rs_reader_xxx functions + bool rowset_is_warmed_up_unlocked(int64_t start_version, int64_t end_version) const; + CloudStorageEngine& _engine; // this mutex MUST ONLY be used when sync meta @@ -346,6 +414,30 @@ class CloudTablet final : public BaseTablet { std::mutex _gc_mutex; std::unordered_map _unused_rowsets; std::vector, DeleteBitmapKeyRanges>> _unused_delete_bitmap; + + // for warm up states management + struct RowsetWarmUpInfo { + WarmUpState state; + int64_t num_segments = 0; + int64_t num_inverted_idx = 0; + int64_t num_segments_warmed_up = 0; + int64_t num_inverted_idx_warmed_up = 0; + std::chrono::steady_clock::time_point start_tp; + + void done(int64_t num_segments, int64_t num_inverted_idx) { + num_segments_warmed_up += num_segments; + num_inverted_idx_warmed_up += num_inverted_idx; + } + + bool has_finished() const { + return (num_segments_warmed_up >= num_segments) && + (num_inverted_idx_warmed_up >= num_inverted_idx); + } + }; + std::unordered_map _rowset_warm_up_states; + + mutable std::shared_mutex _warmed_up_rowsets_mutex; + std::unordered_set _warmed_up_rowsets; }; using CloudTabletSPtr = std::shared_ptr; diff --git a/be/src/cloud/cloud_warm_up_manager.cpp b/be/src/cloud/cloud_warm_up_manager.cpp index 2915b81f2536a3..3a1a05749861f4 100644 --- a/be/src/cloud/cloud_warm_up_manager.cpp +++ b/be/src/cloud/cloud_warm_up_manager.cpp @@ -110,7 +110,7 @@ void CloudWarmUpManager::submit_download_tasks(io::Path path, int64_t file_size, io::FileSystemSPtr file_system, int64_t expiration_time, std::shared_ptr wait, - bool is_index) { + bool is_index, std::function done_cb) { if (file_size < 0) { auto st = file_system->file_size(path, &file_size); if (!st.ok()) [[unlikely]] { @@ -141,13 +141,12 @@ void CloudWarmUpManager::submit_download_tasks(io::Path path, int64_t file_size, .offset = offset, .download_size = current_chunk_size, .file_system = file_system, - .ctx = - { - .expiration_time = expiration_time, - .is_dryrun = config::enable_reader_dryrun_when_download_file_cache, - }, + .ctx = {.expiration_time = expiration_time, + .is_dryrun = config::enable_reader_dryrun_when_download_file_cache, + .is_warmup = true}, .download_done = - [=](Status st) { + [&](Status st) { + if (done_cb) done_cb(st); if (!st) { LOG_WARNING("Warm up error ").error(st); } else if (is_index) { @@ -227,12 +226,24 @@ void CloudWarmUpManager::handle_jobs() { if (expiration_time <= UnixSeconds()) { expiration_time = 0; } + if (!tablet->add_rowset_warmup_state(*rs, WarmUpState::TRIGGERED_BY_JOB)) { + LOG(INFO) << "found duplicate warmup task for rowset " << rs->rowset_id() + << ", skip it"; + continue; + } // 1st. download segment files submit_download_tasks( storage_resource.value()->remote_segment_path(*rs, seg_id), rs->segment_file_size(seg_id), storage_resource.value()->fs, - expiration_time, wait); + expiration_time, wait, false, [tablet, rs, seg_id](Status st) { + VLOG_DEBUG << "warmup rowset " << rs->version() << " segment " + << seg_id << " completed"; + if (tablet->complete_rowset_segment_warmup( + rs->rowset_id(), st, 1, 0) == WarmUpState::DONE) { + VLOG_DEBUG << "warmup rowset " << rs->version() << " completed"; + } + }); // 2nd. download inverted index files int64_t file_size = -1; @@ -254,8 +265,20 @@ void CloudWarmUpManager::handle_jobs() { } } } - submit_download_tasks(idx_path, file_size, storage_resource.value()->fs, - expiration_time, wait, true); + tablet->update_rowset_warmup_state_inverted_idx_num(rs->rowset_id(), 1); + submit_download_tasks( + idx_path, file_size, storage_resource.value()->fs, + expiration_time, wait, true, [=](Status st) { + VLOG_DEBUG << "warmup rowset " << rs->version() + << " segment " << seg_id + << "inverted idx:" << idx_path << " completed"; + if (tablet->complete_rowset_segment_warmup(rs->rowset_id(), + st, 0, 1) == + WarmUpState::DONE) { + VLOG_DEBUG << "warmup rowset " << rs->version() + << " completed"; + } + }); } } else { if (schema_ptr->has_inverted_index()) { @@ -263,8 +286,20 @@ void CloudWarmUpManager::handle_jobs() { storage_resource.value()->remote_idx_v2_path(*rs, seg_id); file_size = idx_file_info.has_index_size() ? idx_file_info.index_size() : -1; - submit_download_tasks(idx_path, file_size, storage_resource.value()->fs, - expiration_time, wait, true); + tablet->update_rowset_warmup_state_inverted_idx_num(rs->rowset_id(), 1); + submit_download_tasks( + idx_path, file_size, storage_resource.value()->fs, + expiration_time, wait, true, [=](Status st) { + VLOG_DEBUG << "warmup rowset " << rs->version() + << " segment " << seg_id + << "inverted idx:" << idx_path << " completed"; + if (tablet->complete_rowset_segment_warmup(rs->rowset_id(), + st, 0, 1) == + WarmUpState::DONE) { + VLOG_DEBUG << "warmup rowset " << rs->version() + << " completed"; + } + }); } } } diff --git a/be/src/cloud/cloud_warm_up_manager.h b/be/src/cloud/cloud_warm_up_manager.h index c801e77acc787b..73da26b2bfdcb0 100644 --- a/be/src/cloud/cloud_warm_up_manager.h +++ b/be/src/cloud/cloud_warm_up_manager.h @@ -38,6 +38,13 @@ enum class DownloadType { S3, }; +enum class WarmUpState : int { + NONE, + TRIGGERED_BY_SYNC_ROWSET, + TRIGGERED_BY_JOB, + DONE, +}; + struct JobMeta { JobMeta() = default; JobMeta(const TJobMeta& meta); @@ -95,8 +102,8 @@ class CloudWarmUpManager { void submit_download_tasks(io::Path path, int64_t file_size, io::FileSystemSPtr file_system, int64_t expiration_time, - std::shared_ptr wait, - bool is_index = false); + std::shared_ptr wait, bool is_index = false, + std::function done_cb = nullptr); std::mutex _mtx; std::condition_variable _cond; int64_t _cur_job_id {0}; diff --git a/be/src/cloud/config.cpp b/be/src/cloud/config.cpp index 16d6aa7f782b65..5d22c7d8f4cfe2 100644 --- a/be/src/cloud/config.cpp +++ b/be/src/cloud/config.cpp @@ -106,5 +106,7 @@ DEFINE_mInt64(warm_up_rowset_sync_wait_min_timeout_ms, "10000"); DEFINE_mInt64(warm_up_rowset_sync_wait_max_timeout_ms, "120000"); +DEFINE_mBool(enable_warmup_immediately_on_new_rowset, "false"); + #include "common/compile_check_end.h" } // namespace doris::config diff --git a/be/src/cloud/config.h b/be/src/cloud/config.h index a52ac758e671a8..6dd36d0b7b935b 100644 --- a/be/src/cloud/config.h +++ b/be/src/cloud/config.h @@ -148,5 +148,7 @@ DECLARE_mInt64(warm_up_rowset_sync_wait_min_timeout_ms); DECLARE_mInt64(warm_up_rowset_sync_wait_max_timeout_ms); +DECLARE_mBool(enable_warmup_immediately_on_new_rowset); + #include "common/compile_check_end.h" } // namespace doris::config diff --git a/be/src/cloud/pb_convert.cpp b/be/src/cloud/pb_convert.cpp index 352b9ae935654b..0da239557e2f85 100644 --- a/be/src/cloud/pb_convert.cpp +++ b/be/src/cloud/pb_convert.cpp @@ -88,6 +88,9 @@ void doris_rowset_meta_to_cloud(RowsetMetaCloudPB* out, const RowsetMetaPB& in) out->mutable_inverted_index_file_info()->CopyFrom(in.inverted_index_file_info()); out->set_source_rowset_id(in.source_rowset_id()); out->set_source_tablet_id(in.source_tablet_id()); + if (in.has_visible_ts_ms()) { + out->set_visible_ts_ms(in.visible_ts_ms()); + } } void doris_rowset_meta_to_cloud(RowsetMetaCloudPB* out, RowsetMetaPB&& in) { @@ -176,6 +179,9 @@ static void fill_schema_with_dict(const RowsetMetaCloudPB& in, RowsetMetaPB* out *unique_id_map.at(dict_val.parent_unique_id())->add_sparse_columns() = dict_val; VLOG_DEBUG << "fill dict sparse column" << dict_val.ShortDebugString(); } + if (in.has_visible_ts_ms()) { + out->set_visible_ts_ms(in.visible_ts_ms()); + } } RowsetMetaPB cloud_rowset_meta_to_doris(const RowsetMetaCloudPB& in, @@ -246,6 +252,9 @@ void cloud_rowset_meta_to_doris(RowsetMetaPB* out, const RowsetMetaCloudPB& in, out->mutable_inverted_index_file_info()->CopyFrom(in.inverted_index_file_info()); out->set_source_rowset_id(in.source_rowset_id()); out->set_source_tablet_id(in.source_tablet_id()); + if (in.has_visible_ts_ms()) { + out->set_visible_ts_ms(in.visible_ts_ms()); + } } void cloud_rowset_meta_to_doris(RowsetMetaPB* out, RowsetMetaCloudPB&& in, @@ -304,6 +313,9 @@ void cloud_rowset_meta_to_doris(RowsetMetaPB* out, RowsetMetaCloudPB&& in, out->mutable_inverted_index_file_info()->Swap(in.mutable_inverted_index_file_info()); out->set_source_rowset_id(in.source_rowset_id()); out->set_source_tablet_id(in.source_tablet_id()); + if (in.has_visible_ts_ms()) { + out->set_visible_ts_ms(in.visible_ts_ms()); + } } TabletSchemaCloudPB doris_tablet_schema_to_cloud(const TabletSchemaPB& in) { diff --git a/be/src/common/config.cpp b/be/src/common/config.cpp index 18116506856b7f..cee699e8a9e43b 100644 --- a/be/src/common/config.cpp +++ b/be/src/common/config.cpp @@ -1573,6 +1573,10 @@ DEFINE_mBool(enable_wal_tde, "false"); DEFINE_mBool(enable_prefill_output_dbm_agg_cache_after_compaction, "true"); DEFINE_mBool(enable_prefill_all_dbm_agg_cache_after_compaction, "true"); +DEFINE_mBool(print_stack_when_cache_miss, "false"); + +DEFINE_mBool(read_cluster_cache_opt_verbose_log, "false"); + // clang-format off #ifdef BE_TEST // test s3 diff --git a/be/src/common/config.h b/be/src/common/config.h index 7b76d436694d5a..6ff07645336595 100644 --- a/be/src/common/config.h +++ b/be/src/common/config.h @@ -1639,6 +1639,10 @@ DECLARE_mBool(enable_wal_tde); DECLARE_mBool(enable_prefill_output_dbm_agg_cache_after_compaction); DECLARE_mBool(enable_prefill_all_dbm_agg_cache_after_compaction); +DECLARE_mBool(print_stack_when_cache_miss); + +DECLARE_mBool(read_cluster_cache_opt_verbose_log); + #ifdef BE_TEST // test s3 DECLARE_String(test_s3_resource); diff --git a/be/src/io/cache/block_file_cache.cpp b/be/src/io/cache/block_file_cache.cpp index e7b050d5edea7f..6f3b3061990a3d 100644 --- a/be/src/io/cache/block_file_cache.cpp +++ b/be/src/io/cache/block_file_cache.cpp @@ -203,12 +203,38 @@ BlockFileCache::BlockFileCache(const std::string& cache_base_path, _cache_base_path.c_str(), "file_cache_num_read_blocks_1h", _num_read_blocks.get(), 3600); + _no_warmup_num_read_blocks = std::make_shared>( + _cache_base_path.c_str(), "file_cache_no_warmup_num_read_blocks"); + _no_warmup_num_hit_blocks = std::make_shared>( + _cache_base_path.c_str(), "file_cache_no_warmup_num_hit_blocks"); + + _no_warmup_num_hit_blocks_5m = std::make_shared>>( + _cache_base_path.c_str(), "file_cache_no_warmup_num_hit_blocks_5m", + _no_warmup_num_hit_blocks.get(), 300); + _no_warmup_num_read_blocks_5m = std::make_shared>>( + _cache_base_path.c_str(), "file_cache_no_warmup_num_read_blocks_5m", + _no_warmup_num_read_blocks.get(), 300); + _no_warmup_num_hit_blocks_1h = std::make_shared>>( + _cache_base_path.c_str(), "file_cache_no_warmup_num_hit_blocks_1h", + _no_warmup_num_hit_blocks.get(), 3600); + _no_warmup_num_read_blocks_1h = std::make_shared>>( + _cache_base_path.c_str(), "file_cache_no_warmup_num_read_blocks_1h", + _no_warmup_num_read_blocks.get(), 3600); + _hit_ratio = std::make_shared>(_cache_base_path.c_str(), "file_cache_hit_ratio", 0.0); _hit_ratio_5m = std::make_shared>(_cache_base_path.c_str(), "file_cache_hit_ratio_5m", 0.0); _hit_ratio_1h = std::make_shared>(_cache_base_path.c_str(), "file_cache_hit_ratio_1h", 0.0); + + _no_warmup_hit_ratio = std::make_shared>( + _cache_base_path.c_str(), "file_cache_no_warmup_hit_ratio", 0.0); + _no_warmup_hit_ratio_5m = std::make_shared>( + _cache_base_path.c_str(), "file_cache_no_warmup_hit_ratio_5m", 0.0); + _no_warmup_hit_ratio_1h = std::make_shared>( + _cache_base_path.c_str(), "file_cache_no_warmup_hit_ratio_1h", 0.0); + _disk_limit_mode_metrics = std::make_shared>( _cache_base_path.c_str(), "file_cache_disk_limit_mode", 0); _need_evict_cache_in_advance_metrics = std::make_shared>( @@ -794,9 +820,15 @@ FileBlocksHolder BlockFileCache::get_or_set(const UInt128Wrapper& hash, size_t o } DCHECK(!file_blocks.empty()); *_num_read_blocks << file_blocks.size(); + if (!context.is_warmup) { + *_no_warmup_num_read_blocks << file_blocks.size(); + } for (auto& block : file_blocks) { if (block->state_unsafe() == FileBlock::State::DOWNLOADED) { *_num_hit_blocks << 1; + if (!context.is_warmup) { + *_no_warmup_num_hit_blocks << 1; + } } } } @@ -1940,6 +1972,21 @@ void BlockFileCache::run_background_monitor() { _hit_ratio_1h->set_value((double)_num_hit_blocks_1h->get_value() / _num_read_blocks_1h->get_value()); } + + if (_no_warmup_num_hit_blocks->get_value() > 0) { + _no_warmup_hit_ratio->set_value((double)_no_warmup_num_hit_blocks->get_value() / + (double)_no_warmup_num_read_blocks->get_value()); + } + if (_no_warmup_num_hit_blocks_5m->get_value() > 0) { + _no_warmup_hit_ratio_5m->set_value( + (double)_no_warmup_num_hit_blocks_5m->get_value() / + (double)_no_warmup_num_read_blocks_5m->get_value()); + } + if (_no_warmup_num_hit_blocks_1h->get_value() > 0) { + _no_warmup_hit_ratio_1h->set_value( + (double)_no_warmup_num_hit_blocks_1h->get_value() / + (double)_no_warmup_num_read_blocks_1h->get_value()); + } } } } diff --git a/be/src/io/cache/block_file_cache.h b/be/src/io/cache/block_file_cache.h index d152e7403c0310..e8e768f7ce2325 100644 --- a/be/src/io/cache/block_file_cache.h +++ b/be/src/io/cache/block_file_cache.h @@ -532,9 +532,20 @@ class BlockFileCache { std::shared_ptr> _num_hit_blocks; std::shared_ptr> _num_removed_blocks; + std::shared_ptr> _no_warmup_num_read_blocks; + std::shared_ptr> _no_warmup_num_hit_blocks; + + std::shared_ptr>> _no_warmup_num_hit_blocks_5m; + std::shared_ptr>> _no_warmup_num_read_blocks_5m; + std::shared_ptr>> _no_warmup_num_hit_blocks_1h; + std::shared_ptr>> _no_warmup_num_read_blocks_1h; + std::shared_ptr> _hit_ratio; std::shared_ptr> _hit_ratio_5m; std::shared_ptr> _hit_ratio_1h; + std::shared_ptr> _no_warmup_hit_ratio; + std::shared_ptr> _no_warmup_hit_ratio_5m; + std::shared_ptr> _no_warmup_hit_ratio_1h; std::shared_ptr> _disk_limit_mode_metrics; std::shared_ptr> _need_evict_cache_in_advance_metrics; diff --git a/be/src/io/cache/cached_remote_file_reader.cpp b/be/src/io/cache/cached_remote_file_reader.cpp index a9b87734222249..aaa1e5f4feb920 100644 --- a/be/src/io/cache/cached_remote_file_reader.cpp +++ b/be/src/io/cache/cached_remote_file_reader.cpp @@ -149,7 +149,21 @@ Status CachedRemoteFileReader::read_at_impl(size_t offset, Slice result, size_t* return Status::OK(); } ReadStatistics stats; + MonotonicStopWatch read_at_sw; + read_at_sw.start(); auto defer_func = [&](int*) { + if (config::print_stack_when_cache_miss) { + if (io_ctx->file_cache_stats == nullptr && !stats.hit_cache && !io_ctx->is_warmup) { + LOG_INFO("[verbose] {}", Status::InternalError("not hit cache")); + } + } + if (!stats.hit_cache && config::read_cluster_cache_opt_verbose_log) { + LOG_INFO( + "[verbose] not hit cache, path: {}, offset: {}, size: {}, cost: {} ms, warmup: " + "{}", + path().native(), offset, bytes_req, read_at_sw.elapsed_time_milliseconds(), + io_ctx->is_warmup); + } if (io_ctx->file_cache_stats && !is_dryrun) { // update stats in io_ctx, for query profile _update_stats(stats, io_ctx->file_cache_stats, io_ctx->is_inverted_index); diff --git a/be/src/io/cache/file_cache_common.h b/be/src/io/cache/file_cache_common.h index f9ac525d0bef86..abbc4ff12fb735 100644 --- a/be/src/io/cache/file_cache_common.h +++ b/be/src/io/cache/file_cache_common.h @@ -148,6 +148,7 @@ struct CacheContext { cache_type = FileCacheType::NORMAL; } query_id = io_context->query_id ? *io_context->query_id : TUniqueId(); + is_warmup = io_context->is_warmup; } CacheContext() = default; bool operator==(const CacheContext& rhs) const { @@ -159,6 +160,7 @@ struct CacheContext { int64_t expiration_time {0}; bool is_cold_data {false}; ReadStatistics* stats; + bool is_warmup {false}; }; template diff --git a/be/src/io/io_common.h b/be/src/io/io_common.h index 6934aa6a75a519..82e9ae30ecada2 100644 --- a/be/src/io/io_common.h +++ b/be/src/io/io_common.h @@ -85,6 +85,8 @@ struct IOContext { // if is_dryrun, read IO will download data to cache but return no data to reader // useful to skip cache data read from local disk to accelarate warm up bool is_dryrun = false; + // if `is_warmup` == true, this I/O request is from a warm up task + bool is_warmup {false}; }; } // namespace io diff --git a/be/src/olap/base_tablet.h b/be/src/olap/base_tablet.h index cb61252c6fe75a..f8182d14c07e48 100644 --- a/be/src/olap/base_tablet.h +++ b/be/src/olap/base_tablet.h @@ -56,7 +56,7 @@ struct TabletWithVersion { enum class CompactionStage { NOT_SCHEDULED, PENDING, EXECUTING }; // Base class for all tablet classes -class BaseTablet { +class BaseTablet : public std::enable_shared_from_this { public: explicit BaseTablet(TabletMetaSharedPtr tablet_meta); virtual ~BaseTablet(); @@ -114,7 +114,7 @@ class BaseTablet { virtual Status capture_rs_readers(const Version& spec_version, std::vector* rs_splits, - bool skip_missing_version) = 0; + const CaptureRowsetOps& opts) = 0; virtual size_t tablet_footprint() = 0; @@ -305,11 +305,16 @@ class BaseTablet { void traverse_rowsets(std::function visitor, bool include_stale = false) { std::shared_lock rlock(_meta_lock); - for (auto& [v, rs] : _rs_version_map) { + traverse_rowsets_unlocked(visitor, include_stale); + } + + void traverse_rowsets_unlocked(std::function visitor, + bool include_stale = false) const { + for (const auto& [v, rs] : _rs_version_map) { visitor(rs); } if (!include_stale) return; - for (auto& [v, rs] : _stale_rs_version_map) { + for (const auto& [v, rs] : _stale_rs_version_map) { visitor(rs); } } @@ -333,7 +338,7 @@ class BaseTablet { [[nodiscard]] Result capture_consistent_rowsets_unlocked( const Version& version_range, const CaptureRowsetOps& options) const; - [[nodiscard]] Result> capture_consistent_versions_unlocked( + [[nodiscard]] virtual Result> capture_consistent_versions_unlocked( const Version& version_range, const CaptureRowsetOps& options) const; [[nodiscard]] Result> capture_rs_readers_unlocked( @@ -409,6 +414,21 @@ struct CaptureRowsetOps { bool quiet = false; bool include_stale_rowsets = true; bool enable_fetch_rowsets_from_peers = false; + + // ======== only take effect in cloud mode ======== + + // Enable preference for cached/warmed-up rowsets when building version paths. + // When enabled, the capture process will prioritize already cached rowsets + // to avoid cold data reads and improve query performance. + bool enable_prefer_cached_rowset {false}; + + // Query freshness tolerance in milliseconds. + // Defines the time window for considering data as "fresh enough". + // Rowsets that became visible within this time range can be skipped if not warmed up, + // but older rowsets (before current_time - query_freshness_tolerance_ms) that are + // not warmed up will trigger fallback to normal capture. + // Set to -1 to disable freshness tolerance checking. + int64_t query_freshness_tolerance_ms {-1}; }; struct CaptureRowsetResult { diff --git a/be/src/olap/rowset/rowset.cpp b/be/src/olap/rowset/rowset.cpp index f18ec4b6bceb61..7fb3a0030497ae 100644 --- a/be/src/olap/rowset/rowset.cpp +++ b/be/src/olap/rowset/rowset.cpp @@ -225,4 +225,7 @@ int64_t Rowset::approximate_cache_index_size() { return total_cache_size; } +std::chrono::time_point Rowset::visible_timestamp() const { + return _rowset_meta->visible_timestamp(); +} } // namespace doris diff --git a/be/src/olap/rowset/rowset.h b/be/src/olap/rowset/rowset.h index e1324f49396b7e..29866df47a0e88 100644 --- a/be/src/olap/rowset/rowset.h +++ b/be/src/olap/rowset/rowset.h @@ -327,6 +327,8 @@ class Rowset : public std::enable_shared_from_this, public MetadataAdder int64_t approximate_cache_index_size(); + std::chrono::time_point visible_timestamp() const; + protected: friend class RowsetFactory; diff --git a/be/src/olap/rowset/rowset_meta.h b/be/src/olap/rowset/rowset_meta.h index 887659e61c761f..6a96b8959fc6ff 100644 --- a/be/src/olap/rowset/rowset_meta.h +++ b/be/src/olap/rowset/rowset_meta.h @@ -21,6 +21,7 @@ #include #include +#include #include #include #include @@ -359,6 +360,22 @@ class RowsetMeta : public MetadataAdder { int64_t newest_write_timestamp() const { return _rowset_meta_pb.newest_write_timestamp(); } + // for cloud only + bool has_visible_ts_ms() const { return _rowset_meta_pb.has_visible_ts_ms(); } + int64_t visible_ts_ms() const { return _rowset_meta_pb.visible_ts_ms(); } + std::chrono::time_point visible_timestamp() const { + using namespace std::chrono; + if (has_visible_ts_ms()) { + return time_point(milliseconds(visible_ts_ms())); + } + return system_clock::from_time_t(newest_write_timestamp()); + } +#ifdef BE_TEST + void set_visible_ts_ms(int64_t visible_ts_ms) { + _rowset_meta_pb.set_visible_ts_ms(visible_ts_ms); + } +#endif + void set_tablet_schema(const TabletSchemaSPtr& tablet_schema); void set_tablet_schema(const TabletSchemaPB& tablet_schema); diff --git a/be/src/olap/tablet.cpp b/be/src/olap/tablet.cpp index a79de2964620b8..373c2d2963c048 100644 --- a/be/src/olap/tablet.cpp +++ b/be/src/olap/tablet.cpp @@ -945,11 +945,11 @@ void Tablet::acquire_version_and_rowsets( } Status Tablet::capture_rs_readers(const Version& spec_version, std::vector* rs_splits, - bool skip_missing_version) { + const CaptureRowsetOps& opts) { std::shared_lock rlock(_meta_lock); std::vector version_path; *rs_splits = DORIS_TRY(capture_rs_readers_unlocked( - spec_version, CaptureRowsetOps {.skip_missing_versions = skip_missing_version})); + spec_version, CaptureRowsetOps {.skip_missing_versions = opts.skip_missing_versions})); return Status::OK(); } diff --git a/be/src/olap/tablet.h b/be/src/olap/tablet.h index a8c9df89ff0889..a9230a838532fa 100644 --- a/be/src/olap/tablet.h +++ b/be/src/olap/tablet.h @@ -188,7 +188,7 @@ class Tablet final : public BaseTablet { // If skip_missing_version is true, skip versions if they are missing. Status capture_rs_readers(const Version& spec_version, std::vector* rs_splits, - bool skip_missing_version) override; + const CaptureRowsetOps& opts) override; // Find the missed versions until the spec_version. // diff --git a/be/src/olap/tablet_meta.h b/be/src/olap/tablet_meta.h index 68f6d323bbc5ba..6acfcb5785eac3 100644 --- a/be/src/olap/tablet_meta.h +++ b/be/src/olap/tablet_meta.h @@ -249,7 +249,11 @@ class TabletMeta : public MetadataAdder { void remove_rowset_delete_bitmap(const RowsetId& rowset_id, const Version& version); bool enable_unique_key_merge_on_write() const { return _enable_unique_key_merge_on_write; } - +#ifdef BE_TEST + void set_enable_unique_key_merge_on_write(bool value) { + _enable_unique_key_merge_on_write = value; + } +#endif // TODO(Drogon): thread safety const BinlogConfig& binlog_config() const { return _binlog_config; } void set_binlog_config(BinlogConfig binlog_config) { diff --git a/be/src/olap/version_graph.cpp b/be/src/olap/version_graph.cpp index c5f8aff9d47def..6c9a8072d66948 100644 --- a/be/src/olap/version_graph.cpp +++ b/be/src/olap/version_graph.cpp @@ -25,6 +25,7 @@ #include // IWYU pragma: keep #include #include +#include #include #include @@ -329,6 +330,27 @@ Status TimestampedVersionTracker::capture_consistent_versions( return _version_graph.capture_consistent_versions(spec_version, version_path); } +Status TimestampedVersionTracker::capture_consistent_versions_with_validator( + const Version& spec_version, std::vector& version_path, + const std::function& validator) const { + return _version_graph.capture_consistent_versions_with_validator(spec_version, version_path, + validator); +} + +Status TimestampedVersionTracker::capture_consistent_versions_prefer_cache( + const Version& spec_version, std::vector& version_path, + const std::function& validator) const { + return _version_graph.capture_consistent_versions_prefer_cache(spec_version, version_path, + validator); +} + +Status TimestampedVersionTracker::capture_consistent_versions_with_validator_mow( + const Version& spec_version, std::vector& version_path, + const std::function& validator) const { + return _version_graph.capture_consistent_versions_with_validator_mow(spec_version, version_path, + validator); +} + void TimestampedVersionTracker::capture_expired_paths( int64_t stale_sweep_endtime, std::vector* path_version_vec) const { std::map::const_iterator iter = @@ -406,6 +428,10 @@ double TimestampedVersionTracker::get_orphan_vertex_ratio() { return _version_graph.get_orphan_vertex_ratio(); } +std::string TimestampedVersionTracker::debug_string() const { + return _version_graph.debug_string(); +} + void TimestampedVersionPathContainer::add_timestamped_version(TimestampedVersionSharedPtr version) { // Compare and refresh `_max_create_time`. if (version->get_create_time() > _max_create_time) { @@ -628,6 +654,172 @@ Status VersionGraph::capture_consistent_versions(const Version& spec_version, return Status::OK(); } +Status VersionGraph::capture_consistent_versions_prefer_cache( + const Version& spec_version, std::vector& version_path, + const std::function& validator) const { + if (spec_version.first > spec_version.second) { + return Status::Error( + "invalid specified version. spec_version={}-{}", spec_version.first, + spec_version.second); + } + + int64_t cur_idx = -1; + for (size_t i = 0; i < _version_graph.size(); i++) { + if (_version_graph[i].value == spec_version.first) { + cur_idx = i; + break; + } + } + + if (cur_idx < 0) { + return Status::InternalError("failed to find path in version_graph. spec_version={}", + spec_version.to_string()); + } + + int64_t end_value = spec_version.second + 1; + while (_version_graph[cur_idx].value < end_value) { + int64_t next_idx = -1; + int64_t first_idx = -1; + for (const auto& it : _version_graph[cur_idx].edges) { + // Only consider incremental versions. + if (_version_graph[it].value < _version_graph[cur_idx].value) { + break; + } + if (first_idx == -1) { + first_idx = it; + } + + if (!validator(_version_graph[cur_idx].value, _version_graph[it].value - 1)) { + continue; + } + + next_idx = it; + break; + } + + if (next_idx > -1) { + version_path.emplace_back(_version_graph[cur_idx].value, + _version_graph[next_idx].value - 1); + + cur_idx = next_idx; + } else if (first_idx != -1) { + // if all edges are not in cache, use the first edge if possible + version_path.emplace_back(_version_graph[cur_idx].value, + _version_graph[first_idx].value - 1); + cur_idx = first_idx; + } else { + return Status::OK(); + } + } + return Status::OK(); +} + +Status VersionGraph::capture_consistent_versions_with_validator( + const Version& spec_version, std::vector& version_path, + const std::function& validator) const { + if (spec_version.first > spec_version.second) { + return Status::Error( + "invalid specified version. spec_version={}-{}", spec_version.first, + spec_version.second); + } + + int64_t cur_idx = -1; + for (size_t i = 0; i < _version_graph.size(); i++) { + if (_version_graph[i].value == spec_version.first) { + cur_idx = i; + break; + } + } + + if (cur_idx < 0) { + return Status::InternalError("failed to find path in version_graph. spec_version={}", + spec_version.to_string()); + } + + int64_t end_value = spec_version.second + 1; + while (_version_graph[cur_idx].value < end_value) { + int64_t next_idx = -1; + for (const auto& it : _version_graph[cur_idx].edges) { + // Only consider incremental versions. + if (_version_graph[it].value < _version_graph[cur_idx].value) { + break; + } + + if (!validator(_version_graph[cur_idx].value, _version_graph[it].value - 1)) { + continue; + } + + next_idx = it; + break; + } + + if (next_idx > -1) { + version_path.emplace_back(_version_graph[cur_idx].value, + _version_graph[next_idx].value - 1); + + cur_idx = next_idx; + } else { + return Status::OK(); + } + } + return Status::OK(); +} + +Status VersionGraph::capture_consistent_versions_with_validator_mow( + const Version& spec_version, std::vector& version_path, + const std::function& validator) const { + if (spec_version.first > spec_version.second) { + return Status::Error( + "invalid specified version. spec_version={}-{}", spec_version.first, + spec_version.second); + } + + int64_t cur_idx = -1; + for (size_t i = 0; i < _version_graph.size(); i++) { + if (_version_graph[i].value == spec_version.first) { + cur_idx = i; + break; + } + } + + if (cur_idx < 0) { + return Status::InternalError("failed to find path in version_graph. spec_version={}", + spec_version.to_string()); + } + + int64_t end_value = spec_version.second + 1; + while (_version_graph[cur_idx].value < end_value) { + int64_t next_idx = -1; + for (const auto& it : _version_graph[cur_idx].edges) { + // Only consider incremental versions. + if (_version_graph[it].value < _version_graph[cur_idx].value) { + break; + } + + if (!validator(_version_graph[cur_idx].value, _version_graph[it].value - 1)) { + if (_version_graph[cur_idx].value + 1 == _version_graph[it].value) { + break; + } + end_value = std::min(_version_graph[it].value, end_value); + continue; + } + + next_idx = it; + break; + } + + if (next_idx > -1) { + version_path.emplace_back(_version_graph[cur_idx].value, + _version_graph[next_idx].value - 1); + + cur_idx = next_idx; + } else { + return Status::OK(); + } + } + return Status::OK(); +} + double VersionGraph::get_orphan_vertex_ratio() { int64_t vertex_num = _version_graph.size(); int64_t orphan_vertex_num = 0; @@ -639,4 +831,19 @@ double VersionGraph::get_orphan_vertex_ratio() { return orphan_vertex_num / (double)vertex_num; } +std::string VersionGraph::debug_string() const { + std::stringstream ss; + ss << "VersionGraph: ["; + for (size_t i = 0; i < _version_graph.size(); ++i) { + ss << "{value: " << _version_graph[i].value << ", edges: ["; + for (const auto& edge : _version_graph[i].edges) { + if (_version_graph[edge].value > _version_graph[i].value) { + ss << _version_graph[edge].value << ", "; + } + } + ss << "]}, "; + } + ss << "]"; + return ss.str(); +} } // namespace doris diff --git a/be/src/olap/version_graph.h b/be/src/olap/version_graph.h index 56d07a52871ae7..4c65d9208614c1 100644 --- a/be/src/olap/version_graph.h +++ b/be/src/olap/version_graph.h @@ -20,6 +20,7 @@ #include #include +#include #include #include #include @@ -55,9 +56,40 @@ class VersionGraph { Status capture_consistent_versions(const Version& spec_version, std::vector* version_path) const; + Status capture_consistent_versions_prefer_cache( + const Version& spec_version, std::vector& version_path, + const std::function& validator) const; + + // Given a start, this method can find a version path which satisfy the following conditions: + // 1. all edges satisfy the conditions specified by `validator` in the graph. + // 2. the destination version is as far as possible. + // 3. the path is the shortest path. + // The version paths are added to version_path as return info. + // If this version not in main version, version_path can be included expired rowset. + // NOTE: this method may return edges which is in stale path + // + // @param validator: Function that takes (start_version, end_version) representing a rowset + // and returns true if the rowset should be included in the path, false to skip it + Status capture_consistent_versions_with_validator( + const Version& spec_version, std::vector& version_path, + const std::function& validator) const; + + // Capture consistent versions with validator for merge-on-write (MOW) tables. + // Similar to capture_consistent_versions_with_validator but with special handling for MOW tables. + // For MOW tables, newly generated delete bitmap marks will be on the rowsets which are in newest layout. + // So we can only capture rowsets which are in newest data layout to ensure data correctness. + // + // @param validator: Function that takes (start_version, end_version) representing a rowset + // and returns true if the rowset is warmed up, false if not warmed up + Status capture_consistent_versions_with_validator_mow( + const Version& spec_version, std::vector& version_path, + const std::function& validator) const; + // See comment of TimestampedVersionTracker's get_orphan_vertex_ratio(); double get_orphan_vertex_ratio(); + std::string debug_string() const; + private: /// Private method add a version to graph. void _add_vertex_to_graph(int64_t vertex_value); @@ -168,6 +200,35 @@ class TimestampedVersionTracker { Status capture_consistent_versions(const Version& spec_version, std::vector* version_path) const; + Status capture_consistent_versions_prefer_cache( + const Version& spec_version, std::vector& version_path, + const std::function& validator) const; + + // Given a start, this method can find a version path which satisfy the following conditions: + // 1. all edges satisfy the conditions specified by `validator` in the graph. + // 2. the destination version is as far as possible. + // 3. the path is the shortest path. + // The version paths are added to version_path as return info. + // If this version not in main version, version_path can be included expired rowset. + // NOTE: this method may return edges which is in stale path + // + // @param validator: Function that takes (start_version, end_version) representing a rowset + // and returns true if the rowset should be included in the path, false to skip it + Status capture_consistent_versions_with_validator( + const Version& spec_version, std::vector& version_path, + const std::function& validator) const; + + // Capture consistent versions with validator for merge-on-write (MOW) tables. + // Similar to capture_consistent_versions_with_validator but with special handling for MOW tables. + // For MOW tables, newly generated delete bitmap marks will be on the rowsets which are in newest layout. + // So we can only capture rowsets which are in newest data layout to ensure data correctness. + // + // @param validator: Function that takes (start_version, end_version) representing a rowset + // and returns true if the rowset is warmed up, false if not warmed up + Status capture_consistent_versions_with_validator_mow( + const Version& spec_version, std::vector& version_path, + const std::function& validator) const; + /// Capture all expired path version. /// When the last rowset create time of a path greater than expired time which can be expressed /// "now() - tablet_rowset_stale_sweep_time_sec" , this path will be remained. @@ -193,6 +254,8 @@ class TimestampedVersionTracker { // If a vertex is no longer the starting point of any edge, then this vertex is defined as orphan vertex double get_orphan_vertex_ratio(); + std::string debug_string() const; + private: /// Construct rowsets version tracker with main path rowset meta. void _construct_versioned_tracker(const std::vector& rs_metas); diff --git a/be/src/pipeline/exec/olap_scan_operator.cpp b/be/src/pipeline/exec/olap_scan_operator.cpp index 64437641d4e8b1..9f38f77b3d9a6d 100644 --- a/be/src/pipeline/exec/olap_scan_operator.cpp +++ b/be/src/pipeline/exec/olap_scan_operator.cpp @@ -561,12 +561,20 @@ Status OlapScanLocalState::prepare(RuntimeState* state) { } } + CaptureRowsetOps opts { + .skip_missing_versions = PipelineXLocalState<>::_state->skip_missing_version(), + .enable_fetch_rowsets_from_peers = config::enable_fetch_rowsets_from_peer_replicas, + .enable_prefer_cached_rowset = + config::is_cloud_mode() + ? PipelineXLocalState<>::_state->enable_prefer_cached_rowset() + : false, + .query_freshness_tolerance_ms = + config::is_cloud_mode() + ? PipelineXLocalState<>::_state->query_freshness_tolerance_ms() + : -1}; for (size_t i = 0; i < _scan_ranges.size(); i++) { - _read_sources[i] = DORIS_TRY(_tablets[i].tablet->capture_read_source( - {0, _tablets[i].version}, - {.skip_missing_versions = RuntimeFilterConsumer::_state->skip_missing_version(), - .enable_fetch_rowsets_from_peers = - config::enable_fetch_rowsets_from_peer_replicas})); + _read_sources[i] = + DORIS_TRY(_tablets[i].tablet->capture_read_source({0, _tablets[i].version}, opts)); if (!PipelineXLocalState<>::_state->skip_delete_predicate()) { _read_sources[i].fill_delete_predicates(); } diff --git a/be/src/runtime/runtime_state.h b/be/src/runtime/runtime_state.h index 14320be48e8b77..44cc3508e44491 100644 --- a/be/src/runtime/runtime_state.h +++ b/be/src/runtime/runtime_state.h @@ -433,6 +433,20 @@ class RuntimeState { return _query_options.partitioned_hash_agg_rows_threshold; } + bool enable_prefer_cached_rowset() const { + return _query_options.__isset.enable_prefer_cached_rowset && + _query_options.enable_prefer_cached_rowset; + } + + int64_t query_freshness_tolerance_ms() const { + return _query_options.query_freshness_tolerance_ms; + } + + bool enable_query_freshness_tolerance() const { + return _query_options.__isset.query_freshness_tolerance_ms && + _query_options.query_freshness_tolerance_ms > 0; + } + std::vector tablet_commit_infos() const { std::lock_guard lock(_tablet_infos_mutex); return _tablet_commit_infos; diff --git a/be/src/vec/exec/scan/new_olap_scanner.cpp b/be/src/vec/exec/scan/new_olap_scanner.cpp index 707defa16902db..ddb17471e087fb 100644 --- a/be/src/vec/exec/scan/new_olap_scanner.cpp +++ b/be/src/vec/exec/scan/new_olap_scanner.cpp @@ -200,11 +200,16 @@ Status NewOlapScanner::init() { ExecEnv::GetInstance()->storage_engine().to_cloud().tablet_hotspot().count(*tablet); } - auto maybe_read_source = tablet->capture_read_source( - _tablet_reader_params.version, - {.skip_missing_versions = _state->skip_missing_version(), - .enable_fetch_rowsets_from_peers = - config::enable_fetch_rowsets_from_peer_replicas}); + CaptureRowsetOps opts { + .skip_missing_versions = _state->skip_missing_version(), + .enable_fetch_rowsets_from_peers = + config::enable_fetch_rowsets_from_peer_replicas, + .enable_prefer_cached_rowset = + config::is_cloud_mode() ? _state->enable_prefer_cached_rowset() : false, + .query_freshness_tolerance_ms = + config::is_cloud_mode() ? _state->query_freshness_tolerance_ms() : -1}; + auto maybe_read_source = + tablet->capture_read_source(_tablet_reader_params.version, opts); if (!maybe_read_source) { LOG(WARNING) << "fail to init reader. res=" << maybe_read_source.error(); return maybe_read_source.error(); diff --git a/be/test/cloud/cloud_tablet_query_prefer_cache_test.cpp b/be/test/cloud/cloud_tablet_query_prefer_cache_test.cpp new file mode 100644 index 00000000000000..928701dae39357 --- /dev/null +++ b/be/test/cloud/cloud_tablet_query_prefer_cache_test.cpp @@ -0,0 +1,804 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include + +#include +#include +#include + +#include "cloud/cloud_storage_engine.h" +#include "cloud/cloud_tablet.h" +#include "olap/base_tablet.h" +#include "olap/rowset/rowset.h" +#include "olap/rowset/rowset_factory.h" +#include "olap/rowset/rowset_meta.h" +#include "olap/tablet_meta.h" +#include "util/uid_util.h" + +namespace doris { + +using namespace std::chrono; + +class TestQueryPreferCache : public testing::Test { +public: + TestQueryPreferCache() : _engine(CloudStorageEngine(EngineOptions {})) {} + + void SetUp() override { + config::read_cluster_cache_opt_verbose_log = true; + _tablet_meta.reset(new TabletMeta(1, 2, 15673, 15674, 4, 5, TTabletSchema(), 6, {{7, 8}}, + UniqueId(9, 10), TTabletType::TABLET_TYPE_DISK, + TCompressionType::LZ4F)); + } + void TearDown() override { config::read_cluster_cache_opt_verbose_log = false; } + + RowsetSharedPtr create_rowset_without_visible_time(Version version) { + auto rs_meta = std::make_shared(); + rs_meta->set_rowset_type(BETA_ROWSET); + rs_meta->set_version(version); + rs_meta->set_rowset_id(_engine.next_rowset_id()); + RowsetSharedPtr rowset; + Status st = RowsetFactory::create_rowset(nullptr, "", rs_meta, &rowset); + if (!st.ok()) { + return nullptr; + } + return rowset; + } + + RowsetSharedPtr create_rowset(Version version, + time_point visible_timestamp = system_clock::now() - + seconds(100)) { + auto rs = create_rowset_without_visible_time(version); + if (!rs) { + return nullptr; + } + rs->rowset_meta()->set_visible_ts_ms( + duration_cast(visible_timestamp.time_since_epoch()).count()); + return rs; + } + + CloudTabletSPtr create_tablet_with_initial_rowsets(int max_version, bool is_mow = false, + bool warmup = true) { + CloudTabletSPtr tablet = + std::make_shared(_engine, std::make_shared(*_tablet_meta)); + tablet->tablet_meta()->set_enable_unique_key_merge_on_write(is_mow); + std::vector rowsets; + auto rs1 = create_rowset(Version {0, 1}); + rowsets.emplace_back(rs1); + tablet->add_warmed_up_rowset(rs1->rowset_id()); + for (int ver = 2; ver <= max_version; ver++) { + auto rs = create_rowset(Version {ver, ver}); + if (warmup) { + tablet->add_warmed_up_rowset(rs->rowset_id()); + } + rowsets.emplace_back(rs); + } + { + std::unique_lock wlock {tablet->get_header_lock()}; + tablet->add_rowsets(rowsets, false, wlock, false); + } + return tablet; + } + + void add_new_version_rowset(CloudTabletSPtr tablet, int64_t version, bool warmed_up, + time_point visible_timestamp) { + auto rowset = create_rowset(Version {version, version}, visible_timestamp); + if (warmed_up) { + tablet->add_warmed_up_rowset(rowset->rowset_id()); + } + std::unique_lock wlock {tablet->get_header_lock()}; + tablet->add_rowsets({rowset}, false, wlock, false); + } + + void do_cumu_compaction(CloudTabletSPtr tablet, int64_t start_version, int64_t end_version, + bool warmed_up, time_point visible_timestamp) { + std::unique_lock wrlock {tablet->get_header_lock()}; + std::vector input_rowsets; + auto output_rowset = create_rowset(Version {start_version, end_version}, visible_timestamp); + if (warmed_up) { + tablet->add_warmed_up_rowset(output_rowset->rowset_id()); + } + std::ranges::copy_if(std::views::values(tablet->rowset_map()), + std::back_inserter(input_rowsets), [=](const RowsetSharedPtr& rowset) { + return rowset->version().first >= start_version && + rowset->version().first <= end_version; + }); + if (input_rowsets.size() == 1) { + tablet->add_rowsets({output_rowset}, true, wrlock); + } else { + tablet->delete_rowsets(input_rowsets, wrlock); + tablet->add_rowsets({output_rowset}, false, wrlock); + } + } + + void check_capture_result(CloudTabletSPtr tablet, Version spec_version, + const std::vector& expected_versions) { + CaptureRowsetOps opts {.skip_missing_versions = false, + .enable_prefer_cached_rowset = true, + .query_freshness_tolerance_ms = -1}; + auto res = tablet->capture_read_source(spec_version, opts); + ASSERT_TRUE(res.has_value()); + std::vector rs_splits = std::move(res.value().rs_splits); + auto dump_versions = [](const std::vector& expected_versions, + const std::vector& splits) { + std::vector expected_str; + for (const auto& version : expected_versions) { + expected_str.push_back(version.to_string()); + } + std::vector versions; + for (const auto& split : splits) { + versions.push_back(split.rs_reader->rowset()->version().to_string()); + } + return fmt::format("expected_versions: {}, actual_versions: {}", + fmt::join(expected_str, ", "), fmt::join(versions, ", ")); + }; + ASSERT_EQ(rs_splits.size(), expected_versions.size()) + << dump_versions(expected_versions, rs_splits); + for (size_t i = 0; i < rs_splits.size(); i++) { + ASSERT_EQ(rs_splits[i].rs_reader->rowset()->version(), expected_versions[i]) + << dump_versions(expected_versions, rs_splits); + } + } + +protected: + std::string _json_rowset_meta; + TabletMetaSharedPtr _tablet_meta; + +private: + CloudStorageEngine _engine; +}; + +TEST_F(TestQueryPreferCache, testCapture_1_1) { + /* + be startup time now-10s now + now - 30s + │ │ 10s │ + │ ◄───────────────────────────┤ +┌────────┐│ ┌─────────┐ ┌─────────┐│ ┌────────┐ ┌───────┐ │ +│ ││ │ in cache│ │in cache ││ │in cache│ │incache│ │ +│ ││ │ │ │ ││ │ │ │ │ │ +│ [2-10] ││ │ [11-15] │ │ [16-16] ││ │ [17-17]│ │[18-18]│ │ +└────────┘│ └─────────┘ └─────────┘│ └────────┘ └───────┘ │ + │ │ │ + now-40s │ now-20s now-15s │ now-7s now-3s │ + │ │ │ + │ │ │ + + return: [2-10],[11-15],[16-16],[17-17],[18-18] + note: We only care about rowsets that are created after startup time point. For other historical rowsets, + we just assume that they are warmuped up. +*/ + _engine.set_startup_timepoint(system_clock::now() - seconds(30)); + auto tablet = create_tablet_with_initial_rowsets(15); + do_cumu_compaction(tablet, 2, 10, false, system_clock::now() - seconds(40)); + do_cumu_compaction(tablet, 11, 15, true, system_clock::now() - seconds(20)); + add_new_version_rowset(tablet, 16, true, system_clock::now() - seconds(15)); + add_new_version_rowset(tablet, 17, true, system_clock::now() - seconds(7)); + add_new_version_rowset(tablet, 18, true, system_clock::now() - seconds(3)); + + std::string compaction_status; + tablet->get_compaction_status(&compaction_status); + std::cout << compaction_status << std::endl; + + std::vector expected_versions = {{0, 1}, {2, 10}, {11, 15}, + {16, 16}, {17, 17}, {18, 18}}; + check_capture_result(tablet, Version {0, 18}, expected_versions); +} + +TEST_F(TestQueryPreferCache, testCapture_1_2) { + /* + now-10s now + │ 10s │ + ◄────────────────────────┼ + │ │ + ┌────────┐ │┌────────┐ ┌────────┐│ + │in cache│ ││ │ │ ││ + │ │ ││ │ │in cache││ + │ [2-10] │ ││ [11-17]│ │[18-18] ││ + └────────┘ │└────────┘ └────────┘│ + │ │ + now-40s │ now-1s now-3s │ +┌───────────────────────────────────────────────────────────────────────┐ +│ │ │ │ +│ stale rowsets │ │ │ +│ ┌─────────┐ ┌─────────┐ │┌────────┐ │ │ +│ │ in cache│ │in cache │ ││in cache│ │ │ +│ │ │ │ │ ││ │ │ │ +│ │ [11-15] │ │ [16-16] │ ││ [17-17]│ │ │ +│ └─────────┘ └─────────┘ │└────────┘ │ │ +│ │ │ │ +│ now-20s now-15s │ now-7s │ │ +└───────────────────────────────────────────────────────────────────────┘ + │ │ + return: [2-10],[11-15],[16-16],[17-17],[18-18] + */ + _engine.set_startup_timepoint(system_clock::now() - seconds(200)); + auto tablet = create_tablet_with_initial_rowsets(15); + do_cumu_compaction(tablet, 2, 10, true, system_clock::now() - seconds(40)); + do_cumu_compaction(tablet, 11, 15, true, system_clock::now() - seconds(20)); + add_new_version_rowset(tablet, 16, true, system_clock::now() - seconds(15)); + add_new_version_rowset(tablet, 17, true, system_clock::now() - seconds(7)); + add_new_version_rowset(tablet, 18, true, system_clock::now() - seconds(3)); + do_cumu_compaction(tablet, 11, 17, false, system_clock::now() - seconds(1)); + + std::string compaction_status; + tablet->get_compaction_status(&compaction_status); + std::cout << compaction_status << std::endl; + + std::vector expected_versions = {{0, 1}, {2, 10}, {11, 15}, + {16, 16}, {17, 17}, {18, 18}}; + check_capture_result(tablet, Version {0, 18}, expected_versions); +} + +TEST_F(TestQueryPreferCache, testCapture_1_3) { + /* + now-10s now + │ 10s │ + ◄────────────────────────┼ + │ │ + ┌────────┐ │ ┌────────┐│ + │ │ │ │ ││ + │ │ │ │in cache││ + │ [2-16] │ │ │[18-18] ││ + └────────┘ │ └────────┘│ + │ │ + now-13s │ now-3s │ + │ │ + │ │ +┌───────────────────────────────────────────────────────────────────────┐ +│ │ │ │ +│ stale rowsets │ │ │ +│ ┌────────┐ ┌─────────┐ ┌─────────┐ │┌────────┐ │ │ +│ │in cache│ │ in cache│ │in cache │ ││in cache│ │ │ +│ │ │ │ │ │ │ ││ │ │ │ +│ │ [2-10] │ │ [11-15] │ │ [16-16] │ ││ [17-17]│ │ │ +│ └────────┘ └─────────┘ └─────────┘ │└────────┘ │ │ +│ │ │ │ +│ now-40s now-20s now-15s │ now-7s │ │ +│ │ │ │ +│ │ │ │ +└───────────────────────────────────────────────────────────────────────┘ + │ │ + return: [2-10],[11-15],[16-16],[17-17],[18-18] + note: should not capture [2-16], otherwise we will meet cache miss +*/ + _engine.set_startup_timepoint(system_clock::now() - seconds(200)); + auto tablet = create_tablet_with_initial_rowsets(15); + do_cumu_compaction(tablet, 2, 10, true, system_clock::now() - seconds(40)); + do_cumu_compaction(tablet, 11, 15, true, system_clock::now() - seconds(20)); + add_new_version_rowset(tablet, 16, true, system_clock::now() - seconds(15)); + add_new_version_rowset(tablet, 17, true, system_clock::now() - seconds(7)); + add_new_version_rowset(tablet, 18, true, system_clock::now() - seconds(3)); + do_cumu_compaction(tablet, 2, 16, false, system_clock::now() - seconds(13)); + + std::string compaction_status; + tablet->get_compaction_status(&compaction_status); + std::cout << compaction_status << std::endl; + + std::vector expected_versions = {{0, 1}, {2, 10}, {11, 15}, + {16, 16}, {17, 17}, {18, 18}}; + check_capture_result(tablet, Version {0, 18}, expected_versions); +} + +TEST_F(TestQueryPreferCache, testCapture_1_4) { + /* + now-10s now + │ 10s │ + ◄────────────────────────┼ + │ │ + │ ┌────────┐ ┌────────┐│ + │ │ │ │ ││ + │ │ │ │in cache││ + │ │ [2-17] │ │[18-18] ││ + │ └────────┘ └────────┘│ + │ │ + │ now-1s now-3s │ + │ │ + │ │ +┌───────────────────────────────────────────────────────────────────────┐ +│ │ │ │ +│ stale rowsets │ │ │ +│ │ │ │ +│ ┌────────┐ │ │ │ +│ │ │ │ │ │ +│ │ │ │ │ │ +│ │ [2-16] │ │ │ │ +│ └────────┘ │ │ │ +│ │ │ │ +│ now-13s │ │ │ +│ │ │ │ +│ │ │ │ +│ ┌────────┐ ┌─────────┐ ┌─────────┐ │┌────────┐ │ │ +│ │in cache│ │ in cache│ │in cache │ ││in cache│ │ │ +│ │ │ │ │ │ │ ││ │ │ │ +│ │ [2-10] │ │ [11-15] │ │ [16-16] │ ││ [17-17]│ │ │ +│ └────────┘ └─────────┘ └─────────┘ │└────────┘ │ │ +│ │ │ │ +│ now-40s now-20s now-15s │ now-7s │ │ +│ │ │ │ +│ │ │ │ +└───────────────────────────────────────────────────────────────────────┘ + │ │ + return: [2-10],[11-15],[16-16],[17-17],[18-18] +*/ + _engine.set_startup_timepoint(system_clock::now() - seconds(200)); + auto tablet = create_tablet_with_initial_rowsets(15); + do_cumu_compaction(tablet, 2, 10, true, system_clock::now() - seconds(40)); + do_cumu_compaction(tablet, 11, 15, true, system_clock::now() - seconds(20)); + add_new_version_rowset(tablet, 16, true, system_clock::now() - seconds(15)); + add_new_version_rowset(tablet, 17, true, system_clock::now() - seconds(7)); + add_new_version_rowset(tablet, 18, true, system_clock::now() - seconds(3)); + do_cumu_compaction(tablet, 2, 16, false, system_clock::now() - seconds(13)); + do_cumu_compaction(tablet, 2, 17, false, system_clock::now() - seconds(1)); + + std::string compaction_status; + tablet->get_compaction_status(&compaction_status); + std::cout << compaction_status << std::endl; + + std::vector expected_versions = {{0, 1}, {2, 10}, {11, 15}, + {16, 16}, {17, 17}, {18, 18}}; + check_capture_result(tablet, Version {0, 18}, expected_versions); +} + +TEST_F(TestQueryPreferCache, testCapture_2_1) { + /* + be startup time now-10s now + now - 30s + │ │ 10s │ + │ ◄───────────────────────────┤ +┌────────┐│ ┌─────────┐ ┌─────────┐│ ┌────────┐ ┌───────┐ │ +│ ││ │ in cache│ │in cache ││ │in cache│ │ │ │ +│ ││ │ │ │ ││ │ │ │ │ │ +│ [2-10] ││ │ [11-15] │ │ [16-16] ││ │ [17-17]│ │[18-18]│ │ +└────────┘│ └─────────┘ └─────────┘│ └────────┘ └───────┘ │ + │ │ │ + now-40s │ now-20s now-15s │ now-7s now-3s │ + │ │ │ + │ │ │ + + return: [2-10],[11-15],[16-16],[17-17],[18-18] + note: We only care about rowsets that are created after startup time point. For other historical rowsets, + we just assume that they are warmuped up. +*/ + _engine.set_startup_timepoint(system_clock::now() - seconds(30)); + auto tablet = create_tablet_with_initial_rowsets(15); + do_cumu_compaction(tablet, 2, 10, false, system_clock::now() - seconds(40)); + do_cumu_compaction(tablet, 11, 15, true, system_clock::now() - seconds(20)); + add_new_version_rowset(tablet, 16, true, system_clock::now() - seconds(15)); + add_new_version_rowset(tablet, 17, true, system_clock::now() - seconds(7)); + add_new_version_rowset(tablet, 18, false, system_clock::now() - seconds(3)); + + std::string compaction_status; + tablet->get_compaction_status(&compaction_status); + std::cout << compaction_status << std::endl; + + std::vector expected_versions = {{0, 1}, {2, 10}, {11, 15}, + {16, 16}, {17, 17}, {18, 18}}; + check_capture_result(tablet, Version {0, 18}, expected_versions); +} + +TEST_F(TestQueryPreferCache, testCapture_2_2) { + /* + now-10s now + │ 10s │ + ◄────────────────────────┼ + │ │ + ┌────────┐ │┌────────┐ ┌────────┐│ + │in cache│ ││ │ │ ││ + │ │ ││ │ │ ││ + │ [2-10] │ ││ [11-17]│ │[18-18] ││ + └────────┘ │└────────┘ └────────┘│ + │ │ + now-40s │ now-1s now-3s │ +┌───────────────────────────────────────────────────────────────────────┐ +│ │ │ │ +│ stale rowsets │ │ │ +│ ┌─────────┐ ┌─────────┐ │┌────────┐ │ │ +│ │ in cache│ │in cache │ ││in cache│ │ │ +│ │ │ │ │ ││ │ │ │ +│ │ [11-15] │ │ [16-16] │ ││ [17-17]│ │ │ +│ └─────────┘ └─────────┘ │└────────┘ │ │ +│ │ │ │ +│ now-20s now-15s │ now-7s │ │ +└───────────────────────────────────────────────────────────────────────┘ + │ │ + return: [2-10],[11-15],[16-16],[17-17],[18-18] + */ + _engine.set_startup_timepoint(system_clock::now() - seconds(200)); + auto tablet = create_tablet_with_initial_rowsets(15); + do_cumu_compaction(tablet, 2, 10, true, system_clock::now() - seconds(40)); + do_cumu_compaction(tablet, 11, 15, true, system_clock::now() - seconds(20)); + add_new_version_rowset(tablet, 16, true, system_clock::now() - seconds(15)); + add_new_version_rowset(tablet, 17, true, system_clock::now() - seconds(7)); + add_new_version_rowset(tablet, 18, false, system_clock::now() - seconds(3)); + do_cumu_compaction(tablet, 11, 17, false, system_clock::now() - seconds(1)); + + std::string compaction_status; + tablet->get_compaction_status(&compaction_status); + std::cout << compaction_status << std::endl; + + std::vector expected_versions = {{0, 1}, {2, 10}, {11, 15}, + {16, 16}, {17, 17}, {18, 18}}; + check_capture_result(tablet, Version {0, 18}, expected_versions); +} + +TEST_F(TestQueryPreferCache, testCapture_2_3) { + /* + now-10s now + │ 10s │ + ◄────────────────────────┼ + │ │ + ┌────────┐ │ ┌────────┐│ + │ │ │ │ ││ + │ │ │ │ ││ + │ [2-16] │ │ │[18-18] ││ + └────────┘ │ └────────┘│ + │ │ + now-13s │ now-3s │ + │ │ + │ │ +┌───────────────────────────────────────────────────────────────────────┐ +│ │ │ │ +│ stale rowsets │ │ │ +│ ┌────────┐ ┌─────────┐ ┌─────────┐ │┌────────┐ │ │ +│ │in cache│ │ in cache│ │in cache │ ││in cache│ │ │ +│ │ │ │ │ │ │ ││ │ │ │ +│ │ [2-10] │ │ [11-15] │ │ [16-16] │ ││ [17-17]│ │ │ +│ └────────┘ └─────────┘ └─────────┘ │└────────┘ │ │ +│ │ │ │ +│ now-40s now-20s now-15s │ now-7s │ │ +│ │ │ │ +│ │ │ │ +└───────────────────────────────────────────────────────────────────────┘ + │ │ + return: [2-10],[11-15],[16-16],[17-17],[18-18] + note: should not capture [2-16], otherwise we will meet cache miss +*/ + _engine.set_startup_timepoint(system_clock::now() - seconds(200)); + auto tablet = create_tablet_with_initial_rowsets(15); + do_cumu_compaction(tablet, 2, 10, true, system_clock::now() - seconds(40)); + do_cumu_compaction(tablet, 11, 15, true, system_clock::now() - seconds(20)); + add_new_version_rowset(tablet, 16, true, system_clock::now() - seconds(15)); + add_new_version_rowset(tablet, 17, true, system_clock::now() - seconds(7)); + add_new_version_rowset(tablet, 18, false, system_clock::now() - seconds(3)); + do_cumu_compaction(tablet, 2, 16, false, system_clock::now() - seconds(13)); + + std::string compaction_status; + tablet->get_compaction_status(&compaction_status); + std::cout << compaction_status << std::endl; + + std::vector expected_versions = {{0, 1}, {2, 10}, {11, 15}, + {16, 16}, {17, 17}, {18, 18}}; + check_capture_result(tablet, Version {0, 18}, expected_versions); +} + +TEST_F(TestQueryPreferCache, testCapture_2_4) { + /* + now-10s now + │ 10s │ + ◄────────────────────────┼ + │ │ + │ ┌────────┐ ┌────────┐│ + │ │ │ │ ││ + │ │ │ │ ││ + │ │ [2-17] │ │[18-18] ││ + │ └────────┘ └────────┘│ + │ │ + │ now-1s now-3s │ + │ │ + │ │ +┌───────────────────────────────────────────────────────────────────────┐ +│ │ │ │ +│ stale rowsets │ │ │ +│ │ │ │ +│ ┌────────┐ │ │ │ +│ │ │ │ │ │ +│ │ │ │ │ │ +│ │ [2-16] │ │ │ │ +│ └────────┘ │ │ │ +│ │ │ │ +│ now-13s │ │ │ +│ │ │ │ +│ │ │ │ +│ ┌────────┐ ┌─────────┐ ┌─────────┐ │┌────────┐ │ │ +│ │in cache│ │ in cache│ │in cache │ ││in cache│ │ │ +│ │ │ │ │ │ │ ││ │ │ │ +│ │ [2-10] │ │ [11-15] │ │ [16-16] │ ││ [17-17]│ │ │ +│ └────────┘ └─────────┘ └─────────┘ │└────────┘ │ │ +│ │ │ │ +│ now-40s now-20s now-15s │ now-7s │ │ +│ │ │ │ +│ │ │ │ +└───────────────────────────────────────────────────────────────────────┘ + │ │ + return: [2-10],[11-15],[16-16],[17-17],[18-18] +*/ + _engine.set_startup_timepoint(system_clock::now() - seconds(200)); + auto tablet = create_tablet_with_initial_rowsets(15); + do_cumu_compaction(tablet, 2, 10, true, system_clock::now() - seconds(40)); + do_cumu_compaction(tablet, 11, 15, true, system_clock::now() - seconds(20)); + add_new_version_rowset(tablet, 16, true, system_clock::now() - seconds(15)); + add_new_version_rowset(tablet, 17, true, system_clock::now() - seconds(7)); + add_new_version_rowset(tablet, 18, false, system_clock::now() - seconds(3)); + do_cumu_compaction(tablet, 2, 16, false, system_clock::now() - seconds(13)); + do_cumu_compaction(tablet, 2, 17, false, system_clock::now() - seconds(1)); + + std::string compaction_status; + tablet->get_compaction_status(&compaction_status); + std::cout << compaction_status << std::endl; + + std::vector expected_versions = {{0, 1}, {2, 10}, {11, 15}, + {16, 16}, {17, 17}, {18, 18}}; + check_capture_result(tablet, Version {0, 18}, expected_versions); +} + +TEST_F(TestQueryPreferCache, testCapture_3_1) { + /* + be startup time now-10s now + now - 30s + │ │ 10s │ + │ ◄───────────────────────────┤ +┌────────┐│ ┌─────────┐ ┌─────────┐│ ┌────────┐ ┌───────┐ │ +│ ││ │ in cache│ │in cache ││ │ │ │ │ │ +│ ││ │ │ │ ││ │ │ │ │ │ +│ [2-10] ││ │ [11-15] │ │ [16-16] ││ │ [17-17]│ │[18-18]│ │ +└────────┘│ └─────────┘ └─────────┘│ └────────┘ └───────┘ │ + │ │ │ + now-40s │ now-20s now-15s │ now-7s now-3s │ + │ │ │ + │ │ │ + + return: [2-10],[11-15],[16-16],[17-17],[18-18] + note: We only care about rowsets that are created after startup time point. For other historical rowsets, + we just assume that they are warmuped up. +*/ + _engine.set_startup_timepoint(system_clock::now() - seconds(30)); + auto tablet = create_tablet_with_initial_rowsets(15); + do_cumu_compaction(tablet, 2, 10, false, system_clock::now() - seconds(40)); + do_cumu_compaction(tablet, 11, 15, true, system_clock::now() - seconds(20)); + add_new_version_rowset(tablet, 16, true, system_clock::now() - seconds(15)); + add_new_version_rowset(tablet, 17, false, system_clock::now() - seconds(7)); + add_new_version_rowset(tablet, 18, false, system_clock::now() - seconds(3)); + + std::string compaction_status; + tablet->get_compaction_status(&compaction_status); + std::cout << compaction_status << std::endl; + + std::vector expected_versions = {{0, 1}, {2, 10}, {11, 15}, + {16, 16}, {17, 17}, {18, 18}}; + check_capture_result(tablet, Version {0, 18}, expected_versions); +} + +TEST_F(TestQueryPreferCache, testCapture_3_2) { + /* + now-10s now + │ 10s │ + ◄────────────────────────┼ + │ │ + ┌────────┐ │┌────────┐ ┌────────┐│ + │in cache│ ││ │ │ ││ + │ │ ││ │ │ ││ + │ [2-10] │ ││ [11-17]│ │[18-18] ││ + └────────┘ │└────────┘ └────────┘│ + │ │ + now-40s │ now-1s now-3s │ +┌───────────────────────────────────────────────────────────────────────┐ +│ │ │ │ +│ stale rowsets │ │ │ +│ ┌─────────┐ ┌─────────┐ │┌────────┐ │ │ +│ │ in cache│ │in cache │ ││ │ │ │ +│ │ │ │ │ ││ │ │ │ +│ │ [11-15] │ │ [16-16] │ ││ [17-17]│ │ │ +│ └─────────┘ └─────────┘ │└────────┘ │ │ +│ │ │ │ +│ now-20s now-15s │ now-7s │ │ +└───────────────────────────────────────────────────────────────────────┘ + │ │ + return: [2-10],[11-15],[16-16],[17-17],[18-18] + */ + _engine.set_startup_timepoint(system_clock::now() - seconds(200)); + auto tablet = create_tablet_with_initial_rowsets(15); + do_cumu_compaction(tablet, 2, 10, true, system_clock::now() - seconds(40)); + do_cumu_compaction(tablet, 11, 15, true, system_clock::now() - seconds(20)); + add_new_version_rowset(tablet, 16, true, system_clock::now() - seconds(15)); + add_new_version_rowset(tablet, 17, false, system_clock::now() - seconds(7)); + add_new_version_rowset(tablet, 18, false, system_clock::now() - seconds(3)); + do_cumu_compaction(tablet, 11, 17, false, system_clock::now() - seconds(1)); + + std::string compaction_status; + tablet->get_compaction_status(&compaction_status); + std::cout << compaction_status << std::endl; + + std::vector expected_versions = {{0, 1}, {2, 10}, {11, 15}, + {16, 16}, {17, 17}, {18, 18}}; + check_capture_result(tablet, Version {0, 18}, expected_versions); +} + +TEST_F(TestQueryPreferCache, testCapture_3_3) { + /* + now-10s now + │ 10s │ + ◄────────────────────────┼ + │ │ + ┌────────┐ │ ┌────────┐│ + │ │ │ │ ││ + │ │ │ │ ││ + │ [2-16] │ │ │[18-18] ││ + └────────┘ │ └────────┘│ + │ │ + now-13s │ now-3s │ + │ │ + │ │ +┌───────────────────────────────────────────────────────────────────────┐ +│ │ │ │ +│ stale rowsets │ │ │ +│ ┌────────┐ ┌─────────┐ ┌─────────┐ │┌────────┐ │ │ +│ │in cache│ │ in cache│ │in cache │ ││ │ │ │ +│ │ │ │ │ │ │ ││ │ │ │ +│ │ [2-10] │ │ [11-15] │ │ [16-16] │ ││ [17-17]│ │ │ +│ └────────┘ └─────────┘ └─────────┘ │└────────┘ │ │ +│ │ │ │ +│ now-40s now-20s now-15s │ now-7s │ │ +│ │ │ │ +│ │ │ │ +└───────────────────────────────────────────────────────────────────────┘ + │ │ + return: [2-10],[11-15],[16-16],[17-17],[18-18] + note: should not capture [2-16], otherwise we will meet cache miss +*/ + _engine.set_startup_timepoint(system_clock::now() - seconds(200)); + auto tablet = create_tablet_with_initial_rowsets(15); + do_cumu_compaction(tablet, 2, 10, true, system_clock::now() - seconds(40)); + do_cumu_compaction(tablet, 11, 15, true, system_clock::now() - seconds(20)); + add_new_version_rowset(tablet, 16, true, system_clock::now() - seconds(15)); + add_new_version_rowset(tablet, 17, false, system_clock::now() - seconds(7)); + add_new_version_rowset(tablet, 18, false, system_clock::now() - seconds(3)); + do_cumu_compaction(tablet, 2, 16, false, system_clock::now() - seconds(13)); + + std::string compaction_status; + tablet->get_compaction_status(&compaction_status); + std::cout << compaction_status << std::endl; + + std::vector expected_versions = {{0, 1}, {2, 10}, {11, 15}, + {16, 16}, {17, 17}, {18, 18}}; + check_capture_result(tablet, Version {0, 18}, expected_versions); +} + +TEST_F(TestQueryPreferCache, testCapture_3_4) { + /* + now-10s now + │ 10s │ + ◄────────────────────────┼ + │ │ + │ ┌────────┐ ┌────────┐│ + │ │ │ │ ││ + │ │ │ │ ││ + │ │ [2-17] │ │[18-18] ││ + │ └────────┘ └────────┘│ + │ │ + │ now-1s now-3s │ + │ │ + │ │ +┌───────────────────────────────────────────────────────────────────────┐ +│ │ │ │ +│ stale rowsets │ │ │ +│ │ │ │ +│ ┌────────┐ │ │ │ +│ │ │ │ │ │ +│ │ │ │ │ │ +│ │ [2-16] │ │ │ │ +│ └────────┘ │ │ │ +│ │ │ │ +│ now-13s │ │ │ +│ │ │ │ +│ │ │ │ +│ ┌────────┐ ┌─────────┐ ┌─────────┐ │┌────────┐ │ │ +│ │in cache│ │ in cache│ │in cache │ ││ │ │ │ +│ │ │ │ │ │ │ ││ │ │ │ +│ │ [2-10] │ │ [11-15] │ │ [16-16] │ ││ [17-17]│ │ │ +│ └────────┘ └─────────┘ └─────────┘ │└────────┘ │ │ +│ │ │ │ +│ now-40s now-20s now-15s │ now-7s │ │ +│ │ │ │ +│ │ │ │ +└───────────────────────────────────────────────────────────────────────┘ + │ │ + return: [2-10],[11-15],[16-16],[17-17],[18-18] +*/ + _engine.set_startup_timepoint(system_clock::now() - seconds(200)); + auto tablet = create_tablet_with_initial_rowsets(15); + do_cumu_compaction(tablet, 2, 10, true, system_clock::now() - seconds(40)); + do_cumu_compaction(tablet, 11, 15, true, system_clock::now() - seconds(20)); + add_new_version_rowset(tablet, 16, true, system_clock::now() - seconds(15)); + add_new_version_rowset(tablet, 17, false, system_clock::now() - seconds(7)); + add_new_version_rowset(tablet, 18, false, system_clock::now() - seconds(3)); + do_cumu_compaction(tablet, 2, 16, false, system_clock::now() - seconds(13)); + do_cumu_compaction(tablet, 2, 17, false, system_clock::now() - seconds(1)); + + std::string compaction_status; + tablet->get_compaction_status(&compaction_status); + std::cout << compaction_status << std::endl; + + std::vector expected_versions = {{0, 1}, {2, 10}, {11, 15}, + {16, 16}, {17, 17}, {18, 18}}; + check_capture_result(tablet, Version {0, 18}, expected_versions); +} + +TEST_F(TestQueryPreferCache, testCapture_4_1) { + /* + now-10s now + │ 10s │ + ◄────────────────────────┼ + │ │ + │ ┌────────┐ ┌────────┐│ + │ │ │ │ ││ + │ │ │ │ ││ + │ │ [2-17] │ │[18-18] ││ + │ └────────┘ └────────┘│ + │ │ + │ now-1s now-3s │ + │ │ + │ │ +┌───────────────────────────────────────────────────────────────────────┐ +│ │ │ │ +│ stale rowsets │ │ │ +│ │ │ │ +│ ┌────────┐ │ │ │ +│ │ │ │ │ │ +│ │ │ │ │ │ +│ │ [11-16]│ │ │ │ +│ └────────┘ │ │ │ +│ │ │ │ +│ now-13s │ │ │ +│ │ │ │ +│ │ │ │ +│ ┌────────┐ ┌─────────┐ ┌─────────┐ │┌────────┐ │ │ +│ │in cache│ │ │ │ │ ││ │ │ │ +│ │ │ │ │ │ │ ││ │ │ │ +│ │ [2-10] │ │ [11-15] │ │ [16-16] │ ││ [17-17]│ │ │ +│ └────────┘ └─────────┘ └─────────┘ │└────────┘ │ │ +│ │ │ │ +│ now-40s now-20s now-15s │ now-7s │ │ +│ │ │ │ +│ │ │ │ +└───────────────────────────────────────────────────────────────────────┘ + │ │ + return: [2-10],[11-16],[17-17],[18-18] + note: when there are no warmed up rowset at some vertex, choose the latest edge +*/ + _engine.set_startup_timepoint(system_clock::now() - seconds(200)); + auto tablet = create_tablet_with_initial_rowsets(15, false, false); + do_cumu_compaction(tablet, 2, 10, true, system_clock::now() - seconds(40)); + do_cumu_compaction(tablet, 11, 15, false, system_clock::now() - seconds(20)); + add_new_version_rowset(tablet, 16, false, system_clock::now() - seconds(15)); + add_new_version_rowset(tablet, 17, false, system_clock::now() - seconds(7)); + add_new_version_rowset(tablet, 18, false, system_clock::now() - seconds(3)); + do_cumu_compaction(tablet, 11, 16, false, system_clock::now() - seconds(13)); + do_cumu_compaction(tablet, 2, 17, false, system_clock::now() - seconds(1)); + + std::string compaction_status; + tablet->get_compaction_status(&compaction_status); + std::cout << compaction_status << std::endl; + + std::vector expected_versions = {{0, 1}, {2, 10}, {11, 16}, {17, 17}, {18, 18}}; + check_capture_result(tablet, Version {0, 18}, expected_versions); +} + +} // namespace doris \ No newline at end of file diff --git a/be/test/cloud/cloud_tablet_query_with_tolerance_test.cpp b/be/test/cloud/cloud_tablet_query_with_tolerance_test.cpp new file mode 100644 index 00000000000000..1a24ea275be007 --- /dev/null +++ b/be/test/cloud/cloud_tablet_query_with_tolerance_test.cpp @@ -0,0 +1,1074 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include + +#include +#include +#include + +#include "cloud/cloud_storage_engine.h" +#include "cloud/cloud_tablet.h" +#include "olap/base_tablet.h" +#include "olap/rowset/rowset.h" +#include "olap/rowset/rowset_factory.h" +#include "olap/rowset/rowset_meta.h" +#include "olap/tablet_meta.h" +#include "util/uid_util.h" + +namespace doris { + +using namespace std::chrono; + +class TestFreshnessTolerance : public testing::Test { +public: + TestFreshnessTolerance() : _engine(CloudStorageEngine(EngineOptions {})) {} + + void SetUp() override { + config::read_cluster_cache_opt_verbose_log = true; + _tablet_meta.reset(new TabletMeta(1, 2, 15673, 15674, 4, 5, TTabletSchema(), 6, {{7, 8}}, + UniqueId(9, 10), TTabletType::TABLET_TYPE_DISK, + TCompressionType::LZ4F)); + } + void TearDown() override { config::read_cluster_cache_opt_verbose_log = false; } + + RowsetSharedPtr create_rowset_without_visible_time(Version version) { + auto rs_meta = std::make_shared(); + rs_meta->set_rowset_type(BETA_ROWSET); + rs_meta->set_version(version); + rs_meta->set_rowset_id(_engine.next_rowset_id()); + RowsetSharedPtr rowset; + Status st = RowsetFactory::create_rowset(nullptr, "", rs_meta, &rowset); + if (!st.ok()) { + return nullptr; + } + return rowset; + } + + RowsetSharedPtr create_rowset(Version version, + time_point visible_timestamp = system_clock::now() - + seconds(100)) { + auto rs = create_rowset_without_visible_time(version); + if (!rs) { + return nullptr; + } + rs->rowset_meta()->set_visible_ts_ms( + duration_cast(visible_timestamp.time_since_epoch()).count()); + return rs; + } + + CloudTabletSPtr create_tablet_with_initial_rowsets(int max_version, bool is_mow = false) { + CloudTabletSPtr tablet = + std::make_shared(_engine, std::make_shared(*_tablet_meta)); + tablet->tablet_meta()->set_enable_unique_key_merge_on_write(is_mow); + std::vector rowsets; + auto rs1 = create_rowset(Version {0, 1}); + rowsets.emplace_back(rs1); + tablet->add_warmed_up_rowset(rs1->rowset_id()); + for (int ver = 2; ver <= max_version; ver++) { + auto rs = create_rowset(Version {ver, ver}); + tablet->add_warmed_up_rowset(rs->rowset_id()); + rowsets.emplace_back(rs); + } + { + std::unique_lock wlock {tablet->get_header_lock()}; + tablet->add_rowsets(rowsets, false, wlock, false); + } + return tablet; + } + + void add_new_version_rowset(CloudTabletSPtr tablet, int64_t version, bool warmed_up, + time_point visible_timestamp) { + auto rowset = create_rowset(Version {version, version}, visible_timestamp); + if (warmed_up) { + tablet->add_warmed_up_rowset(rowset->rowset_id()); + } + std::unique_lock wlock {tablet->get_header_lock()}; + tablet->add_rowsets({rowset}, false, wlock, false); + } + + void do_cumu_compaction(CloudTabletSPtr tablet, int64_t start_version, int64_t end_version, + bool warmed_up, time_point visible_timestamp) { + std::unique_lock wrlock {tablet->get_header_lock()}; + std::vector input_rowsets; + auto output_rowset = create_rowset(Version {start_version, end_version}, visible_timestamp); + if (warmed_up) { + tablet->add_warmed_up_rowset(output_rowset->rowset_id()); + } + std::ranges::copy_if(std::views::values(tablet->rowset_map()), + std::back_inserter(input_rowsets), [=](const RowsetSharedPtr& rowset) { + return rowset->version().first >= start_version && + rowset->version().first <= end_version; + }); + if (input_rowsets.size() == 1) { + tablet->add_rowsets({output_rowset}, true, wrlock); + } else { + tablet->delete_rowsets(input_rowsets, wrlock); + tablet->add_rowsets({output_rowset}, false, wrlock); + } + } + + void check_capture_result(CloudTabletSPtr tablet, Version spec_version, + int64_t query_freshness_tolerance_ms, + const std::vector& expected_versions) { + CaptureRowsetOps opts {.skip_missing_versions = false, + .enable_prefer_cached_rowset = false, + .query_freshness_tolerance_ms = query_freshness_tolerance_ms}; + auto res = tablet->capture_read_source(spec_version, opts); + ASSERT_TRUE(res.has_value()); + std::vector rs_splits = std::move(res.value().rs_splits); + auto dump_versions = [](const std::vector& expected_versions, + const std::vector& splits) { + std::vector expected_str; + for (const auto& version : expected_versions) { + expected_str.push_back(version.to_string()); + } + std::vector versions; + for (const auto& split : splits) { + versions.push_back(split.rs_reader->rowset()->version().to_string()); + } + return fmt::format("expected_versions: {}, actual_versions: {}", + fmt::join(expected_str, ", "), fmt::join(versions, ", ")); + }; + ASSERT_EQ(rs_splits.size(), expected_versions.size()) + << dump_versions(expected_versions, rs_splits); + for (size_t i = 0; i < rs_splits.size(); i++) { + ASSERT_EQ(rs_splits[i].rs_reader->rowset()->version(), expected_versions[i]) + << dump_versions(expected_versions, rs_splits); + } + } + +protected: + std::string _json_rowset_meta; + TabletMetaSharedPtr _tablet_meta; + +private: + CloudStorageEngine _engine; +}; + +TEST_F(TestFreshnessTolerance, testVisibleTimestamp) { + { + // for historical rowset, visible time is not set, RowsetMeta::visible_timestamp() uses + // newest_write_timestamp + auto tp1 = system_clock::now() - seconds(100); + auto rs = create_rowset_without_visible_time({2, 2}); + auto d = duration_cast(tp1.time_since_epoch()).count(); + rs->rowset_meta()->set_newest_write_timestamp(d); + ASSERT_EQ(rs->rowset_meta()->visible_timestamp(), system_clock::from_time_t(d)); + } + + { + // when visible_ts_ms is set, RowsetMeta::visible_timestamp() uses visible_ts_ms which is more precise + auto tp1 = system_clock::now() - seconds(100); + auto tp2 = system_clock::now() - seconds(50); + auto rs = create_rowset_without_visible_time({2, 2}); + auto d1 = duration_cast(tp1.time_since_epoch()).count(); + auto d2 = duration_cast(tp2.time_since_epoch()).count(); + rs->rowset_meta()->set_newest_write_timestamp(d1); + rs->rowset_meta()->set_visible_ts_ms(d2); + ASSERT_EQ(rs->rowset_meta()->visible_timestamp(), + time_point(milliseconds(d2))); + } +} + +TEST_F(TestFreshnessTolerance, testCapture_1_1) { + /* + now-10s now + + │ 10s │ + ◄───────────────────────────┤ +┌────────┐ ┌─────────┐ ┌─────────┐│ ┌────────┐ ┌───────┐ │ +│in cache│ │ in cache│ │in cache ││ │ │ │ │ │ +│ │ │ │ │ ││ │ │ │ │ │ +│ [2-10] │ │ [11-15] │ │ [16-16] ││ │ [17-17]│ │[18-18]│ │ +└────────┘ └─────────┘ └─────────┘│ └────────┘ └───────┘ │ + │ │ + now-40s now-20s now-15s │ now-7s now-3s │ + │ │ + │ │ + return: [2-10],[11-15],[16-16] +*/ + _engine.set_startup_timepoint(system_clock::now() - seconds(200)); + auto tablet = create_tablet_with_initial_rowsets(15); + do_cumu_compaction(tablet, 2, 10, true, system_clock::now() - seconds(40)); + do_cumu_compaction(tablet, 11, 15, true, system_clock::now() - seconds(20)); + add_new_version_rowset(tablet, 16, true, system_clock::now() - seconds(15)); + add_new_version_rowset(tablet, 17, false, system_clock::now() - seconds(7)); + add_new_version_rowset(tablet, 18, false, system_clock::now() - seconds(3)); + + std::string compaction_status; + tablet->get_compaction_status(&compaction_status); + std::cout << compaction_status << std::endl; + + int64_t query_freshness_tolerance_ms = 10000; // 10s + std::vector expected_versions = {{0, 1}, {2, 10}, {11, 15}, {16, 16}}; + check_capture_result(tablet, Version {0, 18}, query_freshness_tolerance_ms, expected_versions); +} + +TEST_F(TestFreshnessTolerance, testCapture_1_2) { + /* + now-10s now + + │ 10s │ + ◄───────────────────────────┤ +┌────────┐ ┌─────────┐ ┌─────────┐│ ┌────────┐ ┌───────┐ │ +│in cache│ │ in cache│ │ ││ │ │ │ │ │ +│ │ │ │ │ ││ │ │ │ │ │ +│ [2-10] │ │ [11-15] │ │ [16-16] ││ │ [17-17]│ │[18-18]│ │ +└────────┘ └─────────┘ └─────────┘│ └────────┘ └───────┘ │ + │ │ + now-40s now-20s now-15s │ now-7s now-3s │ + │ │ + │ │ + return: [2-10],[11-15],[16-16],[17-17],[18-18] + NOTE: rowset[16-16] should be visible becasue it's within the query freshness tolerance time limit. + However, since the data files of rowset[16-16] is not in the cache, there is no difference between + capturing up to version 16 and capturing up to version 18. So we capture up to version 18. +*/ + _engine.set_startup_timepoint(system_clock::now() - seconds(200)); + auto tablet = create_tablet_with_initial_rowsets(15); + do_cumu_compaction(tablet, 2, 10, true, system_clock::now() - seconds(40)); + do_cumu_compaction(tablet, 11, 15, true, system_clock::now() - seconds(20)); + add_new_version_rowset(tablet, 16, false, system_clock::now() - seconds(15)); + add_new_version_rowset(tablet, 17, false, system_clock::now() - seconds(7)); + add_new_version_rowset(tablet, 18, false, system_clock::now() - seconds(3)); + + std::string compaction_status; + tablet->get_compaction_status(&compaction_status); + std::cout << compaction_status << std::endl; + + int64_t query_freshness_tolerance_ms = 10000; // 10s + std::vector expected_versions = {{0, 1}, {2, 10}, {11, 15}, + {16, 16}, {17, 17}, {18, 18}}; + check_capture_result(tablet, Version {0, 18}, query_freshness_tolerance_ms, expected_versions); +} + +TEST_F(TestFreshnessTolerance, testCapture_1_3) { + /* + now-10s now + + │ 10s │ + ◄───────────────────────────┤ +┌────────┐ ┌─────────┐ ┌─────────┐│ ┌────────┐ ┌───────┐ │ +│in cache│ │ in cache│ │in cache ││ │in cache│ │ │ │ +│ │ │ │ │ ││ │ │ │ │ │ +│ [2-10] │ │ [11-15] │ │ [16-16] ││ │ [17-17]│ │[18-18]│ │ +└────────┘ └─────────┘ └─────────┘│ └────────┘ └───────┘ │ + │ │ + now-40s now-20s now-15s │ now-7s now-3s │ + │ │ + │ │ + return: [2-10],[11-15],[16-16],[17-17] +*/ + _engine.set_startup_timepoint(system_clock::now() - seconds(200)); + auto tablet = create_tablet_with_initial_rowsets(15); + do_cumu_compaction(tablet, 2, 10, true, system_clock::now() - seconds(40)); + do_cumu_compaction(tablet, 11, 15, true, system_clock::now() - seconds(20)); + add_new_version_rowset(tablet, 16, true, system_clock::now() - seconds(15)); + add_new_version_rowset(tablet, 17, true, system_clock::now() - seconds(7)); + add_new_version_rowset(tablet, 18, false, system_clock::now() - seconds(3)); + + std::string compaction_status; + tablet->get_compaction_status(&compaction_status); + std::cout << compaction_status << std::endl; + + int64_t query_freshness_tolerance_ms = 10000; // 10s + std::vector expected_versions = {{0, 1}, {2, 10}, {11, 15}, {16, 16}, {17, 17}}; + check_capture_result(tablet, Version {0, 18}, query_freshness_tolerance_ms, expected_versions); +} + +TEST_F(TestFreshnessTolerance, testCapture_1_4) { + /* + be startup time now-10s now + now - 30s + │ │ 10s │ + │ ◄───────────────────────────┤ +┌────────┐│ ┌─────────┐ ┌─────────┐│ ┌────────┐ ┌───────┐ │ +│ ││ │ in cache│ │in cache ││ │in cache│ │ │ │ +│ ││ │ │ │ ││ │ │ │ │ │ +│ [2-10] ││ │ [11-15] │ │ [16-16] ││ │ [17-17]│ │[18-18]│ │ +└────────┘│ └─────────┘ └─────────┘│ └────────┘ └───────┘ │ + │ │ │ + now-40s │ now-20s now-15s │ now-7s now-3s │ + │ │ │ + │ │ │ + + return: [2-10],[11-15],[16-16],[17-17] + note: We only care about rowsets that are created after startup time point. For other historical rowsets, + we just assume that they are warmuped up. +*/ + _engine.set_startup_timepoint(system_clock::now() - seconds(30)); + auto tablet = create_tablet_with_initial_rowsets(15); + do_cumu_compaction(tablet, 2, 10, false, system_clock::now() - seconds(40)); + do_cumu_compaction(tablet, 11, 15, true, system_clock::now() - seconds(20)); + add_new_version_rowset(tablet, 16, true, system_clock::now() - seconds(15)); + add_new_version_rowset(tablet, 17, true, system_clock::now() - seconds(7)); + add_new_version_rowset(tablet, 18, false, system_clock::now() - seconds(3)); + + std::string compaction_status; + tablet->get_compaction_status(&compaction_status); + std::cout << compaction_status << std::endl; + + int64_t query_freshness_tolerance_ms = 10000; // 10s + std::vector expected_versions = {{0, 1}, {2, 10}, {11, 15}, {16, 16}, {17, 17}}; + check_capture_result(tablet, Version {0, 18}, query_freshness_tolerance_ms, expected_versions); +} + +TEST_F(TestFreshnessTolerance, testCapture_2_1) { + /* + now-10s now + │ 10s │ + ◄────────────────────────┼ + │ │ + ┌────────┐ │┌────────┐ ┌───────┐ │ + │in cache│ ││ │ │ │ │ + │ │ ││ │ │ │ │ + │ [2-10] │ ││ [11-17]│ │[18-18]│ │ + └────────┘ │└────────┘ └───────┘ │ + │ │ + now-40s │ now-1s now-3s │ +┌───────────────────────────────────────────────────────────────────────┐ +│ │ │ │ +│ stale rowsets │ │ │ +│ ┌─────────┐ ┌─────────┐ │┌────────┐ │ │ +│ │ in cache│ │in cache │ ││ │ │ │ +│ │ │ │ │ ││ │ │ │ +│ │ [11-15] │ │ [16-16] │ ││ [17-17]│ │ │ +│ └─────────┘ └─────────┘ │└────────┘ │ │ +│ │ │ │ +│ now-20s now-15s │ now-7s │ │ +└───────────────────────────────────────────────────────────────────────┘ + │ │ + + return: [2-10],[11-15],[16-16] + */ + _engine.set_startup_timepoint(system_clock::now() - seconds(200)); + auto tablet = create_tablet_with_initial_rowsets(15); + do_cumu_compaction(tablet, 2, 10, true, system_clock::now() - seconds(40)); + do_cumu_compaction(tablet, 11, 15, true, system_clock::now() - seconds(20)); + add_new_version_rowset(tablet, 16, true, system_clock::now() - seconds(15)); + add_new_version_rowset(tablet, 17, false, system_clock::now() - seconds(7)); + add_new_version_rowset(tablet, 18, false, system_clock::now() - seconds(3)); + do_cumu_compaction(tablet, 11, 17, false, system_clock::now() - seconds(1)); + + std::string compaction_status; + tablet->get_compaction_status(&compaction_status); + std::cout << compaction_status << std::endl; + + int64_t query_freshness_tolerance_ms = 10000; // 10s + std::vector expected_versions = {{0, 1}, {2, 10}, {11, 15}, {16, 16}}; + check_capture_result(tablet, Version {0, 18}, query_freshness_tolerance_ms, expected_versions); +} + +TEST_F(TestFreshnessTolerance, testCapture_2_2) { + /* + now-10s now + │ 10s │ + ◄────────────────────────┼ + │ │ + ┌────────┐ │┌────────┐ ┌───────┐ │ + │in cache│ ││ │ │ │ │ + │ │ ││ │ │ │ │ + │ [2-10] │ ││ [11-17]│ │[18-18]│ │ + └────────┘ │└────────┘ └───────┘ │ + │ │ + now-40s │ now-1s now-3s │ +┌───────────────────────────────────────────────────────────────────────┐ +│ │ │ │ +│ stale rowsets │ │ │ +│ ┌─────────┐ ┌─────────┐ │┌────────┐ │ │ +│ │ in cache│ │in cache │ ││in cache│ │ │ +│ │ │ │ │ ││ │ │ │ +│ │ [11-15] │ │ [16-16] │ ││ [17-17]│ │ │ +│ └─────────┘ └─────────┘ │└────────┘ │ │ +│ │ │ │ +│ now-20s now-15s │ now-7s │ │ +└───────────────────────────────────────────────────────────────────────┘ + │ │ + + return: [2-10],[11-15],[16-16],[17-17] + */ + _engine.set_startup_timepoint(system_clock::now() - seconds(200)); + auto tablet = create_tablet_with_initial_rowsets(15); + do_cumu_compaction(tablet, 2, 10, true, system_clock::now() - seconds(40)); + do_cumu_compaction(tablet, 11, 15, true, system_clock::now() - seconds(20)); + add_new_version_rowset(tablet, 16, true, system_clock::now() - seconds(15)); + add_new_version_rowset(tablet, 17, true, system_clock::now() - seconds(7)); + add_new_version_rowset(tablet, 18, false, system_clock::now() - seconds(3)); + do_cumu_compaction(tablet, 11, 17, false, system_clock::now() - seconds(1)); + + std::string compaction_status; + tablet->get_compaction_status(&compaction_status); + std::cout << compaction_status << std::endl; + + int64_t query_freshness_tolerance_ms = 10000; // 10s + std::vector expected_versions = {{0, 1}, {2, 10}, {11, 15}, {16, 16}, {17, 17}}; + check_capture_result(tablet, Version {0, 18}, query_freshness_tolerance_ms, expected_versions); +} + +TEST_F(TestFreshnessTolerance, testCapture_2_3) { + /* + now-10s now + │ 10s │ + ◄────────────────────────┼ + │ │ + ┌────────┐ │┌────────┐ ┌────────┐│ + │in cache│ ││ │ │ ││ + │ │ ││ │ │in cache││ + │ [2-10] │ ││ [11-17]│ │[18-18] ││ + └────────┘ │└────────┘ └────────┘│ + │ │ + now-40s │ now-1s now-3s │ +┌───────────────────────────────────────────────────────────────────────┐ +│ │ │ │ +│ stale rowsets │ │ │ +│ ┌─────────┐ ┌─────────┐ │┌────────┐ │ │ +│ │ in cache│ │in cache │ ││in cache│ │ │ +│ │ │ │ │ ││ │ │ │ +│ │ [11-15] │ │ [16-16] │ ││ [17-17]│ │ │ +│ └─────────┘ └─────────┘ │└────────┘ │ │ +│ │ │ │ +│ now-20s now-15s │ now-7s │ │ +└───────────────────────────────────────────────────────────────────────┘ + │ │ + return: [2-10],[11-15],[16-16],[17-17],[18-18] + */ + _engine.set_startup_timepoint(system_clock::now() - seconds(200)); + auto tablet = create_tablet_with_initial_rowsets(15); + do_cumu_compaction(tablet, 2, 10, true, system_clock::now() - seconds(40)); + do_cumu_compaction(tablet, 11, 15, true, system_clock::now() - seconds(20)); + add_new_version_rowset(tablet, 16, true, system_clock::now() - seconds(15)); + add_new_version_rowset(tablet, 17, true, system_clock::now() - seconds(7)); + add_new_version_rowset(tablet, 18, true, system_clock::now() - seconds(3)); + do_cumu_compaction(tablet, 11, 17, false, system_clock::now() - seconds(1)); + + std::string compaction_status; + tablet->get_compaction_status(&compaction_status); + std::cout << compaction_status << std::endl; + + int64_t query_freshness_tolerance_ms = 10000; // 10s + std::vector expected_versions = {{0, 1}, {2, 10}, {11, 15}, + {16, 16}, {17, 17}, {18, 18}}; + check_capture_result(tablet, Version {0, 18}, query_freshness_tolerance_ms, expected_versions); +} + +TEST_F(TestFreshnessTolerance, testCapture_2_4) { + /* + now-10s now + │ 10s │ + ◄────────────────────────┼ + │ │ + ┌────────┐ │ ┌────────┐│ + │ │ │ │ ││ + │ │ │ │in cache││ + │ [2-16] │ │ │[18-18] ││ + └────────┘ │ └────────┘│ + │ │ + now-13s │ now-3s │ + │ │ + │ │ +┌───────────────────────────────────────────────────────────────────────┐ +│ │ │ │ +│ stale rowsets │ │ │ +│ ┌────────┐ ┌─────────┐ ┌─────────┐ │┌────────┐ │ │ +│ │in cache│ │ in cache│ │in cache │ ││in cache│ │ │ +│ │ │ │ │ │ │ ││ │ │ │ +│ │ [2-10] │ │ [11-15] │ │ [16-16] │ ││ [17-17]│ │ │ +│ └────────┘ └─────────┘ └─────────┘ │└────────┘ │ │ +│ │ │ │ +│ now-40s now-20s now-15s │ now-7s │ │ +│ │ │ │ +│ │ │ │ +└───────────────────────────────────────────────────────────────────────┘ + │ │ + return: [2-10],[11-15],[16-16],[17-17],[18-18] + note: should not capture [2-16], otherwise we will meet cache miss +*/ + _engine.set_startup_timepoint(system_clock::now() - seconds(200)); + auto tablet = create_tablet_with_initial_rowsets(15); + do_cumu_compaction(tablet, 2, 10, true, system_clock::now() - seconds(40)); + do_cumu_compaction(tablet, 11, 15, true, system_clock::now() - seconds(20)); + add_new_version_rowset(tablet, 16, true, system_clock::now() - seconds(15)); + add_new_version_rowset(tablet, 17, true, system_clock::now() - seconds(7)); + add_new_version_rowset(tablet, 18, true, system_clock::now() - seconds(3)); + do_cumu_compaction(tablet, 2, 16, false, system_clock::now() - seconds(13)); + + std::string compaction_status; + tablet->get_compaction_status(&compaction_status); + std::cout << compaction_status << std::endl; + + int64_t query_freshness_tolerance_ms = 10000; // 10s + std::vector expected_versions = {{0, 1}, {2, 10}, {11, 15}, + {16, 16}, {17, 17}, {18, 18}}; + check_capture_result(tablet, Version {0, 18}, query_freshness_tolerance_ms, expected_versions); +} + +TEST_F(TestFreshnessTolerance, testCapture_3_1) { + /* + now-10s now + │ 10s │ + ◄────────────────────────┼ + │ │ + ┌────────┐ │┌────────┐ ┌───────┐ │ + │in cache│ ││ │ │ │ │ + │ │ ││ │ │ │ │ + │ [2-10] │ ││ [11-17]│ │[18-18]│ │ + └────────┘ │└────────┘ └───────┘ │ + │ │ + now-40s │ now-1s now-3s │ +┌───────────────────────────────────────────────────────────────────────┐ +│ │ │ │ +│ stale rowsets │ │ │ +│ ┌─────────┐ ┌─────────┐ │┌────────┐ │ │ +│ │ in cache│ │ │ ││ │ │ │ +│ │ │ │ │ ││ │ │ │ +│ │ [11-15] │ │ [16-16] │ ││ [17-17]│ │ │ +│ └─────────┘ └─────────┘ │└────────┘ │ │ +│ │ │ │ +│ now-20s now-15s │ now-7s │ │ +└───────────────────────────────────────────────────────────────────────┘ + │ │ + + return: [2-10],[11-17],[18-18] + note: should fallback +*/ + _engine.set_startup_timepoint(system_clock::now() - seconds(200)); + auto tablet = create_tablet_with_initial_rowsets(15); + do_cumu_compaction(tablet, 2, 10, true, system_clock::now() - seconds(40)); + do_cumu_compaction(tablet, 11, 15, true, system_clock::now() - seconds(20)); + add_new_version_rowset(tablet, 16, false, system_clock::now() - seconds(15)); + add_new_version_rowset(tablet, 17, false, system_clock::now() - seconds(7)); + add_new_version_rowset(tablet, 18, false, system_clock::now() - seconds(3)); + do_cumu_compaction(tablet, 11, 17, false, system_clock::now() - seconds(1)); + + std::string compaction_status; + tablet->get_compaction_status(&compaction_status); + std::cout << compaction_status << std::endl; + + int64_t query_freshness_tolerance_ms = 10000; // 10s + std::vector expected_versions = {{0, 1}, {2, 10}, {11, 17}, {18, 18}}; + check_capture_result(tablet, Version {0, 18}, query_freshness_tolerance_ms, expected_versions); +} + +TEST_F(TestFreshnessTolerance, testCaptureMow_1_1) { + /* + now-10s now + + │ 10s │ + ◄───────────────────────────┤ +┌────────┐ ┌─────────┐ ┌─────────┐│ ┌────────┐ ┌───────┐ │ +│in cache│ │ in cache│ │in cache ││ │ │ │ │ │ +│ │ │ │ │ ││ │ │ │ │ │ +│ [2-10] │ │ [11-15] │ │ [16-16] ││ │ [17-17]│ │[18-18]│ │ +└────────┘ └─────────┘ └─────────┘│ └────────┘ └───────┘ │ + │ │ + now-40s now-20s now-15s │ now-7s now-3s │ + │ │ + │ │ + return: [2-10],[11-15],[16-16] +*/ + _engine.set_startup_timepoint(system_clock::now() - seconds(200)); + auto tablet = create_tablet_with_initial_rowsets(15, true); + do_cumu_compaction(tablet, 2, 10, true, system_clock::now() - seconds(40)); + do_cumu_compaction(tablet, 11, 15, true, system_clock::now() - seconds(20)); + add_new_version_rowset(tablet, 16, true, system_clock::now() - seconds(15)); + add_new_version_rowset(tablet, 17, false, system_clock::now() - seconds(7)); + add_new_version_rowset(tablet, 18, false, system_clock::now() - seconds(3)); + + std::string compaction_status; + tablet->get_compaction_status(&compaction_status); + std::cout << compaction_status << std::endl; + + int64_t query_freshness_tolerance_ms = 10000; // 10s + std::vector expected_versions = {{0, 1}, {2, 10}, {11, 15}, {16, 16}}; + check_capture_result(tablet, Version {0, 18}, query_freshness_tolerance_ms, expected_versions); +} + +TEST_F(TestFreshnessTolerance, testCaptureMow_1_2) { + /* + now-10s now + + │ 10s │ + ◄───────────────────────────┤ +┌────────┐ ┌─────────┐ ┌─────────┐│ ┌────────┐ ┌───────┐ │ +│in cache│ │ in cache│ │ ││ │ │ │ │ │ +│ │ │ │ │ ││ │ │ │ │ │ +│ [2-10] │ │ [11-15] │ │ [16-16] ││ │ [17-17]│ │[18-18]│ │ +└────────┘ └─────────┘ └─────────┘│ └────────┘ └───────┘ │ + │ │ + now-40s now-20s now-15s │ now-7s now-3s │ + │ │ + │ │ + return: [2-10],[11-15],[16-16],[17-17],[18-18] + NOTE: rowset[16-16] must be visible becasue it's within the query freshness tolerance time limit. + However, since the data files of rowset[16-16] is not in the cache, there is no difference between + capturing up to version 16 and capturing up to version 18 +*/ + _engine.set_startup_timepoint(system_clock::now() - seconds(200)); + auto tablet = create_tablet_with_initial_rowsets(15, true); + do_cumu_compaction(tablet, 2, 10, true, system_clock::now() - seconds(40)); + do_cumu_compaction(tablet, 11, 15, true, system_clock::now() - seconds(20)); + add_new_version_rowset(tablet, 16, false, system_clock::now() - seconds(15)); + add_new_version_rowset(tablet, 17, false, system_clock::now() - seconds(7)); + add_new_version_rowset(tablet, 18, false, system_clock::now() - seconds(3)); + + std::string compaction_status; + tablet->get_compaction_status(&compaction_status); + std::cout << compaction_status << std::endl; + + int64_t query_freshness_tolerance_ms = 10000; // 10s + std::vector expected_versions = {{0, 1}, {2, 10}, {11, 15}, + {16, 16}, {17, 17}, {18, 18}}; + check_capture_result(tablet, Version {0, 18}, query_freshness_tolerance_ms, expected_versions); +} + +TEST_F(TestFreshnessTolerance, testCaptureMow_1_3) { + /* + now-10s now + + │ 10s │ + ◄───────────────────────────┤ +┌────────┐ ┌─────────┐ ┌─────────┐│ ┌────────┐ ┌───────┐ │ +│in cache│ │ in cache│ │in cache ││ │in cache│ │ │ │ +│ │ │ │ │ ││ │ │ │ │ │ +│ [2-10] │ │ [11-15] │ │ [16-16] ││ │ [17-17]│ │[18-18]│ │ +└────────┘ └─────────┘ └─────────┘│ └────────┘ └───────┘ │ + │ │ + now-40s now-20s now-15s │ now-7s now-3s │ + │ │ + │ │ + return: [2-10],[11-15],[16-16],[17-17] +*/ + _engine.set_startup_timepoint(system_clock::now() - seconds(200)); + auto tablet = create_tablet_with_initial_rowsets(15, true); + do_cumu_compaction(tablet, 2, 10, true, system_clock::now() - seconds(40)); + do_cumu_compaction(tablet, 11, 15, true, system_clock::now() - seconds(20)); + add_new_version_rowset(tablet, 16, true, system_clock::now() - seconds(15)); + add_new_version_rowset(tablet, 17, true, system_clock::now() - seconds(7)); + add_new_version_rowset(tablet, 18, false, system_clock::now() - seconds(3)); + + std::string compaction_status; + tablet->get_compaction_status(&compaction_status); + std::cout << compaction_status << std::endl; + + int64_t query_freshness_tolerance_ms = 10000; // 10s + std::vector expected_versions = {{0, 1}, {2, 10}, {11, 15}, {16, 16}, {17, 17}}; + check_capture_result(tablet, Version {0, 18}, query_freshness_tolerance_ms, expected_versions); +} + +TEST_F(TestFreshnessTolerance, testCaptureMow_1_4) { + /* + be startup time now-10s now + now - 30s + │ │ 10s │ + │ ◄───────────────────────────┤ +┌────────┐│ ┌─────────┐ ┌─────────┐│ ┌────────┐ ┌───────┐ │ +│ ││ │ in cache│ │in cache ││ │in cache│ │ │ │ +│ ││ │ │ │ ││ │ │ │ │ │ +│ [2-10] ││ │ [11-15] │ │ [16-16] ││ │ [17-17]│ │[18-18]│ │ +└────────┘│ └─────────┘ └─────────┘│ └────────┘ └───────┘ │ + │ │ │ + now-40s │ now-20s now-15s │ now-7s now-3s │ + │ │ │ + │ │ │ + + return: [2-10],[11-15],[16-16],[17-17] + note: We only care about rowsets that are created after startup time point. For other historical rowsets, + we just assume that they are warmuped up. +*/ + _engine.set_startup_timepoint(system_clock::now() - seconds(30)); + auto tablet = create_tablet_with_initial_rowsets(15, true); + do_cumu_compaction(tablet, 2, 10, false, system_clock::now() - seconds(40)); + do_cumu_compaction(tablet, 11, 15, true, system_clock::now() - seconds(20)); + add_new_version_rowset(tablet, 16, true, system_clock::now() - seconds(15)); + add_new_version_rowset(tablet, 17, true, system_clock::now() - seconds(7)); + add_new_version_rowset(tablet, 18, false, system_clock::now() - seconds(3)); + + std::string compaction_status; + tablet->get_compaction_status(&compaction_status); + std::cout << compaction_status << std::endl; + + int64_t query_freshness_tolerance_ms = 10000; // 10s + std::vector expected_versions = {{0, 1}, {2, 10}, {11, 15}, {16, 16}, {17, 17}}; + check_capture_result(tablet, Version {0, 18}, query_freshness_tolerance_ms, expected_versions); +} + +TEST_F(TestFreshnessTolerance, testCaptureMow_2_1) { + /* + now-10s now + │ 10s │ + ◄────────────────────────┼ + │ │ + ┌────────┐ │┌────────┐ ┌───────┐ │ + │in cache│ ││ │ │ │ │ + │ │ ││ │ │ │ │ + │ [2-10] │ ││ [11-17]│ │[18-18]│ │ + └────────┘ │└────────┘ └───────┘ │ + │ │ + now-40s │ now-1s now-3s │ +┌───────────────────────────────────────────────────────────────────────┐ +│ │ │ │ +│ stale rowsets │ │ │ +│ ┌─────────┐ ┌─────────┐ │┌────────┐ │ │ +│ │ in cache│ │in cache │ ││ │ │ │ +│ │ │ │ │ ││ │ │ │ +│ │ [11-15] │ │ [16-16] │ ││ [17-17]│ │ │ +│ └─────────┘ └─────────┘ │└────────┘ │ │ +│ │ │ │ +│ now-20s now-15s │ now-7s │ │ +└───────────────────────────────────────────────────────────────────────┘ + │ │ + + return: [2-10],[11-15],[16-16] + */ + _engine.set_startup_timepoint(system_clock::now() - seconds(200)); + auto tablet = create_tablet_with_initial_rowsets(15, true); + do_cumu_compaction(tablet, 2, 10, true, system_clock::now() - seconds(40)); + do_cumu_compaction(tablet, 11, 15, true, system_clock::now() - seconds(20)); + add_new_version_rowset(tablet, 16, true, system_clock::now() - seconds(15)); + add_new_version_rowset(tablet, 17, false, system_clock::now() - seconds(7)); + add_new_version_rowset(tablet, 18, false, system_clock::now() - seconds(3)); + do_cumu_compaction(tablet, 11, 17, false, system_clock::now() - seconds(1)); + + std::string compaction_status; + tablet->get_compaction_status(&compaction_status); + std::cout << compaction_status << std::endl; + + int64_t query_freshness_tolerance_ms = 10000; // 10s + std::vector expected_versions = {{0, 1}, {2, 10}, {11, 15}, {16, 16}}; + check_capture_result(tablet, Version {0, 18}, query_freshness_tolerance_ms, expected_versions); +} + +TEST_F(TestFreshnessTolerance, testCaptureMow_2_2) { + /* + now-10s now + │ 10s │ + ◄────────────────────────┼ + │ │ + ┌────────┐ │┌────────┐ ┌───────┐ │ + │in cache│ ││ │ │ │ │ + │ │ ││ │ │ │ │ + │ [2-10] │ ││ [11-17]│ │[18-18]│ │ + └────────┘ │└────────┘ └───────┘ │ + │ │ + now-40s │ now-1s now-3s │ +┌───────────────────────────────────────────────────────────────────────┐ +│ │ │ │ +│ stale rowsets │ │ │ +│ ┌─────────┐ ┌─────────┐ │┌────────┐ │ │ +│ │ in cache│ │in cache │ ││in cache│ │ │ +│ │ │ │ │ ││ │ │ │ +│ │ [11-15] │ │ [16-16] │ ││ [17-17]│ │ │ +│ └─────────┘ └─────────┘ │└────────┘ │ │ +│ │ │ │ +│ now-20s now-15s │ now-7s │ │ +└───────────────────────────────────────────────────────────────────────┘ + │ │ + + return: [2-10],[11-15],[16-16],[17-17] + */ + _engine.set_startup_timepoint(system_clock::now() - seconds(200)); + auto tablet = create_tablet_with_initial_rowsets(15, true); + do_cumu_compaction(tablet, 2, 10, true, system_clock::now() - seconds(40)); + do_cumu_compaction(tablet, 11, 15, true, system_clock::now() - seconds(20)); + add_new_version_rowset(tablet, 16, true, system_clock::now() - seconds(15)); + add_new_version_rowset(tablet, 17, true, system_clock::now() - seconds(7)); + add_new_version_rowset(tablet, 18, false, system_clock::now() - seconds(3)); + do_cumu_compaction(tablet, 11, 17, false, system_clock::now() - seconds(1)); + + std::string compaction_status; + tablet->get_compaction_status(&compaction_status); + std::cout << compaction_status << std::endl; + + int64_t query_freshness_tolerance_ms = 10000; // 10s + std::vector expected_versions = {{0, 1}, {2, 10}, {11, 15}, {16, 16}, {17, 17}}; + check_capture_result(tablet, Version {0, 18}, query_freshness_tolerance_ms, expected_versions); +} + +TEST_F(TestFreshnessTolerance, testCaptureMow_2_3) { + /* + now-10s now + │ 10s │ + ◄────────────────────────┼ + │ │ + ┌────────┐ │┌────────┐ ┌────────┐│ + │in cache│ ││ │ │ ││ + │ │ ││ │ │in cache││ + │ [2-10] │ ││ [11-17]│ │[18-18] ││ + └────────┘ │└────────┘ └────────┘│ + │ │ + now-40s │ now-1s now-3s │ +┌───────────────────────────────────────────────────────────────────────┐ +│ │ │ │ +│ stale rowsets │ │ │ +│ ┌─────────┐ ┌─────────┐ │┌────────┐ │ │ +│ │ in cache│ │in cache │ ││in cache│ │ │ +│ │ │ │ │ ││ │ │ │ +│ │ [11-15] │ │ [16-16] │ ││ [17-17]│ │ │ +│ └─────────┘ └─────────┘ │└────────┘ │ │ +│ │ │ │ +│ now-20s now-15s │ now-7s │ │ +└───────────────────────────────────────────────────────────────────────┘ + │ │ + return: [2-10],[11-15],[16-16],[17-17] + note: due to the existence of rowset [11-17], we can only capture up to version 17 + because newly rowsets may generate delete bitmap marks on [11-17]. If we capture [18-18], + we may meet data correctness issue if [18-18] has duplicate rows with [11-17] + */ + _engine.set_startup_timepoint(system_clock::now() - seconds(200)); + auto tablet = create_tablet_with_initial_rowsets(15, true); + do_cumu_compaction(tablet, 2, 10, true, system_clock::now() - seconds(40)); + do_cumu_compaction(tablet, 11, 15, true, system_clock::now() - seconds(20)); + add_new_version_rowset(tablet, 16, true, system_clock::now() - seconds(15)); + add_new_version_rowset(tablet, 17, true, system_clock::now() - seconds(7)); + add_new_version_rowset(tablet, 18, true, system_clock::now() - seconds(3)); + do_cumu_compaction(tablet, 11, 17, false, system_clock::now() - seconds(1)); + + std::string compaction_status; + tablet->get_compaction_status(&compaction_status); + std::cout << compaction_status << std::endl; + + int64_t query_freshness_tolerance_ms = 10000; // 10s + std::vector expected_versions = {{0, 1}, {2, 10}, {11, 15}, {16, 16}, {17, 17}}; + check_capture_result(tablet, Version {0, 18}, query_freshness_tolerance_ms, expected_versions); +} + +TEST_F(TestFreshnessTolerance, testCaptureMow_2_4) { + /* + now-10s now + │ 10s │ + ◄────────────────────────┼ + │ │ + ┌────────┐ │ ┌────────┐│ + │ │ │ │ ││ + │ │ │ │in cache││ + │ [2-16] │ │ │[18-18] ││ + └────────┘ │ └────────┘│ + │ │ + now-13s │ now-3s │ + │ │ + │ │ +┌───────────────────────────────────────────────────────────────────────┐ +│ │ │ │ +│ stale rowsets │ │ │ +│ ┌────────┐ ┌─────────┐ ┌─────────┐ │┌────────┐ │ │ +│ │in cache│ │ in cache│ │in cache │ ││in cache│ │ │ +│ │ │ │ │ │ │ ││ │ │ │ +│ │ [2-10] │ │ [11-15] │ │ [16-16] │ ││ [17-17]│ │ │ +│ └────────┘ └─────────┘ └─────────┘ │└────────┘ │ │ +│ │ │ │ +│ now-40s now-20s now-15s │ now-7s │ │ +│ │ │ │ +│ │ │ │ +└───────────────────────────────────────────────────────────────────────┘ + │ │ + return: [2-10],[11-15],[16-16] + note: due to the existence of rowset [2-16], we can only capture up to version 16 + because newly rowsets may generate delete bitmap marks on [2-16]. If we capture [17-17], + we may meet data correctness issue if [17-17] has duplicate rows with [2-16] +*/ + _engine.set_startup_timepoint(system_clock::now() - seconds(200)); + auto tablet = create_tablet_with_initial_rowsets(15, true); + do_cumu_compaction(tablet, 2, 10, true, system_clock::now() - seconds(40)); + do_cumu_compaction(tablet, 11, 15, true, system_clock::now() - seconds(20)); + add_new_version_rowset(tablet, 16, true, system_clock::now() - seconds(15)); + add_new_version_rowset(tablet, 17, true, system_clock::now() - seconds(7)); + add_new_version_rowset(tablet, 18, true, system_clock::now() - seconds(3)); + do_cumu_compaction(tablet, 2, 16, false, system_clock::now() - seconds(13)); + + std::string compaction_status; + tablet->get_compaction_status(&compaction_status); + std::cout << compaction_status << std::endl; + + int64_t query_freshness_tolerance_ms = 10000; // 10s + std::vector expected_versions = {{0, 1}, {2, 10}, {11, 15}, {16, 16}}; + check_capture_result(tablet, Version {0, 18}, query_freshness_tolerance_ms, expected_versions); +} + +TEST_F(TestFreshnessTolerance, testCaptureMow_2_5) { + /* + now-10s now + │ 10s │ + ◄────────────────────────┼ + │ │ + │ ┌────────┐ ┌────────┐│ + │ │ │ │ ││ + │ │ │ │in cache││ + │ │ [2-17] │ │[18-18] ││ + │ └────────┘ └────────┘│ + │ │ + │ now-1s now-3s │ + │ │ + │ │ +┌───────────────────────────────────────────────────────────────────────┐ +│ │ │ │ +│ stale rowsets │ │ │ +│ │ │ │ +│ ┌────────┐ │ │ │ +│ │ │ │ │ │ +│ │ │ │ │ │ +│ │ [2-16] │ │ │ │ +│ └────────┘ │ │ │ +│ │ │ │ +│ now-13s │ │ │ +│ │ │ │ +│ │ │ │ +│ ┌────────┐ ┌─────────┐ ┌─────────┐ │┌────────┐ │ │ +│ │in cache│ │ in cache│ │in cache │ ││in cache│ │ │ +│ │ │ │ │ │ │ ││ │ │ │ +│ │ [2-10] │ │ [11-15] │ │ [16-16] │ ││ [17-17]│ │ │ +│ └────────┘ └─────────┘ └─────────┘ │└────────┘ │ │ +│ │ │ │ +│ now-40s now-20s now-15s │ now-7s │ │ +│ │ │ │ +│ │ │ │ +└───────────────────────────────────────────────────────────────────────┘ + │ │ + return: [2-10],[11-15],[16-16] +*/ + _engine.set_startup_timepoint(system_clock::now() - seconds(200)); + auto tablet = create_tablet_with_initial_rowsets(15, true); + do_cumu_compaction(tablet, 2, 10, true, system_clock::now() - seconds(40)); + do_cumu_compaction(tablet, 11, 15, true, system_clock::now() - seconds(20)); + add_new_version_rowset(tablet, 16, true, system_clock::now() - seconds(15)); + add_new_version_rowset(tablet, 17, true, system_clock::now() - seconds(7)); + add_new_version_rowset(tablet, 18, true, system_clock::now() - seconds(3)); + do_cumu_compaction(tablet, 2, 16, false, system_clock::now() - seconds(13)); + do_cumu_compaction(tablet, 2, 17, false, system_clock::now() - seconds(1)); + + std::string compaction_status; + tablet->get_compaction_status(&compaction_status); + std::cout << compaction_status << std::endl; + + int64_t query_freshness_tolerance_ms = 10000; // 10s + std::vector expected_versions = {{0, 1}, {2, 10}, {11, 15}, {16, 16}}; + check_capture_result(tablet, Version {0, 18}, query_freshness_tolerance_ms, expected_versions); +} + +TEST_F(TestFreshnessTolerance, testCaptureMow_2_6) { + /* + now-10s now + │ 10s │ + ◄────────────────────────┼ + │ │ + │ ┌────────┐ ┌────────┐│ + │ │ │ │ ││ + │ │ │ │in cache││ + │ │ [2-17] │ │[18-18] ││ + │ └────────┘ └────────┘│ + │ │ + │ now-1s now-3s │ + │ │ + │ │ +┌───────────────────────────────────────────────────────────────────────┐ +│ │ │ │ +│ stale rowsets │ │ │ +│ │ │ │ +│ ┌────────┐ │ │ │ +│ │ │ │ │ │ +│ │ │ │ │ │ +│ │ [2-16] │ │ │ │ +│ └────────┘ │ │ │ +│ │ │ │ +│ now-13s │ │ │ +│ │ │ │ +│ │ │ │ +│ ┌────────┐ ┌─────────┐ ┌─────────┐ │┌────────┐ │ │ +│ │in cache│ │ │ │in cache │ ││in cache│ │ │ +│ │ │ │ │ │ │ ││ │ │ │ +│ │ [2-10] │ │ [11-15] │ │ [16-16] │ ││ [17-17]│ │ │ +│ └────────┘ └─────────┘ └─────────┘ │└────────┘ │ │ +│ │ │ │ +│ now-40s now-20s now-15s │ now-7s │ │ +│ │ │ │ +│ │ │ │ +└───────────────────────────────────────────────────────────────────────┘ + │ │ + return: [2-17],[18-18] + note: because rowset [11-15] is not warmed up, we can only choose a path whose max verion is below 15 + but rowset version 16 is within the query freshness tolerance time limit. So we should fallback to + capture rowsets with tablet's max version +*/ + _engine.set_startup_timepoint(system_clock::now() - seconds(200)); + auto tablet = create_tablet_with_initial_rowsets(15, true); + do_cumu_compaction(tablet, 2, 10, true, system_clock::now() - seconds(40)); + do_cumu_compaction(tablet, 11, 15, false, system_clock::now() - seconds(20)); + add_new_version_rowset(tablet, 16, true, system_clock::now() - seconds(15)); + add_new_version_rowset(tablet, 17, true, system_clock::now() - seconds(7)); + add_new_version_rowset(tablet, 18, true, system_clock::now() - seconds(3)); + do_cumu_compaction(tablet, 2, 16, false, system_clock::now() - seconds(13)); + do_cumu_compaction(tablet, 2, 17, false, system_clock::now() - seconds(1)); + + std::string compaction_status; + tablet->get_compaction_status(&compaction_status); + std::cout << compaction_status << std::endl; + + int64_t query_freshness_tolerance_ms = 10000; // 10s + std::vector expected_versions = {{0, 1}, {2, 17}, {18, 18}}; + check_capture_result(tablet, Version {0, 18}, query_freshness_tolerance_ms, expected_versions); +} + +TEST_F(TestFreshnessTolerance, testCaptureMow_3_1) { + /* + now-10s now + │ 10s │ + ◄────────────────────────┼ + │ │ + ┌────────┐ │┌────────┐ ┌───────┐ │ + │in cache│ ││ │ │ │ │ + │ │ ││ │ │ │ │ + │ [2-10] │ ││ [11-17]│ │[18-18]│ │ + └────────┘ │└────────┘ └───────┘ │ + │ │ + now-40s │ now-1s now-3s │ +┌───────────────────────────────────────────────────────────────────────┐ +│ │ │ │ +│ stale rowsets │ │ │ +│ ┌─────────┐ ┌─────────┐ │┌────────┐ │ │ +│ │ in cache│ │ │ ││ │ │ │ +│ │ │ │ │ ││ │ │ │ +│ │ [11-15] │ │ [16-16] │ ││ [17-17]│ │ │ +│ └─────────┘ └─────────┘ │└────────┘ │ │ +│ │ │ │ +│ now-20s now-15s │ now-7s │ │ +└───────────────────────────────────────────────────────────────────────┘ + │ │ + + return: [2-10],[11-17],[18-18] + note: should fallback +*/ + _engine.set_startup_timepoint(system_clock::now() - seconds(200)); + auto tablet = create_tablet_with_initial_rowsets(15, true); + do_cumu_compaction(tablet, 2, 10, true, system_clock::now() - seconds(40)); + do_cumu_compaction(tablet, 11, 15, true, system_clock::now() - seconds(20)); + add_new_version_rowset(tablet, 16, false, system_clock::now() - seconds(15)); + add_new_version_rowset(tablet, 17, false, system_clock::now() - seconds(7)); + add_new_version_rowset(tablet, 18, false, system_clock::now() - seconds(3)); + do_cumu_compaction(tablet, 11, 17, false, system_clock::now() - seconds(1)); + + std::string compaction_status; + tablet->get_compaction_status(&compaction_status); + std::cout << compaction_status << std::endl; + + int64_t query_freshness_tolerance_ms = 10000; // 10s + std::vector expected_versions = {{0, 1}, {2, 10}, {11, 17}, {18, 18}}; + check_capture_result(tablet, Version {0, 18}, query_freshness_tolerance_ms, expected_versions); +} +} // namespace doris diff --git a/be/test/cloud/cloud_tablet_test.cpp b/be/test/cloud/cloud_tablet_test.cpp new file mode 100644 index 00000000000000..fe9751ff7bfbc9 --- /dev/null +++ b/be/test/cloud/cloud_tablet_test.cpp @@ -0,0 +1,359 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "cloud/cloud_tablet.h" + +#include +#include +#include +#include + +#include +#include +#include + +#include "cloud/cloud_storage_engine.h" +#include "cloud/cloud_warm_up_manager.h" +#include "olap/rowset/rowset.h" +#include "olap/rowset/rowset_factory.h" +#include "olap/rowset/rowset_meta.h" +#include "olap/tablet_meta.h" +#include "util/uid_util.h" + +namespace doris { + +using namespace std::chrono; + +class CloudTabletWarmUpStateTest : public testing::Test { +public: + CloudTabletWarmUpStateTest() : _engine(CloudStorageEngine(EngineOptions {})) {} + + void SetUp() override { + _tablet_meta.reset(new TabletMeta(1, 2, 15673, 15674, 4, 5, TTabletSchema(), 6, {{7, 8}}, + UniqueId(9, 10), TTabletType::TABLET_TYPE_DISK, + TCompressionType::LZ4F)); + _tablet = + std::make_shared(_engine, std::make_shared(*_tablet_meta)); + } + void TearDown() override {} + + RowsetSharedPtr create_rowset(Version version, int num_segments = 1) { + auto rs_meta = std::make_shared(); + rs_meta->set_rowset_type(BETA_ROWSET); + rs_meta->set_version(version); + rs_meta->set_rowset_id(_engine.next_rowset_id()); + rs_meta->set_num_segments(num_segments); + RowsetSharedPtr rowset; + Status st = RowsetFactory::create_rowset(nullptr, "", rs_meta, &rowset); + if (!st.ok()) { + return nullptr; + } + return rowset; + } + +protected: + std::string _json_rowset_meta; + TabletMetaSharedPtr _tablet_meta; + std::shared_ptr _tablet; + CloudStorageEngine _engine; +}; + +// Test get_rowset_warmup_state for non-existent rowset +TEST_F(CloudTabletWarmUpStateTest, TestGetRowsetWarmupStateNonExistent) { + auto rowset = create_rowset(Version(1, 1)); + ASSERT_NE(rowset, nullptr); + + auto non_existent_id = _engine.next_rowset_id(); + + WarmUpState state = _tablet->get_rowset_warmup_state(non_existent_id); + EXPECT_EQ(state, WarmUpState::NONE); +} + +// Test add_rowset_warmup_state with TRIGGERED_BY_JOB state +TEST_F(CloudTabletWarmUpStateTest, TestAddRowsetWarmupStateTriggeredByJob) { + auto rowset = create_rowset(Version(1, 1), 5); + ASSERT_NE(rowset, nullptr); + + bool result = _tablet->add_rowset_warmup_state(*(rowset->rowset_meta()), + WarmUpState::TRIGGERED_BY_JOB); + EXPECT_TRUE(result); + + // Verify the state is correctly set + WarmUpState state = _tablet->get_rowset_warmup_state(rowset->rowset_id()); + EXPECT_EQ(state, WarmUpState::TRIGGERED_BY_JOB); +} + +// Test add_rowset_warmup_state with TRIGGERED_BY_SYNC_ROWSET state +TEST_F(CloudTabletWarmUpStateTest, TestAddRowsetWarmupStateTriggeredBySyncRowset) { + auto rowset = create_rowset(Version(2, 2), 3); + ASSERT_NE(rowset, nullptr); + + bool result = _tablet->add_rowset_warmup_state(*(rowset->rowset_meta()), + WarmUpState::TRIGGERED_BY_SYNC_ROWSET); + EXPECT_TRUE(result); + + // Verify the state is correctly set + WarmUpState state = _tablet->get_rowset_warmup_state(rowset->rowset_id()); + EXPECT_EQ(state, WarmUpState::TRIGGERED_BY_SYNC_ROWSET); +} + +// Test adding duplicate rowset warmup state should fail +TEST_F(CloudTabletWarmUpStateTest, TestAddDuplicateRowsetWarmupState) { + auto rowset = create_rowset(Version(3, 3), 2); + ASSERT_NE(rowset, nullptr); + + // First addition should succeed + bool result1 = _tablet->add_rowset_warmup_state(*(rowset->rowset_meta()), + WarmUpState::TRIGGERED_BY_JOB); + EXPECT_TRUE(result1); + + // Second addition should fail + bool result2 = _tablet->add_rowset_warmup_state(*(rowset->rowset_meta()), + WarmUpState::TRIGGERED_BY_SYNC_ROWSET); + EXPECT_FALSE(result2); + + // State should remain the original one + WarmUpState state = _tablet->get_rowset_warmup_state(rowset->rowset_id()); + EXPECT_EQ(state, WarmUpState::TRIGGERED_BY_JOB); +} + +// Test complete_rowset_segment_warmup for non-existent rowset +TEST_F(CloudTabletWarmUpStateTest, TestCompleteRowsetSegmentWarmupNonExistent) { + auto non_existent_id = _engine.next_rowset_id(); + + WarmUpState result = + _tablet->complete_rowset_segment_warmup(non_existent_id, Status::OK(), 1, 0); + EXPECT_EQ(result, WarmUpState::NONE); +} + +// Test complete_rowset_segment_warmup with partial completion +TEST_F(CloudTabletWarmUpStateTest, TestCompleteRowsetSegmentWarmupPartial) { + auto rowset = create_rowset(Version(4, 4), 3); + ASSERT_NE(rowset, nullptr); + + // Add rowset warmup state + bool add_result = _tablet->add_rowset_warmup_state(*(rowset->rowset_meta()), + WarmUpState::TRIGGERED_BY_JOB); + EXPECT_TRUE(add_result); + + // Complete one segment, should still be in TRIGGERED_BY_JOB state + WarmUpState result1 = + _tablet->complete_rowset_segment_warmup(rowset->rowset_id(), Status::OK(), 1, 0); + EXPECT_EQ(result1, WarmUpState::TRIGGERED_BY_JOB); + + // Complete second segment, should still be in TRIGGERED_BY_JOB state + WarmUpState result2 = + _tablet->complete_rowset_segment_warmup(rowset->rowset_id(), Status::OK(), 1, 0); + EXPECT_EQ(result2, WarmUpState::TRIGGERED_BY_JOB); + + // Verify current state is still TRIGGERED_BY_JOB + WarmUpState current_state = _tablet->get_rowset_warmup_state(rowset->rowset_id()); + EXPECT_EQ(current_state, WarmUpState::TRIGGERED_BY_JOB); +} + +// Test complete_rowset_segment_warmup with full completion +TEST_F(CloudTabletWarmUpStateTest, TestCompleteRowsetSegmentWarmupFull) { + auto rowset = create_rowset(Version(5, 5), 2); + ASSERT_NE(rowset, nullptr); + + // Add rowset warmup state + bool add_result = _tablet->add_rowset_warmup_state(*(rowset->rowset_meta()), + WarmUpState::TRIGGERED_BY_SYNC_ROWSET); + EXPECT_TRUE(add_result); + + // Complete first segment + WarmUpState result1 = + _tablet->complete_rowset_segment_warmup(rowset->rowset_id(), Status::OK(), 1, 0); + EXPECT_EQ(result1, WarmUpState::TRIGGERED_BY_SYNC_ROWSET); + + // Complete second segment, should transition to DONE state + WarmUpState result2 = + _tablet->complete_rowset_segment_warmup(rowset->rowset_id(), Status::OK(), 1, 0); + EXPECT_EQ(result2, WarmUpState::DONE); + + // Verify final state is DONE + WarmUpState final_state = _tablet->get_rowset_warmup_state(rowset->rowset_id()); + EXPECT_EQ(final_state, WarmUpState::DONE); +} + +// Test complete_rowset_segment_warmup with inverted index file, partial completion +TEST_F(CloudTabletWarmUpStateTest, TestCompleteRowsetSegmentWarmupWithInvertedIndexPartial) { + auto rowset = create_rowset(Version(6, 6), 1); + ASSERT_NE(rowset, nullptr); + + // Add rowset warmup state + bool add_result = _tablet->add_rowset_warmup_state(*(rowset->rowset_meta()), + WarmUpState::TRIGGERED_BY_JOB); + EXPECT_TRUE(add_result); + + _tablet->update_rowset_warmup_state_inverted_idx_num(rowset->rowset_id(), 1); + _tablet->update_rowset_warmup_state_inverted_idx_num(rowset->rowset_id(), 1); + + // Complete one segment file + WarmUpState result1 = + _tablet->complete_rowset_segment_warmup(rowset->rowset_id(), Status::OK(), 1, 0); + EXPECT_EQ(result1, WarmUpState::TRIGGERED_BY_JOB); + + // Complete inverted index file, should still be in TRIGGERED_BY_JOB state + WarmUpState result2 = + _tablet->complete_rowset_segment_warmup(rowset->rowset_id(), Status::OK(), 0, 1); + EXPECT_EQ(result2, WarmUpState::TRIGGERED_BY_JOB); + + // Verify current state is still TRIGGERED_BY_JOB + WarmUpState current_state = _tablet->get_rowset_warmup_state(rowset->rowset_id()); + EXPECT_EQ(current_state, WarmUpState::TRIGGERED_BY_JOB); +} + +// Test complete_rowset_segment_warmup with inverted index file, full completion +TEST_F(CloudTabletWarmUpStateTest, TestCompleteRowsetSegmentWarmupWithInvertedIndexFull) { + auto rowset = create_rowset(Version(6, 6), 1); + ASSERT_NE(rowset, nullptr); + + // Add rowset warmup state + bool add_result = _tablet->add_rowset_warmup_state(*(rowset->rowset_meta()), + WarmUpState::TRIGGERED_BY_JOB); + EXPECT_TRUE(add_result); + + _tablet->update_rowset_warmup_state_inverted_idx_num(rowset->rowset_id(), 1); + + // Complete segment file + WarmUpState result1 = + _tablet->complete_rowset_segment_warmup(rowset->rowset_id(), Status::OK(), 1, 0); + EXPECT_EQ(result1, WarmUpState::TRIGGERED_BY_JOB); + + // Complete inverted index file + WarmUpState result2 = + _tablet->complete_rowset_segment_warmup(rowset->rowset_id(), Status::OK(), 0, 1); + EXPECT_EQ(result2, WarmUpState::DONE); + + // Verify final state is DONE + WarmUpState final_state = _tablet->get_rowset_warmup_state(rowset->rowset_id()); + EXPECT_EQ(final_state, WarmUpState::DONE); +} + +// Test complete_rowset_segment_warmup with error status +TEST_F(CloudTabletWarmUpStateTest, TestCompleteRowsetSegmentWarmupWithError) { + auto rowset = create_rowset(Version(6, 6), 1); + ASSERT_NE(rowset, nullptr); + + // Add rowset warmup state + bool add_result = _tablet->add_rowset_warmup_state(*(rowset->rowset_meta()), + WarmUpState::TRIGGERED_BY_JOB); + EXPECT_TRUE(add_result); + + // Complete with error status, should still transition to DONE when all segments complete + Status error_status = Status::InternalError("Test error"); + WarmUpState result = + _tablet->complete_rowset_segment_warmup(rowset->rowset_id(), error_status, 1, 0); + EXPECT_EQ(result, WarmUpState::DONE); + + // Verify final state is DONE even with error + WarmUpState final_state = _tablet->get_rowset_warmup_state(rowset->rowset_id()); + EXPECT_EQ(final_state, WarmUpState::DONE); +} + +// Test multiple rowsets warmup state management +TEST_F(CloudTabletWarmUpStateTest, TestMultipleRowsetsWarmupState) { + auto rowset1 = create_rowset(Version(7, 7), 2); + auto rowset2 = create_rowset(Version(8, 8), 3); + auto rowset3 = create_rowset(Version(9, 9), 1); + ASSERT_NE(rowset1, nullptr); + ASSERT_NE(rowset2, nullptr); + ASSERT_NE(rowset3, nullptr); + + // Add multiple rowsets + EXPECT_TRUE(_tablet->add_rowset_warmup_state(*(rowset1->rowset_meta()), + WarmUpState::TRIGGERED_BY_JOB)); + EXPECT_TRUE(_tablet->add_rowset_warmup_state(*(rowset2->rowset_meta()), + WarmUpState::TRIGGERED_BY_SYNC_ROWSET)); + EXPECT_TRUE(_tablet->add_rowset_warmup_state(*(rowset3->rowset_meta()), + WarmUpState::TRIGGERED_BY_JOB)); + + // Verify all states + EXPECT_EQ(_tablet->get_rowset_warmup_state(rowset1->rowset_id()), + WarmUpState::TRIGGERED_BY_JOB); + EXPECT_EQ(_tablet->get_rowset_warmup_state(rowset2->rowset_id()), + WarmUpState::TRIGGERED_BY_SYNC_ROWSET); + EXPECT_EQ(_tablet->get_rowset_warmup_state(rowset3->rowset_id()), + WarmUpState::TRIGGERED_BY_JOB); + + // Complete rowset1 (2 segments) + EXPECT_EQ(_tablet->complete_rowset_segment_warmup(rowset1->rowset_id(), Status::OK(), 1, 0), + WarmUpState::TRIGGERED_BY_JOB); + EXPECT_EQ(_tablet->complete_rowset_segment_warmup(rowset1->rowset_id(), Status::OK(), 1, 0), + WarmUpState::DONE); + + // Complete rowset3 (1 segment) + EXPECT_EQ(_tablet->complete_rowset_segment_warmup(rowset3->rowset_id(), Status::OK(), 1, 0), + WarmUpState::DONE); + + // Verify states after completion + EXPECT_EQ(_tablet->get_rowset_warmup_state(rowset1->rowset_id()), WarmUpState::DONE); + EXPECT_EQ(_tablet->get_rowset_warmup_state(rowset2->rowset_id()), + WarmUpState::TRIGGERED_BY_SYNC_ROWSET); + EXPECT_EQ(_tablet->get_rowset_warmup_state(rowset3->rowset_id()), WarmUpState::DONE); +} + +// Test warmup state with zero segments (edge case) +TEST_F(CloudTabletWarmUpStateTest, TestWarmupStateWithZeroSegments) { + auto rowset = create_rowset(Version(10, 10), 0); + ASSERT_NE(rowset, nullptr); + + // Add rowset with zero segments + bool add_result = _tablet->add_rowset_warmup_state(*(rowset->rowset_meta()), + WarmUpState::TRIGGERED_BY_JOB); + EXPECT_TRUE(add_result); + + // State should be immediately ready for completion since there are no segments to warm up + WarmUpState state = _tablet->get_rowset_warmup_state(rowset->rowset_id()); + EXPECT_EQ(state, WarmUpState::TRIGGERED_BY_JOB); + + // Any completion call should handle the edge case gracefully + WarmUpState result = + _tablet->complete_rowset_segment_warmup(rowset->rowset_id(), Status::OK(), 1, 0); + // With 0 segments, the counter should already be 0, so this should transition to DONE + EXPECT_EQ(result, WarmUpState::DONE); +} + +// Test concurrent access to warmup state (basic thread safety verification) +TEST_F(CloudTabletWarmUpStateTest, TestConcurrentWarmupStateAccess) { + auto rowset1 = create_rowset(Version(11, 11), 4); + auto rowset2 = create_rowset(Version(12, 12), 3); + ASSERT_NE(rowset1, nullptr); + ASSERT_NE(rowset2, nullptr); + + // Add rowsets from different "threads" (simulated by sequential calls) + EXPECT_TRUE(_tablet->add_rowset_warmup_state(*(rowset1->rowset_meta()), + WarmUpState::TRIGGERED_BY_JOB)); + EXPECT_TRUE(_tablet->add_rowset_warmup_state(*(rowset2->rowset_meta()), + WarmUpState::TRIGGERED_BY_SYNC_ROWSET)); + + // Interleaved completion operations + EXPECT_EQ(_tablet->complete_rowset_segment_warmup(rowset1->rowset_id(), Status::OK(), 1, 0), + WarmUpState::TRIGGERED_BY_JOB); + EXPECT_EQ(_tablet->complete_rowset_segment_warmup(rowset2->rowset_id(), Status::OK(), 1, 0), + WarmUpState::TRIGGERED_BY_SYNC_ROWSET); + EXPECT_EQ(_tablet->complete_rowset_segment_warmup(rowset1->rowset_id(), Status::OK(), 1, 0), + WarmUpState::TRIGGERED_BY_JOB); + + // Check states are maintained correctly + EXPECT_EQ(_tablet->get_rowset_warmup_state(rowset1->rowset_id()), + WarmUpState::TRIGGERED_BY_JOB); + EXPECT_EQ(_tablet->get_rowset_warmup_state(rowset2->rowset_id()), + WarmUpState::TRIGGERED_BY_SYNC_ROWSET); +} +} // namespace doris diff --git a/be/test/olap/tablet_test.cpp b/be/test/olap/tablet_test.cpp index b1eb148e8a304b..d0fe006edad465 100644 --- a/be/test/olap/tablet_test.cpp +++ b/be/test/olap/tablet_test.cpp @@ -297,11 +297,11 @@ TEST_F(TestTablet, pad_rowset) { Version version(5, 5); std::vector splits; - ASSERT_FALSE(_tablet->capture_rs_readers(version, &splits, false).ok()); + ASSERT_FALSE(_tablet->capture_rs_readers(version, &splits, {}).ok()); splits.clear(); static_cast(PadRowsetAction::_pad_rowset(_tablet.get(), version)); - ASSERT_TRUE(_tablet->capture_rs_readers(version, &splits, false).ok()); + ASSERT_TRUE(_tablet->capture_rs_readers(version, &splits, {}).ok()); } TEST_F(TestTablet, cooldown_policy) { diff --git a/cloud/src/meta-service/meta_service_job.cpp b/cloud/src/meta-service/meta_service_job.cpp index 501dc4f43b3a05..c740a8720cfbb1 100644 --- a/cloud/src/meta-service/meta_service_job.cpp +++ b/cloud/src/meta-service/meta_service_job.cpp @@ -1048,7 +1048,6 @@ void process_compaction_job(MetaServiceCode& code, std::string& msg, std::string return; } - // We don't actually need to parse the rowset meta doris::RowsetMetaCloudPB rs_meta; rs_meta.ParseFromString(tmp_rowset_val); if (rs_meta.txn_id() <= 0) { @@ -1063,9 +1062,22 @@ void process_compaction_job(MetaServiceCode& code, std::string& msg, std::string INSTANCE_LOG(INFO) << "remove tmp rowset meta, tablet_id=" << tablet_id << " tmp_rowset_key=" << hex(tmp_rowset_key); + using namespace std::chrono; + auto rowset_visible_time = + duration_cast(system_clock::now().time_since_epoch()).count(); + rs_meta.set_visible_ts_ms(rowset_visible_time); + std::string rowset_val; + if (!rs_meta.SerializeToString(&rowset_val)) { + code = MetaServiceCode::PROTOBUF_SERIALIZE_ERR; + SS << "failed to serialize rowset meta, tablet_id=" << tablet_id + << " rowset_id=" << rowset_id; + msg = ss.str(); + return; + } + int64_t version = compaction.output_versions(0); auto rowset_key = meta_rowset_key({instance_id, tablet_id, version}); - txn->put(rowset_key, tmp_rowset_val); + txn->put(rowset_key, rowset_val); INSTANCE_LOG(INFO) << "put rowset meta, tablet_id=" << tablet_id << " rowset_key=" << hex(rowset_key); @@ -1450,9 +1462,31 @@ void process_schema_change_job(MetaServiceCode& code, std::string& msg, std::str : cast_as(err); return; } + + RowsetMetaCloudPB tmp_rowset_meta; + if (!tmp_rowset_meta.ParseFromString(tmp_rowset_val)) { + code = MetaServiceCode::PROTOBUF_PARSE_ERR; + SS << "malformed tmp rowset meta, unable to deserialize, tablet_id=" << new_tablet_id + << " key=" << hex(tmp_rowset_key); + msg = ss.str(); + return; + } + using namespace std::chrono; + auto rowset_visible_time = + duration_cast(system_clock::now().time_since_epoch()).count(); + tmp_rowset_meta.set_visible_ts_ms(rowset_visible_time); + std::string rowset_val; + if (!tmp_rowset_meta.SerializeToString(&rowset_val)) { + code = MetaServiceCode::PROTOBUF_SERIALIZE_ERR; + SS << "failed to serialize rowset meta, tablet_id=" << new_tablet_id + << " rowset_id=" << tmp_rowset_meta.rowset_id_v2(); + msg = ss.str(); + return; + } + auto rowset_key = meta_rowset_key( {instance_id, new_tablet_id, schema_change.output_versions().at(i)}); - txn->put(rowset_key, tmp_rowset_val); + txn->put(rowset_key, rowset_val); txn->remove(tmp_rowset_key); } diff --git a/cloud/src/meta-service/meta_service_txn.cpp b/cloud/src/meta-service/meta_service_txn.cpp index 229a4e205fa869..4ed3c2a1c3d749 100644 --- a/cloud/src/meta-service/meta_service_txn.cpp +++ b/cloud/src/meta-service/meta_service_txn.cpp @@ -1158,6 +1158,10 @@ void commit_txn_immediately( std::vector> rowsets; std::unordered_map tablet_stats; // tablet_id -> stats rowsets.reserve(tmp_rowsets_meta.size()); + + int64_t rowsets_visible_ts_ms = + duration_cast(system_clock::now().time_since_epoch()).count(); + for (auto& [_, i] : tmp_rowsets_meta) { int64_t tablet_id = i.tablet_id(); int64_t table_id = tablet_ids[tablet_id].table_id(); @@ -1179,6 +1183,7 @@ void commit_txn_immediately( int64_t new_version = new_versions[ver_key]; i.set_start_version(new_version); i.set_end_version(new_version); + i.set_visible_ts_ms(rowsets_visible_ts_ms); std::string key = meta_rowset_key({instance_id, tablet_id, i.end_version()}); std::string val; @@ -2327,6 +2332,9 @@ void commit_txn_with_sub_txn(const CommitTxnRequest* request, CommitTxnResponse* continue; } + int64_t rowsets_visible_ts_ms = + duration_cast(system_clock::now().time_since_epoch()).count(); + std::vector> rowsets; std::unordered_map tablet_stats; // tablet_id -> stats for (const auto& sub_txn_info : sub_txn_infos) { @@ -2360,6 +2368,7 @@ void commit_txn_with_sub_txn(const CommitTxnRequest* request, CommitTxnResponse* } i.set_start_version(new_version); i.set_end_version(new_version); + i.set_visible_ts_ms(rowsets_visible_ts_ms); LOG(INFO) << "xxx update rowset version, txn_id=" << txn_id << ", sub_txn_id=" << sub_txn_id << ", table_id=" << table_id << ", partition_id=" << partition_id << ", tablet_id=" << tablet_id diff --git a/cloud/src/meta-service/txn_lazy_committer.cpp b/cloud/src/meta-service/txn_lazy_committer.cpp index 1de69f59d08a81..5be472ffc8ed54 100644 --- a/cloud/src/meta-service/txn_lazy_committer.cpp +++ b/cloud/src/meta-service/txn_lazy_committer.cpp @@ -62,6 +62,9 @@ void convert_tmp_rowsets( // tablet_id -> stats std::unordered_map tablet_stats; + int64_t rowsets_visible_ts_ms = + duration_cast(system_clock::now().time_since_epoch()).count(); + for (auto& [tmp_rowset_key, tmp_rowset_pb] : tmp_rowsets_meta) { std::string tmp_rowst_data; err = txn->get(tmp_rowset_key, &tmp_rowst_data); @@ -171,6 +174,7 @@ void convert_tmp_rowsets( tmp_rowset_pb.set_start_version(version); tmp_rowset_pb.set_end_version(version); + tmp_rowset_pb.set_visible_ts_ms(rowsets_visible_ts_ms); rowset_val.clear(); if (!tmp_rowset_pb.SerializeToString(&rowset_val)) { diff --git a/cloud/test/meta_service_job_test.cpp b/cloud/test/meta_service_job_test.cpp index 89bf7dcb54fb75..2df9ef629820a5 100644 --- a/cloud/test/meta_service_job_test.cpp +++ b/cloud/test/meta_service_job_test.cpp @@ -1094,6 +1094,14 @@ TEST(MetaServiceJobTest, CompactionJobTest) { auto rowset_key = meta_rowset_key({instance_id, tablet_id, input_version_end}); std::string rowset_val; EXPECT_EQ(txn->get(rowset_key, &rowset_val), TxnErrorCode::TXN_OK) << hex(rowset_key); + doris::RowsetMetaCloudPB rowset_meta; + ASSERT_TRUE(rowset_meta.ParseFromString(rowset_val)); + ASSERT_TRUE(rowset_meta.has_visible_ts_ms() && rowset_meta.visible_ts_ms() > 0); + using namespace std::chrono; + auto visible_tp = time_point(milliseconds(rowset_meta.visible_ts_ms())); + std::time_t visible_time = system_clock::to_time_t(visible_tp); + std::cout << "visible time: " + << std::put_time(std::localtime(&visible_time), "%Y%m%d %H:%M:%S") << "\n"; }; auto test_abort_compaction_job = [&](int64_t table_id, int64_t index_id, int64_t partition_id, @@ -3205,6 +3213,12 @@ TEST(MetaServiceJobTest, SchemaChangeJobTest) { EXPECT_EQ(saved_rowset.start_version(), rs.start_version()); EXPECT_EQ(saved_rowset.end_version(), rs.end_version()); EXPECT_EQ(saved_rowset.rowset_id_v2(), rs.rowset_id_v2()); + ASSERT_TRUE(saved_rowset.has_visible_ts_ms() && saved_rowset.visible_ts_ms() > 0); + using namespace std::chrono; + auto visible_tp = time_point(milliseconds(saved_rowset.visible_ts_ms())); + std::time_t visible_time = system_clock::to_time_t(visible_tp); + std::cout << "visible time: " + << std::put_time(std::localtime(&visible_time), "%Y%m%d %H:%M:%S") << "\n"; } for (int i = 3; i < 5; ++i) { // [14-14][15-15] auto [k, v] = it->next(); diff --git a/cloud/test/meta_service_test.cpp b/cloud/test/meta_service_test.cpp index 9f2bc6b11a22fd..fe1bd9e2ccb93d 100644 --- a/cloud/test/meta_service_test.cpp +++ b/cloud/test/meta_service_test.cpp @@ -26,6 +26,7 @@ #include #include +#include #include #include #include @@ -10999,4 +11000,66 @@ TEST(MetaServiceTest, CreateTabletIdempotentAndHandlingError) { ASSERT_EQ(res.status().code(), MetaServiceCode::KV_TXN_GET_ERR); } +TEST(MetaServiceTest, RowsetVisibleTimeTest) { + auto meta_service = get_meta_service(); + using namespace std::chrono; + int64_t txn_id = -1; + // begin txn + { + brpc::Controller cntl; + BeginTxnRequest req; + req.set_cloud_unique_id("test_cloud_unique_id"); + TxnInfoPB txn_info_pb; + txn_info_pb.set_db_id(666); + txn_info_pb.set_label("test_label"); + txn_info_pb.add_table_ids(1234); + txn_info_pb.set_timeout_ms(36000); + req.mutable_txn_info()->CopyFrom(txn_info_pb); + BeginTxnResponse res; + meta_service->begin_txn(reinterpret_cast<::google::protobuf::RpcController*>(&cntl), &req, + &res, nullptr); + ASSERT_EQ(res.status().code(), MetaServiceCode::OK); + txn_id = res.txn_id(); + } + + // mock rowset and tablet + int64_t tablet_id_base = 1103; + for (int i = 0; i < 5; ++i) { + create_tablet(meta_service.get(), 1234, 1235, 1236, tablet_id_base + i); + auto tmp_rowset = create_rowset(txn_id, tablet_id_base + i); + CreateRowsetResponse res; + commit_rowset(meta_service.get(), tmp_rowset, res); + ASSERT_EQ(res.status().code(), MetaServiceCode::OK); + } + { + brpc::Controller cntl; + CommitTxnRequest req; + req.set_cloud_unique_id("test_cloud_unique_id"); + req.set_db_id(666); + req.set_txn_id(txn_id); + CommitTxnResponse res; + meta_service->commit_txn(reinterpret_cast<::google::protobuf::RpcController*>(&cntl), &req, + &res, nullptr); + ASSERT_EQ(res.status().code(), MetaServiceCode::OK); + } + + for (int i = 0; i < 5; ++i) { + int64_t tablet_id = tablet_id_base + i; + int64_t ver = 2; + std::string rowset_key = meta_rowset_key({mock_instance, tablet_id, ver}); + std::string val; + std::unique_ptr txn; + ASSERT_EQ(meta_service->txn_kv()->create_txn(&txn), TxnErrorCode::TXN_OK); + ASSERT_EQ(txn->get(rowset_key, &val), TxnErrorCode::TXN_OK); + RowsetMetaCloudPB rowset_pb; + ASSERT_TRUE(rowset_pb.ParseFromString(val)); + ASSERT_TRUE(rowset_pb.has_visible_ts_ms()); + std::cout << rowset_pb.visible_ts_ms() << "\n"; + ASSERT_GT(rowset_pb.visible_ts_ms(), 0); + auto visible_tp = time_point(milliseconds(rowset_pb.visible_ts_ms())); + std::time_t visible_time = system_clock::to_time_t(visible_tp); + std::cout << "visible time: " + << std::put_time(std::localtime(&visible_time), "%Y%m%d %H:%M:%S") << "\n"; + } +} } // namespace doris::cloud diff --git a/fe/fe-core/src/main/java/org/apache/doris/mysql/privilege/Auth.java b/fe/fe-core/src/main/java/org/apache/doris/mysql/privilege/Auth.java index e1b0a972a7f539..1c70ce07566696 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/mysql/privilege/Auth.java +++ b/fe/fe-core/src/main/java/org/apache/doris/mysql/privilege/Auth.java @@ -1248,6 +1248,24 @@ public Pair isWorkloadGroupInUse(String groupName) { } } + public boolean getEnablePreferCachedRowset(String qualifiedUser) { + readLock(); + try { + return propertyMgr.getEnablePreferCachedRowset(qualifiedUser); + } finally { + readUnlock(); + } + } + + public long getQueryFreshnessToleranceMs(String qualifiedUser) { + readLock(); + try { + return propertyMgr.getQueryFreshnessToleranceMs(qualifiedUser); + } finally { + readUnlock(); + } + } + public void getAllDomains(Set allDomains) { readLock(); try { diff --git a/fe/fe-core/src/main/java/org/apache/doris/mysql/privilege/CommonUserProperties.java b/fe/fe-core/src/main/java/org/apache/doris/mysql/privilege/CommonUserProperties.java index 67f3c7859e26f0..71789cd4ff1bef 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/mysql/privilege/CommonUserProperties.java +++ b/fe/fe-core/src/main/java/org/apache/doris/mysql/privilege/CommonUserProperties.java @@ -73,6 +73,12 @@ public class CommonUserProperties implements Writable, GsonPostProcessable { @SerializedName(value = "wg", alternate = {"workloadGroup"}) private String workloadGroup = WorkloadGroupMgr.DEFAULT_GROUP_NAME; + @SerializedName(value = "epcr", alternate = {"enablePreferCachedRowset"}) + private boolean enablePreferCachedRowset = false; + + @SerializedName(value = "qft", alternate = {"queryFreshnessTolerance"}) + private long queryFreshnessToleranceMs = -1; + private String[] sqlBlockRulesSplit = {}; long getMaxConn() { @@ -186,6 +192,22 @@ public void write(DataOutput out) throws IOException { Text.writeString(out, json); } + public long getQueryFreshnessToleranceMs() { + return queryFreshnessToleranceMs; + } + + public void setQueryFreshnessToleranceMs(long queryFreshnessToleranceMs) { + this.queryFreshnessToleranceMs = queryFreshnessToleranceMs; + } + + public boolean getEnablePreferCachedRowset() { + return enablePreferCachedRowset; + } + + public void setEnablePreferCachedRowset(boolean enablePreferCachedRowset) { + this.enablePreferCachedRowset = enablePreferCachedRowset; + } + @Override public void gsonPostProcess() throws IOException { if (!Strings.isNullOrEmpty(sqlBlockRules)) { diff --git a/fe/fe-core/src/main/java/org/apache/doris/mysql/privilege/UserProperty.java b/fe/fe-core/src/main/java/org/apache/doris/mysql/privilege/UserProperty.java index b79530af6f5d49..84d4af1c8c8555 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/mysql/privilege/UserProperty.java +++ b/fe/fe-core/src/main/java/org/apache/doris/mysql/privilege/UserProperty.java @@ -85,6 +85,9 @@ public class UserProperty implements Writable { public static final String DEFAULT_CLOUD_CLUSTER = "default_cloud_cluster"; public static final String DEFAULT_COMPUTE_GROUP = "default_compute_group"; + public static final String PROP_ENABLE_PREFER_CACHED_ROWSET = "enable_prefer_cached_rowset"; + public static final String PROP_QUERY_FRESHNESS_TOLERANCE = "query_freshness_tolerance_ms"; + // for system user public static final Set ADVANCED_PROPERTIES = Sets.newHashSet(); // for normal user @@ -132,6 +135,8 @@ public class UserProperty implements Writable { COMMON_PROPERTIES.add(Pattern.compile("^" + PROP_WORKLOAD_GROUP + "$", Pattern.CASE_INSENSITIVE)); COMMON_PROPERTIES.add(Pattern.compile("^" + DEFAULT_CLOUD_CLUSTER + "$", Pattern.CASE_INSENSITIVE)); COMMON_PROPERTIES.add(Pattern.compile("^" + DEFAULT_COMPUTE_GROUP + "$", Pattern.CASE_INSENSITIVE)); + COMMON_PROPERTIES.add(Pattern.compile("^" + PROP_QUERY_FRESHNESS_TOLERANCE + "$", Pattern.CASE_INSENSITIVE)); + COMMON_PROPERTIES.add(Pattern.compile("^" + PROP_ENABLE_PREFER_CACHED_ROWSET + "$", Pattern.CASE_INSENSITIVE)); } public UserProperty() { @@ -194,6 +199,14 @@ public long getExecMemLimit() { return commonProperties.getExecMemLimit(); } + public long getQueryFreshnessToleranceMs() { + return commonProperties.getQueryFreshnessToleranceMs(); + } + + public boolean getEnablePreferCachedRowset() { + return commonProperties.getEnablePreferCachedRowset(); + } + public void update(List> properties) throws UserException { update(properties, false); } @@ -211,6 +224,8 @@ public void update(List> properties, boolean isReplay) thro int insertTimeout = this.commonProperties.getInsertTimeout(); String initCatalog = this.commonProperties.getInitCatalog(); String workloadGroup = this.commonProperties.getWorkloadGroup(); + long queryFreshnessToleranceMs = this.commonProperties.getQueryFreshnessToleranceMs(); + boolean enablePreferCachedRowset = this.commonProperties.getEnablePreferCachedRowset(); String newDefaultCloudCluster = defaultCloudCluster; @@ -343,6 +358,21 @@ public void update(List> properties, boolean isReplay) thro throw new DdlException("workload group " + value + " not exists"); } workloadGroup = value; + } else if (keyArr[0].equalsIgnoreCase(PROP_QUERY_FRESHNESS_TOLERANCE)) { + // set property "query_freshness_tolerance" = "1000"; + if (keyArr.length != 1) { + throw new DdlException(PROP_QUERY_FRESHNESS_TOLERANCE + " format error"); + } + queryFreshnessToleranceMs = getLongProperty(key, value, keyArr, PROP_QUERY_FRESHNESS_TOLERANCE); + } else if (keyArr[0].equalsIgnoreCase(PROP_ENABLE_PREFER_CACHED_ROWSET)) { + if (keyArr.length != 1) { + throw new DdlException(PROP_ENABLE_PREFER_CACHED_ROWSET + " format error"); + } + try { + enablePreferCachedRowset = Boolean.parseBoolean(value); + } catch (NumberFormatException e) { + throw new DdlException(PROP_ENABLE_PREFER_CACHED_ROWSET + " is not boolean"); + } } else { if (isReplay) { // After using SET PROPERTY to modify the user property, if FE rolls back to a version without @@ -367,6 +397,8 @@ public void update(List> properties, boolean isReplay) thro this.commonProperties.setInsertTimeout(insertTimeout); this.commonProperties.setInitCatalog(initCatalog); this.commonProperties.setWorkloadGroup(workloadGroup); + this.commonProperties.setQueryFreshnessToleranceMs(queryFreshnessToleranceMs); + this.commonProperties.setEnablePreferCachedRowset(enablePreferCachedRowset); defaultCloudCluster = newDefaultCloudCluster; } @@ -464,6 +496,11 @@ public List> fetchProperty() { result.add(Lists.newArrayList(PROP_WORKLOAD_GROUP, String.valueOf(commonProperties.getWorkloadGroup()))); + result.add(Lists.newArrayList(PROP_ENABLE_PREFER_CACHED_ROWSET, + String.valueOf(commonProperties.getEnablePreferCachedRowset()))); + result.add(Lists.newArrayList(PROP_QUERY_FRESHNESS_TOLERANCE, + String.valueOf(commonProperties.getQueryFreshnessToleranceMs()))); + // default cloud cluster if (defaultCloudCluster != null) { result.add(Lists.newArrayList(DEFAULT_CLOUD_CLUSTER, defaultCloudCluster)); diff --git a/fe/fe-core/src/main/java/org/apache/doris/mysql/privilege/UserPropertyMgr.java b/fe/fe-core/src/main/java/org/apache/doris/mysql/privilege/UserPropertyMgr.java index d34dbb9aeaef81..ea77f1a78bac70 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/mysql/privilege/UserPropertyMgr.java +++ b/fe/fe-core/src/main/java/org/apache/doris/mysql/privilege/UserPropertyMgr.java @@ -241,6 +241,24 @@ public Pair isWorkloadGroupInUse(String groupName) { return Pair.of(false, ""); } + public boolean getEnablePreferCachedRowset(String qualifiedUser) { + UserProperty existProperty = propertyMap.get(qualifiedUser); + existProperty = getPropertyIfNull(qualifiedUser, existProperty); + if (existProperty == null) { + return false; + } + return existProperty.getEnablePreferCachedRowset(); + } + + public long getQueryFreshnessToleranceMs(String qualifiedUser) { + UserProperty existProperty = propertyMap.get(qualifiedUser); + existProperty = getPropertyIfNull(qualifiedUser, existProperty); + if (existProperty == null) { + return -1; + } + return existProperty.getQueryFreshnessToleranceMs(); + } + /** * The method determines which user property to return based on the existProperty parameter * and system configuration: diff --git a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java index 5b959c8b9814ff..859877f374f1db 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java +++ b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java @@ -751,6 +751,9 @@ public class SessionVariable implements Serializable, Writable { public static final String DEFAULT_VARIANT_MAX_SPARSE_COLUMN_STATISTICS_SIZE = "default_variant_max_sparse_column_statistics_size"; + public static final String ENABLE_PREFER_CACHED_ROWSET = "enable_prefer_cached_rowset"; + public static final String QUERY_FRESHNESS_TOLERANCE_MS = "query_freshness_tolerance_ms"; + /** * If set false, user couldn't submit analyze SQL and FE won't allocate any related resources. */ @@ -2258,6 +2261,14 @@ public void setEnableLeftZigZag(boolean enableLeftZigZag) { needForward = true) public boolean enableExternalTableBatchMode = true; + @VariableMgr.VarAttr(name = ENABLE_PREFER_CACHED_ROWSET, needForward = false, + description = {"是否启用 prefer cached rowset 功能", + "Whether to enable prefer cached rowset feature"}) + public boolean enablePreferCachedRowset = false; + + @VariableMgr.VarAttr(name = QUERY_FRESHNESS_TOLERANCE_MS, needForward = false) + public long queryFreshnessToleranceMs = -1; + public Set getIgnoredRuntimeFilterIds() { Set ids = Sets.newLinkedHashSet(); if (ignoreRuntimeFilterIds.isEmpty()) { @@ -3295,6 +3306,30 @@ public int getParallelExecInstanceNum() { } } + public boolean getEnablePreferCachedRowset() { + ConnectContext connectContext = ConnectContext.get(); + if (connectContext != null && connectContext.getEnv() != null && connectContext.getEnv().getAuth() != null) { + boolean userEnablePreferCachedRowset = connectContext.getEnv().getAuth() + .getEnablePreferCachedRowset(connectContext.getQualifiedUser()); + if (userEnablePreferCachedRowset) { + return userEnablePreferCachedRowset; + } + } + return enablePreferCachedRowset; + } + + public long getQueryFreshnessToleranceMs() { + ConnectContext connectContext = ConnectContext.get(); + if (connectContext != null && connectContext.getEnv() != null && connectContext.getEnv().getAuth() != null) { + long userQueryFreshnessToleranceMs = connectContext.getEnv().getAuth() + .getQueryFreshnessToleranceMs(connectContext.getQualifiedUser()); + if (userQueryFreshnessToleranceMs > 0) { + return userQueryFreshnessToleranceMs; + } + } + return queryFreshnessToleranceMs; + } + public int getExchangeInstanceParallel() { return exchangeInstanceParallel; } @@ -4295,6 +4330,10 @@ public TQueryOptions toThrift() { tResult.setEnableJoinSpill(enableJoinSpill); tResult.setEnableSortSpill(enableSortSpill); tResult.setEnableAggSpill(enableAggSpill); + + tResult.setEnablePreferCachedRowset(getEnablePreferCachedRowset()); + tResult.setQueryFreshnessToleranceMs(getQueryFreshnessToleranceMs()); + tResult.setEnableForceSpill(enableForceSpill); tResult.setMinRevocableMem(minRevocableMem); tResult.setDataQueueMaxBlocks(dataQueueMaxBlocks); diff --git a/fe/fe-core/src/test/java/org/apache/doris/catalog/UserPropertyTest.java b/fe/fe-core/src/test/java/org/apache/doris/catalog/UserPropertyTest.java index 6762f6bbad0380..143cbfaa2fdbc2 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/catalog/UserPropertyTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/catalog/UserPropertyTest.java @@ -109,6 +109,8 @@ public void testUpdate() throws UserException { properties.add(Pair.of("sql_block_rules", "rule1,rule2")); properties.add(Pair.of("cpu_resource_limit", "2")); properties.add(Pair.of("query_timeout", "500")); + properties.add(Pair.of("enable_prefer_cached_rowset", "true")); + properties.add(Pair.of("query_freshness_tolerance_ms", "4500")); UserProperty userProperty = new UserProperty(); userProperty.update(properties); @@ -119,6 +121,8 @@ public void testUpdate() throws UserException { Assert.assertEquals(2, userProperty.getCpuResourceLimit()); Assert.assertEquals(500, userProperty.getQueryTimeout()); Assert.assertEquals(Sets.newHashSet(), userProperty.getCopiedResourceTags()); + Assert.assertEquals(true, userProperty.getEnablePreferCachedRowset()); + Assert.assertEquals(4500, userProperty.getQueryFreshnessToleranceMs()); // fetch property List> rows = userProperty.fetchProperty(); diff --git a/fe/fe-core/src/test/java/org/apache/doris/planner/ResourceTagQueryTest.java b/fe/fe-core/src/test/java/org/apache/doris/planner/ResourceTagQueryTest.java index 850d8b27b062af..e9ae96128468bf 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/planner/ResourceTagQueryTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/planner/ResourceTagQueryTest.java @@ -280,7 +280,7 @@ public void test() throws Exception { Assert.assertEquals(1000000, execMemLimit); List> userProps = Env.getCurrentEnv().getAuth().getUserProperties(Auth.ROOT_USER); - Assert.assertEquals(13, userProps.size()); + Assert.assertEquals(15, userProps.size()); // now : // be1 be2 be3 ==>tag1; diff --git a/gensrc/proto/olap_file.proto b/gensrc/proto/olap_file.proto index 1e97d5ad476cb1..776072576988fd 100644 --- a/gensrc/proto/olap_file.proto +++ b/gensrc/proto/olap_file.proto @@ -140,6 +140,8 @@ message RowsetMetaPB { optional bool enable_inverted_index_file_info = 1006; repeated InvertedIndexFileInfo inverted_index_file_info = 1007; + + optional int64 visible_ts_ms = 1010; } message SchemaDictKeyList { @@ -233,6 +235,8 @@ message RowsetMetaCloudPB { optional bool enable_inverted_index_file_info = 106; repeated InvertedIndexFileInfo inverted_index_file_info = 107; + + optional int64 visible_ts_ms = 109; } message SegmentStatisticsPB { diff --git a/gensrc/thrift/PaloInternalService.thrift b/gensrc/thrift/PaloInternalService.thrift index 508d64f772ae28..2dfab470dd0160 100644 --- a/gensrc/thrift/PaloInternalService.thrift +++ b/gensrc/thrift/PaloInternalService.thrift @@ -366,6 +366,9 @@ struct TQueryOptions { // upgrade options. keep them same in every branch. 200: optional bool new_is_ip_address_in_range = false; + 172: optional bool enable_prefer_cached_rowset + 173: optional i64 query_freshness_tolerance_ms + // For cloud, to control if the content would be written into file cache // In write path, to control if the content would be written into file cache. // In read path, read from file cache or remote storage when execute query. diff --git a/regression-test/data/cloud_p0/cache/multi_cluster/warm_up/cluster/test_immediate_warmup_basic.out b/regression-test/data/cloud_p0/cache/multi_cluster/warm_up/cluster/test_immediate_warmup_basic.out new file mode 100644 index 00000000000000..99e1f4ad641944 --- /dev/null +++ b/regression-test/data/cloud_p0/cache/multi_cluster/warm_up/cluster/test_immediate_warmup_basic.out @@ -0,0 +1,16 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !sql -- +1 {"a":1} +2 {"a":111.1111} +3 {"a":"11111"} +4 {"a":1111111111} +5 {"a":1111.11111} + +-- !sql -- +1 {"a":1} +2 {"a":111.1111} +3 {"a":"11111"} +4 {"a":1111111111} +5 {"a":1111.11111} +6 {"a":1111.11111} + diff --git a/regression-test/data/cloud_p0/cache/multi_cluster/warm_up/cluster/test_schema_change_add_key_column.csv.gz b/regression-test/data/cloud_p0/cache/multi_cluster/warm_up/cluster/test_schema_change_add_key_column.csv.gz new file mode 100644 index 00000000000000..bc9d3dd70ea8a5 Binary files /dev/null and b/regression-test/data/cloud_p0/cache/multi_cluster/warm_up/cluster/test_schema_change_add_key_column.csv.gz differ diff --git a/regression-test/data/cloud_p0/read_cluster_cache/no_warmup/test_enable_prefer_cached_rowset.out b/regression-test/data/cloud_p0/read_cluster_cache/no_warmup/test_enable_prefer_cached_rowset.out new file mode 100644 index 00000000000000..04cf3be33e8192 --- /dev/null +++ b/regression-test/data/cloud_p0/read_cluster_cache/no_warmup/test_enable_prefer_cached_rowset.out @@ -0,0 +1,32 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !cluster1 -- +3 {"a":"11111"} +4 {"a":1111111111} +5 {"a":1111.11111} + +-- !cluster2_0 -- +3 {"a":"11111"} +4 {"a":1111111111} +5 {"a":1111.11111} + +-- !cluster1_new_data -- +3 {"a":"11111"} +4 {"a":1111111111} +5 {"a":1111.11111} +6 {"a":1111.11111} + +-- !cluster2_1 -- +1 \N 1 3 +3 {"a":"11111"} 0 4 +4 {"a":1111111111} 0 5 +5 {"a":1111.11111} 0 6 +6 {"a":1111.11111} 0 7 + +-- !cluster2_2 -- +1 {"a":1} 0 2 +1 \N 1 3 +3 {"a":"11111"} 0 4 +4 {"a":1111111111} 0 5 +5 {"a":1111.11111} 0 6 +6 {"a":1111.11111} 0 7 + diff --git a/regression-test/data/cloud_p0/read_cluster_cache/no_warmup/test_query_freshness_tolerance.out b/regression-test/data/cloud_p0/read_cluster_cache/no_warmup/test_query_freshness_tolerance.out new file mode 100644 index 00000000000000..b99240d21e24ff --- /dev/null +++ b/regression-test/data/cloud_p0/read_cluster_cache/no_warmup/test_query_freshness_tolerance.out @@ -0,0 +1,24 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !cluster1 -- +3 {"a":"11111"} +4 {"a":1111111111} +5 {"a":1111.11111} + +-- !cluster2_0 -- +3 {"a":"11111"} +4 {"a":1111111111} +5 {"a":1111.11111} + +-- !cluster1_new_data -- +3 {"a":"11111"} +4 {"a":1111111111} +5 {"a":1111.11111} +6 {"a":1111.11111} + +-- !cluster2_1 -- +1 {"a":1} 0 2 +1 \N 1 3 +3 {"a":"11111"} 0 4 +4 {"a":1111111111} 0 5 +5 {"a":1111.11111} 0 6 + diff --git a/regression-test/data/cloud_p0/read_cluster_cache/warmup/test_warmup_delay_compaction_query_tolerance.out b/regression-test/data/cloud_p0/read_cluster_cache/warmup/test_warmup_delay_compaction_query_tolerance.out new file mode 100644 index 00000000000000..7cefab58718a8e --- /dev/null +++ b/regression-test/data/cloud_p0/read_cluster_cache/warmup/test_warmup_delay_compaction_query_tolerance.out @@ -0,0 +1,9 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !cluster2 -- +1 1 +2 2 +3 3 +4 4 +5 5 +6 6 + diff --git a/regression-test/data/cloud_p0/read_cluster_cache/warmup/test_warmup_delay_idx_query_tolerance.out b/regression-test/data/cloud_p0/read_cluster_cache/warmup/test_warmup_delay_idx_query_tolerance.out new file mode 100644 index 00000000000000..8191de7859e2eb --- /dev/null +++ b/regression-test/data/cloud_p0/read_cluster_cache/warmup/test_warmup_delay_idx_query_tolerance.out @@ -0,0 +1,23 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !cluster2_0 -- +3 3 +4 4 +5 5 +6 6 + +-- !cluster2_1 -- +1 2 1 3 +3 3 0 4 +4 4 0 5 +5 5 0 6 +6 6 0 7 +9 9 0 8 + +-- !cluster2_2 -- +1 1 0 2 +1 2 1 3 +3 3 0 4 +4 4 0 5 +5 5 0 6 +6 6 0 7 + diff --git a/regression-test/data/cloud_p0/read_cluster_cache/warmup/test_warmup_delay_sc_query_tolerance.out b/regression-test/data/cloud_p0/read_cluster_cache/warmup/test_warmup_delay_sc_query_tolerance.out new file mode 100644 index 00000000000000..e7fffb3bcb7ec1 --- /dev/null +++ b/regression-test/data/cloud_p0/read_cluster_cache/warmup/test_warmup_delay_sc_query_tolerance.out @@ -0,0 +1,11 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !cluster2_0 -- +2 2 + +-- !cluster2_1 -- +1 \N 1 3 +2 2 0 2 +9 9 0 4 + +-- !cluster2_2 -- + diff --git a/regression-test/data/cloud_p0/read_cluster_cache/warmup/test_warmup_delay_timeout_compaction_query_tolerance.out b/regression-test/data/cloud_p0/read_cluster_cache/warmup/test_warmup_delay_timeout_compaction_query_tolerance.out new file mode 100644 index 00000000000000..b915a1b6c9449a --- /dev/null +++ b/regression-test/data/cloud_p0/read_cluster_cache/warmup/test_warmup_delay_timeout_compaction_query_tolerance.out @@ -0,0 +1,23 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !cluster2_0 -- +3 3 +4 4 +5 5 +6 6 + +-- !cluster2_1 -- +1 2 1 3 +3 3 0 4 +4 4 0 5 +5 5 0 6 +6 6 0 7 +9 9 0 8 + +-- !cluster2 -- +1 1 0 2 +1 2 1 3 +3 3 0 4 +4 4 0 5 +5 5 0 6 +6 6 0 7 + diff --git a/regression-test/data/cloud_p0/read_cluster_cache/warmup/test_warmup_download_fail.out b/regression-test/data/cloud_p0/read_cluster_cache/warmup/test_warmup_download_fail.out new file mode 100644 index 00000000000000..59c500c665d402 --- /dev/null +++ b/regression-test/data/cloud_p0/read_cluster_cache/warmup/test_warmup_download_fail.out @@ -0,0 +1,10 @@ +-- This file is automatically generated. You should know what you did if you want to edit this +-- !cluster2 -- +1 1 +2 2 +3 3 +4 4 +5 5 +6 6 +9 9 + diff --git a/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/cluster/test_immediate_warmup_basic.groovy b/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/cluster/test_immediate_warmup_basic.groovy new file mode 100644 index 00000000000000..7bbe3f01895424 --- /dev/null +++ b/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/cluster/test_immediate_warmup_basic.groovy @@ -0,0 +1,154 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import org.apache.doris.regression.suite.ClusterOptions +import org.apache.doris.regression.util.NodeType +import groovy.json.JsonSlurper + +suite('test_immediate_warmup_basic', 'docker') { + def options = new ClusterOptions() + options.feConfigs += [ + 'cloud_cluster_check_interval_second=1', + 'cloud_tablet_rebalancer_interval_second=1', + ] + options.beConfigs += [ + 'file_cache_enter_disk_resource_limit_mode_percent=99', + 'enable_evict_file_cache_in_advance=false', + 'block_file_cache_monitor_interval_sec=1', + ] + options.enableDebugPoints() + options.cloudMode = true + + def clearFileCache = {ip, port -> + def url = "http://${ip}:${port}/api/file_cache?op=clear&sync=true" + def response = new URL(url).text + def json = new JsonSlurper().parseText(response) + + // Check the status + if (json.status != "OK") { + throw new RuntimeException("Clear cache on ${ip}:${port} failed: ${json.status}") + } + } + + def clearFileCacheOnAllBackends = { + def backends = sql """SHOW BACKENDS""" + + for (be in backends) { + def ip = be[1] + def port = be[4] + clearFileCache(ip, port) + } + + // clear file cache is async, wait it done + sleep(5000) + } + + def updateBeConf = {cluster, key, value -> + def backends = sql """SHOW BACKENDS""" + def cluster_bes = backends.findAll { it[19].contains("""\"compute_group_name\" : \"${cluster}\"""") } + for (be in cluster_bes) { + def ip = be[1] + def port = be[4] + def (code, out, err) = update_be_config(ip, port, key, value) + logger.info("update config: code=" + code + ", out=" + out + ", err=" + err) + } + } + + def getBrpcMetrics = {ip, port, name -> + def url = "http://${ip}:${port}/brpc_metrics" + def metrics = new URL(url).text + def matcher = metrics =~ ~"${name}\\s+(\\d+)" + if (matcher.find()) { + return matcher[0][1] as long + } else { + throw new RuntimeException("${name} not found for ${ip}:${port}") + } + } + + def getBrpcMetricsByCluster = {cluster, name-> + def backends = sql """SHOW BACKENDS""" + def cluster_bes = backends.findAll { it[19].contains("""\"compute_group_name\" : \"${cluster}\"""") } + assert cluster_bes.size() > 0, "No backend found for cluster ${cluster}" + def be = cluster_bes[0] + def ip = be[1] + def port = be[5] + return getBrpcMetrics(ip, port, name) + } + + docker(options) { + def clusterName1 = "warmup_source" + def clusterName2 = "warmup_target" + + // Add two clusters + cluster.addBackend(1, clusterName1) + cluster.addBackend(1, clusterName2) + + def tag1 = getCloudBeTagByName(clusterName1) + def tag2 = getCloudBeTagByName(clusterName2) + + logger.info("Cluster tag1: {}", tag1) + logger.info("Cluster tag2: {}", tag2) + + updateBeConf(clusterName2, "enable_warmup_immediately_on_new_rowset", "true") + + // Ensure we are in source cluster + sql """use @${clusterName1}""" + + sql """ + create table test ( + col0 int not null, + col1 variant NOT NULL + ) DUPLICATE KEY(`col0`) + DISTRIBUTED BY HASH(col0) BUCKETS 1 + PROPERTIES ("file_cache_ttl_seconds" = "3600", "disable_auto_compaction" = "true"); + """ + + clearFileCacheOnAllBackends() + sleep(15000) + + sql """insert into test values (1, '{"a" : 1.0}')""" + sql """insert into test values (2, '{"a" : 111.1111}')""" + sql """insert into test values (3, '{"a" : "11111"}')""" + sql """insert into test values (4, '{"a" : 1111111111}')""" + sql """insert into test values (5, '{"a" : 1111.11111}')""" + + // switch to read cluster, trigger a sync rowset + sql """use @${clusterName2}""" + qt_sql """select * from test""" + assertEquals(5, getBrpcMetricsByCluster(clusterName2, "file_cache_download_submitted_num")) + assertEquals(5, getBrpcMetricsByCluster(clusterName2, "file_cache_warm_up_rowset_triggered_by_sync_rowset_num")) + assertEquals(0, getBrpcMetricsByCluster(clusterName2, "file_cache_warm_up_rowset_triggered_by_job_num")) + + // switch to source cluster and trigger compaction + sql """use @${clusterName1}""" + trigger_and_wait_compaction("test", "cumulative") + sql """insert into test values (6, '{"a" : 1111.11111}')""" + sleep(2000) + + // switch to read cluster, trigger a sync rowset + sql """use @${clusterName2}""" + qt_sql """select * from test""" + // wait until the injection complete + sleep(1000) + + assertEquals(7, getBrpcMetricsByCluster(clusterName2, "file_cache_download_submitted_num")) + assertEquals(7, getBrpcMetricsByCluster(clusterName2, "file_cache_warm_up_rowset_triggered_by_sync_rowset_num")) + assertEquals(0, getBrpcMetricsByCluster(clusterName2, "file_cache_warm_up_rowset_triggered_by_job_num")) + sleep(5000) + assertEquals(7, getBrpcMetricsByCluster(clusterName2, "file_cache_warm_up_rowset_complete_num")) + } +} diff --git a/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/cluster/test_immediate_warmup_multi_segments.groovy b/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/cluster/test_immediate_warmup_multi_segments.groovy new file mode 100644 index 00000000000000..fc1416984d2e87 --- /dev/null +++ b/regression-test/suites/cloud_p0/cache/multi_cluster/warm_up/cluster/test_immediate_warmup_multi_segments.groovy @@ -0,0 +1,232 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import org.apache.doris.regression.suite.ClusterOptions +import org.apache.doris.regression.util.Http +import org.apache.doris.regression.util.NodeType +import groovy.json.JsonSlurper + +suite('test_immediate_warmup_multi_segments', 'docker') { + def options = new ClusterOptions() + options.feConfigs += [ + 'cloud_cluster_check_interval_second=1', + 'cloud_tablet_rebalancer_interval_second=1', + ] + options.beConfigs += [ + 'file_cache_enter_disk_resource_limit_mode_percent=99', + 'enable_evict_file_cache_in_advance=false', + 'block_file_cache_monitor_interval_sec=1', + 'tablet_rowset_stale_sweep_time_sec=0', + 'vacuum_stale_rowsets_interval_s=10', + 'doris_scanner_row_bytes=1', + ] + options.enableDebugPoints() + options.cloudMode = true + + def testTable = "test" + + def clearFileCache = {ip, port -> + def url = "http://${ip}:${port}/api/file_cache?op=clear&sync=true" + def response = new URL(url).text + def json = new JsonSlurper().parseText(response) + + // Check the status + if (json.status != "OK") { + throw new RuntimeException("Clear cache on ${ip}:${port} failed: ${json.status}") + } + } + + def clearFileCacheOnAllBackends = { + def backends = sql """SHOW BACKENDS""" + + for (be in backends) { + def ip = be[1] + def port = be[4] + clearFileCache(ip, port) + } + + // clear file cache is async, wait it done + sleep(5000) + } + + def updateBeConf = {cluster, key, value -> + def backends = sql """SHOW BACKENDS""" + def cluster_bes = backends.findAll { it[19].contains("""\"compute_group_name\" : \"${cluster}\"""") } + for (be in cluster_bes) { + def ip = be[1] + def port = be[4] + def (code, out, err) = update_be_config(ip, port, key, value) + logger.info("update config: code=" + code + ", out=" + out + ", err=" + err) + } + } + + def getBrpcMetrics = {ip, port, name -> + def url = "http://${ip}:${port}/brpc_metrics" + def metrics = new URL(url).text + def matcher = metrics =~ ~"${name}\\s+(\\d+)" + if (matcher.find()) { + return matcher[0][1] as long + } else { + throw new RuntimeException("${name} not found for ${ip}:${port}") + } + } + + def getBrpcMetricsByCluster = {cluster, name-> + def backends = sql """SHOW BACKENDS""" + def cluster_bes = backends.findAll { it[19].contains("""\"compute_group_name\" : \"${cluster}\"""") } + assert cluster_bes.size() > 0, "No backend found for cluster ${cluster}" + def be = cluster_bes[0] + def ip = be[1] + def port = be[5] + return getBrpcMetrics(ip, port, name) + } + + def injectS3FileReadSlow = {cluster, sleep_s -> + def backends = sql """SHOW BACKENDS""" + def cluster_bes = backends.findAll { it[19].contains("""\"compute_group_name\" : \"${cluster}\"""") } + def injectName = 'S3FileReader::read_at_impl.io_slow' + for (be in cluster_bes) { + def ip = be[1] + def port = be[4] + GetDebugPoint().enableDebugPoint(ip, port as int, NodeType.BE, injectName, [sleep:sleep_s, execute:1]) + } + } + + def getTabletStatus = { cluster, tablet_id, rowsetIndex, lastRowsetSegmentNum, enableAssert = false -> + def backends = sql """SHOW BACKENDS""" + def cluster_bes = backends.findAll { it[19].contains("""\"compute_group_name\" : \"${cluster}\"""") } + assert cluster_bes.size() > 0, "No backend found for cluster ${cluster}" + def be = cluster_bes[0] + def ip = be[1] + def port = be[4] + StringBuilder sb = new StringBuilder(); + sb.append("curl -X GET http://${ip}:${port}") + sb.append("/api/compaction/show?tablet_id=") + sb.append(tablet_id) + + String command = sb.toString() + logger.info(command) + def process = command.execute() + def code = process.waitFor() + def out = process.getText() + logger.info("Get tablet status: =" + code + ", out=" + out) + assertEquals(code, 0) + + def tabletJson = parseJson(out.trim()) + assert tabletJson.rowsets instanceof List + assertTrue(tabletJson.rowsets.size() >= rowsetIndex) + def rowset = tabletJson.rowsets.get(rowsetIndex - 1) + logger.info("rowset: ${rowset}") + + int start_index = rowset.indexOf("]") + int end_index = rowset.indexOf("DATA") + def segmentNumStr = rowset.substring(start_index + 1, end_index).trim() + logger.info("segmentNumStr: ${segmentNumStr}") + if (enableAssert) { + assertEquals(lastRowsetSegmentNum, Integer.parseInt(segmentNumStr)) + } else { + return lastRowsetSegmentNum == Integer.parseInt(segmentNumStr); + } + } + + docker(options) { + def clusterName1 = "warmup_source" + def clusterName2 = "warmup_target" + + // Add two clusters + cluster.addBackend(1, clusterName1) + cluster.addBackend(1, clusterName2) + + def tag1 = getCloudBeTagByName(clusterName1) + def tag2 = getCloudBeTagByName(clusterName2) + + logger.info("Cluster tag1: {}", tag1) + logger.info("Cluster tag2: {}", tag2) + + updateBeConf(clusterName2, "enable_warmup_immediately_on_new_rowset", "true") + + // Ensure we are in source cluster + sql """use @${clusterName1}""" + sql """ DROP TABLE IF EXISTS ${testTable} """ + sql """ CREATE TABLE IF NOT EXISTS ${testTable} ( + `k1` int(11) NULL, + `k2` int(11) NULL, + `v3` int(11) NULL, + `v4` int(11) NULL + ) unique KEY(`k1`, `k2`) + DISTRIBUTED BY HASH(`k1`) BUCKETS 1 + PROPERTIES ( + "replication_num" = "1", + "disable_auto_compaction" = "true" + ); + """ + + clearFileCacheOnAllBackends() + sleep(15000) + + def tablets = sql_return_maparray """ show tablets from test; """ + logger.info("tablets: " + tablets) + assertEquals(1, tablets.size()) + def tablet = tablets[0] + String tablet_id = tablet.TabletId + + GetDebugPoint().enableDebugPointForAllBEs("MemTable.need_flush") + try { + // load 1 + streamLoad { + table "${testTable}" + set 'column_separator', ',' + set 'compress_type', 'GZ' + file 'test_schema_change_add_key_column.csv.gz' + time 10000 // limit inflight 10s + + check { res, exception, startTime, endTime -> + if (exception != null) { + throw exception + } + def json = parseJson(res) + assertEquals("success", json.Status.toLowerCase()) + assertEquals(8192, json.NumberTotalRows) + assertEquals(0, json.NumberFilteredRows) + } + } + sql "sync" + def rowCount1 = sql """ select count() from ${testTable}; """ + logger.info("rowCount1: ${rowCount1}") + // check generate 3 segments + getTabletStatus(clusterName1, tablet_id, 2, 3, true) + + // switch to read cluster, trigger a sync rowset + injectS3FileReadSlow(clusterName2, 10) + // the query will be blocked by the injection, we call it async + def future = thread { + sql """use @${clusterName2}""" + sql """select * from test""" + } + sleep(1000) + assertEquals(1, getBrpcMetricsByCluster(clusterName2, "file_cache_warm_up_rowset_triggered_by_sync_rowset_num")) + assertEquals(2, getBrpcMetricsByCluster(clusterName2, "file_cache_warm_up_segment_complete_num")) + assertEquals(0, getBrpcMetricsByCluster(clusterName2, "file_cache_warm_up_rowset_complete_num")) + + future.get() + assertEquals(3, getBrpcMetricsByCluster(clusterName2, "file_cache_warm_up_segment_complete_num")) + assertEquals(1, getBrpcMetricsByCluster(clusterName2, "file_cache_warm_up_rowset_complete_num")) + } finally { + GetDebugPoint().clearDebugPointsForAllBEs() + } + } +} diff --git a/regression-test/suites/cloud_p0/read_cluster_cache/no_warmup/test_enable_prefer_cached_rowset.groovy b/regression-test/suites/cloud_p0/read_cluster_cache/no_warmup/test_enable_prefer_cached_rowset.groovy new file mode 100644 index 00000000000000..34ca1d7e8a4b4e --- /dev/null +++ b/regression-test/suites/cloud_p0/read_cluster_cache/no_warmup/test_enable_prefer_cached_rowset.groovy @@ -0,0 +1,178 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import org.apache.doris.regression.suite.ClusterOptions +import org.apache.doris.regression.util.NodeType +import groovy.json.JsonSlurper + +suite('test_enable_prefer_cached_rowset', 'docker') { + def options = new ClusterOptions() + options.feConfigs += [ + 'cloud_cluster_check_interval_second=1', + 'cloud_tablet_rebalancer_interval_second=1', + ] + options.beConfigs += [ + 'file_cache_enter_disk_resource_limit_mode_percent=99', + 'enable_evict_file_cache_in_advance=false', + 'block_file_cache_monitor_interval_sec=1', + ] + options.enableDebugPoints() + options.cloudMode = true + + def clearFileCache = {ip, port -> + def url = "http://${ip}:${port}/api/file_cache?op=clear&sync=true" + def response = new URL(url).text + def json = new JsonSlurper().parseText(response) + + // Check the status + if (json.status != "OK") { + throw new RuntimeException("Clear cache on ${ip}:${port} failed: ${json.status}") + } + } + + def clearFileCacheOnAllBackends = { + def backends = sql """SHOW BACKENDS""" + + for (be in backends) { + def ip = be[1] + def port = be[4] + clearFileCache(ip, port) + } + + // clear file cache is async, wait it done + sleep(2000) + } + + def updateBeConf = {cluster, key, value -> + def backends = sql """SHOW BACKENDS""" + def cluster_bes = backends.findAll { it[19].contains("""\"compute_group_name\" : \"${cluster}\"""") } + for (be in cluster_bes) { + def ip = be[1] + def port = be[4] + def (code, out, err) = update_be_config(ip, port, key, value) + logger.info("update config: code=" + code + ", out=" + out + ", err=" + err) + } + } + + def getBrpcMetrics = {ip, port, name -> + def url = "http://${ip}:${port}/brpc_metrics" + def metrics = new URL(url).text + def matcher = metrics =~ ~"${name}\\s+(\\d+)" + if (matcher.find()) { + return matcher[0][1] as long + } else { + throw new RuntimeException("${name} not found for ${ip}:${port}") + } + } + + def getBrpcMetricsByCluster = {cluster, name-> + def backends = sql """SHOW BACKENDS""" + def cluster_bes = backends.findAll { it[19].contains("""\"compute_group_name\" : \"${cluster}\"""") } + assert cluster_bes.size() > 0, "No backend found for cluster ${cluster}" + def be = cluster_bes[0] + def ip = be[1] + def port = be[5] + return getBrpcMetrics(ip, port, name) + } + + def injectCompactionRowsetDownloadSlow = {cluster, sleep_s -> + def backends = sql """SHOW BACKENDS""" + def cluster_bes = backends.findAll { it[19].contains("""\"compute_group_name\" : \"${cluster}\"""") } + def injectName = 'CloudTablet::add_rowsets.download_data.callback.block_compaction_rowset' + for (be in cluster_bes) { + def ip = be[1] + def port = be[4] + GetDebugPoint().enableDebugPoint(ip, port as int, NodeType.BE, injectName, [sleep:sleep_s]) + } + } + + docker(options) { + def clusterName1 = "warmup_source" + def clusterName2 = "warmup_target" + + // Add two clusters + cluster.addBackend(1, clusterName1) + cluster.addBackend(1, clusterName2) + + def tag1 = getCloudBeTagByName(clusterName1) + def tag2 = getCloudBeTagByName(clusterName2) + + logger.info("Cluster tag1: {}", tag1) + logger.info("Cluster tag2: {}", tag2) + + updateBeConf(clusterName2, "enable_warmup_immediately_on_new_rowset", "true") + + // Ensure we are in source cluster + sql """use @${clusterName1}""" + + sql """ + create table test ( + col0 int not null, + col1 variant NULL + ) UNIQUE KEY(`col0`) + DISTRIBUTED BY HASH(col0) BUCKETS 1 + PROPERTIES ("file_cache_ttl_seconds" = "3600", "disable_auto_compaction" = "true", + "enable_unique_key_merge_on_write" = "false"); + """ + + clearFileCacheOnAllBackends() + + sql """insert into test values (1, '{"a" : 1.0}')""" + sql """insert into test(col0,__DORIS_DELETE_SIGN__) values (1, 1);""" + sql """insert into test values (3, '{"a" : "11111"}')""" + sql """insert into test values (4, '{"a" : 1111111111}')""" + sql """insert into test values (5, '{"a" : 1111.11111}')""" + + sql """use @${clusterName1}""" + qt_cluster1 """select * from test""" + + // switch to read cluster, trigger a sync rowset + sql """use @${clusterName2}""" + qt_cluster2_0 """select * from test""" + + // switch to source cluster and trigger compaction + sql """use @${clusterName1}""" + trigger_and_wait_compaction("test", "cumulative") + // load new data to increase the version + sql """insert into test values (6, '{"a" : 1111.11111}')""" + qt_cluster1_new_data "select * from test;" + + // inject to let cluster2 read compaction rowset data slowly + injectCompactionRowsetDownloadSlow(clusterName2, 10) + // switch to read cluster, trigger a sync rowset + sql """use @${clusterName2}""" + sql "set enable_profile=true;" + sql "set profile_level=2;" + + sql "set skip_delete_sign=true;" + sql "set show_hidden_columns=true;" + sql "set skip_storage_engine_merge=true;" + + // when enable_prefer_cached_rowset = false, need to read all data including compaction rowsets + qt_cluster2_1 "select * from test order by col0, __DORIS_VERSION_COL__;" + + sql "set enable_prefer_cached_rowset = true" + // when enable_prefer_cached_rowset = true, only need to read newly load data, compaction rowsets data will be skipped + def t1 = System.currentTimeMillis() + def capturePreferCacheCount = getBrpcMetricsByCluster(clusterName2, "capture_prefer_cache_count") + qt_cluster2_2 "select * from test order by col0, __DORIS_VERSION_COL__;" + def t2 = System.currentTimeMillis() + logger.info("query in cluster2 cost=${t2 - t1} ms") + assert t2 - t1 < 2000 + assert getBrpcMetricsByCluster(clusterName2, "capture_prefer_cache_count") == capturePreferCacheCount + 1 + } +} diff --git a/regression-test/suites/cloud_p0/read_cluster_cache/no_warmup/test_query_freshness_tolerance.groovy b/regression-test/suites/cloud_p0/read_cluster_cache/no_warmup/test_query_freshness_tolerance.groovy new file mode 100644 index 00000000000000..215e588137ed8e --- /dev/null +++ b/regression-test/suites/cloud_p0/read_cluster_cache/no_warmup/test_query_freshness_tolerance.groovy @@ -0,0 +1,183 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import org.apache.doris.regression.suite.ClusterOptions +import org.apache.doris.regression.util.NodeType +import groovy.json.JsonSlurper + +suite('test_query_freshness_tolerance', 'docker') { + def options = new ClusterOptions() + options.feConfigs += [ + 'cloud_cluster_check_interval_second=1', + 'cloud_tablet_rebalancer_interval_second=1', + ] + options.beConfigs += [ + 'file_cache_enter_disk_resource_limit_mode_percent=99', + 'enable_evict_file_cache_in_advance=false', + 'block_file_cache_monitor_interval_sec=1', + ] + options.enableDebugPoints() + options.cloudMode = true + + def clearFileCache = {ip, port -> + def url = "http://${ip}:${port}/api/file_cache?op=clear&sync=true" + def response = new URL(url).text + def json = new JsonSlurper().parseText(response) + + // Check the status + if (json.status != "OK") { + throw new RuntimeException("Clear cache on ${ip}:${port} failed: ${json.status}") + } + } + + def clearFileCacheOnAllBackends = { + def backends = sql """SHOW BACKENDS""" + + for (be in backends) { + def ip = be[1] + def port = be[4] + clearFileCache(ip, port) + } + + // clear file cache is async, wait it done + sleep(2000) + } + + def updateBeConf = {cluster, key, value -> + def backends = sql """SHOW BACKENDS""" + def cluster_bes = backends.findAll { it[19].contains("""\"compute_group_name\" : \"${cluster}\"""") } + for (be in cluster_bes) { + def ip = be[1] + def port = be[4] + def (code, out, err) = update_be_config(ip, port, key, value) + logger.info("update config: code=" + code + ", out=" + out + ", err=" + err) + } + } + + def getBrpcMetrics = {ip, port, name -> + def url = "http://${ip}:${port}/brpc_metrics" + def metrics = new URL(url).text + def matcher = metrics =~ ~"${name}\\s+(\\d+)" + if (matcher.find()) { + def ret = matcher[0][1] as long + logger.info("getBrpcMetrics, ${url}, name:${name}, value:${ret}") + return ret + } else { + throw new RuntimeException("${name} not found for ${ip}:${port}") + } + } + + def getBrpcMetricsByCluster = {cluster, name-> + def backends = sql """SHOW BACKENDS""" + def cluster_bes = backends.findAll { it[19].contains("""\"compute_group_name\" : \"${cluster}\"""") } + assert cluster_bes.size() > 0, "No backend found for cluster ${cluster}" + def be = cluster_bes[0] + def ip = be[1] + def port = be[5] + return getBrpcMetrics(ip, port, name) + } + + def injectS3FileReadSlow = {cluster, sleep_s -> + def backends = sql """SHOW BACKENDS""" + def cluster_bes = backends.findAll { it[19].contains("""\"compute_group_name\" : \"${cluster}\"""") } + def injectName = 'S3FileReader::read_at_impl.io_slow' + for (be in cluster_bes) { + def ip = be[1] + def port = be[4] + GetDebugPoint().enableDebugPoint(ip, port as int, NodeType.BE, injectName, [sleep:sleep_s]) + } + } + + docker(options) { + def clusterName1 = "warmup_source" + def clusterName2 = "warmup_target" + + // Add two clusters + cluster.addBackend(1, clusterName1) + cluster.addBackend(1, clusterName2) + + def tag1 = getCloudBeTagByName(clusterName1) + def tag2 = getCloudBeTagByName(clusterName2) + + logger.info("Cluster tag1: {}", tag1) + logger.info("Cluster tag2: {}", tag2) + + updateBeConf(clusterName2, "enable_warmup_immediately_on_new_rowset", "true") + + // Ensure we are in source cluster + sql """use @${clusterName1}""" + + sql """ + create table test ( + col0 int not null, + col1 variant NULL + ) UNIQUE KEY(`col0`) + DISTRIBUTED BY HASH(col0) BUCKETS 1 + PROPERTIES ("file_cache_ttl_seconds" = "3600", "disable_auto_compaction" = "true", + "enable_unique_key_merge_on_write" = "false"); + """ + + clearFileCacheOnAllBackends() + + sql """insert into test values (1, '{"a" : 1.0}')""" + sql """insert into test(col0,__DORIS_DELETE_SIGN__) values (1, 1);""" + sql """insert into test values (3, '{"a" : "11111"}')""" + sql """insert into test values (4, '{"a" : 1111111111}')""" + sql """insert into test values (5, '{"a" : 1111.11111}')""" + + sql """use @${clusterName1}""" + qt_cluster1 """select * from test""" + + // switch to read cluster, trigger a sync rowset + sql """use @${clusterName2}""" + qt_cluster2_0 """select * from test""" + + // sleep for 5s to let these rowsets meet the requirement of query freshness tolerance + sleep(5000) + + // switch to source cluster and trigger compaction + sql """use @${clusterName1}""" + trigger_and_wait_compaction("test", "cumulative") + // load new data to increase the version + sql """insert into test values (6, '{"a" : 1111.11111}')""" + qt_cluster1_new_data "select * from test;" + + // inject to let cluster2 read compaction rowset data slowly + injectS3FileReadSlow(clusterName2, 10) + // switch to read cluster, trigger a sync rowset + sql """use @${clusterName2}""" + sql "set enable_profile=true;" + sql "set profile_level=2;" + + sql "set skip_delete_sign=true;" + sql "set show_hidden_columns=true;" + sql "set skip_storage_engine_merge=true;" + + sql "set query_freshness_tolerance_ms = 5000" + def t1 = System.currentTimeMillis() + def queryFreshnessToleranceCount = getBrpcMetricsByCluster(clusterName2, "capture_with_freshness_tolerance_count") + def fallbackCount = getBrpcMetricsByCluster(clusterName2, "capture_with_freshness_tolerance_fallback_count") + // when query_freshness_tolerance_ms is set, newly load data and compaction rowsets data will be skipped + qt_cluster2_1 "select * from test order by col0, __DORIS_VERSION_COL__;" + def t2 = System.currentTimeMillis() + logger.info("query in cluster2 cost=${t2 - t1} ms") + assert t2 - t1 < 3000 + assert getBrpcMetricsByCluster(clusterName2, "capture_with_freshness_tolerance_count") == queryFreshnessToleranceCount + 1 + // query with freshness tolerance should not fallback + assert getBrpcMetricsByCluster(clusterName2, "capture_with_freshness_tolerance_fallback_count") == fallbackCount + } +} diff --git a/regression-test/suites/cloud_p0/read_cluster_cache/warmup/test_warmup_delay_compaction_query_tolerance.groovy b/regression-test/suites/cloud_p0/read_cluster_cache/warmup/test_warmup_delay_compaction_query_tolerance.groovy new file mode 100644 index 00000000000000..3ce4ee58f4b771 --- /dev/null +++ b/regression-test/suites/cloud_p0/read_cluster_cache/warmup/test_warmup_delay_compaction_query_tolerance.groovy @@ -0,0 +1,316 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import org.apache.doris.regression.suite.ClusterOptions +import org.apache.doris.regression.util.NodeType +import groovy.json.JsonSlurper + +suite('test_warmup_delay_compaction_query_tolerance', 'docker') { + def options = new ClusterOptions() + options.feConfigs += [ + 'cloud_cluster_check_interval_second=1', + 'cloud_tablet_rebalancer_interval_second=1', + ] + options.beConfigs += [ + 'file_cache_enter_disk_resource_limit_mode_percent=99', + 'enable_evict_file_cache_in_advance=false', + 'file_cache_background_monitor_interval_ms=1000', + 'warm_up_rowset_slow_log_ms=1', + 'enable_compaction_delay_commit_for_warm_up=true', + 'warm_up_rowset_sync_wait_min_timeout_ms=20000', + 'warm_up_rowset_sync_wait_max_timeout_ms=20000', + ] + options.enableDebugPoints() + options.cloudMode = true + + def clearFileCache = {ip, port -> + def url = "http://${ip}:${port}/api/file_cache?op=clear&sync=true" + def response = new URL(url).text + def json = new JsonSlurper().parseText(response) + + // Check the status + if (json.status != "OK") { + throw new RuntimeException("Clear cache on ${ip}:${port} failed: ${json.status}") + } + } + + def clearFileCacheOnAllBackends = { + def backends = sql """SHOW BACKENDS""" + + for (be in backends) { + def ip = be[1] + def port = be[4] + clearFileCache(ip, port) + } + + // clear file cache is async, wait it done + sleep(1000) + } + + def getBrpcMetrics = {ip, port, name -> + def url = "http://${ip}:${port}/brpc_metrics" + def metrics = new URL(url).text + def matcher = metrics =~ ~"${name}\\s+(\\d+)" + if (matcher.find()) { + logger.info("Metric ${name} on ${ip}:${port} is ${matcher[0][1]}") + return matcher[0][1] as long + } else { + throw new RuntimeException("${name} not found for ${ip}:${port}") + } + } + + def getBeIpAndPort = { cluster -> + def backends = sql """SHOW BACKENDS""" + def cluster_bes = backends.findAll { it[19].contains("""\"compute_group_name\" : \"${cluster}\"""") } + + if (cluster_bes.isEmpty()) { + throw new RuntimeException("No BE found for cluster: ${cluster}") + } + + def firstBe = cluster_bes[0] + return [ip: firstBe[1], http_port:firstBe[4], rpc_port: firstBe[5]] + } + + def logFileCacheDownloadMetrics = { cluster -> + def backends = sql """SHOW BACKENDS""" + def cluster_bes = backends.findAll { it[19].contains("""\"compute_group_name\" : \"${cluster}\"""") } + for (be in cluster_bes) { + def ip = be[1] + def port = be[5] + def submitted = getBrpcMetrics(ip, port, "file_cache_download_submitted_num") + def finished = getBrpcMetrics(ip, port, "file_cache_download_finished_num") + def failed = getBrpcMetrics(ip, port, "file_cache_download_failed_num") + logger.info("${cluster} be ${ip}:${port}, downloader submitted=${submitted}" + + ", finished=${finished}, failed=${failed}") + } + } + + def logWarmUpRowsetMetrics = { cluster -> + def backends = sql """SHOW BACKENDS""" + def cluster_bes = backends.findAll { it[19].contains("""\"compute_group_name\" : \"${cluster}\"""") } + for (be in cluster_bes) { + def ip = be[1] + def port = be[5] + def submitted_segment = getBrpcMetrics(ip, port, "file_cache_event_driven_warm_up_submitted_segment_num") + def finished_segment = getBrpcMetrics(ip, port, "file_cache_event_driven_warm_up_finished_segment_num") + def failed_segment = getBrpcMetrics(ip, port, "file_cache_event_driven_warm_up_failed_segment_num") + def submitted_index = getBrpcMetrics(ip, port, "file_cache_event_driven_warm_up_submitted_index_num") + def finished_index = getBrpcMetrics(ip, port, "file_cache_event_driven_warm_up_finished_index_num") + def failed_index = getBrpcMetrics(ip, port, "file_cache_event_driven_warm_up_failed_index_num") + def compaction_sync_wait = getBrpcMetrics(ip, port, "file_cache_warm_up_rowset_wait_for_compaction_num") + logger.info("${cluster} be ${ip}:${port}, submitted_segment=${submitted_segment}" + + ", finished_segment=${finished_segment}, failed_segment=${failed_segment}" + + ", submitted_index=${submitted_index}" + + ", finished_index=${finished_index}" + + ", failed_index=${failed_index}" + + ", compaction_sync_wait=${compaction_sync_wait}") + } + } + + def waitForBrpcMetricValue = { ip, port, metricName, targetValue, timeoutMs -> + def delta_time = 100 + def useTime = 0 + + for(int t = delta_time; t <= timeoutMs; t += delta_time){ + try { + def currentValue = getBrpcMetrics(ip, port, metricName) + + if (currentValue == targetValue) { + logger.info("BE ${ip}:${port} metric ${metricName} reached target value: ${targetValue}") + return true + } + + logger.info("BE ${ip}:${port} metric ${metricName} current value: ${currentValue}, target: ${targetValue}") + + } catch (Exception e) { + logger.warn("Failed to get metric ${metricName} from BE ${ip}:${port}: ${e.message}") + } + + useTime = t + sleep(delta_time) + } + + assertTrue(useTime <= timeoutMs, "waitForBrpcMetricValue timeout") + } + + def getTabletStatus = { ip, port, tablet_id -> + StringBuilder sb = new StringBuilder(); + sb.append("curl -X GET http://${ip}:${port}") + sb.append("/api/compaction/show?tablet_id=") + sb.append(tablet_id) + + String command = sb.toString() + logger.info(command) + def process = command.execute() + def code = process.waitFor() + def out = process.getText() + logger.info("Get tablet status: =" + code + ", out=" + out) + assertEquals(code, 0) + def tabletStatus = parseJson(out.trim()) + return tabletStatus + } + + def do_cumu_compaction = { def be, def tbl, def tablet_id, int start, int end -> + GetDebugPoint().enableDebugPoint(be.ip, be.http_port as int, NodeType.BE, "CloudSizeBasedCumulativeCompactionPolicy::pick_input_rowsets.set_input_rowsets", [tablet_id: "${tablet_id}", start_version: "${start}", end_version: "${end}"]) + trigger_and_wait_compaction(tbl, "cumulative") + GetDebugPoint().disableDebugPoint(be.ip, be.http_port as int, NodeType.BE, "CloudSizeBasedCumulativeCompactionPolicy::pick_input_rowsets.set_input_rowsets") + } + + def getBrpcMetricsByCluster = {cluster, name-> + def backends = sql """SHOW BACKENDS""" + def cluster_bes = backends.findAll { it[19].contains("""\"compute_group_name\" : \"${cluster}\"""") } + assert cluster_bes.size() > 0, "No backend found for cluster ${cluster}" + def be = cluster_bes[0] + def ip = be[1] + def port = be[5] + return getBrpcMetrics(ip, port, name) + } + + docker(options) { + def clusterName1 = "warmup_source" + def clusterName2 = "warmup_target" + + // Add two clusters + cluster.addBackend(1, clusterName1) + cluster.addBackend(1, clusterName2) + + def tag1 = getCloudBeTagByName(clusterName1) + def tag2 = getCloudBeTagByName(clusterName2) + + logger.info("Cluster tag1: {}", tag1) + logger.info("Cluster tag2: {}", tag2) + + def jsonSlurper = new JsonSlurper() + + def getJobState = { jobId -> + def jobStateResult = sql """SHOW WARM UP JOB WHERE ID = ${jobId}""" + return jobStateResult[0][3] + } + + // Ensure we are in source cluster + sql """use @${clusterName1}""" + + // Start warm up job + def jobId_ = sql """ + WARM UP CLUSTER ${clusterName2} WITH CLUSTER ${clusterName1} + PROPERTIES ( + "sync_mode" = "event_driven", + "sync_event" = "load" + ) + """ + + def jobId = jobId_[0][0] + logger.info("Warm-up job ID: ${jobId}") + + sql """ + create table test ( + col0 int not null, + col1 int NOT NULL + ) UNIQUE KEY(`col0`) + DISTRIBUTED BY HASH(col0) BUCKETS 1 + PROPERTIES ("file_cache_ttl_seconds" = "3600", "disable_auto_compaction" = "true"); + """ + + clearFileCacheOnAllBackends() + sleep(5000) + + sql """use @${clusterName1}""" + // load data + sql """insert into test values (1, 1)""" + sql """insert into test values (2, 2)""" + sql """insert into test values (3, 3)""" + sql """insert into test values (4, 4)""" + sql """insert into test values (5, 5)""" + sql """insert into test values (6, 6)""" + sleep(3000) + + def tablets = sql_return_maparray """ show tablets from test; """ + logger.info("tablets: " + tablets) + assertEquals(1, tablets.size()) + def tablet = tablets[0] + String tablet_id = tablet.TabletId + + def be = getBeIpAndPort(clusterName2) + def src_be = getBeIpAndPort(clusterName1) + + logFileCacheDownloadMetrics(clusterName2) + logWarmUpRowsetMetrics(clusterName2) + def num_submitted = getBrpcMetrics(be.ip, be.rpc_port, "file_cache_event_driven_warm_up_submitted_segment_num") + def num_finished = getBrpcMetrics(be.ip, be.rpc_port, "file_cache_event_driven_warm_up_finished_segment_num") + assert num_submitted >= 6 + assert num_finished == num_submitted + + // inject sleep when read cluster warm up rowset for compaction and load + GetDebugPoint().enableDebugPoint(be.ip, be.http_port as int, NodeType.BE, "CloudInternalServiceImpl::warm_up_rowset.download_segment", [sleep:10]) + + // trigger and wait compaction async + def future = thread { + sql """use @${clusterName1}""" + do_cumu_compaction(src_be, "test", tablet_id, 2, 5) + } + // wait until the warmup for compaction started + waitForBrpcMetricValue(be.ip, be.rpc_port, "file_cache_warm_up_rowset_wait_for_compaction_num", 1, /*timeout*/10000) + logFileCacheDownloadMetrics(clusterName2) + logWarmUpRowsetMetrics(clusterName2) + assertEquals(num_submitted + 1, getBrpcMetrics(be.ip, be.rpc_port, "file_cache_event_driven_warm_up_submitted_segment_num")) + assertEquals(num_finished, getBrpcMetrics(be.ip, be.rpc_port, "file_cache_event_driven_warm_up_finished_segment_num")) + + + // a new insert will trigger the sync rowset operation in the following query + sql """insert into test values (9, 9)""" + + + // in this moment, compaction has completed, but not commited, it's waiting for warm up + // trigger a query on read cluster, can't read the compaction data + sql """use @${clusterName2}""" + sql "select * from test" + def tablet_status = getTabletStatus(be.ip, be.http_port, tablet_id) + def rowsets = tablet_status ["rowsets"] + assert rowsets[1].contains("[2-2]") + assert rowsets[2].contains("[3-3]") + assert rowsets[3].contains("[4-4]") + assert rowsets[4].contains("[5-5]") + assert rowsets[5].contains("[6-6]") + assert rowsets[6].contains("[7-7]") + assert rowsets[7].contains("[8-8]") + + sql "set enable_profile=true;" + sql "set profile_level=2;" + + sql "set query_freshness_tolerance_ms = 5000" + def t1 = System.currentTimeMillis() + def queryFreshnessToleranceCount = getBrpcMetricsByCluster(clusterName2, "capture_with_freshness_tolerance_count") + def fallbackCount = getBrpcMetricsByCluster(clusterName2, "capture_with_freshness_tolerance_fallback_count") + // should not contains (9,9) + qt_cluster2 """select * from test""" + def t2 = System.currentTimeMillis() + logger.info("query in cluster2 cost=${t2 - t1} ms") + assert t2 - t1 < 3000 + assert getBrpcMetricsByCluster(clusterName2, "capture_with_freshness_tolerance_count") == queryFreshnessToleranceCount + 1 + assert getBrpcMetricsByCluster(clusterName2, "capture_with_freshness_tolerance_fallback_count") == fallbackCount + + logFileCacheDownloadMetrics(clusterName2) + logWarmUpRowsetMetrics(clusterName2) + + future.get() + assert num_finished + 2 == getBrpcMetrics(be.ip, be.rpc_port, "file_cache_event_driven_warm_up_finished_segment_num") + assert 0 == getBrpcMetrics(be.ip, be.rpc_port, "file_cache_warm_up_rowset_wait_for_compaction_timeout_num") + + assert getBrpcMetrics(be.ip, be.rpc_port, "file_cache_event_driven_warm_up_finished_segment_num") + == getBrpcMetrics(src_be.ip, src_be.rpc_port, "file_cache_event_driven_warm_up_requested_segment_num") + } +} diff --git a/regression-test/suites/cloud_p0/read_cluster_cache/warmup/test_warmup_delay_idx_query_tolerance.groovy b/regression-test/suites/cloud_p0/read_cluster_cache/warmup/test_warmup_delay_idx_query_tolerance.groovy new file mode 100644 index 00000000000000..de4887624e4945 --- /dev/null +++ b/regression-test/suites/cloud_p0/read_cluster_cache/warmup/test_warmup_delay_idx_query_tolerance.groovy @@ -0,0 +1,332 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import org.apache.doris.regression.suite.ClusterOptions +import org.apache.doris.regression.util.NodeType +import groovy.json.JsonSlurper + +suite('test_warmup_delay_idx_query_tolerance', 'docker') { + def options = new ClusterOptions() + options.feConfigs += [ + 'cloud_cluster_check_interval_second=1', + 'cloud_tablet_rebalancer_interval_second=1', + ] + options.beConfigs += [ + 'file_cache_enter_disk_resource_limit_mode_percent=99', + 'enable_evict_file_cache_in_advance=false', + 'file_cache_background_monitor_interval_ms=1000', + 'warm_up_rowset_slow_log_ms=1', + 'enable_compaction_delay_commit_for_warm_up=true', + 'warm_up_rowset_sync_wait_min_timeout_ms=100', // to cauase timeout + 'warm_up_rowset_sync_wait_max_timeout_ms=100', + ] + options.enableDebugPoints() + options.cloudMode = true + + def clearFileCache = {ip, port -> + def url = "http://${ip}:${port}/api/file_cache?op=clear&sync=true" + def response = new URL(url).text + def json = new JsonSlurper().parseText(response) + + // Check the status + if (json.status != "OK") { + throw new RuntimeException("Clear cache on ${ip}:${port} failed: ${json.status}") + } + } + + def clearFileCacheOnAllBackends = { + def backends = sql """SHOW BACKENDS""" + + for (be in backends) { + def ip = be[1] + def port = be[4] + clearFileCache(ip, port) + } + + // clear file cache is async, wait it done + sleep(5000) + } + + def getBrpcMetrics = {ip, port, name -> + def url = "http://${ip}:${port}/brpc_metrics" + def metrics = new URL(url).text + def matcher = metrics =~ ~"${name}\\s+(\\d+)" + if (matcher.find()) { + logger.info("Metric ${name} on ${ip}:${port} is ${matcher[0][1]}") + return matcher[0][1] as long + } else { + throw new RuntimeException("${name} not found for ${ip}:${port}") + } + } + + def getBeIpAndPort = { cluster -> + def backends = sql """SHOW BACKENDS""" + def cluster_bes = backends.findAll { it[19].contains("""\"compute_group_name\" : \"${cluster}\"""") } + + if (cluster_bes.isEmpty()) { + throw new RuntimeException("No BE found for cluster: ${cluster}") + } + + def firstBe = cluster_bes[0] + return [ip: firstBe[1], http_port:firstBe[4], rpc_port: firstBe[5]] + } + + def logFileCacheDownloadMetrics = { cluster -> + def backends = sql """SHOW BACKENDS""" + def cluster_bes = backends.findAll { it[19].contains("""\"compute_group_name\" : \"${cluster}\"""") } + for (be in cluster_bes) { + def ip = be[1] + def port = be[5] + def submitted = getBrpcMetrics(ip, port, "file_cache_download_submitted_num") + def finished = getBrpcMetrics(ip, port, "file_cache_download_finished_num") + def failed = getBrpcMetrics(ip, port, "file_cache_download_failed_num") + logger.info("${cluster} be ${ip}:${port}, downloader submitted=${submitted}" + + ", finished=${finished}, failed=${failed}") + } + } + + def logWarmUpRowsetMetrics = { cluster -> + def backends = sql """SHOW BACKENDS""" + def cluster_bes = backends.findAll { it[19].contains("""\"compute_group_name\" : \"${cluster}\"""") } + for (be in cluster_bes) { + def ip = be[1] + def port = be[5] + def submitted_segment = getBrpcMetrics(ip, port, "file_cache_event_driven_warm_up_submitted_segment_num") + def finished_segment = getBrpcMetrics(ip, port, "file_cache_event_driven_warm_up_finished_segment_num") + def failed_segment = getBrpcMetrics(ip, port, "file_cache_event_driven_warm_up_failed_segment_num") + def submitted_index = getBrpcMetrics(ip, port, "file_cache_event_driven_warm_up_submitted_index_num") + def finished_index = getBrpcMetrics(ip, port, "file_cache_event_driven_warm_up_finished_index_num") + def failed_index = getBrpcMetrics(ip, port, "file_cache_event_driven_warm_up_failed_index_num") + def compaction_sync_wait = getBrpcMetrics(ip, port, "file_cache_warm_up_rowset_wait_for_compaction_num") + logger.info("${cluster} be ${ip}:${port}, submitted_segment=${submitted_segment}" + + ", finished_segment=${finished_segment}, failed_segment=${failed_segment}" + + ", submitted_index=${submitted_index}" + + ", finished_index=${finished_index}" + + ", failed_index=${failed_index}" + + ", compaction_sync_wait=${compaction_sync_wait}") + } + } + + def waitForBrpcMetricValue = { ip, port, metricName, targetValue, timeoutMs -> + def delta_time = 100 + def useTime = 0 + + for(int t = delta_time; t <= timeoutMs; t += delta_time){ + try { + def currentValue = getBrpcMetrics(ip, port, metricName) + + if (currentValue == targetValue) { + logger.info("BE ${ip}:${port} metric ${metricName} reached target value: ${targetValue}") + return true + } + + logger.info("BE ${ip}:${port} metric ${metricName} current value: ${currentValue}, target: ${targetValue}") + + } catch (Exception e) { + logger.warn("Failed to get metric ${metricName} from BE ${ip}:${port}: ${e.message}") + } + + useTime = t + sleep(delta_time) + } + + assertTrue(useTime <= timeoutMs, "waitForBrpcMetricValue timeout") + } + + def getTabletStatus = { ip, port, tablet_id -> + StringBuilder sb = new StringBuilder(); + sb.append("curl -X GET http://${ip}:${port}") + sb.append("/api/compaction/show?tablet_id=") + sb.append(tablet_id) + + String command = sb.toString() + logger.info(command) + def process = command.execute() + def code = process.waitFor() + def out = process.getText() + logger.info("Get tablet status: =" + code + ", out=" + out) + assertEquals(code, 0) + def tabletStatus = parseJson(out.trim()) + return tabletStatus + } + + def do_cumu_compaction = { def be, def tbl, def tablet_id, int start, int end -> + GetDebugPoint().enableDebugPoint(be.ip, be.http_port as int, NodeType.BE, "CloudSizeBasedCumulativeCompactionPolicy::pick_input_rowsets.set_input_rowsets", [tablet_id: "${tablet_id}", start_version: "${start}", end_version: "${end}"]) + trigger_and_wait_compaction(tbl, "cumulative") + GetDebugPoint().disableDebugPoint(be.ip, be.http_port as int, NodeType.BE, "CloudSizeBasedCumulativeCompactionPolicy::pick_input_rowsets.set_input_rowsets") + } + + def getBrpcMetricsByCluster = {cluster, name-> + def backends = sql """SHOW BACKENDS""" + def cluster_bes = backends.findAll { it[19].contains("""\"compute_group_name\" : \"${cluster}\"""") } + assert cluster_bes.size() > 0, "No backend found for cluster ${cluster}" + def be = cluster_bes[0] + def ip = be[1] + def port = be[5] + return getBrpcMetrics(ip, port, name) + } + + docker(options) { + def clusterName1 = "warmup_source" + def clusterName2 = "warmup_target" + + // Add two clusters + cluster.addBackend(1, clusterName1) + cluster.addBackend(1, clusterName2) + + def tag1 = getCloudBeTagByName(clusterName1) + def tag2 = getCloudBeTagByName(clusterName2) + + logger.info("Cluster tag1: {}", tag1) + logger.info("Cluster tag2: {}", tag2) + + def jsonSlurper = new JsonSlurper() + + def getJobState = { jobId -> + def jobStateResult = sql """SHOW WARM UP JOB WHERE ID = ${jobId}""" + return jobStateResult[0][3] + } + + // Ensure we are in source cluster + sql """use @${clusterName1}""" + + // Start warm up job + def jobId_ = sql """ + WARM UP CLUSTER ${clusterName2} WITH CLUSTER ${clusterName1} + PROPERTIES ( + "sync_mode" = "event_driven", + "sync_event" = "load" + ) + """ + + def jobId = jobId_[0][0] + logger.info("Warm-up job ID: ${jobId}") + + sql """ + create table test ( + col0 int not null, + col1 int NOT NULL, + INDEX idx1(col1) USING INVERTED + ) UNIQUE KEY(`col0`) + DISTRIBUTED BY HASH(col0) BUCKETS 1 + PROPERTIES ("file_cache_ttl_seconds" = "3600", "disable_auto_compaction" = "true", "enable_unique_key_merge_on_write" = "false"); + """ + + clearFileCacheOnAllBackends() + + sql """use @${clusterName1}""" + // load data + sql """insert into test values (1, 1)""" + sql """insert into test(col0,col1,__DORIS_DELETE_SIGN__) values (1, 2, 1)""" + sql """insert into test values (3, 3)""" + sql """insert into test values (4, 4)""" + sql """insert into test values (5, 5)""" + sql """insert into test values (6, 6)""" + sleep(5000) + + def tablets = sql_return_maparray """ show tablets from test; """ + logger.info("tablets: " + tablets) + assertEquals(1, tablets.size()) + def tablet = tablets[0] + String tablet_id = tablet.TabletId + + def be = getBeIpAndPort(clusterName2) + def src_be = getBeIpAndPort(clusterName1) + + logFileCacheDownloadMetrics(clusterName2) + logWarmUpRowsetMetrics(clusterName2) + def num_submitted = getBrpcMetrics(be.ip, be.rpc_port, "file_cache_event_driven_warm_up_submitted_segment_num") + def num_finished = getBrpcMetrics(be.ip, be.rpc_port, "file_cache_event_driven_warm_up_finished_segment_num") + def num_idx_submitted = getBrpcMetrics(be.ip, be.rpc_port, "file_cache_event_driven_warm_up_submitted_index_num") + def num_idx_finished = getBrpcMetrics(be.ip, be.rpc_port, "file_cache_event_driven_warm_up_finished_index_num") + assert num_submitted >= 6 + assert num_finished == num_submitted + assert num_idx_submitted >= 6 + assert num_idx_finished == num_idx_submitted + + sql """use @${clusterName2}""" + // ensure that base rowsets' meta are loaded on target cluster + qt_cluster2_0 "select * from test order by col0, __DORIS_VERSION_COL__;" + sql """use @${clusterName1}""" + + // inject sleep when read cluster warm up rowset for compaction and load + GetDebugPoint().enableDebugPoint(be.ip, be.http_port as int, NodeType.BE, "CloudInternalServiceImpl::warm_up_rowset.download_inverted_idx", [sleep:10]) + + // trigger and wait compaction async + def future = thread { + sql """use @${clusterName1}""" + do_cumu_compaction(src_be, "test", tablet_id, 2, 5) + } + sleep(500) + // wait until the warmup for compaction started + waitForBrpcMetricValue(be.ip, be.rpc_port, "file_cache_warm_up_rowset_wait_for_compaction_num", 1, /*timeout*/10000) + logFileCacheDownloadMetrics(clusterName2) + logWarmUpRowsetMetrics(clusterName2) + assert num_submitted + 1 == getBrpcMetrics(be.ip, be.rpc_port, "file_cache_event_driven_warm_up_submitted_segment_num") + assert num_finished + 1 == getBrpcMetrics(be.ip, be.rpc_port, "file_cache_event_driven_warm_up_finished_segment_num") + assert num_idx_submitted + 1 == getBrpcMetrics(be.ip, be.rpc_port, "file_cache_event_driven_warm_up_submitted_index_num") + assert num_idx_finished == getBrpcMetrics(be.ip, be.rpc_port, "file_cache_event_driven_warm_up_finished_index_num") + + + // a new insert will trigger the sync rowset operation in the following query + sql """insert into test values (9, 9)""" + + // trigger a query on read cluster without query tolerance, read the origin data + sql """use @${clusterName2}""" + sql "set skip_delete_sign=true;" + sql "set show_hidden_columns=true;" + sql "set skip_storage_engine_merge=true;" + qt_cluster2_1 "select * from test order by col0, __DORIS_VERSION_COL__;" + def tablet_status = getTabletStatus(be.ip, be.http_port, tablet_id) + def rowsets = tablet_status ["rowsets"] + assert rowsets[1].contains("[2-5]") + assert rowsets[2].contains("[6-6]") + assert rowsets[3].contains("[7-7]") + assert rowsets[4].contains("[8-8]") + + sql "set enable_profile=true;" + sql "set profile_level=2;" + + // trigger a query on read cluster without query tolerance, read the compacted data + sql "set query_freshness_tolerance_ms = 5000" + def t1 = System.currentTimeMillis() + def queryFreshnessToleranceCount = getBrpcMetricsByCluster(clusterName2, "capture_with_freshness_tolerance_count") + def fallbackCount = getBrpcMetricsByCluster(clusterName2, "capture_with_freshness_tolerance_fallback_count") + // should not contains (9,9) + sql "set skip_delete_sign=true;" + sql "set show_hidden_columns=true;" + sql "set skip_storage_engine_merge=true;" + qt_cluster2_2 "select * from test order by col0, __DORIS_VERSION_COL__;" + def t2 = System.currentTimeMillis() + logger.info("query in cluster2 cost=${t2 - t1} ms") + assert t2 - t1 < 3000 + assert getBrpcMetricsByCluster(clusterName2, "capture_with_freshness_tolerance_count") == queryFreshnessToleranceCount + 1 + assert getBrpcMetricsByCluster(clusterName2, "capture_with_freshness_tolerance_fallback_count") == fallbackCount + + logFileCacheDownloadMetrics(clusterName2) + logWarmUpRowsetMetrics(clusterName2) + + sleep(10000) + // assert num_finished + 2 == getBrpcMetrics(be.ip, be.rpc_port, "file_cache_event_driven_warm_up_finished_segment_num") + assert 1 == getBrpcMetrics(be.ip, be.rpc_port, "file_cache_warm_up_rowset_wait_for_compaction_timeout_num") + + assert getBrpcMetrics(be.ip, be.rpc_port, "file_cache_event_driven_warm_up_finished_segment_num") + == getBrpcMetrics(src_be.ip, src_be.rpc_port, "file_cache_event_driven_warm_up_requested_segment_num") + assert getBrpcMetrics(be.ip, be.rpc_port, "file_cache_event_driven_warm_up_finished_index_num") + == getBrpcMetrics(src_be.ip, src_be.rpc_port, "file_cache_event_driven_warm_up_requested_index_num") + } +} diff --git a/regression-test/suites/cloud_p0/read_cluster_cache/warmup/test_warmup_delay_sc_query_tolerance.groovy b/regression-test/suites/cloud_p0/read_cluster_cache/warmup/test_warmup_delay_sc_query_tolerance.groovy new file mode 100644 index 00000000000000..688aa5e4446a57 --- /dev/null +++ b/regression-test/suites/cloud_p0/read_cluster_cache/warmup/test_warmup_delay_sc_query_tolerance.groovy @@ -0,0 +1,308 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import org.apache.doris.regression.suite.ClusterOptions +import org.apache.doris.regression.util.NodeType +import groovy.json.JsonSlurper + +suite('test_warmup_delay_sc_query_tolerance', 'docker') { + def options = new ClusterOptions() + options.feConfigs += [ + 'cloud_cluster_check_interval_second=1', + 'cloud_tablet_rebalancer_interval_second=1', + ] + options.beConfigs += [ + 'file_cache_enter_disk_resource_limit_mode_percent=99', + 'enable_evict_file_cache_in_advance=false', + 'file_cache_background_monitor_interval_ms=1000', + 'warm_up_rowset_slow_log_ms=1', + 'enable_compaction_delay_commit_for_warm_up=true', + 'warm_up_rowset_sync_wait_min_timeout_ms=100', + 'warm_up_rowset_sync_wait_max_timeout_ms=100', + ] + options.enableDebugPoints() + options.cloudMode = true + + def clearFileCache = {ip, port -> + def url = "http://${ip}:${port}/api/file_cache?op=clear&sync=true" + def response = new URL(url).text + def json = new JsonSlurper().parseText(response) + + // Check the status + if (json.status != "OK") { + throw new RuntimeException("Clear cache on ${ip}:${port} failed: ${json.status}") + } + } + + def clearFileCacheOnAllBackends = { + def backends = sql """SHOW BACKENDS""" + + for (be in backends) { + def ip = be[1] + def port = be[4] + clearFileCache(ip, port) + } + + // clear file cache is async, wait it done + sleep(1000) + } + + def getBrpcMetrics = {ip, port, name -> + def url = "http://${ip}:${port}/brpc_metrics" + def metrics = new URL(url).text + def matcher = metrics =~ ~"${name}\\s+(\\d+)" + if (matcher.find()) { + logger.info("Metric ${name} on ${ip}:${port} is ${matcher[0][1]}") + return matcher[0][1] as long + } else { + throw new RuntimeException("${name} not found for ${ip}:${port}") + } + } + + def getBeIpAndPort = { cluster -> + def backends = sql """SHOW BACKENDS""" + def cluster_bes = backends.findAll { it[19].contains("""\"compute_group_name\" : \"${cluster}\"""") } + + if (cluster_bes.isEmpty()) { + throw new RuntimeException("No BE found for cluster: ${cluster}") + } + + def firstBe = cluster_bes[0] + return [ip: firstBe[1], http_port:firstBe[4], rpc_port: firstBe[5]] + } + + def logFileCacheDownloadMetrics = { cluster -> + def backends = sql """SHOW BACKENDS""" + def cluster_bes = backends.findAll { it[19].contains("""\"compute_group_name\" : \"${cluster}\"""") } + for (be in cluster_bes) { + def ip = be[1] + def port = be[5] + def submitted = getBrpcMetrics(ip, port, "file_cache_download_submitted_num") + def finished = getBrpcMetrics(ip, port, "file_cache_download_finished_num") + def failed = getBrpcMetrics(ip, port, "file_cache_download_failed_num") + logger.info("${cluster} be ${ip}:${port}, downloader submitted=${submitted}" + + ", finished=${finished}, failed=${failed}") + } + } + + def logWarmUpRowsetMetrics = { cluster -> + def backends = sql """SHOW BACKENDS""" + def cluster_bes = backends.findAll { it[19].contains("""\"compute_group_name\" : \"${cluster}\"""") } + for (be in cluster_bes) { + def ip = be[1] + def port = be[5] + def submitted_segment = getBrpcMetrics(ip, port, "file_cache_event_driven_warm_up_submitted_segment_num") + def finished_segment = getBrpcMetrics(ip, port, "file_cache_event_driven_warm_up_finished_segment_num") + def failed_segment = getBrpcMetrics(ip, port, "file_cache_event_driven_warm_up_failed_segment_num") + def submitted_index = getBrpcMetrics(ip, port, "file_cache_event_driven_warm_up_submitted_index_num") + def finished_index = getBrpcMetrics(ip, port, "file_cache_event_driven_warm_up_finished_index_num") + def failed_index = getBrpcMetrics(ip, port, "file_cache_event_driven_warm_up_failed_index_num") + def compaction_sync_wait = getBrpcMetrics(ip, port, "file_cache_warm_up_rowset_wait_for_compaction_num") + logger.info("${cluster} be ${ip}:${port}, submitted_segment=${submitted_segment}" + + ", finished_segment=${finished_segment}, failed_segment=${failed_segment}" + + ", submitted_index=${submitted_index}" + + ", finished_index=${finished_index}" + + ", failed_index=${failed_index}" + + ", compaction_sync_wait=${compaction_sync_wait}") + } + } + + def waitForBrpcMetricValue = { ip, port, metricName, targetValue, timeoutMs -> + def delta_time = 100 + def useTime = 0 + + for(int t = delta_time; t <= timeoutMs; t += delta_time){ + try { + def currentValue = getBrpcMetrics(ip, port, metricName) + + if (currentValue == targetValue) { + logger.info("BE ${ip}:${port} metric ${metricName} reached target value: ${targetValue}") + return true + } + + logger.info("BE ${ip}:${port} metric ${metricName} current value: ${currentValue}, target: ${targetValue}") + + } catch (Exception e) { + logger.warn("Failed to get metric ${metricName} from BE ${ip}:${port}: ${e.message}") + } + + useTime = t + sleep(delta_time) + } + + assertTrue(useTime <= timeoutMs, "waitForBrpcMetricValue timeout") + } + + def getTabletStatus = { ip, port, tablet_id -> + StringBuilder sb = new StringBuilder(); + sb.append("curl -X GET http://${ip}:${port}") + sb.append("/api/compaction/show?tablet_id=") + sb.append(tablet_id) + + String command = sb.toString() + logger.info(command) + def process = command.execute() + def code = process.waitFor() + def out = process.getText() + logger.info("Get tablet status: =" + code + ", out=" + out) + assertEquals(code, 0) + def tabletStatus = parseJson(out.trim()) + return tabletStatus + } + + def getBrpcMetricsByCluster = {cluster, name-> + def backends = sql """SHOW BACKENDS""" + def cluster_bes = backends.findAll { it[19].contains("""\"compute_group_name\" : \"${cluster}\"""") } + assert cluster_bes.size() > 0, "No backend found for cluster ${cluster}" + def be = cluster_bes[0] + def ip = be[1] + def port = be[5] + return getBrpcMetrics(ip, port, name) + } + + def do_cumu_compaction = { def be, def tbl, def tablet_id, int start, int end -> + GetDebugPoint().enableDebugPoint(be.ip, be.http_port as int, NodeType.BE, "CloudSizeBasedCumulativeCompactionPolicy::pick_input_rowsets.set_input_rowsets", [tablet_id: "${tablet_id}", start_version: "${start}", end_version: "${end}"]) + trigger_and_wait_compaction(tbl, "cumulative") + GetDebugPoint().disableDebugPoint(be.ip, be.http_port as int, NodeType.BE, "CloudSizeBasedCumulativeCompactionPolicy::pick_input_rowsets.set_input_rowsets") + } + + docker(options) { + def clusterName1 = "warmup_source" + def clusterName2 = "warmup_target" + + // Add two clusters + cluster.addBackend(1, clusterName1) + cluster.addBackend(1, clusterName2) + + def tag1 = getCloudBeTagByName(clusterName1) + def tag2 = getCloudBeTagByName(clusterName2) + + logger.info("Cluster tag1: {}", tag1) + logger.info("Cluster tag2: {}", tag2) + + def jsonSlurper = new JsonSlurper() + + def getJobState = { jobId -> + def jobStateResult = sql """SHOW WARM UP JOB WHERE ID = ${jobId}""" + return jobStateResult[0][3] + } + + // Ensure we are in source cluster + sql """use @${clusterName1}""" + + // Start warm up job + def jobId_ = sql """ + WARM UP CLUSTER ${clusterName2} WITH CLUSTER ${clusterName1} + PROPERTIES ( + "sync_mode" = "event_driven", + "sync_event" = "load" + ) + """ + + def jobId = jobId_[0][0] + logger.info("Warm-up job ID: ${jobId}") + + sql """ + create table test ( + col0 int not null, + col1 int NULL + ) UNIQUE KEY(`col0`) + DISTRIBUTED BY HASH(col0) BUCKETS 1 + PROPERTIES ("file_cache_ttl_seconds" = "3600", "disable_auto_compaction" = "true", + "enable_unique_key_merge_on_write" = "false"); + """ + + clearFileCacheOnAllBackends() + + sql """use @${clusterName1}""" + // load data + sql """insert into test values (1, 1),(2,2);""" + sql """insert into test(col0,__DORIS_DELETE_SIGN__) values (1, 1);""" + sleep(5000) + + def tablets = sql_return_maparray """ show tablets from test; """ + logger.info("tablets: " + tablets) + assertEquals(1, tablets.size()) + def tablet = tablets[0] + String tablet_id = tablet.TabletId + + def be = getBeIpAndPort(clusterName2) + def src_be = getBeIpAndPort(clusterName1) + + logFileCacheDownloadMetrics(clusterName2) + logWarmUpRowsetMetrics(clusterName2) + def num_submitted = getBrpcMetrics(be.ip, be.rpc_port, "file_cache_event_driven_warm_up_submitted_segment_num") + def num_finished = getBrpcMetrics(be.ip, be.rpc_port, "file_cache_event_driven_warm_up_finished_segment_num") + assert num_submitted >= 1 + assert num_finished == num_submitted + + sql """use @${clusterName2}""" + // ensure that base rowsets' meta are loaded on target cluster + qt_cluster2_0 "select * from test order by col0, __DORIS_VERSION_COL__;" + sql """use @${clusterName1}""" + + // inject sleep when read cluster warm up rowset for compaction and load + GetDebugPoint().enableDebugPoint(be.ip, be.http_port as int, NodeType.BE, "CloudInternalServiceImpl::warm_up_rowset.download_segment", [sleep:10]) + + sql """insert into test values (9, 9)""" + + do_cumu_compaction(src_be, "test", tablet_id, 2, 4) + + // trigger a heavy SC + sql "alter table test modify column col1 varchar(1000);" + + waitForSchemaChangeDone { + sql """ SHOW ALTER TABLE COLUMN WHERE TableName='test' ORDER BY createtime DESC LIMIT 1 """ + time 1000 + } + + logFileCacheDownloadMetrics(clusterName2) + logWarmUpRowsetMetrics(clusterName2) + // assert num_submitted + 2 == getBrpcMetrics(be.ip, be.rpc_port, "file_cache_event_driven_warm_up_submitted_segment_num") + // assert num_finished == getBrpcMetrics(be.ip, be.rpc_port, "file_cache_event_driven_warm_up_finished_segment_num") + + + sql """use @${clusterName2}""" + + sql "set enable_profile=true;" + sql "set profile_level=2;" + + sql "set skip_delete_sign=true;" + sql "set show_hidden_columns=true;" + sql "set skip_storage_engine_merge=true;" + + qt_cluster2_1 "select * from test order by col0, __DORIS_VERSION_COL__;" + + sql "set query_freshness_tolerance_ms = 5000" + def t1 = System.currentTimeMillis() + def queryFreshnessToleranceCount = getBrpcMetricsByCluster(clusterName2, "capture_with_freshness_tolerance_count") + def fallbackCount = getBrpcMetricsByCluster(clusterName2, "capture_with_freshness_tolerance_fallback_count") + qt_cluster2_2 "select * from test order by col0, __DORIS_VERSION_COL__;" + def t2 = System.currentTimeMillis() + logger.info("query in cluster2 cost=${t2 - t1} ms") + assert t2 - t1 < 3000 + assert getBrpcMetricsByCluster(clusterName2, "capture_with_freshness_tolerance_count") == queryFreshnessToleranceCount + 1 + assert getBrpcMetricsByCluster(clusterName2, "capture_with_freshness_tolerance_fallback_count") == fallbackCount + + logFileCacheDownloadMetrics(clusterName2) + logWarmUpRowsetMetrics(clusterName2) + + sleep(10000) + assert getBrpcMetrics(be.ip, be.rpc_port, "file_cache_event_driven_warm_up_finished_segment_num") + == getBrpcMetrics(src_be.ip, src_be.rpc_port, "file_cache_event_driven_warm_up_requested_segment_num") + } +} diff --git a/regression-test/suites/cloud_p0/read_cluster_cache/warmup/test_warmup_delay_timeout_compaction_query_tolerance.groovy b/regression-test/suites/cloud_p0/read_cluster_cache/warmup/test_warmup_delay_timeout_compaction_query_tolerance.groovy new file mode 100644 index 00000000000000..b13609ed42e1e8 --- /dev/null +++ b/regression-test/suites/cloud_p0/read_cluster_cache/warmup/test_warmup_delay_timeout_compaction_query_tolerance.groovy @@ -0,0 +1,335 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import org.apache.doris.regression.suite.ClusterOptions +import org.apache.doris.regression.util.NodeType +import groovy.json.JsonSlurper + +suite('test_warmup_delay_timeout_compaction_query_tolerance', 'docker') { + def options = new ClusterOptions() + options.feConfigs += [ + 'cloud_cluster_check_interval_second=1', + 'cloud_tablet_rebalancer_interval_second=1', + ] + options.beConfigs += [ + 'file_cache_enter_disk_resource_limit_mode_percent=99', + 'enable_evict_file_cache_in_advance=false', + 'file_cache_background_monitor_interval_ms=1000', + 'warm_up_rowset_slow_log_ms=1', + 'enable_compaction_delay_commit_for_warm_up=true', + 'read_cluster_cache_opt_verbose_log=true', + 'warm_up_rowset_sync_wait_min_timeout_ms=100', + 'warm_up_rowset_sync_wait_max_timeout_ms=100', // to cause timeout + ] + options.enableDebugPoints() + options.cloudMode = true + + def clearFileCache = {ip, port -> + def url = "http://${ip}:${port}/api/file_cache?op=clear&sync=true" + def response = new URL(url).text + def json = new JsonSlurper().parseText(response) + + // Check the status + if (json.status != "OK") { + throw new RuntimeException("Clear cache on ${ip}:${port} failed: ${json.status}") + } + } + + def clearFileCacheOnAllBackends = { + def backends = sql """SHOW BACKENDS""" + + for (be in backends) { + def ip = be[1] + def port = be[4] + clearFileCache(ip, port) + } + + // clear file cache is async, wait it done + sleep(2000) + } + + def getBrpcMetrics = {ip, port, name -> + def url = "http://${ip}:${port}/brpc_metrics" + def metrics = new URL(url).text + def matcher = metrics =~ ~"${name}\\s+(\\d+)" + if (matcher.find()) { + logger.info("Metric ${name} on ${ip}:${port} is ${matcher[0][1]}") + return matcher[0][1] as long + } else { + throw new RuntimeException("${name} not found for ${ip}:${port}") + } + } + + def getBeIpAndPort = { cluster -> + def backends = sql """SHOW BACKENDS""" + def cluster_bes = backends.findAll { it[19].contains("""\"compute_group_name\" : \"${cluster}\"""") } + + if (cluster_bes.isEmpty()) { + throw new RuntimeException("No BE found for cluster: ${cluster}") + } + + def firstBe = cluster_bes[0] + return [ip: firstBe[1], http_port:firstBe[4], rpc_port: firstBe[5]] + } + + def logFileCacheDownloadMetrics = { cluster -> + def backends = sql """SHOW BACKENDS""" + def cluster_bes = backends.findAll { it[19].contains("""\"compute_group_name\" : \"${cluster}\"""") } + for (be in cluster_bes) { + def ip = be[1] + def port = be[5] + def submitted = getBrpcMetrics(ip, port, "file_cache_download_submitted_num") + def finished = getBrpcMetrics(ip, port, "file_cache_download_finished_num") + def failed = getBrpcMetrics(ip, port, "file_cache_download_failed_num") + logger.info("${cluster} be ${ip}:${port}, downloader submitted=${submitted}" + + ", finished=${finished}, failed=${failed}") + } + } + + def logWarmUpRowsetMetrics = { cluster -> + def backends = sql """SHOW BACKENDS""" + def cluster_bes = backends.findAll { it[19].contains("""\"compute_group_name\" : \"${cluster}\"""") } + for (be in cluster_bes) { + def ip = be[1] + def port = be[5] + def submitted_segment = getBrpcMetrics(ip, port, "file_cache_event_driven_warm_up_submitted_segment_num") + def finished_segment = getBrpcMetrics(ip, port, "file_cache_event_driven_warm_up_finished_segment_num") + def failed_segment = getBrpcMetrics(ip, port, "file_cache_event_driven_warm_up_failed_segment_num") + def submitted_index = getBrpcMetrics(ip, port, "file_cache_event_driven_warm_up_submitted_index_num") + def finished_index = getBrpcMetrics(ip, port, "file_cache_event_driven_warm_up_finished_index_num") + def failed_index = getBrpcMetrics(ip, port, "file_cache_event_driven_warm_up_failed_index_num") + def compaction_sync_wait = getBrpcMetrics(ip, port, "file_cache_warm_up_rowset_wait_for_compaction_num") + logger.info("${cluster} be ${ip}:${port}, submitted_segment=${submitted_segment}" + + ", finished_segment=${finished_segment}, failed_segment=${failed_segment}" + + ", submitted_index=${submitted_index}" + + ", finished_index=${finished_index}" + + ", failed_index=${failed_index}" + + ", compaction_sync_wait=${compaction_sync_wait}") + } + } + + def waitForBrpcMetricValue = { ip, port, metricName, targetValue, timeoutMs -> + def delta_time = 100 + def useTime = 0 + + for(int t = delta_time; t <= timeoutMs; t += delta_time){ + try { + def currentValue = getBrpcMetrics(ip, port, metricName) + + if (currentValue == targetValue) { + logger.info("BE ${ip}:${port} metric ${metricName} reached target value: ${targetValue}") + return true + } + + logger.info("BE ${ip}:${port} metric ${metricName} current value: ${currentValue}, target: ${targetValue}") + + } catch (Exception e) { + logger.warn("Failed to get metric ${metricName} from BE ${ip}:${port}: ${e.message}") + } + + useTime = t + sleep(delta_time) + } + + assertTrue(useTime <= timeoutMs, "waitForBrpcMetricValue timeout") + } + + def getTabletStatus = { ip, port, tablet_id -> + StringBuilder sb = new StringBuilder(); + sb.append("curl -X GET http://${ip}:${port}") + sb.append("/api/compaction/show?tablet_id=") + sb.append(tablet_id) + + String command = sb.toString() + logger.info(command) + def process = command.execute() + def code = process.waitFor() + def out = process.getText() + logger.info("Get tablet status: =" + code + ", out=" + out) + assertEquals(code, 0) + def tabletStatus = parseJson(out.trim()) + return tabletStatus + } + + def do_cumu_compaction = { def be, def tbl, def tablet_id, int start, int end -> + GetDebugPoint().enableDebugPoint(be.ip, be.http_port as int, NodeType.BE, "CloudSizeBasedCumulativeCompactionPolicy::pick_input_rowsets.set_input_rowsets", [tablet_id: "${tablet_id}", start_version: "${start}", end_version: "${end}"]) + trigger_and_wait_compaction(tbl, "cumulative") + GetDebugPoint().disableDebugPoint(be.ip, be.http_port as int, NodeType.BE, "CloudSizeBasedCumulativeCompactionPolicy::pick_input_rowsets.set_input_rowsets") + } + + def getBrpcMetricsByCluster = {cluster, name-> + def backends = sql """SHOW BACKENDS""" + def cluster_bes = backends.findAll { it[19].contains("""\"compute_group_name\" : \"${cluster}\"""") } + assert cluster_bes.size() > 0, "No backend found for cluster ${cluster}" + def be = cluster_bes[0] + def ip = be[1] + def port = be[5] + return getBrpcMetrics(ip, port, name) + } + + docker(options) { + def clusterName1 = "warmup_source" + def clusterName2 = "warmup_target" + + // Add two clusters + cluster.addBackend(1, clusterName1) + cluster.addBackend(1, clusterName2) + + def tag1 = getCloudBeTagByName(clusterName1) + def tag2 = getCloudBeTagByName(clusterName2) + + logger.info("Cluster tag1: {}", tag1) + logger.info("Cluster tag2: {}", tag2) + + def jsonSlurper = new JsonSlurper() + + def getJobState = { jobId -> + def jobStateResult = sql """SHOW WARM UP JOB WHERE ID = ${jobId}""" + return jobStateResult[0][3] + } + + // Ensure we are in source cluster + sql """use @${clusterName1}""" + + // Start warm up job + def jobId_ = sql """ + WARM UP CLUSTER ${clusterName2} WITH CLUSTER ${clusterName1} + PROPERTIES ( + "sync_mode" = "event_driven", + "sync_event" = "load" + ) + """ + + def jobId = jobId_[0][0] + logger.info("Warm-up job ID: ${jobId}") + + sql """ + create table test ( + col0 int not null, + col1 int NOT NULL + ) UNIQUE KEY(`col0`) + DISTRIBUTED BY HASH(col0) BUCKETS 1 + PROPERTIES ("file_cache_ttl_seconds" = "3600", "disable_auto_compaction" = "true", + "enable_unique_key_merge_on_write" = "false"); + """ + + clearFileCacheOnAllBackends() + + sql """use @${clusterName1}""" + // load data + sql """insert into test values (1, 1)""" + sql """insert into test(col0,col1,__DORIS_DELETE_SIGN__) values (1, 2, 1)""" + sql """insert into test values (3, 3)""" + sql """insert into test values (4, 4)""" + sql """insert into test values (5, 5)""" + sql """insert into test values (6, 6)""" + sleep(5000) + + def tablets = sql_return_maparray """ show tablets from test; """ + logger.info("tablets: " + tablets) + assertEquals(1, tablets.size()) + def tablet = tablets[0] + String tablet_id = tablet.TabletId + + def be = getBeIpAndPort(clusterName2) + def src_be = getBeIpAndPort(clusterName1) + + logFileCacheDownloadMetrics(clusterName2) + logWarmUpRowsetMetrics(clusterName2) + def num_submitted = getBrpcMetrics(be.ip, be.rpc_port, "file_cache_event_driven_warm_up_submitted_segment_num") + def num_finished = getBrpcMetrics(be.ip, be.rpc_port, "file_cache_event_driven_warm_up_finished_segment_num") + def num_requested = getBrpcMetrics(src_be.ip, src_be.rpc_port, "file_cache_event_driven_warm_up_requested_segment_num") + assert num_submitted >= 6 + assert num_finished == num_submitted + assert num_requested == num_finished + + sql """use @${clusterName2}""" + // ensure that base rowsets' meta are loaded on target cluster + qt_cluster2_0 "select * from test order by col0, __DORIS_VERSION_COL__;" + sql """use @${clusterName1}""" + + // inject sleep when read cluster warm up rowset for compaction and load + GetDebugPoint().enableDebugPoint(be.ip, be.http_port as int, NodeType.BE, "CloudInternalServiceImpl::warm_up_rowset.download_segment", [sleep:10]) + + // trigger and wait compaction async + def future = thread { + sql """use @${clusterName1}""" + do_cumu_compaction(src_be, "test", tablet_id, 2, 5) + } + // wait until the warmup for compaction started + waitForBrpcMetricValue(be.ip, be.rpc_port, "file_cache_warm_up_rowset_wait_for_compaction_num", 1, /*timeout*/10000) + logFileCacheDownloadMetrics(clusterName2) + logWarmUpRowsetMetrics(clusterName2) + assert num_submitted + 1 == getBrpcMetrics(be.ip, be.rpc_port, "file_cache_event_driven_warm_up_submitted_segment_num") + assert num_finished == getBrpcMetrics(be.ip, be.rpc_port, "file_cache_event_driven_warm_up_finished_segment_num") + + + // a new insert will trigger the sync rowset operation in the following query + sql """insert into test values (9, 9)""" + sleep(500) + assert num_submitted + 2 == getBrpcMetrics(be.ip, be.rpc_port, "file_cache_event_driven_warm_up_submitted_segment_num") + + + // trigger a query on read cluster without query tolerance, read the origin data + sql """use @${clusterName2}""" + sql "set skip_delete_sign=true;" + sql "set show_hidden_columns=true;" + sql "set skip_storage_engine_merge=true;" + qt_cluster2_1 "select * from test order by col0, __DORIS_VERSION_COL__;" + def tablet_status = getTabletStatus(be.ip, be.http_port, tablet_id) + def rowsets = tablet_status ["rowsets"] + assert rowsets[1].contains("[2-5]") + assert rowsets[2].contains("[6-6]") + assert rowsets[3].contains("[7-7]") + assert rowsets[4].contains("[8-8]") + + // this query will trigger sync_rowsets, due to compaction cnts changes, version_overlap will be true, so that compaction rowset + // and new load rowset's warmup task will be triggered. However, these rowsets' warmup tasks have been triggered in passive warmup + // we check that they will not be triggered again + assert num_submitted + 2 == getBrpcMetrics(be.ip, be.rpc_port, "file_cache_event_driven_warm_up_submitted_segment_num") + + sql "set enable_profile=true;" + sql "set profile_level=2;" + + // trigger a query on read cluster without query tolerance, read the compacted data + sql "set query_freshness_tolerance_ms = 5000" + def t1 = System.currentTimeMillis() + def queryFreshnessToleranceCount = getBrpcMetricsByCluster(clusterName2, "capture_with_freshness_tolerance_count") + def fallbackCount = getBrpcMetricsByCluster(clusterName2, "capture_with_freshness_tolerance_fallback_count") + // should not contains (9,9) + sql "set skip_delete_sign=true;" + sql "set show_hidden_columns=true;" + sql "set skip_storage_engine_merge=true;" + qt_cluster2 "select * from test order by col0, __DORIS_VERSION_COL__;" + def t2 = System.currentTimeMillis() + logger.info("query in cluster2 cost=${t2 - t1} ms") + assert t2 - t1 < 3000 + assert getBrpcMetricsByCluster(clusterName2, "capture_with_freshness_tolerance_count") == queryFreshnessToleranceCount + 1 + assert getBrpcMetricsByCluster(clusterName2, "capture_with_freshness_tolerance_fallback_count") == fallbackCount + + logFileCacheDownloadMetrics(clusterName2) + logWarmUpRowsetMetrics(clusterName2) + + future.get() + assert num_finished == getBrpcMetrics(be.ip, be.rpc_port, "file_cache_event_driven_warm_up_finished_segment_num") + assert 1 == getBrpcMetrics(be.ip, be.rpc_port, "file_cache_warm_up_rowset_wait_for_compaction_timeout_num") + + sleep(10000) + assert getBrpcMetrics(be.ip, be.rpc_port, "file_cache_event_driven_warm_up_finished_segment_num") + == getBrpcMetrics(src_be.ip, src_be.rpc_port, "file_cache_event_driven_warm_up_requested_segment_num") + } +} diff --git a/regression-test/suites/cloud_p0/read_cluster_cache/warmup/test_warmup_download_fail.groovy b/regression-test/suites/cloud_p0/read_cluster_cache/warmup/test_warmup_download_fail.groovy new file mode 100644 index 00000000000000..ea1aafbc44cf37 --- /dev/null +++ b/regression-test/suites/cloud_p0/read_cluster_cache/warmup/test_warmup_download_fail.groovy @@ -0,0 +1,254 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +import org.apache.doris.regression.suite.ClusterOptions +import org.apache.doris.regression.util.NodeType +import groovy.json.JsonSlurper + +suite('test_warmup_download_fail', 'docker') { + def options = new ClusterOptions() + options.feConfigs += [ + 'cloud_cluster_check_interval_second=1', + 'cloud_tablet_rebalancer_interval_second=1', + ] + options.beConfigs += [ + 'file_cache_enter_disk_resource_limit_mode_percent=99', + 'enable_evict_file_cache_in_advance=false', + 'file_cache_background_monitor_interval_ms=1000', + 'warm_up_rowset_slow_log_ms=1', + 'enable_compaction_delay_commit_for_warm_up=true', + 'warm_up_rowset_sync_wait_min_timeout_ms=20000', + 'warm_up_rowset_sync_wait_max_timeout_ms=20000', + ] + options.enableDebugPoints() + options.cloudMode = true + + def clearFileCache = {ip, port -> + def url = "http://${ip}:${port}/api/file_cache?op=clear&sync=true" + def response = new URL(url).text + def json = new JsonSlurper().parseText(response) + + // Check the status + if (json.status != "OK") { + throw new RuntimeException("Clear cache on ${ip}:${port} failed: ${json.status}") + } + } + + def clearFileCacheOnAllBackends = { + def backends = sql """SHOW BACKENDS""" + + for (be in backends) { + def ip = be[1] + def port = be[4] + clearFileCache(ip, port) + } + + // clear file cache is async, wait it done + sleep(1000) + } + + def getBrpcMetrics = {ip, port, name -> + def url = "http://${ip}:${port}/brpc_metrics" + def metrics = new URL(url).text + def matcher = metrics =~ ~"${name}\\s+(\\d+)" + if (matcher.find()) { + logger.info("Metric ${name} on ${ip}:${port} is ${matcher[0][1]}") + return matcher[0][1] as long + } else { + throw new RuntimeException("${name} not found for ${ip}:${port}") + } + } + + def getBeIpAndPort = { cluster -> + def backends = sql """SHOW BACKENDS""" + def cluster_bes = backends.findAll { it[19].contains("""\"compute_group_name\" : \"${cluster}\"""") } + + if (cluster_bes.isEmpty()) { + throw new RuntimeException("No BE found for cluster: ${cluster}") + } + + def firstBe = cluster_bes[0] + return [ip: firstBe[1], http_port:firstBe[4], rpc_port: firstBe[5]] + } + + def logFileCacheDownloadMetrics = { cluster -> + def backends = sql """SHOW BACKENDS""" + def cluster_bes = backends.findAll { it[19].contains("""\"compute_group_name\" : \"${cluster}\"""") } + for (be in cluster_bes) { + def ip = be[1] + def port = be[5] + def submitted = getBrpcMetrics(ip, port, "file_cache_download_submitted_num") + def finished = getBrpcMetrics(ip, port, "file_cache_download_finished_num") + def failed = getBrpcMetrics(ip, port, "file_cache_download_failed_num") + logger.info("${cluster} be ${ip}:${port}, downloader submitted=${submitted}" + + ", finished=${finished}, failed=${failed}") + } + } + + def logWarmUpRowsetMetrics = { cluster -> + def backends = sql """SHOW BACKENDS""" + def cluster_bes = backends.findAll { it[19].contains("""\"compute_group_name\" : \"${cluster}\"""") } + for (be in cluster_bes) { + def ip = be[1] + def port = be[5] + def submitted_segment = getBrpcMetrics(ip, port, "file_cache_event_driven_warm_up_submitted_segment_num") + def finished_segment = getBrpcMetrics(ip, port, "file_cache_event_driven_warm_up_finished_segment_num") + def failed_segment = getBrpcMetrics(ip, port, "file_cache_event_driven_warm_up_failed_segment_num") + def submitted_index = getBrpcMetrics(ip, port, "file_cache_event_driven_warm_up_submitted_index_num") + def finished_index = getBrpcMetrics(ip, port, "file_cache_event_driven_warm_up_finished_index_num") + def failed_index = getBrpcMetrics(ip, port, "file_cache_event_driven_warm_up_failed_index_num") + def compaction_sync_wait = getBrpcMetrics(ip, port, "file_cache_warm_up_rowset_wait_for_compaction_num") + logger.info("${cluster} be ${ip}:${port}, submitted_segment=${submitted_segment}" + + ", finished_segment=${finished_segment}, failed_segment=${failed_segment}" + + ", submitted_index=${submitted_index}" + + ", finished_index=${finished_index}" + + ", failed_index=${failed_index}" + + ", compaction_sync_wait=${compaction_sync_wait}") + } + } + + def getTabletStatus = { ip, port, tablet_id -> + StringBuilder sb = new StringBuilder(); + sb.append("curl -X GET http://${ip}:${port}") + sb.append("/api/compaction/show?tablet_id=") + sb.append(tablet_id) + + String command = sb.toString() + logger.info(command) + def process = command.execute() + def code = process.waitFor() + def out = process.getText() + logger.info("Get tablet status: =" + code + ", out=" + out) + assertEquals(code, 0) + def tabletStatus = parseJson(out.trim()) + return tabletStatus + } + + def getBrpcMetricsByCluster = {cluster, name-> + def backends = sql """SHOW BACKENDS""" + def cluster_bes = backends.findAll { it[19].contains("""\"compute_group_name\" : \"${cluster}\"""") } + assert cluster_bes.size() > 0, "No backend found for cluster ${cluster}" + def be = cluster_bes[0] + def ip = be[1] + def port = be[5] + return getBrpcMetrics(ip, port, name) + } + + docker(options) { + def clusterName1 = "warmup_source" + def clusterName2 = "warmup_target" + + // Add two clusters + cluster.addBackend(1, clusterName1) + cluster.addBackend(1, clusterName2) + + def tag1 = getCloudBeTagByName(clusterName1) + def tag2 = getCloudBeTagByName(clusterName2) + + logger.info("Cluster tag1: {}", tag1) + logger.info("Cluster tag2: {}", tag2) + + def jsonSlurper = new JsonSlurper() + + def getJobState = { jobId -> + def jobStateResult = sql """SHOW WARM UP JOB WHERE ID = ${jobId}""" + return jobStateResult[0][3] + } + + // Ensure we are in source cluster + sql """use @${clusterName1}""" + + // Start warm up job + def jobId_ = sql """ + WARM UP CLUSTER ${clusterName2} WITH CLUSTER ${clusterName1} + PROPERTIES ( + "sync_mode" = "event_driven", + "sync_event" = "load" + ) + """ + + def jobId = jobId_[0][0] + logger.info("Warm-up job ID: ${jobId}") + + sql """ + create table test ( + col0 int not null, + col1 int NOT NULL + ) UNIQUE KEY(`col0`) + DISTRIBUTED BY HASH(col0) BUCKETS 1 + PROPERTIES ("file_cache_ttl_seconds" = "3600", "disable_auto_compaction" = "true"); + """ + + clearFileCacheOnAllBackends() + sleep(1000) + + sql """use @${clusterName1}""" + // load data + sql """insert into test values (1, 1)""" + sql """insert into test values (2, 2)""" + sql """insert into test values (3, 3)""" + sql """insert into test values (4, 4)""" + sql """insert into test values (5, 5)""" + sql """insert into test values (6, 6)""" + sleep(5000) + + def tablets = sql_return_maparray """ show tablets from test; """ + logger.info("tablets: " + tablets) + assertEquals(1, tablets.size()) + def tablet = tablets[0] + String tablet_id = tablet.TabletId + + def be = getBeIpAndPort(clusterName2) + def src_be = getBeIpAndPort(clusterName1) + + logFileCacheDownloadMetrics(clusterName2) + logWarmUpRowsetMetrics(clusterName2) + def num_submitted = getBrpcMetrics(be.ip, be.rpc_port, "file_cache_event_driven_warm_up_submitted_segment_num") + def num_finished = getBrpcMetrics(be.ip, be.rpc_port, "file_cache_event_driven_warm_up_finished_segment_num") + def num_failed = getBrpcMetrics(be.ip, be.rpc_port, "file_cache_event_driven_warm_up_failed_segment_num") + assert num_submitted >= 6 + assert num_finished == num_submitted + assert num_failed == 0 + + GetDebugPoint().enableDebugPoint(be.ip, be.http_port as int, NodeType.BE, "CloudInternalServiceImpl::warm_up_rowset.download_segment.inject_error") + + // a new insert will trigger the sync rowset operation in the following query + sql """insert into test values (9, 9)""" + sleep(1000) + + assert num_failed + 1 == getBrpcMetrics(be.ip, be.rpc_port, "file_cache_event_driven_warm_up_failed_segment_num") + + + sql """use @${clusterName2}""" + sql "set enable_profile=true;" + sql "set profile_level=2;" + + // although download failed, the query should still read the newly inserted data + sql "set query_freshness_tolerance_ms = 5000" + def queryFreshnessToleranceCount = getBrpcMetricsByCluster(clusterName2, "capture_with_freshness_tolerance_count") + def fallbackCount = getBrpcMetricsByCluster(clusterName2, "capture_with_freshness_tolerance_fallback_count") + qt_cluster2 """select * from test""" + assert getBrpcMetricsByCluster(clusterName2, "capture_with_freshness_tolerance_count") == queryFreshnessToleranceCount + 1 + assert getBrpcMetricsByCluster(clusterName2, "capture_with_freshness_tolerance_fallback_count") == fallbackCount + + logFileCacheDownloadMetrics(clusterName2) + logWarmUpRowsetMetrics(clusterName2) + + assert getBrpcMetrics(be.ip, be.rpc_port, "file_cache_event_driven_warm_up_finished_segment_num") + getBrpcMetrics(be.ip, be.rpc_port, "file_cache_event_driven_warm_up_failed_segment_num") + == getBrpcMetrics(src_be.ip, src_be.rpc_port, "file_cache_event_driven_warm_up_requested_segment_num") + } +} diff --git a/regression-test/suites/cloud_p0/test_read_cluster_var_property.groovy b/regression-test/suites/cloud_p0/test_read_cluster_var_property.groovy new file mode 100644 index 00000000000000..12d9c682001d6d --- /dev/null +++ b/regression-test/suites/cloud_p0/test_read_cluster_var_property.groovy @@ -0,0 +1,214 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite('test_read_cluster_var_property') { + if (!isCloudMode()) { + return + } + String userName = "test_read_cluster_var_property_user" + String pwd = '123456' + sql """drop user if exists ${userName}""" + sql """CREATE USER '${userName}' IDENTIFIED BY '${pwd}'""" + sql """GRANT ADMIN_PRIV ON *.*.* TO ${userName}""" + + def getBrpcMetrics = {ip, port, name -> + def url = "http://${ip}:${port}/brpc_metrics" + def metrics = new URL(url).text + def matcher = metrics =~ ~"${name}\\s+(\\d+)" + if (matcher.find()) { + def ret = matcher[0][1] as long + logger.info("getBrpcMetrics, ${url}, name:${name}, value:${ret}") + return ret + } else { + throw new RuntimeException("${name} not found for ${ip}:${port}") + } + } + + connect(userName, "${pwd}", context.config.jdbcUrl) { + // test non-mow table + try { + def tableName = "test_read_cluster_var_property" + sql """ DROP TABLE IF EXISTS ${tableName} """ + sql """ CREATE TABLE ${tableName} + (k int, v1 int, v2 int ) + DUPLICATE KEY(k) + DISTRIBUTED BY HASH (k) + BUCKETS 1 PROPERTIES( + "replication_num" = "1", + "disable_auto_compaction" = "true"); + """ + + (1..20).each{ id -> + sql """insert into ${tableName} select number, number, number from numbers("number"="10");""" + } + + sql "select * from ${tableName};" + + def backends = sql_return_maparray('show backends') + def tabletStats = sql_return_maparray("show tablets from ${tableName};") + assert tabletStats.size() == 1 + def tabletId = tabletStats[0].TabletId + def tabletBackendId = tabletStats[0].BackendId + def tabletBackend + for (def be : backends) { + if (be.BackendId == tabletBackendId) { + tabletBackend = be + break; + } + } + logger.info("tablet ${tabletId} on backend ${tabletBackend.Host} with backendId=${tabletBackend.BackendId}"); + + try { + // 1. test enable_prefer_cached_rowset + sql "set enable_prefer_cached_rowset=true;" + def preferCachedRowsetCount = getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_prefer_cache_count") + sql "select * from ${tableName};" + assert getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_prefer_cache_count") == preferCachedRowsetCount + 1 + + sql "set enable_prefer_cached_rowset=false;" + preferCachedRowsetCount = getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_prefer_cache_count") + sql "select * from ${tableName};" + assert getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_prefer_cache_count") == preferCachedRowsetCount + + // user property has higher prioroty than session variable + sql "set property for '${userName}' 'enable_prefer_cached_rowset'='true';" + preferCachedRowsetCount = getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_prefer_cache_count") + sql "select * from ${tableName};" + assert getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_prefer_cache_count") == 1 + preferCachedRowsetCount + } finally { + sql "set enable_prefer_cached_rowset=false;" + sql "set property for '${userName}' 'enable_prefer_cached_rowset'='false';" + } + + try { + // 2. test query_freshness_tolerance_ms + sql "set query_freshness_tolerance_ms=1000;" + def queryFreshnessTolerance = getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_with_freshness_tolerance_count") + sql "select * from ${tableName};" + assert getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_with_freshness_tolerance_count") == queryFreshnessTolerance + 1 + + sql "set query_freshness_tolerance_ms=-1;" + queryFreshnessTolerance = getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_with_freshness_tolerance_count") + sql "select * from ${tableName};" + assert getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_with_freshness_tolerance_count") == queryFreshnessTolerance + + // user property has higher prioroty than session variable + sql "set property for '${userName}' 'query_freshness_tolerance_ms'='2000';" + queryFreshnessTolerance = getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_with_freshness_tolerance_count") + sql "select * from ${tableName};" + assert getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_with_freshness_tolerance_count") == 1 + queryFreshnessTolerance + } finally { + sql "set query_freshness_tolerance_ms=-1;" + sql "set property for '${userName}' 'query_freshness_tolerance_ms'='-1';" + } + } catch (Exception e) { + logger.error("Error occurred while testing query_freshness_tolerance_ms: ${e.message}") + } finally { + sql "set enable_prefer_cached_rowset=false;" + sql "set query_freshness_tolerance_ms=-1;" + sql "set property for '${userName}' 'enable_prefer_cached_rowset'='false';" + sql "set property for '${userName}' 'query_freshness_tolerance_ms'='-1';" + } + + // test mow table + try { + def tableName = "test_read_cluster_var_property_mow" + sql """ DROP TABLE IF EXISTS ${tableName} """ + sql """ CREATE TABLE ${tableName} + (k int, v1 int, v2 int ) + UNIQUE KEY(k) DISTRIBUTED BY HASH (k) + BUCKETS 1 PROPERTIES( + "replication_num" = "1", + "enable_unique_key_merge_on_write" = "true", + "disable_auto_compaction" = "true"); + """ + + (1..20).each{ id -> + sql """insert into ${tableName} select number, number, number from numbers("number"="10");""" + } + + sql "select * from ${tableName};" + + def backends = sql_return_maparray('show backends') + def tabletStats = sql_return_maparray("show tablets from ${tableName};") + assert tabletStats.size() == 1 + def tabletId = tabletStats[0].TabletId + def tabletBackendId = tabletStats[0].BackendId + def tabletBackend + for (def be : backends) { + if (be.BackendId == tabletBackendId) { + tabletBackend = be + break; + } + } + logger.info("tablet ${tabletId} on backend ${tabletBackend.Host} with backendId=${tabletBackend.BackendId}"); + + try { + // 1. test enable_prefer_cached_rowset + // enable_prefer_cached_rowset should not take effect on mow table + sql "set enable_prefer_cached_rowset=true;" + def preferCachedRowsetCount = getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_prefer_cache_count") + sql "select * from ${tableName};" + assert getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_prefer_cache_count") == preferCachedRowsetCount + + sql "set enable_prefer_cached_rowset=false;" + preferCachedRowsetCount = getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_prefer_cache_count") + sql "select * from ${tableName};" + assert getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_prefer_cache_count") == preferCachedRowsetCount + + // user property has higher prioroty than session variable + sql "set property for '${userName}' 'enable_prefer_cached_rowset'='true';" + preferCachedRowsetCount = getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_prefer_cache_count") + sql "select * from ${tableName};" + assert getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_prefer_cache_count") == preferCachedRowsetCount + } finally { + sql "set enable_prefer_cached_rowset=false;" + sql "set property for '${userName}' 'enable_prefer_cached_rowset'='false';" + } + + try { + // 2. test query_freshness_tolerance_ms + sql "set query_freshness_tolerance_ms=1000;" + def queryFreshnessTolerance = getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_with_freshness_tolerance_count") + sql "select * from ${tableName};" + assert getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_with_freshness_tolerance_count") == queryFreshnessTolerance + 1 + + sql "set query_freshness_tolerance_ms=-1;" + queryFreshnessTolerance = getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_with_freshness_tolerance_count") + sql "select * from ${tableName};" + assert getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_with_freshness_tolerance_count") == queryFreshnessTolerance + + // user property has higher prioroty than session variable + sql "set property for '${userName}' 'query_freshness_tolerance_ms'='2000';" + queryFreshnessTolerance = getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_with_freshness_tolerance_count") + sql "select * from ${tableName};" + assert getBrpcMetrics(tabletBackend.Host, tabletBackend.BrpcPort, "capture_with_freshness_tolerance_count") == 1 + queryFreshnessTolerance + } finally { + sql "set query_freshness_tolerance_ms=-1;" + sql "set property for '${userName}' 'query_freshness_tolerance_ms'='-1';" + } + } catch (Exception e) { + logger.error("Error occurred while testing query_freshness_tolerance_ms: ${e.message}") + throw e + } finally { + sql "set enable_prefer_cached_rowset=false;" + sql "set query_freshness_tolerance_ms=-1;" + sql "set property for '${userName}' 'enable_prefer_cached_rowset'='false';" + sql "set property for '${userName}' 'query_freshness_tolerance_ms'='-1';" + } + } +} \ No newline at end of file